# 7.1 Handling Missing Data

In [2]:
import pandas as pd 
import numpy as np

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [6]:
string_data[1] = pd.NA
string_data.isnull()

0     True
1     True
2     True
3    False
dtype: bool

## 7.1.1 Filtering Out Missing Data

In [8]:
from numpy import nan as NA

data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [11]:
data.notnull()

0     True
1    False
2     True
3    False
4     True
dtype: bool

In [10]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data = pd.DataFrame([[1.,6.5,3.], 
                     [1.,NA,NA], 
                     [NA,NA,NA], 
                     [NA,6.5,3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [13]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [14]:
data.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [15]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
data.dropna(axis = 1, how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
data.dropna(axis = 1, how = 'all').dropna(axis = 0, how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [18]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1] = NA
df.iloc[:2,2] = NA
df

Unnamed: 0,0,1,2
0,-0.071707,,
1,1.068716,,
2,-0.452546,,1.021532
3,0.187533,,-0.774949
4,-0.694325,-0.455999,0.461452
5,0.682559,1.479124,1.120273
6,-1.034461,-0.518801,-1.772283


In [19]:
df.dropna(axis = 0)

Unnamed: 0,0,1,2
4,-0.694325,-0.455999,0.461452
5,0.682559,1.479124,1.120273
6,-1.034461,-0.518801,-1.772283


In [20]:
df.dropna(axis = 0, thresh = 2)

Unnamed: 0,0,1,2
2,-0.452546,,1.021532
3,0.187533,,-0.774949
4,-0.694325,-0.455999,0.461452
5,0.682559,1.479124,1.120273
6,-1.034461,-0.518801,-1.772283


## 7.1.2 Filling in Missing Data

In [21]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.071707,0.0,0.0
1,1.068716,0.0,0.0
2,-0.452546,0.0,1.021532
3,0.187533,0.0,-0.774949
4,-0.694325,-0.455999,0.461452
5,0.682559,1.479124,1.120273
6,-1.034461,-0.518801,-1.772283


In [22]:
df.fillna({1: 0.5, 
           2: 0})

Unnamed: 0,0,1,2
0,-0.071707,0.5,0.0
1,1.068716,0.5,0.0
2,-0.452546,0.5,1.021532
3,0.187533,0.5,-0.774949
4,-0.694325,-0.455999,0.461452
5,0.682559,1.479124,1.120273
6,-1.034461,-0.518801,-1.772283


In [23]:
_ = df.fillna(0, inplace = True)
df

Unnamed: 0,0,1,2
0,-0.071707,0.0,0.0
1,1.068716,0.0,0.0
2,-0.452546,0.0,1.021532
3,0.187533,0.0,-0.774949
4,-0.694325,-0.455999,0.461452
5,0.682559,1.479124,1.120273
6,-1.034461,-0.518801,-1.772283


In [24]:
df = pd.DataFrame(np.random.randn(6,3))
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA
df

Unnamed: 0,0,1,2
0,-1.497585,1.212359,-1.896243
1,1.02445,-2.085406,-0.174656
2,-2.040181,,0.989439
3,0.17965,,0.297747
4,-1.252829,,
5,0.423227,,


In [25]:
df.fillna(method = 'ffill')

Unnamed: 0,0,1,2
0,-1.497585,1.212359,-1.896243
1,1.02445,-2.085406,-0.174656
2,-2.040181,-2.085406,0.989439
3,0.17965,-2.085406,0.297747
4,-1.252829,-2.085406,0.297747
5,0.423227,-2.085406,0.297747


In [26]:
df.fillna(method = 'bfill')

Unnamed: 0,0,1,2
0,-1.497585,1.212359,-1.896243
1,1.02445,-2.085406,-0.174656
2,-2.040181,,0.989439
3,0.17965,,0.297747
4,-1.252829,,
5,0.423227,,


In [27]:
df.fillna(method = 'ffill', limit = 2)

Unnamed: 0,0,1,2
0,-1.497585,1.212359,-1.896243
1,1.02445,-2.085406,-0.174656
2,-2.040181,-2.085406,0.989439
3,0.17965,-2.085406,0.297747
4,-1.252829,,0.297747
5,0.423227,,0.297747


In [28]:
data = pd.Series([1.,NA,3.5,NA,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [29]:
data.mean()

3.8333333333333335

In [30]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# 7.2 Data Transformation

## 7.2.1 Removing Duplicates

In [31]:
data = pd.DataFrame({'k1':['one','two'] * 3 + ['two'],
                     'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [32]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [33]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [34]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [35]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [36]:
data.drop_duplicates(['k2'])

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


In [37]:
data.drop_duplicates(['k1','k2'], keep = 'last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


## 7.2.2 Transforming Data Using a Function or Mapping

In [38]:
data = pd.DataFrame({'food':['bacon','pulled pork','bacon',
                             'Pastrami','corned beef','Bacon',
                             'pastrami','honey ham','nova lox'],
                     'ounces':[4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [39]:
meat_to_animal = {
    'bacon' : 'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox': 'salmon'
}

In [40]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [42]:
# The map method on a Series accepts a function or dict-like object containing a mapping 
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [44]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Using map is a convenient way to perform element-wise transformations and other data cleaning related operations

## 7.2.3 Replacing Values

In [45]:
data = pd.Series([1.,-999.,2.,-999.,-1000.,3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [46]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [47]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [49]:
data.replace([-999, -1000],
             [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [50]:
data.replace({-999: np.nan,
              -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

## 7.2.4 Renaming Axis Indexes

In [51]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),
                    index = ['Ohio','Colorado','New York'],
                    columns = ['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [52]:
transform = lambda x: x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [54]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [55]:
data.rename(index = str.title, columns = str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [56]:
data.rename(index = {'OHIO': 'INDIANA'},
            columns = {'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [57]:
data.rename(index = {'OHIO':'INDIANA'}, 
            inplace = True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


## 7.2.5 Discretization and Binning

In [58]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]

cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [59]:
type(cats)

pandas.core.arrays.categorical.Categorical

The object pandas returns is a special Categorical object. The output describes the bins computed by pandas.cut. You can treat if like an array of strings indicating the bin name. Internally it contains a categories array specifying the distinct category names along with a labeling for the ages data in the codes attribute

In [63]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [64]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [65]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [66]:
cats.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [67]:
pd.cut(ages, bins = [18, 26, 36, 61, 100], right = False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [68]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senoir']

pd.cut(ages, bins, labels = group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senoir, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senoir]

In [70]:
pd.cut(ages, bins, labels = group_names).value_counts()

Youth         5
YoungAdult    3
MiddleAged    3
Senoir        1
dtype: int64

In [71]:
data = np.random.rand(20)

pd.cut(data, bins = 4, precision = 2)

[(0.25, 0.48], (0.0075, 0.25], (0.72, 0.96], (0.72, 0.96], (0.25, 0.48], ..., (0.25, 0.48], (0.48, 0.72], (0.48, 0.72], (0.0075, 0.25], (0.0075, 0.25]]
Length: 20
Categories (4, interval[float64]): [(0.0075, 0.25] < (0.25, 0.48] < (0.48, 0.72] < (0.72, 0.96]]

In [72]:
data = np.random.randn(1000)

cats = pd.qcut(data, 4)
cats

[(-0.688, 0.0643], (0.703, 4.113], (-0.688, 0.0643], (0.0643, 0.703], (-0.688, 0.0643], ..., (-0.688, 0.0643], (-2.859, -0.688], (0.703, 4.113], (-0.688, 0.0643], (0.703, 4.113]]
Length: 1000
Categories (4, interval[float64]): [(-2.859, -0.688] < (-0.688, 0.0643] < (0.0643, 0.703] < (0.703, 4.113]]

In [73]:
pd.value_counts(cats)

(0.703, 4.113]      250
(0.0643, 0.703]     250
(-0.688, 0.0643]    250
(-2.859, -0.688]    250
dtype: int64

In [74]:
cats.value_counts()

(-2.859, -0.688]    250
(-0.688, 0.0643]    250
(0.0643, 0.703]     250
(0.703, 4.113]      250
dtype: int64

In [75]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-1.273, 0.0643], (1.415, 4.113], (-1.273, 0.0643], (0.0643, 1.415], (-1.273, 0.0643], ..., (-1.273, 0.0643], (-1.273, 0.0643], (0.0643, 1.415], (-1.273, 0.0643], (1.415, 4.113]]
Length: 1000
Categories (4, interval[float64]): [(-2.859, -1.273] < (-1.273, 0.0643] < (0.0643, 1.415] < (1.415, 4.113]]

In [76]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-2.859, -1.273]    100
(-1.273, 0.0643]    400
(0.0643, 1.415]     400
(1.415, 4.113]      100
dtype: int64

## 7.2.6 Detecting and Filtering Outliers

In [77]:
data = pd.DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.01367,-0.007769,0.025712,-0.075328
std,1.025571,0.990324,0.990472,0.980218
min,-2.776624,-3.604173,-3.338282,-2.591883
25%,-0.647689,-0.673692,-0.631187,-0.768725
50%,0.04331,-0.025538,0.0186,-0.103471
75%,0.687216,0.665822,0.700813,0.572454
max,4.413689,2.75705,3.436654,3.32723


In [78]:
data

Unnamed: 0,0,1,2,3
0,1.683154,0.366238,-0.437225,1.270393
1,0.229333,-1.263995,-1.163191,1.052016
2,0.881827,-0.249234,-2.425877,0.699156
3,0.012753,0.895358,-0.699776,-1.348860
4,1.131795,0.183718,1.031610,-0.956636
...,...,...,...,...
995,-1.093044,1.592552,-0.343079,-1.393141
996,1.261601,-0.093438,-0.947772,0.857894
997,-0.331583,-0.732874,-1.609683,-0.878360
998,-1.107969,-1.328181,-1.615729,-0.340583


In [79]:
col = data[2]
col[np.abs(col)> 3]

90    -3.125781
236    3.436654
347   -3.302047
659    3.194808
780   -3.338282
Name: 2, dtype: float64

In [80]:
# select all rows having a valye exceeding 3 or -3 
data[(np.abs(data) > 3).any(axis = 1)]

Unnamed: 0,0,1,2,3
90,1.226201,0.653337,-3.125781,0.972325
204,3.184598,-0.258595,-0.093578,0.566035
236,-2.659082,-0.842058,3.436654,0.213854
243,0.26794,-3.604173,-1.157686,-0.860695
245,0.91405,-3.487085,0.059785,0.179756
334,3.032231,0.298442,-1.401059,-0.455487
347,0.394724,-0.590721,-3.302047,-0.596958
563,3.088035,0.070075,1.203351,0.291944
659,-1.144053,1.088326,3.194808,-1.182958
780,0.266397,1.052922,-3.338282,2.09318


In [81]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.011951,-0.006678,0.025846,-0.075655
std,1.019546,0.986711,0.985993,0.979135
min,-2.776624,-3.0,-3.0,-2.591883
25%,-0.647689,-0.673692,-0.631187,-0.768725
50%,0.04331,-0.025538,0.0186,-0.103471
75%,0.687216,0.665822,0.700813,0.572454
max,3.0,2.75705,3.0,3.0


In [82]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,1.0
1,1.0,-1.0,-1.0,1.0
2,1.0,-1.0,-1.0,1.0
3,1.0,1.0,-1.0,-1.0
4,1.0,1.0,1.0,-1.0


## 7.2.7 Permutation and Random Sampling

In [83]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5,4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [84]:
sampler = np.random.permutation(5)
sampler

array([2, 1, 4, 0, 3])

In [85]:
df.take(sampler)

Unnamed: 0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
4,16,17,18,19
0,0,1,2,3
3,12,13,14,15


In [88]:
df.iloc[sampler,:]

Unnamed: 0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
4,16,17,18,19
0,0,1,2,3
3,12,13,14,15


In [89]:
df.sample(n = 3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
1,4,5,6,7


In [90]:
choices = pd.Series([5,7,-1,6,4])
draws = choices.sample(n = 10, replace = True)
draws

4    4
3    6
4    4
3    6
3    6
2   -1
3    6
2   -1
3    6
4    4
dtype: int64

## 7.2.8 Computing Indicator / Dummy Variables

In [91]:
df = pd.DataFrame({'key':['b','b','a','c','a','b'], 
                   'data1':range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [92]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [93]:
dummies = pd.get_dummies(df['key'], prefix = 'key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [94]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy 

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [95]:
mnames = ['movie_id', 'title','genres']

movies = pd.read_table('/Users/boyuan/Desktop/OneDrive/Python for data analysis 2nd/datasets/movielens/movies.dat', 
                       sep = '::', 
                       header = None, 
                       names = mnames, 
                       engine='python')
movies[:10]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [97]:
# extract the list of unique genres in the dataset 
all_genres = []

for x in movies.genres:
    all_genres.extend(x.split('|'))
    
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [98]:
zero_matrix = np.zeros((len(movies), len(genres)))
zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [99]:
dummies = pd.DataFrame(zero_matrix, columns = genres)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
gen = movies.genres[0]
gen

"Animation|Children's|Comedy"

In [101]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [102]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2])

In [103]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [104]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [105]:
np.random.seed(12345)

values = np.random.rand(10)

values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [106]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [107]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


# 7.3 String Manipulation

## 7.3.1 String Object Methods

In [109]:
val = 'a,b, guido'
val.split(sep = ',')

['a', 'b', ' guido']

In [110]:
type(val)

str

In [112]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [113]:
first, second, third = pieces

first + '::' + second + '::' + third

'a::b::guido'

In [114]:
'::'.join(pieces)

'a::b::guido'

In [115]:
'guido' in val

True

In [116]:
'gu' in val

True

In [117]:
val.index(',')

1

In [118]:
val.find(':')

-1

In [119]:
val.index(':')

ValueError: substring not found

In [121]:
try:
    val.index(':')
except ValueError:
    pass

In [122]:
val.count(',')

2

In [124]:
val.replace(',', '::')

'a::b:: guido'

In [125]:
val.replace(',','')

'ab guido'

## 7.3.2 Regular Expressions

Regular expressions provide a flexible way to search or match (often more complex) string patterns in text. A single expression, commonly called a regex, is a string formed according to the regular expression language. python's built-in re module is responsible for applying regular expressions to strings

The re module functions fall into three categories: pattern matching, substitution, and splitting. Naturally these are all related. A regex describes a pattern to locate in the text, which can then be used for many purposes

In [126]:
import re

# split a string with a variable number of whitespaces characters (tabs, spaces, and newlines)
text = "foo    bar\t baz   \tqux"

re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

When you call re.split('\s+', text), the regular expression is first compiled, and then its split method is called on the passed text. You can compile the regex yourself with re.compile, forming a reusable regex object

In [127]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [128]:
regex.findall(text)

['    ', '\t ', '   \t']

To avoid unwanted escaping with \ in a regular expression, use raw string literals like r'C:\x' instead of the equivalent 'C:\\x'

Creating a regex object with re.compile is highly recommended if you intend to apply the same expression to many strings. Doing so will save CPU cycles

match and search are closely related to findall. While findall returns all matches in a string, search returns only the first match. More rigidly, match only matches at the beginning of the string

In [129]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [130]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [131]:
regex = re.compile(pattern, flags = re.IGNORECASE)

In [132]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [133]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [134]:
text[m.start():m.end()]

'dave@google.com'

In [138]:
print(regex.match(text))

None


In [139]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [140]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [141]:
regex = re.compile(pattern, flags = re.IGNORECASE)

In [142]:
m = regex.match('wesm@bright.net')

In [143]:
m.groups()

('wesm', 'bright', 'net')

In [144]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [145]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



## 7.3.3 Vectorized String Functions in pandas

In [146]:
data = {'Dave': 'dave@google.com', 
        'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 
        'Wes': np.nan}

data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [147]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [152]:
data.values

array(['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', nan],
      dtype=object)

Series has array-oriented methods for string operations that skip NA values. These are accessed through Series's str attribute

In [153]:
# check whether each email address has 'gmail' in it with str.contains

data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [154]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [155]:
data.str.findall(pattern, flags = re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [160]:
matches = data.str.match(pattern, flags = re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [None]:
matches.str.get(1)

In [None]:
matches.str[0]

In [166]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [169]:
pd.Series.str?

[0;31mInit signature:[0m [0mpd[0m[0;34m.[0m[0mSeries[0m[0;34m.[0m[0mstr[0m[0;34m([0m[0mdata[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Vectorized string functions for Series and Index. NAs stay NA unless
handled otherwise by a particular method. Patterned after Python's string
methods, with some inspiration from R's stringr package.

Examples
--------
>>> s.str.split('_')
>>> s.str.replace('_', '')
[0;31mFile:[0m           ~/anaconda3/envs/sds/lib/python3.8/site-packages/pandas/core/strings.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     


# 7.4 Conclusion