Steps for Data Analysis:
    1. Data Loading
    2. Data Cleaning
    3. Data Transforming
    4. Data Rearrangment

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 7.1 Handling Missing Data

In [2]:
string_data = pd.Series(data=['Apple', 'Orange', np.nan, 'Banana'])

In [3]:
string_data

0     Apple
1    Orange
2       NaN
3    Banana
dtype: object

NaN(Not a Number)

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

NA (Not Available)

In [5]:
string_data[0] = None

In [6]:
string_data

0      None
1    Orange
2       NaN
3    Banana
dtype: object

In [7]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

Table 7-1. NA handling methods

In [8]:
df = pd.DataFrame(data=np.random.randn(1024,5), index=np.arange(1024), columns=list('ABCDE'))

In [9]:
df

Unnamed: 0,A,B,C,D,E
0,0.174528,-1.593428,0.246567,1.342950,-0.521441
1,0.351270,-0.355010,0.472036,-0.664019,1.610938
2,1.822580,-0.900391,-0.243351,-1.076532,0.103432
3,0.633548,1.701642,1.182074,2.012732,-0.763786
4,1.473584,-0.330367,1.005004,-0.190411,0.018640
...,...,...,...,...,...
1019,-1.782064,1.415784,-1.142679,-0.850528,1.445037
1020,-0.583946,-0.491979,0.881464,0.279406,-1.246850
1021,-0.943032,1.281033,-0.619047,1.357003,-1.437074
1022,-0.401629,1.139404,-0.946334,-1.041762,-1.718283


In [10]:
df.loc[5][['C', 'D']] = np.nan
df.loc[1021]['A'] = np.nan
df.loc[5] = np.nan

In [11]:
df

Unnamed: 0,A,B,C,D,E
0,0.174528,-1.593428,0.246567,1.342950,-0.521441
1,0.351270,-0.355010,0.472036,-0.664019,1.610938
2,1.822580,-0.900391,-0.243351,-1.076532,0.103432
3,0.633548,1.701642,1.182074,2.012732,-0.763786
4,1.473584,-0.330367,1.005004,-0.190411,0.018640
...,...,...,...,...,...
1019,-1.782064,1.415784,-1.142679,-0.850528,1.445037
1020,-0.583946,-0.491979,0.881464,0.279406,-1.246850
1021,,1.281033,-0.619047,1.357003,-1.437074
1022,-0.401629,1.139404,-0.946334,-1.041762,-1.718283


In [12]:
df[(df.isnull()).any(1)]

Unnamed: 0,A,B,C,D,E
5,,,,,
1021,,1.281033,-0.619047,1.357003,-1.437074


In [13]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C,D,E
0,0.174528,-1.593428,0.246567,1.342950,-0.521441
1,0.351270,-0.355010,0.472036,-0.664019,1.610938
2,1.822580,-0.900391,-0.243351,-1.076532,0.103432
3,0.633548,1.701642,1.182074,2.012732,-0.763786
4,1.473584,-0.330367,1.005004,-0.190411,0.018640
...,...,...,...,...,...
1019,-1.782064,1.415784,-1.142679,-0.850528,1.445037
1020,-0.583946,-0.491979,0.881464,0.279406,-1.246850
1021,,1.281033,-0.619047,1.357003,-1.437074
1022,-0.401629,1.139404,-0.946334,-1.041762,-1.718283


In [14]:
df.fillna(value=0)

Unnamed: 0,A,B,C,D,E
0,0.174528,-1.593428,0.246567,1.342950,-0.521441
1,0.351270,-0.355010,0.472036,-0.664019,1.610938
2,1.822580,-0.900391,-0.243351,-1.076532,0.103432
3,0.633548,1.701642,1.182074,2.012732,-0.763786
4,1.473584,-0.330367,1.005004,-0.190411,0.018640
...,...,...,...,...,...
1019,-1.782064,1.415784,-1.142679,-0.850528,1.445037
1020,-0.583946,-0.491979,0.881464,0.279406,-1.246850
1021,0.000000,1.281033,-0.619047,1.357003,-1.437074
1022,-0.401629,1.139404,-0.946334,-1.041762,-1.718283


## Filtering Out Missing Data

In [15]:
data = pd.Series(data=[1, 5, np.nan, 7, 9, np.nan])

In [16]:
data

0    1.0
1    5.0
2    NaN
3    7.0
4    9.0
5    NaN
dtype: float64

In [17]:
data.dropna()

0    1.0
1    5.0
3    7.0
4    9.0
dtype: float64

equivalent to:

In [18]:
data[data.notnull()]

0    1.0
1    5.0
3    7.0
4    9.0
dtype: float64

In [19]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [20]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
cleaned = data.dropna()

In [22]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [23]:
#Passing how='all' will only drop rows that are all NA:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [24]:
data[4] = np.nan

In [25]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [26]:
#To drop columns in the same way, pass axis=1
data.dropna(how='all', axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [27]:
df = pd.DataFrame(data=np.random.randn(5,3))

In [28]:
df

Unnamed: 0,0,1,2
0,0.655129,1.629357,0.996413
1,0.551365,0.849895,-0.851984
2,-0.891503,0.06279,-0.613856
3,0.833369,0.028153,1.673192
4,-1.012075,-0.35962,2.014329


In [29]:
df.iloc[:2,1] = np.nan

In [30]:
df.iloc[:3,2] = np.nan

In [31]:
df

Unnamed: 0,0,1,2
0,0.655129,,
1,0.551365,,
2,-0.891503,0.06279,
3,0.833369,0.028153,1.673192
4,-1.012075,-0.35962,2.014329


In [32]:
df.dropna()

Unnamed: 0,0,1,2
3,0.833369,0.028153,1.673192
4,-1.012075,-0.35962,2.014329


In [33]:
#Keep only the rows with at least thresh='2' non-NA values.
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.891503,0.06279,
3,0.833369,0.028153,1.673192
4,-1.012075,-0.35962,2.014329


## Filling In Missing Data

Rather than filtering out missing data (and potentially discarding other data along
with it), you may want to fill in the “holes” in any number of ways.

In [34]:
df

Unnamed: 0,0,1,2
0,0.655129,,
1,0.551365,,
2,-0.891503,0.06279,
3,0.833369,0.028153,1.673192
4,-1.012075,-0.35962,2.014329


In [35]:
df.fillna(value=0)

Unnamed: 0,0,1,2
0,0.655129,0.0,0.0
1,0.551365,0.0,0.0
2,-0.891503,0.06279,0.0
3,0.833369,0.028153,1.673192
4,-1.012075,-0.35962,2.014329


In [36]:
#Calling fillna with a dict, you can use a different fill value for each column:
df.fillna(value={0:100, 1:200, 2:300})

Unnamed: 0,0,1,2
0,0.655129,200.0,300.0
1,0.551365,200.0,300.0
2,-0.891503,0.06279,300.0
3,0.833369,0.028153,1.673192
4,-1.012075,-0.35962,2.014329


In [37]:
df.fillna(value=0, inplace=True)

In [38]:
df

Unnamed: 0,0,1,2
0,0.655129,0.0,0.0
1,0.551365,0.0,0.0
2,-0.891503,0.06279,0.0
3,0.833369,0.028153,1.673192
4,-1.012075,-0.35962,2.014329


In [39]:
df = pd.DataFrame(np.random.randn(7, 4))

In [40]:
df

Unnamed: 0,0,1,2,3
0,1.089226,0.93818,0.133069,0.782136
1,1.452777,1.106967,1.532692,0.050871
2,0.810425,-1.452205,-0.787204,-1.737212
3,-0.288386,0.563872,-1.055143,0.525982
4,0.276271,0.170269,0.513419,0.569988
5,1.45333,1.353726,-0.837788,0.635617
6,0.666619,-0.664585,-1.758453,-1.133159


In [41]:
df.loc[2:4,2] = np.nan

In [42]:
df.iloc[-1:-5:-1,0] = np.nan

In [43]:
df

Unnamed: 0,0,1,2,3
0,1.089226,0.93818,0.133069,0.782136
1,1.452777,1.106967,1.532692,0.050871
2,0.810425,-1.452205,,-1.737212
3,,0.563872,,0.525982
4,,0.170269,,0.569988
5,,1.353726,-0.837788,0.635617
6,,-0.664585,-1.758453,-1.133159


In [44]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2,3
0,1.089226,0.93818,0.133069,0.782136
1,1.452777,1.106967,1.532692,0.050871
2,0.810425,-1.452205,1.532692,-1.737212
3,0.810425,0.563872,1.532692,0.525982
4,0.810425,0.170269,1.532692,0.569988
5,0.810425,1.353726,-0.837788,0.635617
6,0.810425,-0.664585,-1.758453,-1.133159


In [45]:
df.fillna(method='bfill')

Unnamed: 0,0,1,2,3
0,1.089226,0.93818,0.133069,0.782136
1,1.452777,1.106967,1.532692,0.050871
2,0.810425,-1.452205,-0.837788,-1.737212
3,,0.563872,-0.837788,0.525982
4,,0.170269,-0.837788,0.569988
5,,1.353726,-0.837788,0.635617
6,,-0.664585,-1.758453,-1.133159


In [46]:
df.fillna(method='ffill', limit=3)

Unnamed: 0,0,1,2,3
0,1.089226,0.93818,0.133069,0.782136
1,1.452777,1.106967,1.532692,0.050871
2,0.810425,-1.452205,1.532692,-1.737212
3,0.810425,0.563872,1.532692,0.525982
4,0.810425,0.170269,1.532692,0.569988
5,0.810425,1.353726,-0.837788,0.635617
6,,-0.664585,-1.758453,-1.133159


In [47]:
ser = pd.Series(data=[2, 5.3, np.nan, 7, np.nan, 87, 1.25])

In [48]:
ser

0     2.00
1     5.30
2      NaN
3     7.00
4      NaN
5    87.00
6     1.25
dtype: float64

In [49]:
ser.fillna(value=np.mean(ser))

0     2.00
1     5.30
2    20.51
3     7.00
4    20.51
5    87.00
6     1.25
dtype: float64

Table 7-2. fillna function arguments

# 7.2 Data Transformation

## Removing Duplicates

In [50]:
df = pd.DataFrame(data={'k1': ['one', 'two']*3 + ['two'], 'k2': [1, 1, 2, 2, 3, 3, 3]})

In [51]:
df

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,2
4,one,3
5,two,3
6,two,3


In [52]:
# duplicated returns a boolean Series indicating whether each row is a duplicate (has been observed in a previous row) or not:
df.duplicated(keep='first')

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [53]:
df.duplicated(keep='last')

0    False
1    False
2    False
3    False
4    False
5     True
6    False
dtype: bool

In [54]:
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,2
4,one,3
5,two,3


In [55]:
df.drop_duplicates(keep='last')

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,2
4,one,3
6,two,3


In [56]:
df['k3'] = range(7)

In [57]:
df

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1
2,one,2,2
3,two,2,3
4,one,3,4
5,two,3,5
6,two,3,6


In [58]:
df.drop_duplicates(['k1', 'k2'])

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1
2,one,2,2
3,two,2,3
4,one,3,4
5,two,3,5


In [59]:
df.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1
2,one,2,2
3,two,2,3
4,one,3,4
6,two,3,6


## Transforming Data Using a Function or Mapping

In [60]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [61]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [62]:
d = {'bacon':'pig', 'pulled pork':'pig', 'pastrami': 'cow', 'corned beef':'cow', 'honey ham': 'pig', 'nova lox': 'salmon'}

In [63]:
lowered_case = data['food'].str.lower()

In [64]:
lowered_case

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [65]:
data['animal'] = lowered_case.map(d)

In [66]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [67]:
lowered_case.map(d)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [68]:
data['food'].map(lambda x: d[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Using map is a convenient way to perform element-wise transformations and other
data cleaning–related operations.


## Replacing Values

In [69]:
ser = pd.Series(data=[8.52, 5, -85, 41.02, -85, 1000, 45, -9.0023, -85])

In [70]:
ser

0       8.5200
1       5.0000
2     -85.0000
3      41.0200
4     -85.0000
5    1000.0000
6      45.0000
7      -9.0023
8     -85.0000
dtype: float64

In [71]:
ser.replace(-85, 0)

0       8.5200
1       5.0000
2       0.0000
3      41.0200
4       0.0000
5    1000.0000
6      45.0000
7      -9.0023
8       0.0000
dtype: float64

In [72]:
ser.replace([-85, -9.0023], np.nan)

0       8.52
1       5.00
2        NaN
3      41.02
4        NaN
5    1000.00
6      45.00
7        NaN
8        NaN
dtype: float64

In [73]:
ser.replace([-85, -9.0023, 1000], [np.nan, np.nan, 0])

0     8.52
1     5.00
2      NaN
3    41.02
4      NaN
5     0.00
6    45.00
7      NaN
8      NaN
dtype: float64

In [74]:
ser.replace({-85:np.nan, -9.0023:np.nan, 1000:0})

0     8.52
1     5.00
2      NaN
3    41.02
4      NaN
5     0.00
6    45.00
7      NaN
8      NaN
dtype: float64

In [75]:
ser.replace(ser[ser<0], np.nan)

0       8.52
1       5.00
2        NaN
3      41.02
4        NaN
5    1000.00
6      45.00
7        NaN
8        NaN
dtype: float64

In [76]:
ser

0       8.5200
1       5.0000
2     -85.0000
3      41.0200
4     -85.0000
5    1000.0000
6      45.0000
7      -9.0023
8     -85.0000
dtype: float64

## Renaming Axis Indexes

Like values in a Series, axis labels can be similarly transformed by a function or mapping of some form to produce new, differently labeled objects

In [77]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [78]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [79]:
data.index = data.index.map(lambda x: str.upper(x))

In [80]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [81]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [82]:
data.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [83]:
data.rename(index={'OHIO': 'NY'}, inplace=True)

In [84]:
data

Unnamed: 0,one,two,three,four
NY,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


## Discretization and Binning

In [85]:
ages = [20, 22, 25, 27, 21, 69, 62, 58, 44, 23, 37, 31, 61, 45, 41, 32]

In [86]:
bins = [18, 25, 35, 60, 100]

In [87]:
cats = pd.cut(ages, bins)

In [88]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 16
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

The object pandas returns is a special Categorical object. The output you see
describes the bins computed by pandas.cut. 

In [89]:
cats.codes # .codes attribute returns the label of catgories

array([0, 0, 0, 1, 0, 3, 3, 2, 2, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [90]:
pd.value_counts(cats)

(35, 60]     5
(18, 25]     5
(60, 100]    3
(25, 35]     3
dtype: int64

Consistent with mathematical notation for intervals, a parenthesis means that the side
is open, while the square bracket means it is closed (inclusive).

In [91]:
bins

[18, 25, 35, 60, 100]

In [92]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 16
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [93]:
labels = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [94]:
pd.cut(ages, bins, labels=labels)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 16
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

If you pass an integer number of bins to cut instead of explicit bin edges, it will compute equal-length bins based on the minimum and maximum values in the data.
Consider the case of some uniformly distributed data chopped into fourths:

In [95]:
pd.cut(np.random.randn(10), 4, precision=2) #The precision=2 option limits the decimal precision to two digits.

[(-1.15, -0.47], (-0.47, 0.21], (-1.15, -0.47], (-1.15, -0.47], (-1.15, -0.47], (-0.47, 0.21], (-0.47, 0.21], (0.88, 1.56], (-1.15, -0.47], (-1.15, -0.47]]
Categories (4, interval[float64]): [(-1.15, -0.47] < (-0.47, 0.21] < (0.21, 0.88] < (0.88, 1.56]]

In [96]:
#qcuts returns the same size of bins (in Quartiles)
data = np.random.randn(1000)

In [97]:
data

array([ 0.07716193,  1.58686213,  0.96432038, -0.19701665,  0.98604389,
       -0.85781643, -1.31978042, -0.24707433, -0.68826381,  1.60774317,
       -1.19722342, -1.70494442,  2.32880145, -0.8162864 ,  0.86872226,
        1.66384409,  0.929642  ,  0.13282988, -0.81133042,  0.01334731,
       -0.84387386, -0.12357223, -1.78087267,  0.72681232, -2.33699113,
       -0.82943531,  1.31826699, -1.61925022,  1.13007576,  0.97743705,
       -0.46308572, -0.09255231,  0.23329226, -0.20930388, -0.24774335,
       -1.91156144, -2.16362943,  0.67365037,  0.72821414, -1.03454989,
       -0.13305451,  0.88872025,  2.51149903,  0.02819735,  1.25557686,
       -1.1951189 ,  0.10791966,  0.32720737,  1.79758532, -0.48875235,
       -0.37161276,  1.00869567, -0.95538653, -0.04460734,  0.07790759,
       -0.80216312,  1.08177005, -0.26313038,  1.07361978, -0.27700509,
        0.08866099, -0.19696793,  0.16646537,  0.84667954, -0.03931377,
        0.88057662,  0.09695848, -0.87497156, -0.36085318, -1.09

In [98]:
cats = pd.qcut(data, 4) #distribute data into 4 quartiles

In [99]:
cats.value_counts()

(-3.771, -0.702]     250
(-0.702, -0.0379]    250
(-0.0379, 0.653]     250
(0.653, 2.73]        250
dtype: int64

In [100]:
pd.qcut(data, 6).value_counts()

(-3.771, -0.995]     167
(-0.995, -0.443]     167
(-0.443, -0.0379]    166
(-0.0379, 0.376]     167
(0.376, 0.961]       166
(0.961, 2.73]        167
dtype: int64

In [101]:
pd.qcut(data, [0, 0.20, 0.40, 0.60, 0.80, 1]).value_counts()

(-3.771, -0.844]    200
(-0.844, -0.254]    200
(-0.254, 0.22]      200
(0.22, 0.791]       200
(0.791, 2.73]       200
dtype: int64

## Detecting and Filtering Outliers

In [102]:
data = pd.DataFrame(data=np.random.randn(1000, 4))

In [103]:
data

Unnamed: 0,0,1,2,3
0,0.416640,0.795114,-0.743435,1.111822
1,-0.446112,-0.018319,-0.657937,1.465648
2,0.877370,0.179955,0.838285,0.962178
3,-1.626815,0.605235,-0.281931,-1.364533
4,1.173010,0.793466,1.010796,0.382796
...,...,...,...,...
995,-1.380163,-0.244628,-0.527992,2.215075
996,-1.017890,0.632673,-0.662727,0.790172
997,-0.353166,1.614957,-0.999091,0.278586
998,0.721523,0.375522,0.531194,0.364769


In [104]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.016652,0.047012,-0.054445,-0.00336
std,1.042345,1.030181,0.986705,0.975111
min,-3.707654,-3.268194,-3.246817,-2.943873
25%,-0.692866,-0.651794,-0.698734,-0.681022
50%,-0.027341,0.063683,-0.042863,-0.045815
75%,0.721098,0.739424,0.610927,0.672786
max,3.667291,3.082116,2.985314,3.510821


In [105]:
col_2 = data[2]

In [106]:
col_2[np.abs(col_2) > 3]

19    -3.246817
501   -3.025978
Name: 2, dtype: float64

In [107]:
#To select all rows having a value exceeding 3 or –3
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
19,-0.049701,-1.000427,-3.246817,0.558937
121,0.139453,-3.268194,-0.126937,0.382325
157,-0.192505,3.082116,-1.317084,0.482158
213,0.660247,0.603671,-1.112599,3.510821
501,-1.800008,1.422605,-3.025978,0.092752
743,3.667291,1.205293,-1.073897,-0.629957
779,0.657757,-3.079998,0.589954,-0.946266
840,-3.065402,-0.330182,-0.03744,-2.007581
877,3.164556,-0.302388,0.388238,-1.123943
956,-3.707654,-0.167246,0.408062,-1.159327


In [108]:
data[(np.abs(data) > 3)] = np.sign(data) * 3

In [109]:
data

Unnamed: 0,0,1,2,3
0,0.416640,0.795114,-0.743435,1.111822
1,-0.446112,-0.018319,-0.657937,1.465648
2,0.877370,0.179955,0.838285,0.962178
3,-1.626815,0.605235,-0.281931,-1.364533
4,1.173010,0.793466,1.010796,0.382796
...,...,...,...,...
995,-1.380163,-0.244628,-0.527992,2.215075
996,-1.017890,0.632673,-0.662727,0.790172
997,-0.353166,1.614957,-0.999091,0.278586
998,0.721523,0.375522,0.531194,0.364769


In [110]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.016711,0.047278,-0.054173,-0.003871
std,1.037238,1.028872,0.985859,0.973401
min,-3.0,-3.0,-3.0,-2.943873
25%,-0.692866,-0.651794,-0.698734,-0.681022
50%,-0.027341,0.063683,-0.042863,-0.045815
75%,0.721098,0.739424,0.610927,0.672786
max,3.0,3.0,2.985314,3.0


In [117]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,1.0
1,-1.0,-1.0,-1.0,1.0
2,1.0,1.0,1.0,1.0
3,-1.0,1.0,-1.0,-1.0
4,1.0,1.0,1.0,1.0


## Permutation and Random Sampling

Permuting (randomly reordering) a Series or the rows in a DataFrame is easy to do
using the numpy.random.permutation function.

In [118]:
df = pd.DataFrame(data=(np.arange(5*4).reshape(5, 4)))

In [119]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [130]:
sampler = np.random.permutation(df.shape[0])

In [131]:
sampler

array([4, 1, 2, 0, 3])

In [135]:
df.take(sampler)

Unnamed: 0,0,1,2,3
4,16,17,18,19
1,4,5,6,7
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15


In [134]:
#To select a random subset without replacement, you can use the sample method on Series and DataFrame
df.sample(n=3)

Unnamed: 0,0,1,2,3
0,0,1,2,3
4,16,17,18,19
1,4,5,6,7


In [139]:
choices = ser = pd.Series([5, 7, -1, 6, 4])

In [140]:
draws = ser.sample(n=10, replace=True)

In [141]:
draws

4    4
2   -1
0    5
4    4
4    4
1    7
3    6
4    4
4    4
2   -1
dtype: int64

## Computing Indicator/Dummy Variables

Another type of transformation for statistical modeling or machine learning applications is converting a categorical variable into a “dummy” or “indicator” matrix. If a
column in a DataFrame has k distinct values, you would derive a matrix or DataFrame with k columns containing all 1s and 0s.

In [142]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})

In [143]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [145]:
dummies = pd.get_dummies(df['key'])

In [146]:
dummies

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [185]:
df_with_dummy = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(df_with_dummy)

In [184]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [151]:
mnames = ['id','movie_id', 'title', 'genre']

In [162]:
movies = pd.read_table('movielens/movies.dat', sep='::', header=None, names=mnames)

  """Entry point for launching an IPython kernel.


In [163]:
movies

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [171]:
all_genre = []

In [172]:
for genre in movies['genre']:
    all_genre.extend(genre.split('|'))

In [178]:
pd.value_counts(all_genre)

Drama          1603
Comedy         1200
Action          503
Thriller        492
Romance         471
Horror          343
Adventure       283
Sci-Fi          276
Children's      251
Crime           211
War             143
Documentary     127
Musical         114
Mystery         106
Animation       105
Fantasy          68
Western          68
Film-Noir        44
dtype: int64

In [198]:
genre = pd.unique(all_genre)

In [199]:
genre

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [201]:
zero_matrix = np.zeros((len(movies), len(genre)))

In [202]:
zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [203]:
dummies = pd.DataFrame(data=zero_matrix, columns=genre)

In [204]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [205]:
for i, gen in enumerate(movies['genre']):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [206]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [208]:
final_dummy = movies[['movie_id', 'title']].join(dummies)

In [209]:
final_dummy

Unnamed: 0,movie_id,title,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story (1995),1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,3949,Requiem for a Dream (2000),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,3950,Tigerland (2000),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,3951,Two Family House (2000),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


A useful recipe for statistical applications is to combine get_dummies with a discretization function like cut:

In [210]:
np.random.seed(12345)

In [211]:
values = np.random.rand(10)

In [212]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [213]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [214]:
dummies = pd.get_dummies(pd.cut(values, bins))

In [215]:
dummies

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


# 7.3 String Manipulation

## String Object Methods

In [216]:
val = 'a, b, guido, github'

In [217]:
val.split(',') #string can be broken into pieces with split()

['a', ' b', ' guido', ' github']

In [219]:
#split is often combined with strip to trim whitespace (including line breaks):
pieces = [x.strip() for x in val.split(',')]

In [220]:
pieces

['a', 'b', 'guido', 'github']

In [221]:
first, second, third, fourth = pieces

In [224]:
first + "::" + second + "::" + third + "::" + fourth #Not a practical generic method

'a::b::guido::github'

In [225]:
"::".join(pieces)

'a::b::guido::github'

In [226]:
val

'a, b, guido, github'

In [233]:
'guido' in val

True

In [234]:
val.find('g')

6

In [229]:
val.find(':')

-1

In [231]:
val.index('g')

6

In [237]:
"val.index(':')" #Raises an exception of ValueError

"val.index(':')"

In [238]:
val.count(',')

3

In [239]:
val.count('g')

2

In [240]:
val.count(':')

0

In [241]:
val

'a, b, guido, github'

In [242]:
val.replace(', ', '::')

'a::b::guido::github'

In [246]:
a = val.replace(',', ' ')

In [249]:
a

'a  b  guido  github'

See Table 7-3 for a listing of some of Python’s string methods.

In [250]:
val = "Dilory Pam Pah Pah Pa Rara Ra"

In [254]:
val.endswith('a Ra')

True

In [257]:
val.startswith('Dilo')

True

In [260]:
val.rfind('Ra') #index of 'R'

27

## Regular Expressions

Regular expressions provide a flexible way to search or match (often more complex)
string patterns in text. A single expression, commonly called a regex, is a string
formed according to the regular expression language. 

The re module functions fall into three categories: pattern matching, substitution,
and splitting. 

In [263]:
import re

In [267]:
text = "My    name     is          Danish Hudani"

In [276]:
regex = re.compile('\s+') #One or more white spaces

In [277]:
regex.split(text)

['My', 'name', 'is', 'Danish', 'Hudani']

In [278]:
regex.findall(text)

['    ', '     ', '          ', ' ']

In [279]:
print(r'C:\x') #instead of

C:\x


In [280]:
print('C:\\x')

C:\x


In [311]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [312]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+.[A-Z]'

In [313]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [314]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [315]:
m = regex.search(text) #searches the 1st object

In [316]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [322]:
text[m.start():m.end()]

'dave@google.com'

In [324]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [333]:
pattern =  r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [334]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [335]:
m = regex.match('danish.hud95@gmail.com')

In [336]:
m

<re.Match object; span=(0, 22), match='danish.hud95@gmail.com'>

In [337]:
m.groups()

('danish.hud95', 'gmail', 'com')

In [338]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [342]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



Table 7-4. Regular expression methods