Steps for Data Analysis:
    1. Data Loading
    2. Data Cleaning
    3. Data Transforming
    4. Data Rearrangment

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 7.1 Handling Missing Data

In [2]:
string_data = pd.Series(data=['Apple', 'Orange', np.nan, 'Banana'])

In [3]:
string_data

0     Apple
1    Orange
2       NaN
3    Banana
dtype: object

NaN(Not a Number)

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

NA (Not Available)

In [5]:
string_data[0] = None

In [6]:
string_data

0      None
1    Orange
2       NaN
3    Banana
dtype: object

In [7]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

*Table 7-1. NA handling methods*

![NA handling methods](Img/7.1.png)

In [8]:
df = pd.DataFrame(data=np.random.randn(1024,5), index=np.arange(1024), columns=list('ABCDE'))

In [9]:
df

Unnamed: 0,A,B,C,D,E
0,2.271313,1.767124,-0.417139,-1.179323,-0.511787
1,1.458493,1.604584,0.532143,-0.268574,0.086024
2,0.285559,1.862293,-0.487194,1.222941,-0.649823
3,-0.978494,-0.667646,-0.172639,-0.873012,0.477971
4,-1.204393,1.192339,0.125861,-0.013035,1.099539
...,...,...,...,...,...
1019,0.844554,-1.030583,0.291111,0.912779,0.179775
1020,-0.654604,2.466272,0.761705,0.598039,1.341811
1021,-0.698075,-0.639144,-2.116148,-0.688732,0.650226
1022,-0.647693,1.001485,-2.591511,-0.328065,0.439573


In [10]:
df.loc[5][['C', 'D']] = np.nan
df.loc[1021]['A'] = np.nan
df.loc[5] = np.nan

In [11]:
df

Unnamed: 0,A,B,C,D,E
0,2.271313,1.767124,-0.417139,-1.179323,-0.511787
1,1.458493,1.604584,0.532143,-0.268574,0.086024
2,0.285559,1.862293,-0.487194,1.222941,-0.649823
3,-0.978494,-0.667646,-0.172639,-0.873012,0.477971
4,-1.204393,1.192339,0.125861,-0.013035,1.099539
...,...,...,...,...,...
1019,0.844554,-1.030583,0.291111,0.912779,0.179775
1020,-0.654604,2.466272,0.761705,0.598039,1.341811
1021,,-0.639144,-2.116148,-0.688732,0.650226
1022,-0.647693,1.001485,-2.591511,-0.328065,0.439573


In [12]:
df[(df.isnull()).any(1)]

Unnamed: 0,A,B,C,D,E
5,,,,,
1021,,-0.639144,-2.116148,-0.688732,0.650226


In [13]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C,D,E
0,2.271313,1.767124,-0.417139,-1.179323,-0.511787
1,1.458493,1.604584,0.532143,-0.268574,0.086024
2,0.285559,1.862293,-0.487194,1.222941,-0.649823
3,-0.978494,-0.667646,-0.172639,-0.873012,0.477971
4,-1.204393,1.192339,0.125861,-0.013035,1.099539
...,...,...,...,...,...
1019,0.844554,-1.030583,0.291111,0.912779,0.179775
1020,-0.654604,2.466272,0.761705,0.598039,1.341811
1021,,-0.639144,-2.116148,-0.688732,0.650226
1022,-0.647693,1.001485,-2.591511,-0.328065,0.439573


In [14]:
df.fillna(value=0)

Unnamed: 0,A,B,C,D,E
0,2.271313,1.767124,-0.417139,-1.179323,-0.511787
1,1.458493,1.604584,0.532143,-0.268574,0.086024
2,0.285559,1.862293,-0.487194,1.222941,-0.649823
3,-0.978494,-0.667646,-0.172639,-0.873012,0.477971
4,-1.204393,1.192339,0.125861,-0.013035,1.099539
...,...,...,...,...,...
1019,0.844554,-1.030583,0.291111,0.912779,0.179775
1020,-0.654604,2.466272,0.761705,0.598039,1.341811
1021,0.000000,-0.639144,-2.116148,-0.688732,0.650226
1022,-0.647693,1.001485,-2.591511,-0.328065,0.439573


## Filtering Out Missing Data

In [15]:
data = pd.Series(data=[1, 5, np.nan, 7, 9, np.nan])

In [16]:
data

0    1.0
1    5.0
2    NaN
3    7.0
4    9.0
5    NaN
dtype: float64

In [17]:
data.dropna()

0    1.0
1    5.0
3    7.0
4    9.0
dtype: float64

equivalent to:

In [18]:
data[data.notnull()]

0    1.0
1    5.0
3    7.0
4    9.0
dtype: float64

In [19]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [20]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
cleaned = data.dropna()

In [22]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [23]:
#Passing how='all' will only drop rows that are all NA:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [24]:
data[4] = np.nan

In [25]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [26]:
#To drop columns in the same way, pass axis=1
data.dropna(how='all', axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [27]:
df = pd.DataFrame(data=np.random.randn(5,3))

In [28]:
df

Unnamed: 0,0,1,2
0,-0.271334,1.100488,-1.06518
1,1.045262,-1.477195,0.495599
2,2.265197,0.175053,0.493201
3,2.066989,0.42277,-1.425019
4,0.520064,-1.13092,-0.738951


In [29]:
df.iloc[:2,1] = np.nan

In [30]:
df.iloc[:3,2] = np.nan

In [31]:
df

Unnamed: 0,0,1,2
0,-0.271334,,
1,1.045262,,
2,2.265197,0.175053,
3,2.066989,0.42277,-1.425019
4,0.520064,-1.13092,-0.738951


In [32]:
df.dropna()

Unnamed: 0,0,1,2
3,2.066989,0.42277,-1.425019
4,0.520064,-1.13092,-0.738951


In [33]:
#Keep only the rows with at least thresh='2' non-NA values.
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,2.265197,0.175053,
3,2.066989,0.42277,-1.425019
4,0.520064,-1.13092,-0.738951


## Filling In Missing Data

Rather than filtering out missing data (and potentially discarding other data along
with it), you may want to fill in the “holes” in any number of ways.

In [34]:
df

Unnamed: 0,0,1,2
0,-0.271334,,
1,1.045262,,
2,2.265197,0.175053,
3,2.066989,0.42277,-1.425019
4,0.520064,-1.13092,-0.738951


In [35]:
df.fillna(value=0)

Unnamed: 0,0,1,2
0,-0.271334,0.0,0.0
1,1.045262,0.0,0.0
2,2.265197,0.175053,0.0
3,2.066989,0.42277,-1.425019
4,0.520064,-1.13092,-0.738951


In [36]:
#Calling fillna with a dict, you can use a different fill value for each column:
df.fillna(value={0:100, 1:200, 2:300})

Unnamed: 0,0,1,2
0,-0.271334,200.0,300.0
1,1.045262,200.0,300.0
2,2.265197,0.175053,300.0
3,2.066989,0.42277,-1.425019
4,0.520064,-1.13092,-0.738951


In [37]:
df.fillna(value=0, inplace=True)

In [38]:
df

Unnamed: 0,0,1,2
0,-0.271334,0.0,0.0
1,1.045262,0.0,0.0
2,2.265197,0.175053,0.0
3,2.066989,0.42277,-1.425019
4,0.520064,-1.13092,-0.738951


In [39]:
df = pd.DataFrame(np.random.randn(7, 4))

In [40]:
df

Unnamed: 0,0,1,2,3
0,0.368689,1.319802,-0.093204,-2.176661
1,-1.595528,-0.308678,0.439589,0.969215
2,-0.269263,1.308736,0.747822,0.358399
3,0.470667,-0.221211,0.292496,-0.820597
4,-0.989582,-0.429522,-0.863013,-0.680278
5,0.230411,-1.232555,1.576345,-1.099205
6,-0.005711,-0.110127,0.7522,-1.111825


In [41]:
df.loc[2:4,2] = np.nan

In [42]:
df.iloc[-1:-5:-1,0] = np.nan

In [43]:
df

Unnamed: 0,0,1,2,3
0,0.368689,1.319802,-0.093204,-2.176661
1,-1.595528,-0.308678,0.439589,0.969215
2,-0.269263,1.308736,,0.358399
3,,-0.221211,,-0.820597
4,,-0.429522,,-0.680278
5,,-1.232555,1.576345,-1.099205
6,,-0.110127,0.7522,-1.111825


In [44]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2,3
0,0.368689,1.319802,-0.093204,-2.176661
1,-1.595528,-0.308678,0.439589,0.969215
2,-0.269263,1.308736,0.439589,0.358399
3,-0.269263,-0.221211,0.439589,-0.820597
4,-0.269263,-0.429522,0.439589,-0.680278
5,-0.269263,-1.232555,1.576345,-1.099205
6,-0.269263,-0.110127,0.7522,-1.111825


In [45]:
df.fillna(method='bfill')

Unnamed: 0,0,1,2,3
0,0.368689,1.319802,-0.093204,-2.176661
1,-1.595528,-0.308678,0.439589,0.969215
2,-0.269263,1.308736,1.576345,0.358399
3,,-0.221211,1.576345,-0.820597
4,,-0.429522,1.576345,-0.680278
5,,-1.232555,1.576345,-1.099205
6,,-0.110127,0.7522,-1.111825


In [46]:
df.fillna(method='ffill', limit=3)

Unnamed: 0,0,1,2,3
0,0.368689,1.319802,-0.093204,-2.176661
1,-1.595528,-0.308678,0.439589,0.969215
2,-0.269263,1.308736,0.439589,0.358399
3,-0.269263,-0.221211,0.439589,-0.820597
4,-0.269263,-0.429522,0.439589,-0.680278
5,-0.269263,-1.232555,1.576345,-1.099205
6,,-0.110127,0.7522,-1.111825


In [47]:
ser = pd.Series(data=[2, 5.3, np.nan, 7, np.nan, 87, 1.25])

In [48]:
ser

0     2.00
1     5.30
2      NaN
3     7.00
4      NaN
5    87.00
6     1.25
dtype: float64

In [49]:
ser.fillna(value=np.mean(ser))

0     2.00
1     5.30
2    20.51
3     7.00
4    20.51
5    87.00
6     1.25
dtype: float64

*Table 7-2. fillna function arguments*

![Fillna function arguments](Img/7.2.png)

# 7.2 Data Transformation

## Removing Duplicates

In [50]:
df = pd.DataFrame(data={'k1': ['one', 'two']*3 + ['two'], 'k2': [1, 1, 2, 2, 3, 3, 3]})

In [51]:
df

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,2
4,one,3
5,two,3
6,two,3


In [52]:
# duplicated returns a boolean Series indicating whether each row is a duplicate (has been observed in a previous row) or not:
df.duplicated(keep='first')

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [53]:
df.duplicated(keep='last')

0    False
1    False
2    False
3    False
4    False
5     True
6    False
dtype: bool

In [54]:
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,2
4,one,3
5,two,3


In [55]:
df.drop_duplicates(keep='last')

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,2
4,one,3
6,two,3


In [56]:
df['k3'] = range(7)

In [57]:
df

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1
2,one,2,2
3,two,2,3
4,one,3,4
5,two,3,5
6,two,3,6


In [58]:
df.drop_duplicates(['k1', 'k2'])

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1
2,one,2,2
3,two,2,3
4,one,3,4
5,two,3,5


In [59]:
df.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1
2,one,2,2
3,two,2,3
4,one,3,4
6,two,3,6


## Transforming Data Using a Function or Mapping

In [60]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [61]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [62]:
d = {'bacon':'pig', 'pulled pork':'pig', 'pastrami': 'cow', 'corned beef':'cow', 'honey ham': 'pig', 'nova lox': 'salmon'}

In [63]:
lowered_case = data['food'].str.lower()

In [64]:
lowered_case

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [65]:
data['animal'] = lowered_case.map(d)

In [66]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [67]:
lowered_case.map(d)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [68]:
data['food'].map(lambda x: d[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Using map is a convenient way to perform element-wise transformations and other
data cleaning–related operations.


## Replacing Values

In [69]:
ser = pd.Series(data=[8.52, 5, -85, 41.02, -85, 1000, 45, -9.0023, -85])

In [70]:
ser

0       8.5200
1       5.0000
2     -85.0000
3      41.0200
4     -85.0000
5    1000.0000
6      45.0000
7      -9.0023
8     -85.0000
dtype: float64

In [71]:
ser.replace(-85, 0)

0       8.5200
1       5.0000
2       0.0000
3      41.0200
4       0.0000
5    1000.0000
6      45.0000
7      -9.0023
8       0.0000
dtype: float64

In [72]:
ser.replace([-85, -9.0023], np.nan)

0       8.52
1       5.00
2        NaN
3      41.02
4        NaN
5    1000.00
6      45.00
7        NaN
8        NaN
dtype: float64

In [73]:
ser.replace([-85, -9.0023, 1000], [np.nan, np.nan, 0])

0     8.52
1     5.00
2      NaN
3    41.02
4      NaN
5     0.00
6    45.00
7      NaN
8      NaN
dtype: float64

In [74]:
ser.replace({-85:np.nan, -9.0023:np.nan, 1000:0})

0     8.52
1     5.00
2      NaN
3    41.02
4      NaN
5     0.00
6    45.00
7      NaN
8      NaN
dtype: float64

In [75]:
ser.replace(ser[ser<0], np.nan)

0       8.52
1       5.00
2        NaN
3      41.02
4        NaN
5    1000.00
6      45.00
7        NaN
8        NaN
dtype: float64

In [76]:
ser

0       8.5200
1       5.0000
2     -85.0000
3      41.0200
4     -85.0000
5    1000.0000
6      45.0000
7      -9.0023
8     -85.0000
dtype: float64

## Renaming Axis Indexes

Like values in a Series, axis labels can be similarly transformed by a function or mapping of some form to produce new, differently labeled objects

In [77]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [78]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [79]:
data.index = data.index.map(lambda x: str.upper(x))

In [80]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [81]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [82]:
data.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [83]:
data.rename(index={'OHIO': 'NY'}, inplace=True)

In [84]:
data

Unnamed: 0,one,two,three,four
NY,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


## Discretization and Binning

In [85]:
ages = [20, 22, 25, 27, 21, 69, 62, 58, 44, 23, 37, 31, 61, 45, 41, 32]

In [86]:
bins = [18, 25, 35, 60, 100]

In [87]:
cats = pd.cut(ages, bins)

In [88]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 16
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

The object pandas returns is a special Categorical object. The output you see
describes the bins computed by pandas.cut. 

In [89]:
cats.codes # .codes attribute returns the label of catgories

array([0, 0, 0, 1, 0, 3, 3, 2, 2, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [90]:
pd.value_counts(cats)

(35, 60]     5
(18, 25]     5
(60, 100]    3
(25, 35]     3
dtype: int64

Consistent with mathematical notation for intervals, a parenthesis means that the side
is open, while the square bracket means it is closed (inclusive).

In [91]:
bins

[18, 25, 35, 60, 100]

In [92]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 16
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [93]:
labels = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [94]:
pd.cut(ages, bins, labels=labels)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 16
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

If you pass an integer number of bins to cut instead of explicit bin edges, it will compute equal-length bins based on the minimum and maximum values in the data.
Consider the case of some uniformly distributed data chopped into fourths:

In [95]:
pd.cut(np.random.randn(10), 4, precision=2) #The precision=2 option limits the decimal precision to two digits.

[(-0.21, 0.48], (-0.9, -0.21], (-0.21, 0.48], (-0.21, 0.48], (-0.21, 0.48], (-2.29, -1.6], (-0.21, 0.48], (-0.9, -0.21], (-0.21, 0.48], (-2.29, -1.6]]
Categories (4, interval[float64]): [(-2.29, -1.6] < (-1.6, -0.9] < (-0.9, -0.21] < (-0.21, 0.48]]

In [96]:
#qcuts returns the same size of bins (in Quartiles)
data = np.random.randn(1000)

In [97]:
data

array([-1.01224132e+00, -1.57479488e+00,  1.74901454e+00,  2.92874494e-01,
        3.51519442e-01,  1.50019891e+00,  9.63262814e-01,  3.24696302e-01,
        2.04070033e-01,  7.80163572e-01,  6.98819726e-01, -2.70500455e-01,
        1.70749358e+00,  7.54517913e-02, -3.27933306e-01, -8.29544450e-01,
        9.59907171e-02,  2.12002660e+00,  3.10556998e-03, -1.33888425e+00,
       -2.65608603e-01, -7.40400968e-01,  8.57085509e-01, -1.75656376e+00,
        1.97033541e+00,  1.10489350e+00, -5.82088343e-01,  7.19128241e-01,
       -1.75335127e+00,  9.49032463e-01, -5.50332797e-01,  1.02504690e-01,
       -1.51201173e+00,  6.20827871e-01,  3.88291050e-02,  3.49093003e-01,
        1.71714578e+00,  1.21223606e-01,  2.46137594e-01, -6.49114413e-01,
        3.61860382e-01, -2.19978617e+00, -8.00279796e-01, -5.40811071e-01,
        7.41142412e-01, -1.04309971e+00, -1.57226938e+00, -5.07590035e-02,
        4.19188825e-01, -1.11473144e+00,  5.20107232e-01, -2.60506435e-01,
       -6.59915025e-01, -

In [98]:
cats = pd.qcut(data, 4) #distribute data into 4 quartiles

In [99]:
cats.value_counts()

(-2.916, -0.695]    250
(-0.695, 0.0527]    250
(0.0527, 0.745]     250
(0.745, 2.88]       250
dtype: int64

In [100]:
pd.qcut(data, 6).value_counts()

(-2.916, -0.962]    167
(-0.962, -0.437]    167
(-0.437, 0.0527]    166
(0.0527, 0.451]     167
(0.451, 1.024]      166
(1.024, 2.88]       167
dtype: int64

In [101]:
pd.qcut(data, [0, 0.20, 0.40, 0.60, 0.80, 1]).value_counts()

(-2.916, -0.878]    200
(-0.878, -0.235]    200
(-0.235, 0.304]     200
(0.304, 0.914]      200
(0.914, 2.88]       200
dtype: int64

## Detecting and Filtering Outliers

In [102]:
data = pd.DataFrame(data=np.random.randn(1000, 4))

In [103]:
data

Unnamed: 0,0,1,2,3
0,-1.033564,1.605062,-0.046327,1.046000
1,0.114094,-0.146643,-0.634795,0.107142
2,-0.041166,0.570417,-1.040250,-0.049690
3,-0.768343,-0.446371,-1.135226,1.280530
4,1.368125,0.085290,-0.268204,-0.892880
...,...,...,...,...
995,1.733191,0.574954,0.905160,0.426486
996,1.128897,0.271849,2.169926,1.024474
997,-1.506079,-0.727875,1.646877,-1.351133
998,-0.383771,0.344013,-1.221490,-0.307183


In [104]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.0128,-0.018389,-0.073276,0.003789
std,0.989185,0.988575,0.996988,0.998883
min,-3.244585,-3.062549,-3.541081,-3.116853
25%,-0.686993,-0.671735,-0.71366,-0.652165
50%,-0.00605,-0.015986,-0.082956,0.004676
75%,0.612208,0.655395,0.612366,0.698817
max,3.378623,3.304657,3.013443,3.36163


In [105]:
col_2 = data[2]

In [106]:
col_2[np.abs(col_2) > 3]

340    3.013443
505   -3.541081
Name: 2, dtype: float64

In [107]:
#To select all rows having a value exceeding 3 or –3
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
32,-3.244585,0.805529,0.313707,0.764875
83,-3.012515,-0.191263,-1.832611,-0.441237
205,0.026849,3.304657,-1.476,-0.412711
299,3.216721,-0.53738,-1.516994,0.188431
340,1.635482,-1.142992,3.013443,-0.367412
461,-0.785647,-3.062549,-2.248448,-0.552984
489,0.070704,-1.801743,2.298547,3.36163
505,-0.983523,-0.523826,-3.541081,0.570007
603,3.148897,0.259233,-1.974085,-0.497529
619,3.378623,1.517292,1.389523,1.058873


In [108]:
data[(np.abs(data) > 3)] = np.sign(data) * 3

In [109]:
data

Unnamed: 0,0,1,2,3
0,-1.033564,1.605062,-0.046327,1.046000
1,0.114094,-0.146643,-0.634795,0.107142
2,-0.041166,0.570417,-1.040250,-0.049690
3,-0.768343,-0.446371,-1.135226,1.280530
4,1.368125,0.085290,-0.268204,-0.892880
...,...,...,...,...
995,1.733191,0.574954,0.905160,0.426486
996,1.128897,0.271849,2.169926,1.024474
997,-1.506079,-0.727875,1.646877,-1.351133
998,-0.383771,0.344013,-1.221490,-0.307183


In [110]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.013287,-0.018631,-0.072748,0.003681
std,0.985996,0.987405,0.995207,0.996874
min,-3.0,-3.0,-3.0,-3.0
25%,-0.686993,-0.671735,-0.71366,-0.652165
50%,-0.00605,-0.015986,-0.082956,0.004676
75%,0.612208,0.655395,0.612366,0.698817
max,3.0,3.0,3.0,3.0


In [111]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,1.0
1,1.0,-1.0,-1.0,1.0
2,-1.0,1.0,-1.0,-1.0
3,-1.0,-1.0,-1.0,1.0
4,1.0,1.0,-1.0,-1.0


## Permutation and Random Sampling

Permuting (randomly reordering) a Series or the rows in a DataFrame is easy to do
using the numpy.random.permutation function.

In [112]:
df = pd.DataFrame(data=(np.arange(5*4).reshape(5, 4)))

In [113]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [114]:
sampler = np.random.permutation(df.shape[0])

In [115]:
sampler

array([2, 1, 4, 3, 0])

In [116]:
df.take(sampler)

Unnamed: 0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
4,16,17,18,19
3,12,13,14,15
0,0,1,2,3


In [117]:
#To select a random subset without replacement, you can use the sample method on Series and DataFrame
df.sample(n=3)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
3,12,13,14,15


In [118]:
choices = ser = pd.Series([5, 7, -1, 6, 4])

In [119]:
draws = ser.sample(n=10, replace=True)

In [120]:
draws

3    6
0    5
2   -1
4    4
0    5
2   -1
3    6
0    5
2   -1
2   -1
dtype: int64

## Computing Indicator/Dummy Variables

Another type of transformation for statistical modeling or machine learning applications is converting a categorical variable into a “dummy” or “indicator” matrix. If a
column in a DataFrame has k distinct values, you would derive a matrix or DataFrame with k columns containing all 1s and 0s.

In [121]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})

In [122]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [123]:
dummies = pd.get_dummies(df['key'])

In [124]:
dummies

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [125]:
df_with_dummy = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(df_with_dummy)

In [126]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [127]:
mnames = ['movie_id', 'title', 'genre']

In [128]:
movies = pd.read_table('movielens/movies.dat', sep='::', header=None, names=mnames)

  """Entry point for launching an IPython kernel.


In [129]:
movies

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [130]:
all_genre = []

In [131]:
for genre in movies['genre']:
    all_genre.extend(genre.split('|'))

In [132]:
pd.value_counts(all_genre)

Drama          1603
Comedy         1200
Action          503
Thriller        492
Romance         471
Horror          343
Adventure       283
Sci-Fi          276
Children's      251
Crime           211
War             143
Documentary     127
Musical         114
Mystery         106
Animation       105
Fantasy          68
Western          68
Film-Noir        44
dtype: int64

In [133]:
genre = pd.unique(all_genre)

In [134]:
genre

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [135]:
zero_matrix = np.zeros((len(movies), len(genre)))

In [136]:
zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [137]:
dummies = pd.DataFrame(data=zero_matrix, columns=genre)

In [138]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [139]:
for i, gen in enumerate(movies['genre']):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [140]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:
final_dummy = movies[['movie_id', 'title']].join(dummies)

In [142]:
final_dummy

Unnamed: 0,movie_id,title,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story (1995),1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,3949,Requiem for a Dream (2000),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,3950,Tigerland (2000),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,3951,Two Family House (2000),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


A useful recipe for statistical applications is to combine get_dummies with a discretization function like cut:

In [143]:
np.random.seed(12345)

In [144]:
values = np.random.rand(10)

In [145]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [146]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [147]:
dummies = pd.get_dummies(pd.cut(values, bins))

In [148]:
dummies

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


# 7.3 String Manipulation

## String Object Methods

In [149]:
val = 'a, b, guido, github'

In [150]:
val.split(',') #string can be broken into pieces with split()

['a', ' b', ' guido', ' github']

In [151]:
#split is often combined with strip to trim whitespace (including line breaks):
pieces = [x.strip() for x in val.split(',')]

In [152]:
pieces

['a', 'b', 'guido', 'github']

In [153]:
first, second, third, fourth = pieces

In [154]:
first + "::" + second + "::" + third + "::" + fourth #Not a practical generic method

'a::b::guido::github'

In [155]:
"::".join(pieces)

'a::b::guido::github'

In [156]:
val

'a, b, guido, github'

In [157]:
'guido' in val

True

In [158]:
val.find('g')

6

In [159]:
val.find(':')

-1

In [160]:
val.index('g')

6

In [161]:
"val.index(':')" #Raises an exception of ValueError

"val.index(':')"

In [162]:
val.count(',')

3

In [163]:
val.count('g')

2

In [164]:
val.count(':')

0

In [165]:
val

'a, b, guido, github'

In [166]:
val.replace(', ', '::')

'a::b::guido::github'

In [167]:
a = val.replace(',', ' ')

In [168]:
a

'a  b  guido  github'

*See Table 7-3 for a listing of some of Python’s string methods.*

![Listing of some of Python’s string methods](Img/7.3.png)

In [169]:
val = "Dilory Pam Pah Pah Pa Rara Ra"

In [170]:
val.endswith('a Ra')

True

In [171]:
val.startswith('Dilo')

True

In [172]:
val.rfind('Ra') #index of 'R'

27

## Regular Expressions

Regular expressions provide a flexible way to search or match (often more complex)
string patterns in text. A single expression, commonly called a regex, is a string
formed according to the regular expression language. 

The re module functions fall into three categories: pattern matching, substitution,
and splitting. 

In [173]:
import re

In [174]:
text = "My    name     is          Danish Hudani"

In [175]:
regex = re.compile('\s+') #One or more white spaces

In [176]:
regex.split(text)

['My', 'name', 'is', 'Danish', 'Hudani']

In [177]:
regex.findall(text)

['    ', '     ', '          ', ' ']

In [178]:
print(r'C:\x') #instead of

C:\x


In [179]:
print('C:\\x')

C:\x


In [180]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [181]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+.[A-Z]'

In [182]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [183]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [184]:
m = regex.search(text) #searches the 1st object

In [185]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [186]:
text[m.start():m.end()]

'dave@google.com'

In [187]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [188]:
pattern =  r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [189]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [190]:
m = regex.match('danish.hud95@gmail.com')

In [191]:
m

<re.Match object; span=(0, 22), match='danish.hud95@gmail.com'>

In [192]:
m.groups()

('danish.hud95', 'gmail', 'com')

In [193]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [194]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



*Table 7-4. Regular expression methods*

![Regular expression methods](Img/7.4.png)

## Vectorized String Functions in pandas

In [210]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan}

In [211]:
data

{'Dave': 'dave@google.com',
 'Steve': 'steve@gmail.com',
 'Rob': 'rob@gmail.com',
 'Wes': nan}

In [212]:
data = pd.Series(data)

In [213]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [214]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [215]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [216]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [217]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [220]:
matches = data.str.match(pattern, flags=re.IGNORECASE)

In [223]:
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [258]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

*Table 7-5. Partial listing of vectorized string methods*

![Partial listing of vectorized string methods](Img/7.5a.png)

![Partial listing of vectorized string methods](Img/7.5b.png)