Steps for Data Analysis:
    1. Data Loading
    2. Data Cleaning
    3. Data Transforming
    4. Data Rearrangment

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 7.1 Handling Missing Data

In [2]:
string_data = pd.Series(data=['Apple', 'Orange', np.nan, 'Banana'])

In [3]:
string_data

0     Apple
1    Orange
2       NaN
3    Banana
dtype: object

NaN(Not a Number)

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

NA (Not Available)

In [5]:
string_data[0] = None

In [6]:
string_data

0      None
1    Orange
2       NaN
3    Banana
dtype: object

In [7]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

Table 7-1. NA handling methods

In [8]:
df = pd.DataFrame(data=np.random.randn(1024,5), index=np.arange(1024), columns=list('ABCDE'))

In [9]:
df

Unnamed: 0,A,B,C,D,E
0,-0.085053,0.171346,-2.231905,-0.870569,0.050929
1,-0.721617,-0.068463,-0.514313,0.924806,-0.046560
2,-0.550596,0.557059,-0.522690,1.603810,-1.076933
3,0.331983,-0.656144,-1.910244,0.731161,0.310146
4,2.177783,-0.485098,-0.573985,-1.376808,0.143210
...,...,...,...,...,...
1019,0.485381,0.216378,1.514993,1.218844,1.419047
1020,-0.694140,0.217277,0.858736,-1.283879,-0.472882
1021,-0.431617,-1.234836,-0.840876,0.278349,0.367145
1022,-0.026553,-0.616987,-1.813501,-1.084858,0.056444


In [10]:
df.loc[5][['C', 'D']] = np.nan
df.loc[1021]['A'] = np.nan
df.loc[5] = np.nan

In [11]:
df

Unnamed: 0,A,B,C,D,E
0,-0.085053,0.171346,-2.231905,-0.870569,0.050929
1,-0.721617,-0.068463,-0.514313,0.924806,-0.046560
2,-0.550596,0.557059,-0.522690,1.603810,-1.076933
3,0.331983,-0.656144,-1.910244,0.731161,0.310146
4,2.177783,-0.485098,-0.573985,-1.376808,0.143210
...,...,...,...,...,...
1019,0.485381,0.216378,1.514993,1.218844,1.419047
1020,-0.694140,0.217277,0.858736,-1.283879,-0.472882
1021,,-1.234836,-0.840876,0.278349,0.367145
1022,-0.026553,-0.616987,-1.813501,-1.084858,0.056444


In [12]:
df[df.isnull()]

Unnamed: 0,A,B,C,D,E
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
...,...,...,...,...,...
1019,,,,,
1020,,,,,
1021,,,,,
1022,,,,,


In [13]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C,D,E
0,-0.085053,0.171346,-2.231905,-0.870569,0.050929
1,-0.721617,-0.068463,-0.514313,0.924806,-0.046560
2,-0.550596,0.557059,-0.522690,1.603810,-1.076933
3,0.331983,-0.656144,-1.910244,0.731161,0.310146
4,2.177783,-0.485098,-0.573985,-1.376808,0.143210
...,...,...,...,...,...
1019,0.485381,0.216378,1.514993,1.218844,1.419047
1020,-0.694140,0.217277,0.858736,-1.283879,-0.472882
1021,,-1.234836,-0.840876,0.278349,0.367145
1022,-0.026553,-0.616987,-1.813501,-1.084858,0.056444


In [15]:
df.fillna(value=0)

Unnamed: 0,A,B,C,D,E
0,-0.085053,0.171346,-2.231905,-0.870569,0.050929
1,-0.721617,-0.068463,-0.514313,0.924806,-0.046560
2,-0.550596,0.557059,-0.522690,1.603810,-1.076933
3,0.331983,-0.656144,-1.910244,0.731161,0.310146
4,2.177783,-0.485098,-0.573985,-1.376808,0.143210
...,...,...,...,...,...
1019,0.485381,0.216378,1.514993,1.218844,1.419047
1020,-0.694140,0.217277,0.858736,-1.283879,-0.472882
1021,0.000000,-1.234836,-0.840876,0.278349,0.367145
1022,-0.026553,-0.616987,-1.813501,-1.084858,0.056444


## Filtering Out Missing Data

In [17]:
data = pd.Series(data=[1, 5, np.nan, 7, 9, np.nan])

In [18]:
data

0    1.0
1    5.0
2    NaN
3    7.0
4    9.0
5    NaN
dtype: float64

In [22]:
data.dropna()

0    1.0
1    5.0
3    7.0
4    9.0
dtype: float64

equivalent to:

In [21]:
data[data.notnull()]

0    1.0
1    5.0
3    7.0
4    9.0
dtype: float64

In [23]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [24]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [26]:
cleaned = data.dropna()

In [27]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [29]:
#Passing how='all' will only drop rows that are all NA:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [30]:
data[4] = np.nan

In [31]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [32]:
#To drop columns in the same way, pass axis=1
data.dropna(how='all', axis=1)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [33]:
df = pd.DataFrame(data=np.random.randn(5,3))

In [34]:
df

Unnamed: 0,0,1,2
0,-1.032759,-0.875559,0.153805
1,1.127416,-0.165365,0.235686
2,0.410472,0.187333,-0.567676
3,1.367596,0.144934,0.604169
4,0.152236,0.770223,0.703361


In [38]:
df.iloc[:2,1] = np.nan

In [39]:
df.iloc[:3,2] = np.nan

In [40]:
df

Unnamed: 0,0,1,2
0,-1.032759,,
1,1.127416,,
2,0.410472,0.187333,
3,1.367596,0.144934,0.604169
4,0.152236,0.770223,0.703361


In [41]:
df.dropna()

Unnamed: 0,0,1,2
3,1.367596,0.144934,0.604169
4,0.152236,0.770223,0.703361


In [47]:
#Keep only the rows with at least thresh='2' non-NA values.
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.410472,0.187333,
3,1.367596,0.144934,0.604169
4,0.152236,0.770223,0.703361


## Filling In Missing Data

Rather than filtering out missing data (and potentially discarding other data along
with it), you may want to fill in the “holes” in any number of ways.

In [49]:
df

Unnamed: 0,0,1,2
0,-1.032759,,
1,1.127416,,
2,0.410472,0.187333,
3,1.367596,0.144934,0.604169
4,0.152236,0.770223,0.703361


In [50]:
df.fillna(value=0)

Unnamed: 0,0,1,2
0,-1.032759,0.0,0.0
1,1.127416,0.0,0.0
2,0.410472,0.187333,0.0
3,1.367596,0.144934,0.604169
4,0.152236,0.770223,0.703361


In [51]:
#Calling fillna with a dict, you can use a different fill value for each column:
df.fillna(value={0:100, 1:200, 2:300})

Unnamed: 0,0,1,2
0,-1.032759,200.0,300.0
1,1.127416,200.0,300.0
2,0.410472,0.187333,300.0
3,1.367596,0.144934,0.604169
4,0.152236,0.770223,0.703361


In [58]:
df.fillna(value=0, inplace=True)

In [59]:
df

Unnamed: 0,0,1,2
0,-1.032759,0.0,0.0
1,1.127416,0.0,0.0
2,0.410472,0.187333,0.0
3,1.367596,0.144934,0.604169
4,0.152236,0.770223,0.703361


In [60]:
df = pd.DataFrame(np.random.randn(7, 4))

In [61]:
df

Unnamed: 0,0,1,2,3
0,-0.602468,-0.306963,-0.592741,-0.475788
1,0.513004,0.484631,0.591842,1.113649
2,-0.266285,-0.657614,-0.432998,1.109626
3,-0.101062,0.671304,-1.256782,0.076683
4,1.186549,-1.76696,-1.493829,1.366777
5,0.135172,0.792066,-1.152333,2.994277
6,0.946757,1.69028,0.585947,0.279398


In [64]:
df.loc[2:4,2] = np.nan

In [67]:
df.iloc[-1:-5:-1,0] = np.nan

In [68]:
df

Unnamed: 0,0,1,2,3
0,-0.602468,-0.306963,-0.592741,-0.475788
1,0.513004,0.484631,0.591842,1.113649
2,-0.266285,-0.657614,,1.109626
3,,0.671304,,0.076683
4,,-1.76696,,1.366777
5,,0.792066,-1.152333,2.994277
6,,1.69028,0.585947,0.279398


In [69]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2,3
0,-0.602468,-0.306963,-0.592741,-0.475788
1,0.513004,0.484631,0.591842,1.113649
2,-0.266285,-0.657614,0.591842,1.109626
3,-0.266285,0.671304,0.591842,0.076683
4,-0.266285,-1.76696,0.591842,1.366777
5,-0.266285,0.792066,-1.152333,2.994277
6,-0.266285,1.69028,0.585947,0.279398


In [71]:
df.fillna(method='bfill')

Unnamed: 0,0,1,2,3
0,-0.602468,-0.306963,-0.592741,-0.475788
1,0.513004,0.484631,0.591842,1.113649
2,-0.266285,-0.657614,-1.152333,1.109626
3,,0.671304,-1.152333,0.076683
4,,-1.76696,-1.152333,1.366777
5,,0.792066,-1.152333,2.994277
6,,1.69028,0.585947,0.279398


In [73]:
df.fillna(method='ffill', limit=3)

Unnamed: 0,0,1,2,3
0,-0.602468,-0.306963,-0.592741,-0.475788
1,0.513004,0.484631,0.591842,1.113649
2,-0.266285,-0.657614,0.591842,1.109626
3,-0.266285,0.671304,0.591842,0.076683
4,-0.266285,-1.76696,0.591842,1.366777
5,-0.266285,0.792066,-1.152333,2.994277
6,,1.69028,0.585947,0.279398


In [76]:
ser = pd.Series(data=[2, 5.3, np.nan, 7, np.nan, 87, 1.25])

In [77]:
ser

0     2.00
1     5.30
2      NaN
3     7.00
4      NaN
5    87.00
6     1.25
dtype: float64

In [78]:
ser.fillna(value=np.mean(ser))

0     2.00
1     5.30
2    20.51
3     7.00
4    20.51
5    87.00
6     1.25
dtype: float64

Table 7-2. fillna function arguments

# 7.2 Data Transformation

## Removing Duplicates

In [84]:
df = pd.DataFrame(data={'k1': ['one', 'two']*3 + ['two'], 'k2': [1, 1, 2, 2, 3, 3, 3]})

In [85]:
df

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,2
4,one,3
5,two,3
6,two,3


In [90]:
# duplicated returns a boolean Series indicating whether each row is a duplicate (has been observed in a previous row) or not:
df.duplicated(keep='first')

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [91]:
df.duplicated(keep='last')

0    False
1    False
2    False
3    False
4    False
5     True
6    False
dtype: bool

In [87]:
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,2
4,one,3
5,two,3


In [92]:
df.drop_duplicates(keep='last')

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,2
4,one,3
6,two,3


In [94]:
df['k3'] = range(7)

In [95]:
df

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1
2,one,2,2
3,two,2,3
4,one,3,4
5,two,3,5
6,two,3,6


In [99]:
df.drop_duplicates(['k1', 'k2'])

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1
2,one,2,2
3,two,2,3
4,one,3,4
5,two,3,5


In [101]:
df.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,k3
0,one,1,0
1,two,1,1
2,one,2,2
3,two,2,3
4,one,3,4
6,two,3,6


## Transforming Data Using a Function or Mapping

In [102]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'], 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [103]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [104]:
d = {'bacon':'pig', 'pulled pork':'pig', 'pastrami': 'cow', 'corned beef':'cow', 'honey ham': 'pig', 'nova lox': 'salmon'}

In [109]:
lowered_case = data['food'].str.lower()

In [110]:
lowered_case

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [111]:
data['animal'] = lowered_case.map(d)

In [112]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [113]:
lowered_case.map(d)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [116]:
data['food'].map(lambda x: d[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Using map is a convenient way to perform element-wise transformations and other
data cleaning–related operations.


## Replacing Values

In [119]:
ser = pd.Series(data=[8.52, 5, -85, 41.02, -85, 1000, 45, -9.0023, -85])

In [120]:
ser

0       8.5200
1       5.0000
2     -85.0000
3      41.0200
4     -85.0000
5    1000.0000
6      45.0000
7      -9.0023
8     -85.0000
dtype: float64

In [121]:
ser.replace(-85, 0)

0       8.5200
1       5.0000
2       0.0000
3      41.0200
4       0.0000
5    1000.0000
6      45.0000
7      -9.0023
8       0.0000
dtype: float64

In [123]:
ser.replace([-85, -9.0023], np.nan)

0       8.52
1       5.00
2        NaN
3      41.02
4        NaN
5    1000.00
6      45.00
7        NaN
8        NaN
dtype: float64

In [124]:
ser.replace([-85, -9.0023, 1000], [np.nan, np.nan, 0])

0     8.52
1     5.00
2      NaN
3    41.02
4      NaN
5     0.00
6    45.00
7      NaN
8      NaN
dtype: float64

In [126]:
ser.replace({-85:np.nan, -9.0023:np.nan, 1000:0})

0     8.52
1     5.00
2      NaN
3    41.02
4      NaN
5     0.00
6    45.00
7      NaN
8      NaN
dtype: float64

In [132]:
ser.replace(ser[ser<0], np.nan)

0       8.52
1       5.00
2        NaN
3      41.02
4        NaN
5    1000.00
6      45.00
7        NaN
8        NaN
dtype: float64

In [131]:
ser

0       8.5200
1       5.0000
2     -85.0000
3      41.0200
4     -85.0000
5    1000.0000
6      45.0000
7      -9.0023
8     -85.0000
dtype: float64

## Renaming Axis Indexes

Like values in a Series, axis labels can be similarly transformed by a function or mapping of some form to produce new, differently labeled objects

In [133]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [134]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [142]:
data.index = data.index.map(lambda x: str.upper(x))

In [143]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [144]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [149]:
data.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [152]:
data.rename(index={'OHIO': 'NY'}, inplace=True)

In [153]:
data

Unnamed: 0,one,two,three,four
NY,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11
