# Data Cleaning and Prepartion

### Handling Missing Data

In [9]:
import numpy as np
import pandas as pd
from numpy import nan as NA

data = pd.DataFrame([[1., 6.5, NA], [1., NA, NA], [NA, NA, NA], [NA, 6.5, NA]])
display(data)
print()
cleaned_data = data.dropna(how="all")
display(cleaned_data)
print()
display(data.dropna(how="any"))
print()
display(data.dropna(axis=1, how="any"))
print()
display(data.dropna(axis=1, how="all"))

Unnamed: 0,0,1,2
0,1.0,6.5,
1,1.0,,
2,,,
3,,6.5,





Unnamed: 0,0,1,2
0,1.0,6.5,
1,1.0,,
3,,6.5,





Unnamed: 0,0,1,2





0
1
2
3





Unnamed: 0,0,1
0,1.0,6.5
1,1.0,
2,,
3,,6.5


# Filling Missing Values

In [2]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4,1] = NA
df.iloc[:2,2] = NA
display(df)

Unnamed: 0,0,1,2
0,1.188976,,
1,-0.274115,,
2,-1.029359,,0.200778
3,-0.145914,,0.231221
4,0.167976,-0.648441,0.597405
5,-0.112498,0.555877,0.922957
6,2.301042,-0.503551,0.433272


In [3]:
display(df.fillna(1.5))

Unnamed: 0,0,1,2
0,1.188976,1.5,1.5
1,-0.274115,1.5,1.5
2,-1.029359,1.5,0.200778
3,-0.145914,1.5,0.231221
4,0.167976,-0.648441,0.597405
5,-0.112498,0.555877,0.922957
6,2.301042,-0.503551,0.433272


In [4]:
display(df.fillna({1:0.5, 2: 0}))

Unnamed: 0,0,1,2
0,1.188976,0.5,0.0
1,-0.274115,0.5,0.0
2,-1.029359,0.5,0.200778
3,-0.145914,0.5,0.231221
4,0.167976,-0.648441,0.597405
5,-0.112498,0.555877,0.922957
6,2.301042,-0.503551,0.433272


In [5]:
display(df.fillna(method="ffill"))

Unnamed: 0,0,1,2
0,1.188976,,
1,-0.274115,,
2,-1.029359,,0.200778
3,-0.145914,,0.231221
4,0.167976,-0.648441,0.597405
5,-0.112498,0.555877,0.922957
6,2.301042,-0.503551,0.433272


In [6]:
display(df.fillna(method="ffill", axis=1))

Unnamed: 0,0,1,2
0,1.188976,1.188976,1.188976
1,-0.274115,-0.274115,-0.274115
2,-1.029359,-1.029359,0.200778
3,-0.145914,-0.145914,0.231221
4,0.167976,-0.648441,0.597405
5,-0.112498,0.555877,0.922957
6,2.301042,-0.503551,0.433272


# Removing Duplicates

In [42]:
data1 = pd.DataFrame({
    "k1": ["one", "two"] * 3 + ["two"] , 
    "k2": [1, 1, 2, 3, 3, 4, 4]
})
display(data1)
display(data1.duplicated())
data2 = data1.drop_duplicates()
display(data2)
data1["v1"] = ["one", "two", "one", "four", "one", "six", "two"]
display(data1)
data3 = data1.drop_duplicates(["k1", "v1"])
display(data3)

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


Unnamed: 0,k1,k2,v1
0,one,1,one
1,two,1,two
2,one,2,one
3,two,3,four
4,one,3,one
5,two,4,six
6,two,4,two


Unnamed: 0,k1,k2,v1
0,one,1,one
1,two,1,two
3,two,3,four
5,two,4,six


In [43]:
df1 = pd.DataFrame(np.random.randn(7,3))
df1.iloc[:4, 1] = np.nan
df1.iloc[:2, 2] = np.nan
display(df1)

Unnamed: 0,0,1,2
0,0.859401,,
1,-3.140355,,
2,-0.960563,,-0.857806
3,-0.053654,,-1.671158
4,1.26777,0.673622,-0.050284
5,0.644894,0.655371,0.100124
6,0.383865,0.208045,-1.166006


In [46]:
df1 = df1.replace(np.nan, -999)
display(df1)


Unnamed: 0,0,1,2
0,0.859401,-999.0,-999.0
1,-3.140355,-999.0,-999.0
2,-0.960563,-999.0,-0.857806
3,-0.053654,-999.0,-1.671158
4,1.26777,0.673622,-0.050284
5,0.644894,0.655371,0.100124
6,0.383865,0.208045,-1.166006


In [47]:
df1 = df1.replace(-999, -9)
display(df1)

Unnamed: 0,0,1,2
0,0.859401,-9.0,-9.0
1,-3.140355,-9.0,-9.0
2,-0.960563,-9.0,-0.857806
3,-0.053654,-9.0,-1.671158
4,1.26777,0.673622,-0.050284
5,0.644894,0.655371,0.100124
6,0.383865,0.208045,-1.166006


In [49]:
df1 = df1.replace([-999, -9], [np.nan, 0])
display(df1)

Unnamed: 0,0,1,2
0,0.859401,0.0,0.0
1,-3.140355,0.0,0.0
2,-0.960563,0.0,-0.857806
3,-0.053654,0.0,-1.671158
4,1.26777,0.673622,-0.050284
5,0.644894,0.655371,0.100124
6,0.383865,0.208045,-1.166006


# Renaming Axis Indexes using Function Mapping

In [54]:
data1 = pd.DataFrame(np.arange(12).reshape(3,4) , index=["Ohio", "Colorado","New york"], columns=["one", "two", "three", "four"])
display(data1)
transform = lambda x : x[:4].upper()
data1 = data1.index.map(transform)
display(data1)


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New york,8,9,10,11


Index(['OHIO', 'COLO', 'NEW '], dtype='object')

# Detecting and Filtering Outliers

In [64]:
data = pd.DataFrame(np.random.randn(1000,4))
data.describe()
col = data[2]
col[np.abs(col) >3]

669    3.845438
762    3.092596
862   -3.020107
909   -3.077627
Name: 2, dtype: float64

In [74]:
outliers = data[(np.abs(data) > 3).any(1)]
display(outliers)
print(len(outliers), len(data))

Unnamed: 0,0,1,2,3
45,3.183837,0.922139,-0.786521,0.220701
251,4.55312,1.133316,1.293548,0.961999
318,0.100682,2.002224,-1.791008,-3.477822
352,-0.007563,-3.127492,0.623534,-0.694067
392,0.533208,-3.073389,1.694808,0.734441
537,-3.09395,-0.703691,0.950561,1.707716
550,0.487049,1.28597,-0.134216,-3.115799
556,-0.586285,0.900436,-0.635282,3.035562
669,0.758357,0.302023,3.845438,0.195395
762,-1.164822,0.137433,3.092596,-0.424055


13 1000


# Permutation and Random Sampling

In [87]:
df = pd.DataFrame(np.arange(5*4).reshape(5, 4))
print(df.shape)
sampler = np.random.permutation(4)
display(df)
df = df[sampler]
display(df)

(5, 4)


Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


Unnamed: 0,1,2,3,0
0,1,2,3,0
1,5,6,7,4
2,9,10,11,8
3,13,14,15,12
4,17,18,19,16


# Regular Expressions

In [8]:
import re

text = "foo   bar\t baz \tqux"
print(text.split())
print(re.split("\s+", text))      #first compiled and then split method is called.
rgx = re.compile("\s+")          # we compile it first now our time is saved split method can be called without compiling everytime.
print(rgx.split(text))



['foo', 'bar', 'baz', 'qux']
['foo', 'bar', 'baz', 'qux']
['foo', 'bar', 'baz', 'qux']


# Vectorization Strings in Pandas

In [27]:
import pandas as pd
import numpy as np
import re

data = ["simpleEmail@email.com","simple.email@email.com","plus+symbol@email.com","ash-symbol@email.com"
        "q@email.com","\"unsual\"@email.com","dash-symbol@email-dash.com","test@emailServer.com"
        "" "@email.com","user@[IPv6:2001:DB8::1]","example@localhost","example@s.solutions","12345@email.com"]
data = pd.Series(data)
print(data)
pattern =  '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
print(data.str.findall(pattern, flags = re.IGNORECASE))
data[data.str.match(pattern , flags = re.IGNORECASE)]



0               simpleEmail@email.com
1              simple.email@email.com
2               plus+symbol@email.com
3     ash-symbol@email.comq@email.com
4                  "unsual"@email.com
5          dash-symbol@email-dash.com
6      test@emailServer.com@email.com
7             user@[IPv6:2001:DB8::1]
8                   example@localhost
9                 example@s.solutions
10                    12345@email.com
dtype: object
0          [(simpleEmail, email, com)]
1         [(simple.email, email, com)]
2          [(plus+symbol, email, com)]
3          [(ash-symbol, email, comq)]
4                                   []
5     [(dash-symbol, email-dash, com)]
6           [(test, emailServer, com)]
7                                   []
8                                   []
9                 [(example, s, solu)]
10               [(12345, email, com)]
dtype: object


0               simpleEmail@email.com
1              simple.email@email.com
2               plus+symbol@email.com
3     ash-symbol@email.comq@email.com
5          dash-symbol@email-dash.com
6      test@emailServer.com@email.com
9                 example@s.solutions
10                    12345@email.com
dtype: object

In [28]:
#Hierarchial indexing plays important role in reshaping data and group based operations like formating a pivot table.
#For example you could rearange a data into dataframe using its unstack method.

In [35]:
data = pd.Series(np.random.randn(9), index = [["a", "a", "a", "b", "b", "c", "c", "d", "d"], [1, 2, 3, 1, 3, 1,2, 2, 3]])
print(data)

a  1   -1.154226
   2   -1.084253
   3   -0.394760
b  1   -0.280824
   3    0.948456
c  1    1.104620
   2   -0.299841
d  2    0.203768
   3    1.309887
dtype: float64


In [48]:
data = data.unstack()
display(data)


Unnamed: 0,1,2,3
a,-1.154226,-1.084253,-0.39476
b,-0.280824,,0.948456
c,1.10462,-0.299841,
d,,0.203768,1.309887


In [56]:
frame = pd.DataFrame(np.arange(12).reshape(4,3), index = [["a","a","b","b"], [1,2,1,2]],
                     columns=[["Ohio","Ohio","Colorado"], ["Green","Red", "Green"]])
display(frame)
frame.index.names = ["key1", "key2"]
frame.columns.names = ["State", "Color"]
display(frame)
frame["Ohio"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


Unnamed: 0_level_0,State,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


Unnamed: 0_level_0,Color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [63]:
frame.swaplevel("key1","key2")
#frame = frame.swaplevel("key1","key2")

Unnamed: 0_level_0,State,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [66]:
frame.sort_index(1)

Unnamed: 0_level_0,State,Colorado,Ohio,Ohio
Unnamed: 0_level_1,Color,Green,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


In [67]:
frame.swaplevel(0,1).sort_index(0)

Unnamed: 0_level_0,State,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11
