In [3]:
import pandas as pd
import numpy as np
float_data =pd.Series([1, -3.5, np.nan, 0])

In [4]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

### Filtering out Missing Data

In [7]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data1 =pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])

In [12]:
data1.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [15]:
data1.dropna(how ='all')
data[4] =NA
data1.dropna(axis=1, how ='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
df =pd.DataFrame(np.random.standard_normal((7,3)))
df.iloc[:4,1] =NA
df.iloc[:2,2] =NA

In [19]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.683572,,0.375947
3,0.563339,,1.144469
4,1.75449,1.127736,-0.453405
5,2.354803,-0.979093,0.839817
6,1.301139,-0.973327,-0.100995


### Filling in Missing Data

In [20]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.447927,0.0,0.0
1,-0.291675,0.0,0.0
2,-1.683572,0.0,0.375947
3,0.563339,0.0,1.144469
4,1.75449,1.127736,-0.453405
5,2.354803,-0.979093,0.839817
6,1.301139,-0.973327,-0.100995


In [25]:
df.fillna(0)
df

Unnamed: 0,0,1,2
0,-0.447927,0.0,0.0
1,-0.291675,0.0,0.0
2,-1.683572,,0.375947
3,0.563339,,1.144469
4,1.75449,,
5,2.354803,,
6,1.301139,,


In [26]:
df.iloc[2:, 1] =NA
df.iloc[4:, 2] =NA

In [27]:

df.fillna(method ='ffill')

Unnamed: 0,0,1,2
0,-0.447927,0.0,0.0
1,-0.291675,0.0,0.0
2,-1.683572,0.0,0.375947
3,0.563339,0.0,1.144469
4,1.75449,0.0,1.144469
5,2.354803,0.0,1.144469
6,1.301139,0.0,1.144469


In [28]:
df.fillna(method ='ffill', limit =2)

Unnamed: 0,0,1,2
0,-0.447927,0.0,0.0
1,-0.291675,0.0,0.0
2,-1.683572,0.0,0.375947
3,0.563339,0.0,1.144469
4,1.75449,,1.144469
5,2.354803,,1.144469
6,1.301139,,


In [29]:
data =pd.Series([1, NA, 3.7, NA, 7])

In [30]:
data.fillna(data.mean())

0    1.0
1    3.9
2    3.7
3    3.9
4    7.0
dtype: float64

### Data Transform
Removing Duplicates

In [34]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [36]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [37]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [40]:
data['v'] =range(7)
data


Unnamed: 0,k1,k2,v
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [42]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v
0,one,1,0
1,two,1,1


### Transforming Data using mapping () Function:

In [48]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork','bacon','Pastrami', 'corned beef','Bacon','pastrami', 'honey ham','nova lox'],'ounces': [4, 3, 12, 6, 7.5, 8, 3,5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [59]:
meat_to_animal = {
 'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'
}


In [60]:
lowercased = data['food'].str.lower()

lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [67]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [65]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [66]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


### Replacing Values
We replace values with new values using replace () function which create a new Series (unless we use inplace =Ture which alter in present list)

In [70]:
data =pd.Series([2., -999, 3, 7, -999])
data

0      2.0
1   -999.0
2      3.0
3      7.0
4   -999.0
dtype: float64

In [71]:
data.replace(-999, np.nan)

0    2.0
1    NaN
2    3.0
3    7.0
4    NaN
dtype: float64

In [72]:
### To Replace multiple values at time use list
data.replace([-999, 7], np.nan)


0    2.0
1    NaN
2    3.0
3    NaN
4    NaN
dtype: float64

In [74]:
### To replace multiple values with multiple substitute
data.replace([-999, 7], [555, 989])

0      2.0
1    555.0
2      3.0
3    989.0
4    555.0
dtype: float64

In [75]:
### To replace we can use Dictionary to replace with substitute
data.replace({-999: 111, 7:222})

0      2.0
1    111.0
2      3.0
3    222.0
4    111.0
dtype: float64

### Rename indexing of DataFrame 
Like list, Series name of axis can be modify with function or other way. We also rename index in dataframe inplace without creating new.

In [84]:
df =pd.DataFrame(np.arange(12).reshape(3,4), index = ['pakpattan', 'sahiwal', 'lahore'], columns = ['one', 'tow', 'three', 'four'])
df

Unnamed: 0,one,tow,three,four
pakpattan,0,1,2,3
sahiwal,4,5,6,7
lahore,8,9,10,11


In [90]:
### it support map() function
transform = lambda x: x[:4].upper()
df.index.map(transform)



Index(['PAKP', 'SAHI', 'LAHO'], dtype='object')

In [92]:
df.index =df.index.map(transform)
df

Unnamed: 0,one,tow,three,four
PAKP,0,1,2,3
SAHI,4,5,6,7
LAHO,8,9,10,11


In [93]:
### if we want to create a transform version of DataFrame without the original Data set. Then we use rename function
df.rename(index =str.title, columns =str.upper)

Unnamed: 0,ONE,TOW,THREE,FOUR
Pakp,0,1,2,3
Sahi,4,5,6,7
Laho,8,9,10,11


In [94]:
### rename can be use with dic like object, providing a new value for subset of axis
df.rename(index ={'PAKP': 'Pakpattan', }, columns ={'three': 3})

Unnamed: 0,one,tow,3,four
Pakpattan,0,1,2,3
SAHI,4,5,6,7
LAHO,8,9,10,11


# Discretization and Binning
Continous Data often discretization or binning into group for data analysis. For example we have data of age we want to group them into discrete buckets

In [95]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]


In [97]:
# we want to group age between binn like 15 to 25, 26 to 35, 36 t0 45.as
bin =[18, 25, 35, 60,100]
# use cut function
bins_group =pd.cut(ages, bin)
bins_group

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [100]:
bins_group.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [106]:
print(bins_group.categories)
print(bins_group.categories[0])
print(pd.value_counts(bins_group))

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')
(18, 25]
(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64


In [107]:
pd.cut(ages, [18, 25, 35, 60, 100], right =False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

### Detecting and Filtering Output.

In [118]:
data =pd.DataFrame(np.random.standard_normal((10000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,10000.0,10000.0,10000.0,10000.0
mean,0.013354,0.006502,0.014074,0.001215
std,0.999599,1.008564,1.010535,0.999878
min,-3.381609,-4.202444,-4.72028,-3.809224
25%,-0.664511,-0.676017,-0.677245,-0.66436
50%,0.00978,0.008444,0.019746,0.011649
75%,0.685219,0.68742,0.703263,0.672793
max,5.416407,4.045231,3.566536,4.57492


In [119]:
col = data[2]
col[col.abs() > 3]

1244   -4.720280
1634    3.055333
1857    3.027749
1858    3.384144
1860   -3.535051
2106   -3.507081
2334   -3.115923
2418   -3.249062
2634    3.280970
2726    3.566536
3134    3.021571
3317    3.103224
3563   -3.239505
3581   -3.780339
3839   -3.132421
4009   -4.235843
5365    3.486033
5519   -3.281584
5622    3.016226
5845    3.191365
6588    3.094208
6804    3.172114
6931    3.261221
7232    3.308325
7297   -3.002846
7700    3.017346
8265   -3.250495
8699   -3.373629
9326    3.214285
9545    3.179980
Name: 2, dtype: float64

In [122]:
data[(data.abs()>3).any(1)]

Unnamed: 0,0,1,2,3
50,-0.874542,0.131974,0.393842,3.067226
198,3.202462,0.542144,1.654696,-0.552803
238,0.007305,3.296556,0.177155,0.549131
284,0.496725,1.297629,1.819345,-3.139133
343,-0.324570,-3.972893,0.769736,0.402900
...,...,...,...,...
9216,0.485842,0.561676,0.261952,3.544583
9326,1.044235,-1.284277,3.214285,-1.383278
9545,-0.638026,0.820533,3.179980,0.483713
9612,-0.331750,0.239120,-0.021625,3.011448


In [124]:
data[data.abs()>3] =np.sign(data)*3
data.describe()

Unnamed: 0,0,1,2,3
count,10000.0,10000.0,10000.0,10000.0
mean,0.012697,0.006268,0.014378,0.001264
std,0.996273,1.004802,1.007259,0.99734
min,-3.0,-3.0,-3.0,-3.0
25%,-0.664511,-0.676017,-0.677245,-0.66436
50%,0.00978,0.008444,0.019746,0.011649
75%,0.685219,0.68742,0.703263,0.672793
max,3.0,3.0,3.0,3.0


In [127]:
# The statement np.sign(data) produces 1 and –1 values based on whether the values in data are positive or negative
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,-1.0
1,-1.0,1.0,1.0,1.0
2,1.0,-1.0,1.0,-1.0
3,-1.0,1.0,-1.0,1.0
4,1.0,1.0,1.0,1.0


### Permutation and Random Sampling
permutation (randomly reording ) Series or row in Data Frame. Possibly using np.random.permutation(). Calling Permutation with axis to permute to reoder the axis

In [130]:
df =pd.DataFrame(np.arange(5*7).reshape(5,7))
sample =np.random.permutation(5)
sample

array([3, 4, 2, 1, 0])

In [132]:
df
df.take(sample)

Unnamed: 0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13
0,0,1,2,3,4,5,6


In [134]:
df.iloc[:, np.random.permutation(7)]

Unnamed: 0,5,3,4,2,1,0,6
0,5,3,4,2,1,0,6
1,12,10,11,9,8,7,13
2,19,17,18,16,15,14,20
3,26,24,25,23,22,21,27
4,33,31,32,30,29,28,34


In [137]:
# To Select random Subset with out replacement use sample method
df.sample(3)

Unnamed: 0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20


In [142]:
# To Generate a sample with replacement using sample with Inplace =True
df =pd.Series([1,4,6,7,8,])
Sample1 =df.sample(10, replace =True)
Sample1

4    8
3    7
3    7
2    6
4    8
2    6
0    1
2    6
4    8
0    1
dtype: int64

### Computing Indicater/ Dummy Variable
Another transformation for machine Learning and Statistical analysis is converting a Categorical. If a column in DataFrame has k distic values. We would derive matrix or DataFrame with k column has 1s and 0s values

In [145]:
df =pd.DataFrame({'keys':['a', 'B', 'c', 'a', 'a', 'b', 'b', 'c', 'c'], 'data1':range(9)})
pd.get_dummies(df['keys'])

Unnamed: 0,B,a,b,c
0,0,1,0,0
1,1,0,0,0
2,0,0,0,1
3,0,1,0,0
4,0,1,0,0
5,0,0,1,0
6,0,0,1,0
7,0,0,0,1
8,0,0,0,1


In [151]:
# some time we want to add prefix to the dummy variable
dummies = pd.get_dummies(df['keys'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy


Unnamed: 0,data1,key_B,key_a,key_b,key_c
0,0,0,1,0,0
1,1,1,0,0,0
2,2,0,0,0,1
3,3,0,1,0,0
4,4,0,1,0,0
5,5,0,0,1,0
6,6,0,0,1,0
7,7,0,0,0,1
8,8,0,0,0,1
