# handling missing data

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
string_data=pd.Series(['ayesha','aqsa',np.nan,'khadija'])

In [4]:
string_data

0     ayesha
1       aqsa
2        NaN
3    khadija
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data[0]=None

In [7]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

# filtering out missing data

In [8]:
from numpy import nan as NA

In [10]:
data=pd.Series([1,NA,3.5,NA,7])

In [11]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [14]:
data =pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])

In [15]:
cleaned=data.dropna()

In [16]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [18]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [19]:
data[4]=NA

In [20]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [21]:
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [22]:
df=pd.DataFrame(np.random.randn(7,3))

In [23]:
df.iloc[:4,1]=NA

In [24]:
df.iloc[:2,2]=NA

In [25]:
df

Unnamed: 0,0,1,2
0,-0.586701,,
1,-0.33463,,
2,-1.089112,,-0.013649
3,0.305493,,-1.394577
4,-1.168818,-0.917824,-0.193333
5,0.698757,-0.01069,1.673743
6,-0.79315,0.100512,-1.771277


In [26]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.168818,-0.917824,-0.193333
5,0.698757,-0.01069,1.673743
6,-0.79315,0.100512,-1.771277


In [27]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.089112,,-0.013649
3,0.305493,,-1.394577
4,-1.168818,-0.917824,-0.193333
5,0.698757,-0.01069,1.673743
6,-0.79315,0.100512,-1.771277


# filling & Replacing  data

In [28]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.586701,0.0,0.0
1,-0.33463,0.0,0.0
2,-1.089112,0.0,-0.013649
3,0.305493,0.0,-1.394577
4,-1.168818,-0.917824,-0.193333
5,0.698757,-0.01069,1.673743
6,-0.79315,0.100512,-1.771277


In [29]:
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,-0.586701,0.5,0.0
1,-0.33463,0.5,0.0
2,-1.089112,0.5,-0.013649
3,0.305493,0.5,-1.394577
4,-1.168818,-0.917824,-0.193333
5,0.698757,-0.01069,1.673743
6,-0.79315,0.100512,-1.771277


In [30]:
_=df.fillna(0,inplace=True)

In [31]:
df

Unnamed: 0,0,1,2
0,-0.586701,0.0,0.0
1,-0.33463,0.0,0.0
2,-1.089112,0.0,-0.013649
3,0.305493,0.0,-1.394577
4,-1.168818,-0.917824,-0.193333
5,0.698757,-0.01069,1.673743
6,-0.79315,0.100512,-1.771277


In [32]:
df=pd.DataFrame(np.random.randn(6,3))

In [33]:
df.iloc[2:,1]=NA

In [34]:
df.iloc[4:,2]=NA

In [35]:
df

Unnamed: 0,0,1,2
0,0.926201,-2.056741,0.060426
1,-0.742062,0.348396,0.164469
2,-0.497,,0.21256
3,0.122651,,0.033849
4,1.402472,,
5,-1.434623,,


In [37]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.926201,-2.056741,0.060426
1,-0.742062,0.348396,0.164469
2,-0.497,0.348396,0.21256
3,0.122651,0.348396,0.033849
4,1.402472,0.348396,0.033849
5,-1.434623,0.348396,0.033849


In [38]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,0.926201,-2.056741,0.060426
1,-0.742062,0.348396,0.164469
2,-0.497,0.348396,0.21256
3,0.122651,0.348396,0.033849
4,1.402472,,0.033849
5,-1.434623,,0.033849


In [39]:
data=pd.Series([1.,NA,3.5,NA,7])

In [40]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# removing duplicates

In [42]:
data=pd.DataFrame({'k1':['one','two']*3+['two'],
                  'k2':[1,1,2,3,3,4,4]})

In [44]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [45]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [46]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [47]:
data['v1']=range(7)

In [48]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [49]:
data.drop_duplicates(['k1','k2'],keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


# replacing values

In [50]:
data=pd.Series([1.,-999,2.,-999.,-1000.,3.])

In [51]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [52]:
data.replace(-999,NA)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [53]:
data.replace([-999,-1000],NA)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [54]:
data.replace([-999,-1000],[NA,1])

0    1.0
1    NaN
2    2.0
3    NaN
4    1.0
5    3.0
dtype: float64

In [55]:
data.replace({-999:NA,-1000:1})

0    1.0
1    NaN
2    2.0
3    NaN
4    1.0
5    3.0
dtype: float64

# detecting & filtering outliers

In [56]:
data=pd.DataFrame(np.random.randn(1000,4))

In [57]:
data


Unnamed: 0,0,1,2,3
0,0.012748,-1.028518,1.083392,1.667220
1,-0.931497,-0.836565,-1.149882,0.677617
2,0.664061,0.072385,1.865665,-1.306061
3,-0.362204,-0.779193,-2.099137,0.325219
4,-1.330795,0.503229,-0.585742,0.078305
...,...,...,...,...
995,-0.485662,-0.150861,-1.020490,-0.043397
996,0.247427,-1.169759,-0.666863,-0.776718
997,0.340604,-0.451387,-0.658429,1.437849
998,-0.219532,1.092696,-0.907714,-0.659904


In [58]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.037726,-0.0251,-0.015391,0.007899
std,0.983859,1.030438,0.992747,0.979286
min,-3.543958,-3.177522,-3.346195,-2.913966
25%,-0.650107,-0.700256,-0.678878,-0.674992
50%,0.052896,-0.032473,-0.055388,-0.01196
75%,0.707446,0.675141,0.711678,0.685318
max,2.92814,3.100155,2.885872,2.924347


In [59]:
col=data[2]

In [60]:
col[np.abs(col)>3]

319   -3.346195
Name: 2, dtype: float64

In [61]:
data[(np.abs(data)>3).any(1)]

  data[(np.abs(data)>3).any(1)]


Unnamed: 0,0,1,2,3
97,-0.708778,3.100155,-0.956387,1.972205
278,-3.543958,-0.158649,-1.245278,1.801196
319,0.501291,3.052386,-3.346195,-0.256823
910,0.147573,-3.008614,-0.998426,-0.936855
941,0.865259,-3.177522,1.499537,-0.37163


In [62]:
data[np.abs(data)>3]=np.sign(data)*3

In [63]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.03827,-0.025066,-0.015045,0.007899
std,0.982025,1.02943,0.991644,0.979286
min,-3.0,-3.0,-3.0,-2.913966
25%,-0.650107,-0.700256,-0.678878,-0.674992
50%,0.052896,-0.032473,-0.055388,-0.01196
75%,0.707446,0.675141,0.711678,0.685318
max,2.92814,3.0,2.885872,2.924347


In [64]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,1.0
1,-1.0,-1.0,-1.0,1.0
2,1.0,1.0,1.0,-1.0
3,-1.0,-1.0,-1.0,1.0
4,-1.0,1.0,-1.0,1.0
