# Handling Missing Data

In [1]:
import numpy as np
import pandas as pd

In [3]:
val1=np.array([1,np.nan,7,1,8])
val1

array([ 1., nan,  7.,  1.,  8.])

In Machine learning you can not enter unstructure data.By using data cleaning it convert unstructure data to structure data to create good model.

In [4]:
6+np.nan

nan

In [5]:
7*np.nan

nan

Numpy does provide some special aggreagations that will ignore these missing values:

In [6]:
np.nansum(val1)   #it ignore NaN value

17.0

NaN is specially a floating point value;There is no equivalent NaN value for integer

In [7]:
pd.Series([1,np.nan,2,None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

Operating on Null values
isnull(): Generate a boolean mask indicating missing values
notnull():Opposite of isnull()
dropna(): Return a filter version of the data
fillna():Return a copy of the data with missing values filled or imputed

In [16]:
raw_data={'firstname':['dhara',np.nan,'raj'],
         'last_name':['Patel',np.nan,'Sharma'],
         'age':[42,np.nan,24],
         'sex':['m',np.nan,'f'],
         'pretestScore':[25,np.nan,np.nan],
         'posttestscore':[12,np.nan,24]}
df=pd.DataFrame(raw_data)
df

Unnamed: 0,firstname,last_name,age,sex,pretestScore,posttestscore
0,dhara,Patel,42.0,m,25.0,12.0
1,,,,,,
2,raj,Sharma,24.0,f,,24.0


In [17]:
df_no_missing=df.dropna()
df_no_missing

Unnamed: 0,firstname,last_name,age,sex,pretestScore,posttestscore
0,dhara,Patel,42.0,m,25.0,12.0


In [18]:
df_cleaned=df.dropna(how='all')
df_cleaned

Unnamed: 0,firstname,last_name,age,sex,pretestScore,posttestscore
0,dhara,Patel,42.0,m,25.0,12.0
2,raj,Sharma,24.0,f,,24.0


In [19]:
df.fillna(0)

Unnamed: 0,firstname,last_name,age,sex,pretestScore,posttestscore
0,dhara,Patel,42.0,m,25.0,12.0
1,0,0,0.0,0,0.0,0.0
2,raj,Sharma,24.0,f,0.0,24.0


In [20]:
df["pretestScore"].fillna(df["pretestScore"].mean(),inplace=True)
df  #use mean for fill missing value of pretestScore

Unnamed: 0,firstname,last_name,age,sex,pretestScore,posttestscore
0,dhara,Patel,42.0,m,25.0,12.0
1,,,,,25.0,
2,raj,Sharma,24.0,f,25.0,24.0


In [22]:
df["posttestscore"].fillna(df.groupby("sex")["posttestscore"].transform("mean"),inplace=True)
df

Unnamed: 0,firstname,last_name,age,sex,pretestScore,posttestscore
0,dhara,Patel,42.0,m,25.0,12.0
1,,,,,25.0,
2,raj,Sharma,24.0,f,25.0,24.0


# select some rows but ignore the missing data points

In [26]:
df[df['age'].notnull() & df['sex'].notnull()]

Unnamed: 0,firstname,last_name,age,sex,pretestScore,posttestscore
0,dhara,Patel,42.0,m,25.0,12.0
2,raj,Sharma,24.0,f,25.0,24.0


In [27]:
#backfill
df.fillna(method='bfill')

Unnamed: 0,firstname,last_name,age,sex,pretestScore,posttestscore
0,dhara,Patel,42.0,m,25.0,12.0
1,raj,Sharma,24.0,f,25.0,24.0
2,raj,Sharma,24.0,f,25.0,24.0


In [28]:
df.fillna(method='ffill',axis=1) #forward fill

Unnamed: 0,firstname,last_name,age,sex,pretestScore,posttestscore
0,dhara,Patel,42.0,m,25,12
1,,,,,25,25
2,raj,Sharma,24.0,f,25,24


# Hierarchical Indexing

In [None]:
create dataframe

In [None]:
df.set_index(['regiment','company'],drop=False)

In [None]:
df.set_index(['regiment','company'],inplace=True)

# set the hierarchical index to be by regiment, and then by company

In [None]:
df.swaplevel('regiment','company')

In [None]:
df.sum(level='regiment')