# Handling Missing Data

In [2]:
import pandas as pd
import numpy as np

In [11]:
df_num = pd.DataFrame({'nums': [1,2,3,4,np.nan,6,7,8,9,10]})
df_num

Unnamed: 0,nums
0,1.0
1,2.0
2,3.0
3,4.0
4,
5,6.0
6,7.0
7,8.0
8,9.0
9,10.0


In [5]:
from sklearn.impute import SimpleImputer

In [10]:
# Replacing missing values with mean of existing ones
imp_mean = SimpleImputer(strategy='mean')
out1 = imp_mean.fit_transform(df_num)
df_num2 = pd.DataFrame(out1, columns=df_num.columns)
df_num2

Unnamed: 0,nums
0,1.0
1,2.0
2,3.0
3,4.0
4,5.555556
5,6.0
6,7.0
7,8.0
8,9.0
9,10.0


In [18]:
# Replacing missing values with mode of existing ones
imp_mode = SimpleImputer(strategy='most_frequent')
df_num3 = df_num.copy()
df_num3.loc[0, 'nums'] = 6    # Setting first number to 6. To make 6 as mode
out2 = imp_mode.fit_transform(df_num3)
df_num3 = pd.DataFrame(out2, columns=df_num.columns)
df_num3

Unnamed: 0,nums
0,6.0
1,2.0
2,3.0
3,4.0
4,6.0
5,6.0
6,7.0
7,8.0
8,9.0
9,10.0


In [23]:
# Replacing missing values with median of existing ones
imp_median = SimpleImputer(strategy='median')
out3 = imp_median.fit_transform(df_num)
df_num4 = pd.DataFrame(out3, columns=df_num.columns)
df_num4

Unnamed: 0,nums
0,1.0
1,2.0
2,3.0
3,4.0
4,6.0
5,6.0
6,7.0
7,8.0
8,9.0
9,10.0


In [24]:
# Replacing missing values with a pre-defined value
imp_const = SimpleImputer(strategy='constant', fill_value=5)
out4 = imp_const.fit_transform(df_num)
df_num5 = pd.DataFrame(out4, columns=df_num.columns)
df_num5

Unnamed: 0,nums
0,1.0
1,2.0
2,3.0
3,4.0
4,5.0
5,6.0
6,7.0
7,8.0
8,9.0
9,10.0


In [22]:
df_cat = pd.DataFrame({'cats': ['apple', 'ball', 'cat', 'dog', np.nan, 'fan', 'giraffe', 'hockey']})
df_cat

Unnamed: 0,cats
0,apple
1,ball
2,cat
3,dog
4,
5,fan
6,giraffe
7,hockey


In [27]:
# Replacing missing values with a categorical value
imp_cat = SimpleImputer(strategy='constant', fill_value='elephant')
out5 = imp_cat.fit_transform(df_cat)
df_cat2 = pd.DataFrame(out5, columns=df_cat.columns)
df_cat2

Unnamed: 0,cats
0,apple
1,ball
2,cat
3,dog
4,elephant
5,fan
6,giraffe
7,hockey


In [31]:
# Filling missing values with fill indicator
imp_cat = SimpleImputer(strategy='constant', fill_value='elephant', add_indicator=True)
out5 = imp_cat.fit_transform(df_cat)
df_cat2 = pd.DataFrame(out5, columns=list(df_cat.columns)+['is_filled'])
df_cat2

Unnamed: 0,cats,is_filled
0,apple,False
1,ball,False
2,cat,False
3,dog,False
4,elephant,True
5,fan,False
6,giraffe,False
7,hockey,False
