# Handelling Missing Values

### Load the data set

In [37]:
import seaborn as sns
import numpy as np
import pandas as pd
df=sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
# Getting Missing Values
df.isnull()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
887,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False
889,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
# Counting missing values per column
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

### Handelling Missing Values

In [None]:
# 1. Delete the row or data point containing missing values, but due to this drop you can see weare losing
# very much values
print(df.shape)
df.dropna().shape

(891, 15)


(182, 15)

In [13]:
# 2. Delete column wise missing values measn drop columns having missing values
df.dropna(axis=1)

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,class,who,adult_male,alive,alone
0,0,3,male,1,0,7.2500,Third,man,True,no,False
1,1,1,female,1,0,71.2833,First,woman,False,yes,False
2,1,3,female,0,0,7.9250,Third,woman,False,yes,True
3,1,1,female,1,0,53.1000,First,woman,False,yes,False
4,0,3,male,0,0,8.0500,Third,man,True,no,True
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,Second,man,True,no,True
887,1,1,female,0,0,30.0000,First,woman,False,yes,True
888,0,3,female,1,2,23.4500,Third,woman,False,no,False
889,1,1,male,0,0,30.0000,First,man,True,yes,True


### Imputation Missing Values

#### 1. Mean Value Imputation: 
Works well if have normally Distributed data and there are no outliers

In [21]:
# Replace the missing values with the mean of that column
df['age_mean']=df['age'].fillna(df['age'].mean)
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
age_mean         0
dtype: int64

#### 2. Median Value Imputation:  
We Use it when we have outliers in data set

In [22]:
# Fill the missing values of a column by it's median
df["age_median"]=df['age'].fillna(df['age'].median)
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
age_mean         0
age_median       0
dtype: int64

#### Mode Value Imputation: Used to fill missing values in categorical variables

In [35]:
# We take mode of categories of that column and fill the missing values with it
df['embarked'] # Categorical Feature
print(df['embarked'].unique()) # Unique Categories
# Now we will fill the missing values with the mode of categories of hat column
mode= df[df['embarked'].notna()]['embarked'].mode()[0]
print(mode)
df['embarked_mode']=df['embarked'].fillna(mode)
df.isnull().sum()

['S' 'C' 'Q' nan]
S


survived           0
pclass             0
sex                0
age              177
sibsp              0
parch              0
fare               0
embarked           2
class              0
who                0
adult_male         0
deck             688
embark_town        2
alive              0
alone              0
age_mean           0
age_median         0
embarked_mode      0
dtype: int64

#### Random Sampling Technique:
Select random value from that column and replace missing values with it

In [51]:
random_values= np.random.choice(df['deck'].dropna(),size=df['deck'].isnull().sum(),replace=True)
df['deck_random'] = df['deck'].fillna(pd.Series(random_values,index=df[df['deck'].isnull()].index))
df.isnull().sum()



survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
deck_random      0
dtype: int64