In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from numpy import random

### There are 3 type of Missing data

1- Missing Completely at Random (MCAR) - Implies the missingness of a  field is completely random, and that we probably cannot predict that value from any other value in the data.

2- Missing at Random (MAR) - Implies that the missingness of a field can be explained by the values in other columns, but not from that column.

3-Missing NOT at Random (MNAR) - Implies whether there was a reason why the respondent didn’t fill up that field, and hence that data is not missing at random. For example, if someone is obese, they are less likely to disclose their weight.

### Different ways to handle it :-


1- do nothing / drop

2- imputation using mean, mode or median

3- imputation using 0 or global constant

4- fill it manually

5- imputation using ML algo.(costly)

5- MICE 

### MICE (Multivariate Imputation By Chained Equations)

In [71]:
df=pd.read_csv('D:\Data_sets\mice.csv')
df

Unnamed: 0,age,experience,salary,loan
0,25.0,,50.0,0
1,27.0,3.0,,1
2,29.0,5.0,110.0,1
3,31.0,7.0,140.0,0
4,33.0,9.0,170.0,1
5,,11.0,200.0,0


In [72]:
data=df[['age','experience','salary']]

### step 1 - fill the missing data with mean

In [73]:
d1=data.fillna(data.mean())

In [74]:
d1

Unnamed: 0,age,experience,salary
0,25.0,7.0,50.0
1,27.0,3.0,134.0
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0
5,29.0,11.0,200.0


### step 2 -
take one column at a time and remove the value which you have filled with mean and other use column to predict that missing value using ML algo(ex- linear regresssion). and repeat this step for all column for those you have filled with mean value.

In [75]:
d2=pd.DataFrame(data.fillna(value={'experience':df.experience.mean(),'salary':df.salary.mean()}))

In [76]:
d2

Unnamed: 0,age,experience,salary
0,25.0,7.0,50.0
1,27.0,3.0,134.0
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0
5,,11.0,200.0


In [77]:
from sklearn.linear_model import LinearRegression

## for age column

In [78]:
d3=d2.dropna()

In [79]:
d3

Unnamed: 0,age,experience,salary
0,25.0,7.0,50.0
1,27.0,3.0,134.0
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0


In [80]:
lr1=LinearRegression()

In [81]:
lr1.fit(d3[['experience','salary']], d3.age)

LinearRegression()

In [83]:
lr1.predict([[11,200]])

array([36.25316456])

## for experience column

In [84]:
d2=pd.DataFrame(data.fillna(value={'age':36.25,'salary':df.salary.mean()}))

In [85]:
d2

Unnamed: 0,age,experience,salary
0,25.0,,50.0
1,27.0,3.0,134.0
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0
5,36.25,11.0,200.0


In [86]:
d3=d2.dropna()

In [87]:
d3

Unnamed: 0,age,experience,salary
1,27.0,3.0,134.0
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0
5,36.25,11.0,200.0


In [88]:
lr1.fit(d3[['age','salary']], d3.experience)

LinearRegression()

In [89]:
lr1.predict([[25,50]])

array([1.8521521])

## for salary column

In [90]:
d2=pd.DataFrame(data.fillna(value={'age':36.25,'experience':1.85}))

In [91]:
d2

Unnamed: 0,age,experience,salary
0,25.0,1.85,50.0
1,27.0,3.0,
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0
5,36.25,11.0,200.0


In [94]:
d3=d2.dropna()

In [95]:
d3

Unnamed: 0,age,experience,salary
0,25.0,1.85,50.0
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0
5,36.25,11.0,200.0


In [96]:
lr1.fit(d3[['age','experience']], d3.salary)

LinearRegression()

In [97]:
lr1.predict([[27,3]])

array([72.79560693])

In [98]:
d2=pd.DataFrame(data.fillna(value={'age':36.25,'experience':1.85,'salary':72.79}))

In [99]:
d2

Unnamed: 0,age,experience,salary
0,25.0,1.85,50.0
1,27.0,3.0,72.79
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0
5,36.25,11.0,200.0


### now subtract the 1st data frame which we have filled with the mean value from it

In [100]:
d1

Unnamed: 0,age,experience,salary
0,25.0,7.0,50.0
1,27.0,3.0,134.0
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0
5,29.0,11.0,200.0


In [101]:
d2

Unnamed: 0,age,experience,salary
0,25.0,1.85,50.0
1,27.0,3.0,72.79
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0
5,36.25,11.0,200.0


### Step 3- Difference datsets

In [102]:
d2-d1

Unnamed: 0,age,experience,salary
0,0.0,-5.15,0.0
1,0.0,0.0,-61.21
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,7.25,0.0,0.0


### Now take this data frame as first base dataset and repeat the same step until you get the all the value in difference datasets as zero, when you get all the values 0 or close to 0 , then you will get your missing value from last dataset from last iteration