# Iterative Imputer
* MICE is used when data is MAR (Missing at random)
* How MICE works ?
  * Fill all the NaN values with mean of respective cols
  * Remove all col1 missing values and predict missing values of col1 using other cols by using any ML algorithm
  * Repeat the above procedure for other columns
  * Take difference of Iteration 0 - Last Iteration
  * Repeat the above procedure untill difference is close to 0


In [21]:
# Load dataset
import numpy as np
import pandas as pd

df = pd.read_csv('train.csv', usecols=['Age', 'Pclass', 'Fare', 'Survived'])
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [22]:
# Find % of missing values
df.isnull().mean()*100

Unnamed: 0,0
Survived,0.0
Pclass,0.0
Age,19.86532
Fare,0.0


In [23]:
# Train test Split
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Survived'])
Y = df['Survived']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [29]:
# Apply IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(estimator=LinearRegression(), random_state=42)

X_train_i = imputer.fit_transform(X_train)
X_test_i = imputer.transform(X_test)

In [30]:
# Applying LogisticRegression and checking accuracy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_i, Y_train)

Y_pred = lr.predict(X_test_i)
accuracy_score(Y_test, Y_pred)

0.6927374301675978