In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('E:\\Notes\\Statistics\\Feature Engineering\\Datasets\\train.csv', usecols=['Age','Pclass','Fare','Survived'])
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [3]:
df.isnull().mean() * 100

Survived     0.00000
Pclass       0.00000
Age         19.86532
Fare         0.00000
dtype: float64

In [13]:
X = df.drop(columns=['Survived'])
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

**Applying KNN Imputer**

In [14]:
knn = KNNImputer(n_neighbors=3, weights='distance')

X_train_trf = knn.fit_transform(X_train)
X_test_trf = knn.fit_transform(X_test)

In [17]:
lr = LogisticRegression()
lr.fit(X_train_trf, y_train)

y_pred= lr.predict(X_test_trf)

accuracy_score(y_test, y_pred)

0.7430167597765364

In [None]:
from sklearn.model_selection import cross_val_score

knn = KNNImputer(n_neighbors=3, weights='distance') # n_neighbors represent K=3

X_trf = knn.fit_transform(X)


print(np.mean(cross_val_score(lr, X_trf, y, cv=10, scoring='accuracy')))

0.7048938826466917


**Comparing with Simple Imputer**

In [18]:
si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.fit_transform(X_test)

In [26]:
lr = LogisticRegression()
lr.fit(X_train_trf2, y_train)

y_pred2 = lr.predict(X_test_trf2)

accuracy_score(y_test, y_pred2)

0.7374301675977654

**MICE (Multivariate Imputation by Chained Equations)**

In [30]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

data = {'R&D Spend': [120,150,np.nan,140,130],
        'Administration': [90,np.nan,100,95,105],
        'Marketing Spend': [200,180,220,np.nan,190]}

df = pd.DataFrame(data)

In [None]:
mice_imputer = IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=42)
df_imputed = pd.DataFrame(mice_imputer.fit_transform(df), columns=df.columns)

In [33]:
print('Before Imputation:\n', df)
print('\n After MICE Imputation:\n', df_imputed)

Before Imputation:
    R&D Spend  Administration  Marketing Spend
0      120.0            90.0            200.0
1      150.0             NaN            180.0
2        NaN           100.0            220.0
3      140.0            95.0              NaN
4      130.0           105.0            190.0

 After MICE Imputation:
     R&D Spend  Administration  Marketing Spend
0  120.000000       90.000000        200.00000
1  150.000000       97.492744        180.00000
2   82.233954      100.000000        220.00000
3  140.000000       95.000000        186.66503
4  130.000000      105.000000        190.00000
