In [48]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import SGDClassifier

# SimpleImputer

In [49]:
from sklearn.impute import SimpleImputer

In [50]:
X = np.array([[10,3],
              [0,4],
              [5,3],
              [np.nan,3]])

In [51]:
X

array([[10.,  3.],
       [ 0.,  4.],
       [ 5.,  3.],
       [nan,  3.]])

In [52]:
imputer = SimpleImputer(missing_values=np.nan , 
              strategy='mean'
              )

imputer.fit_transform(X)

array([[10.,  3.],
       [ 0.,  4.],
       [ 5.,  3.],
       [ 5.,  3.]])

In [53]:
X_test = np.array([[10,3],
              [0,4],
              [5,3],
              [np.nan,np.nan]])

imputer.transform(X_test)

array([[10.  ,  3.  ],
       [ 0.  ,  4.  ],
       [ 5.  ,  3.  ],
       [ 5.  ,  3.25]])

# KNN Imputer

In [54]:
from sklearn.impute import KNNImputer

In [55]:
X = np.array([[1,100],
                [2,30],
                [3,15],
                [np.nan , 20]])

In [56]:
imputer = KNNImputer(n_neighbors=1)
imputer.fit_transform(X)

array([[  1., 100.],
       [  2.,  30.],
       [  3.,  15.],
       [  3.,  20.]])

# Missing Indicator

In [57]:
from sklearn.impute import MissingIndicator

In [58]:
X = np.array([[1,100],
                [2,30],
                [3,15],
                [np.nan , np.nan]])

In [59]:
MissingIndicator().fit_transform(X)

array([[False, False],
       [False, False],
       [False, False],
       [ True,  True]])

In [60]:
from sklearn.pipeline import make_union


pipeline = make_union(SimpleImputer(strategy='constant', fill_value=-99),
                      MissingIndicator())

pipeline.fit_transform(X)

array([[  1., 100.,   0.,   0.],
       [  2.,  30.,   0.,   0.],
       [  3.,  15.,   0.,   0.],
       [-99., -99.,   1.,   1.]])

# Application Titanic

In [61]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import seaborn as sns

In [62]:
titanic = sns.load_dataset('titanic')
X = titanic[['pclass','age']]
y = titanic['survived']

X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [63]:
model = make_pipeline(KNNImputer(), SGDClassifier())

In [64]:
params= {'knnimputer__n_neighbors': [1,2,3,4]}

In [65]:
grid = GridSearchCV(model, param_grid=params, cv=5)

In [66]:
grid.fit(X_train,y_train)

In [68]:
grid.best_params_

{'knnimputer__n_neighbors': 1}