In [39]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [18]:
df = pd.read_csv('train.csv',usecols=['Pclass','Age','Fare','Survived'])
df.sample()

Unnamed: 0,Survived,Pclass,Age,Fare
318,1,1,31.0,164.8667


In [19]:
df.isnull().mean()*100

Survived     0.00000
Pclass       0.00000
Age         19.86532
Fare         0.00000
dtype: float64

In [20]:
X = df.drop(columns=['Survived'])
y = df.Survived

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [22]:
X_train.shape

(712, 3)

In [25]:
def knnImputerAccuracy(neigh,wt):
    knn = KNNImputer(n_neighbors=neigh,weights=wt)

    X_train_knn_trf = knn.fit_transform(X_train)
    X_test_knn_trf = knn.transform(X_test)

    lr_knn = LogisticRegression()
    lr_knn.fit(X_train_knn_trf,y_train)

    y_pred_knn = lr_knn.predict(X_test_knn_trf)
    print(accuracy_score(y_test,y_pred_knn))

In [85]:
knnImputerAccuracy(5,'distance')

0.7318435754189944


In [69]:
knnImputerAccuracy(1,'uniform')

0.7374301675977654


In [75]:
def Accuracy():
    si = SimpleImputer()
    X_train_trf = si.fit_transform(X_train)
    X_test_trf = si.transform(X_test);
    lr = LogisticRegression()
    lr.fit(X_train_trf,y_train)

    y_pred_trf = lr.predict(X_test_trf)
    print(accuracy_score(y_test,y_pred_trf))

In [76]:
Accuracy()

0.7318435754189944


# Grid Search CV to find best result of knnImputer

In [58]:
clf = Pipeline([
    ('knn',KNNImputer()),
    ('classifier',LogisticRegression())
])
param_grid = {
    'knn__n_neighbors':np.arange(1,101),
    'knn__weights': ['distance','uniform']
}

grid_search = GridSearchCV(clf,param_grid)

In [59]:
grid_search.fit(X_train,y_train)

print(f"Best params : ")
print(grid_search.best_params_)

Best params : 
{'knn__n_neighbors': np.int64(3), 'knn__weights': 'distance'}


In [60]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")

Internal CV score: 0.698


In [61]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_knn__n_neighbors','param_knn__weights','mean_test_score']]

Unnamed: 0,param_knn__n_neighbors,param_knn__weights,mean_test_score
4,3,distance,0.698099
21,11,uniform,0.696691
71,36,uniform,0.696681
51,26,uniform,0.696681
65,33,uniform,0.696681
...,...,...,...
16,9,distance,0.691067
17,9,uniform,0.691067
14,8,distance,0.689668
13,7,uniform,0.689658
