In [33]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer , SimpleImputer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [34]:
df = pd.read_csv("train.csv" , usecols=["Age","Pclass","Fare","Survived"])

In [35]:
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [36]:
df.isnull().sum()

Survived      0
Pclass        0
Age         177
Fare          0
dtype: int64

In [37]:
X = df.drop("Survived" , axis=1)
y= df["Survived"]

In [38]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size = 0.2 , random_state=42)

In [39]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 331 to 102
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  712 non-null    int64  
 1   Age     572 non-null    float64
 2   Fare    712 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 22.2 KB


##### before applying KNN imputer we need to scale the data to check : as KNN is distance based so "Fare" wil dominate distance -> bad neighbors

In [40]:
from sklearn.preprocessing import StandardScaler

In [41]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [42]:
knn = KNNImputer(
    n_neighbors=3,
    weights='distance'
)

In [43]:
X_train_trf = knn.fit_transform(X_train_scaled)
X_test_trf = knn.transform(X_test_scaled)

In [44]:
lr = LogisticRegression(max_iter=1000)

In [45]:
lr.fit(X_train_trf , y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [46]:
y_pred = lr.predict(X_test_trf)

In [47]:
accuracy_score(y_test , y_pred)

0.7430167597765364

In [48]:
## let's compare it with simple imputer 

In [49]:
from sklearn.impute import SimpleImputer

si = SimpleImputer(strategy='mean')

In [50]:
X_train_si = si.fit_transform(X_train)
X_test_si = si.transform(X_test)

⚠ Notice:
We do NOT scale before SimpleImputer because mean is not distance-based.

In [51]:
scaler = StandardScaler()

X_train_si_scaled = scaler.fit_transform(X_train_si)
X_test_si_scaled = scaler.transform(X_test_si)

In [52]:
lr = LogisticRegression(max_iter=1000)

lr.fit(X_train_si_scaled, y_train)
y_pred_si = lr.predict(X_test_si_scaled)

In [53]:
acc_si = accuracy_score(y_test, y_pred_si)
acc_si

0.7374301675977654

## let's do grid searchCV

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


In [55]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', KNNImputer()),
    ('model', LogisticRegression(max_iter=1000))
])

In [56]:
param_grid = {
    'imputer__n_neighbors': [3, 5, 7, 9],
    'imputer__weights': ['uniform', 'distance'],
    'model__C': [0.01, 0.1, 1, 10]
}

In [58]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

In [59]:
grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step..._iter=1000))])
,param_grid,"{'imputer__n_neighbors': [3, 5, ...], 'imputer__weights': ['uniform', 'distance'], 'model__C': [0.01, 0.1, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [60]:
grid.best_params_

{'imputer__n_neighbors': 5, 'imputer__weights': 'uniform', 'model__C': 0.1}

In [61]:
grid.best_score_

0.6980399881808333

In [62]:
best_model = grid.best_estimator_

In [63]:
y_pred = best_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7374301675977654