In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [94]:
df = pd.read_csv('titanic.csv', usecols=['Age', 'Pclass', 'Fare', 'Survived'])
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Age       714 non-null    float64
 3   Fare      891 non-null    float64
dtypes: float64(2), int64(2)
memory usage: 28.0 KB


In [96]:
df.isnull().mean() * 100

Survived     0.00000
Pclass       0.00000
Age         19.86532
Fare         0.00000
dtype: float64

In [97]:
y = df.Survived
X = df.drop(['Survived'], axis=1)

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
X_train.head(10)

Unnamed: 0,Pclass,Age,Fare
331,1,45.5,28.5
733,2,23.0,13.0
382,3,32.0,7.925
704,3,26.0,7.8542
813,3,6.0,31.275
118,1,24.0,247.5208
536,1,45.0,26.55
361,2,29.0,27.7208
29,3,,7.8958
55,1,,35.5


In [131]:
knn = KNNImputer(n_neighbors=8, weights='uniform')
X_train_transf = pd.DataFrame(knn.fit_transform(X_train), columns=X_train.columns)
X_test_transf = pd.DataFrame(knn.transform(X_test), columns=X_test.columns)

n_neighbors param defines how many neighbors you want to take into consideration while calculating distance, while weights takes in one of two values 'uniform' or 'distance'. Uniform treats all the nearest neighbors same and the value is obtained by by the average(x+y/2) while distance is different to uniform and gives slight advantage to nearest neighbors for calculating distance. 

In [133]:
X_train_transf.head(10)

Unnamed: 0,Pclass,Age,Fare
0,1.0,45.5,28.5
1,2.0,23.0,13.0
2,3.0,32.0,7.925
3,3.0,26.0,7.8542
4,3.0,6.0,31.275
5,1.0,24.0,247.5208
6,1.0,45.0,26.55
7,2.0,29.0,27.7208
8,3.0,30.375,7.8958
9,1.0,33.5,35.5


In [135]:
lr = LogisticRegression()
lr.fit(X_train_transf, y_train)
pred = lr.predict(X_test_transf)
pred

array([0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0], dtype=int64)

In [137]:
accuracy_score(y_test, pred)

0.7430167597765364