# Pre-processing

In [22]:
import pandas as pd
import numpy as np

In [23]:
df_train = pd.read_csv('Data/train.csv')
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [24]:
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked']

In [25]:
# Training dataset
df_train = df_train.drop(columns_to_drop, axis=1)

df_train['Age'] = df_train['Age'].fillna(df_train['Age'].mean())
df_train['Fare'] = df_train['Fare'].fillna(df_train['Fare'].mean())
df_train = pd.get_dummies(df_train, columns=['Sex'])

df_train

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,0,3,22.000000,1,0,7.2500,False,True
1,1,1,38.000000,1,0,71.2833,True,False
2,1,3,26.000000,0,0,7.9250,True,False
3,1,1,35.000000,1,0,53.1000,True,False
4,0,3,35.000000,0,0,8.0500,False,True
...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,False,True
887,1,1,19.000000,0,0,30.0000,True,False
888,0,3,29.699118,1,2,23.4500,True,False
889,1,1,26.000000,0,0,30.0000,False,True


In [26]:
# Testing dataset
df_test = pd.read_csv('Data/test.csv')

new_col = pd.read_csv('Data/gender_submission.csv')['Survived']
df_test.insert(0, 'Survived', new_col)

df_test = df_test.drop(columns_to_drop, axis=1)

df_test['Age'] = df_test['Age'].fillna(df_test['Age'].mean())
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mean())
df_test = pd.get_dummies(df_test, columns=['Sex'])

df_test

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male
0,0,3,34.50000,0,0,7.8292,False,True
1,1,3,47.00000,1,0,7.0000,True,False
2,0,2,62.00000,0,0,9.6875,False,True
3,0,3,27.00000,0,0,8.6625,False,True
4,1,3,22.00000,1,1,12.2875,True,False
...,...,...,...,...,...,...,...,...
413,0,3,30.27259,0,0,8.0500,False,True
414,1,1,39.00000,0,0,108.9000,True,False
415,0,3,38.50000,0,0,7.2500,False,True
416,0,3,30.27259,0,0,8.0500,False,True


In [27]:
X_train = df_train.drop(['Survived'], axis=1)
y_train = df_train['Survived']

X_test = df_test.drop(['Survived'], axis=1)
y_test = df_test['Survived']

In [28]:
# Feature scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [29]:
# Dimensionality reduction
from sklearn.decomposition import PCA

pca = PCA(n_components=4, random_state=1)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

X_train

array([[-1.18429605, -1.00930622,  0.26963839,  0.25752958],
       [ 2.0190514 ,  1.52234108, -0.43966594,  0.05576715],
       [ 0.99910855, -0.65898368, -1.87538493,  0.10498592],
       ...,
       [ 2.15185232, -1.30522685, -0.24080524, -1.15114014],
       [-0.81139794,  1.11197361,  0.45143025,  0.84414101],
       [-1.47231082, -0.29363187, -0.25857945, -0.14018678]])

# Modeling

In [30]:
# Model tuning
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True, random_state=1)

params = {
    'n_neighbors': range(1, 15),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
}

model = KNN()
grid = GridSearchCV(model, params, cv=kf)
grid.fit(X_train, y_train)

model = grid.best_estimator_

print(grid.best_params_)

{'metric': 'manhattan', 'n_neighbors': 4, 'weights': 'uniform'}


In [31]:
y_pred = model.predict(X_test)

# Model Evaluation

In [32]:
# Cross-validation
from sklearn.model_selection import cross_val_score

X_concat = np.concatenate((X_train, X_test))
y_concat = np.concatenate((y_train, y_test))

accuracy_by_cv = cross_val_score(model, X_concat, y_concat, cv=kf).mean()*100

In [33]:
# Accuracy by test data
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

negative_in_all = accuracy_score(y_test, [0]*len(y_test))*100
accuracy_by_test_data = accuracy_score(y_test, y_pred)*100

In [34]:
print(f'Accuracy by cross-validation (reliable): {accuracy_by_cv:.2f}%')
print(f'Accuracy only on provided test data (unreliable): {accuracy_by_test_data:.2f}%')
print(f'Negative in all cases: {negative_in_all:.2f}%\n')

print(classification_report(y_test, y_pred))

df_cm = pd.DataFrame(confusion_matrix(y_test, y_pred), index=['Negative', 'Positive'], columns=['Negative', 'Positive'])
df_cm

Accuracy by cross-validation (reliable): 84.80%
Accuracy only on provided test data (unreliable): 81.10%
Negative in all cases: 63.64%

              precision    recall  f1-score   support

           0       0.81      0.91      0.86       266
           1       0.81      0.63      0.71       152

    accuracy                           0.81       418
   macro avg       0.81      0.77      0.78       418
weighted avg       0.81      0.81      0.81       418



Unnamed: 0,Negative,Positive
Negative,243,23
Positive,56,96
