In [142]:
import seaborn as sns
import sklearn as sk
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix

In [143]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Cabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,C85
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,C123
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,


In [144]:
df_subset = df[["PassengerId","Age",  "Survived", "Pclass", "SibSp", "Parch", "Sex"]]
df_subset.head()
df_subset = df_subset.dropna()
df_subset.head()

Unnamed: 0,PassengerId,Age,Survived,Pclass,SibSp,Parch,Sex
0,1,22.0,0,3,1,0,male
1,2,38.0,1,1,1,0,female
2,3,26.0,1,3,0,0,female
3,4,35.0,1,1,1,0,female
4,5,35.0,0,3,0,0,male


In [145]:
df_dummy = pd.get_dummies(df_subset["Sex"], prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)
df_dummy.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [146]:
df_subset.update(df_dummy)

In [147]:
df_subset = pd.merge(df_subset, df_dummy, on=None, left_on=None, right_on=None, left_index=True, right_index=True)
df_subset.head()
df_subset = df_subset[["PassengerId","Age",  "Survived", "Pclass", "SibSp", "Parch", "female", "male"]]
df_subset.head()

Unnamed: 0,PassengerId,Age,Survived,Pclass,SibSp,Parch,female,male
0,1,22.0,0,3,1,0,0,1
1,2,38.0,1,1,1,0,1,0
2,3,26.0,1,3,0,0,1,0
3,4,35.0,1,1,1,0,1,0
4,5,35.0,0,3,0,0,0,1


In [148]:
X = df_subset[["Age", "Pclass", "SibSp", "Parch", "female", "male"]] #create the X matrix
X = normalize(X) #normalize the matrix to put everything on the same scale
y = df_subset['Survived'] #create the y-variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) #split the data, store it into different variables

In [149]:
knn = KNeighborsClassifier(n_neighbors=10) #create a KNN-classifier with 5 neighbors (default)
knn = knn.fit(X_train, y_train) #this fits the k-nearest neigbor model with the train data
knn.score(X_test, y_test) #calculate the fit on the test data

0.813953488372093

### Calculate Accuracy, precision, recall for survival

In [150]:
y_test_pred = knn.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_test_pred) #creates a "confusion matrix"
cm

array([[119,  15],
       [ 25,  56]])

In [151]:
#In order to read it easily , let's make a dataframe out of it, and add labels to it.
conf_matrix = pd.DataFrame(cm, index=['did_not_survive', 'survived'], columns = ['did_not_survive_predicted', 'Survived_predicted']) 
conf_matrix

Unnamed: 0,did_not_survive_predicted,Survived_predicted
did_not_survive,119,15
survived,25,56


In [152]:
#Accuracy
(119+25)/(119+25+15+56)

0.6697674418604651

In [153]:
#precision
(119)/(119+15)

0.8880597014925373

In [154]:
#recall 
(119)/(119+25)

0.8263888888888888