In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
titanic = sns.load_dataset('titanic')
titanic['survived'] = titanic['survived'].apply(lambda x: 'did not survive' if x == 0 else 'survived')

In [2]:
titanic.shape

(891, 15)

In [3]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,did not survive,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,survived,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,survived,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,survived,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,did not survive,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


We are going to learn machine learning concepts with the titanic dataset, one of the most infamous shipwrecks in history.

Also as a fun fact, this dataset is one of the most famous datasets around in terms of machine learning next to the MNIST and Iris dataset!

Data preprocessing
Machine learning is nothing without fine data preprocessing and feature engineering.
Modify the titanic dataset by:

Selecting dedicated and useful features (i.e columns), only these ones : survived, pclass, sex, age, embark_town
Removing rows with NaN data
Use factorize to recode features sex (gender) and embark_town (the harbour city) into numerical data, because ML needs (and loves) numerical data.

In [4]:
titanic = titanic[['survived', 'pclass', 'sex', 'age', 'embark_town']]
titanic.head()

Unnamed: 0,survived,pclass,sex,age,embark_town
0,did not survive,3,male,22.0,Southampton
1,survived,1,female,38.0,Cherbourg
2,survived,3,female,26.0,Southampton
3,survived,1,female,35.0,Southampton
4,did not survive,3,male,35.0,Southampton


In [5]:
titanic.dropna(inplace = True)

In [6]:
titanic["sex"] = titanic["sex"].factorize()[0]
titanic["embark_town"] = titanic["embark_town"].factorize()[0]

In [7]:
titanic

Unnamed: 0,survived,pclass,sex,age,embark_town
0,did not survive,3,0,22.0,0
1,survived,1,1,38.0,1
2,survived,3,1,26.0,0
3,survived,1,1,35.0,0
4,did not survive,3,0,35.0,0
...,...,...,...,...,...
885,did not survive,3,1,39.0,2
886,did not survive,2,0,27.0,0
887,survived,1,1,19.0,0
889,survived,1,0,26.0,1


KNN classification with Scikit-Learn

Train Test Split Data
First you have to divide the titanic dataframe into 2 separated dataframes :

y with the feature to be predicted (i.e. survived)
X with the other features that will be used for the model (all numeric features + sex recoded with factorize + embark_town recoded with factorize)

In [8]:
y = titanic["survived"]
X = titanic[['pclass', 'sex', 'age', 'embark_town']]

And then from X and y, you need to separate them for training and testing your model :

Use 75% of data for training, the rest for testing
Please split data with random_state = 55

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 55)
print("The length of the initial dataset is :", len(X))
print("The length of the train dataset is   :", len(X_train))
print("The length of the test dataset is    :", len(X_test))

The length of the initial dataset is : 712
The length of the train dataset is   : 534
The length of the test dataset is    : 178


Model initialization
CONGRATS !!! You are going to develop your first ML model for KNN classification.
For that, please create a model object that initialises your model with the KNN classifier

In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 534 entries, 423 to 583
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pclass       534 non-null    int64  
 1   sex          534 non-null    int64  
 2   age          534 non-null    float64
 3   embark_town  534 non-null    int64  
dtypes: float64(1), int64(3)
memory usage: 20.9 KB


In [11]:
from sklearn.neighbors import KNeighborsClassifier
model =  KNeighborsClassifier()

Model fitting
Now you have to fit your model on the training data.

In [14]:
model.fit(X_train.values, y_train.values)
print(model)

KNeighborsClassifier()


Make predictions
You model is ready for prediction !

Make prediction for yourself !
Change the data below and evaluate your chance of survival ...

In [15]:
my_class = 3
my_sex = 0
my_age = 28
my_town = 0
my_data = np.array([my_class, my_sex, my_age, my_town]).reshape(1,4)
print(model.predict(my_data))

['did not survive']


In [16]:
# Then you can execute this code to see probabilities per class
for i, j in zip(model.classes_, model.predict_proba(my_data)[0]):
  print("Prediction probability for:", i, "is", j)

Prediction probability for: did not survive is 1.0
Prediction probability for: survived is 0.0


Model evaluation
Last but not least, you should evaluate the accuracy of your model.
You can compute the accuracy score directly with the well-known .score() method. Remember to compare train score and test score to evaluate overfitting. Please remember that other metrics are available to evaluate classification models such as precision, recall, f1 score and all together compose the confusion matrix.

In [19]:
# Your code here to compare test and train score (accuracy score). Is there overfitting?
print(model.score(X_train.values, y_train.values))
print(model.score(X_test.values, y_test.values))

0.8352059925093633
0.7921348314606742


In [21]:
# You can execute this code to get the confusion matrix

from sklearn.metrics import confusion_matrix
pd.DataFrame(data = confusion_matrix(y_true = y_test, y_pred = model.predict(X_test.values)),
             index = model.classes_ + " ACTUAL",
             columns = model.classes_ + " PREDICTED")

Unnamed: 0,did not survive PREDICTED,survived PREDICTED
did not survive ACTUAL,98,18
survived ACTUAL,19,43


Hyperparameter - improve you model
Let's play with the n_neighbors and weights hyperparameters of the model.

Evaluate the accuracy score (only on the test set) of your models by adjusting the hyperparameter n_neighbors from 2 to 10, and changing weights between uniform and distance.
What is the values of n_neighbors and weights that leads to the best score ?
NB: you could use a loop to test the different hyperparameter values

Tip: you should find as optimal value n_neighbors = 2 and weights = 'distance', with a test score value of 0.809.

In [22]:
for neighbors in range(2,11):
  for weight in ["uniform" , "distance"]:
    model =  KNeighborsClassifier(n_neighbors = neighbors , weights = weight).fit(X_train, y_train)
    print("For ", neighbors, "neighbors and weight=", weight,
          ": train score", model.score(X_train, y_train),
          "and test score:", model.score(X_test, y_test))

For  2 neighbors and weight= uniform : train score 0.8314606741573034 and test score: 0.7528089887640449
For  2 neighbors and weight= distance : train score 0.8895131086142322 and test score: 0.797752808988764
For  3 neighbors and weight= uniform : train score 0.8726591760299626 and test score: 0.797752808988764
For  3 neighbors and weight= distance : train score 0.9101123595505618 and test score: 0.797752808988764
For  4 neighbors and weight= uniform : train score 0.8389513108614233 and test score: 0.7752808988764045
For  4 neighbors and weight= distance : train score 0.9119850187265918 and test score: 0.7921348314606742
For  5 neighbors and weight= uniform : train score 0.8352059925093633 and test score: 0.7921348314606742
For  5 neighbors and weight= distance : train score 0.9101123595505618 and test score: 0.7921348314606742
For  6 neighbors and weight= uniform : train score 0.8164794007490637 and test score: 0.7640449438202247
For  6 neighbors and weight= distance : train score 0.

In [23]:
model =  KNeighborsClassifier(n_neighbors = 2 , weights = 'distance').fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.8895131086142322
0.797752808988764


Let's go back to data processing to improve our model

In [24]:
titanic2 = sns.load_dataset('titanic')
titanic2['survived'] = titanic2['survived'].apply(lambda x: 'did not survive' if x == 0 else 'survived')
titanic2.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,did not survive,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,survived,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,survived,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,survived,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,did not survive,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


Please do the same data processing as previously, but change embark_town (the harbour city) with get_dummies (and not factorize). Then initialize, fit and score your model. Is it better?

In [25]:
titanic2 = titanic2[['survived', 'pclass', 'sex', 'age', 'embark_town']]
titanic2.dropna(inplace = True)
titanic2["sex"] = titanic2["sex"].factorize()[0]

In [26]:
titanic2

Unnamed: 0,survived,pclass,sex,age,embark_town
0,did not survive,3,0,22.0,Southampton
1,survived,1,1,38.0,Cherbourg
2,survived,3,1,26.0,Southampton
3,survived,1,1,35.0,Southampton
4,did not survive,3,0,35.0,Southampton
...,...,...,...,...,...
885,did not survive,3,1,39.0,Queenstown
886,did not survive,2,0,27.0,Southampton
887,survived,1,1,19.0,Southampton
889,survived,1,0,26.0,Cherbourg


In [27]:
pd.get_dummies(titanic2["embark_town"])

Unnamed: 0,Cherbourg,Queenstown,Southampton
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
885,0,1,0
886,0,0,1
887,0,0,1
889,1,0,0


In [28]:
titanic2 = pd.concat([titanic2, pd.get_dummies(titanic2["embark_town"])], axis = 1)
titanic2

Unnamed: 0,survived,pclass,sex,age,embark_town,Cherbourg,Queenstown,Southampton
0,did not survive,3,0,22.0,Southampton,0,0,1
1,survived,1,1,38.0,Cherbourg,1,0,0
2,survived,3,1,26.0,Southampton,0,0,1
3,survived,1,1,35.0,Southampton,0,0,1
4,did not survive,3,0,35.0,Southampton,0,0,1
...,...,...,...,...,...,...,...,...
885,did not survive,3,1,39.0,Queenstown,0,1,0
886,did not survive,2,0,27.0,Southampton,0,0,1
887,survived,1,1,19.0,Southampton,0,0,1
889,survived,1,0,26.0,Cherbourg,1,0,0


In [29]:
titanic2.drop(["embark_town"], axis = 1, inplace = True)

In [30]:
y2 = titanic2["survived"]
X2 = titanic2[['pclass', 'sex', 'age', 'Cherbourg',	'Queenstown',	'Southampton']]

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y2, random_state = 55)
print("The length of the initial dataset is :", len(X2))
print("The length of the train dataset is   :", len(X_train2))
print("The length of the test dataset is    :", len(X_test2))

The length of the initial dataset is : 712
The length of the train dataset is   : 534
The length of the test dataset is    : 178


In [31]:
model2 =  KNeighborsClassifier(n_neighbors = 2 , weights = 'distance').fit(X_train2, y_train2)
print(model2.score(X_train2, y_train2))
print(model2.score(X_test2, y_test2))

0.8895131086142322
0.8089887640449438
