<a href="https://colab.research.google.com/github/FelixIG15/klasa_felixivander/blob/master/Module%206%20Task%20Felix%20Ivander.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Collection

In [1]:
import os
import tarfile
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
PATH = os.path.join("datasets", "titanic")
URL = DOWNLOAD_ROOT + "datasets/titanic/train.csv"

def fetch_titanic_data(url=URL, path=PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    csv_path = os.path.join(path, "titanic.csv")
    urllib.request.urlretrieve(url, csv_path)


def load_titanic_data(path=PATH):
    csv_path = os.path.join(path, "titanic.csv")
    return pd.read_csv(csv_path)



See the column explanation: https://www.kaggle.com/c/titanic/data

In [2]:
fetch_titanic_data()
titanic = load_titanic_data()
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Data Preparation

## Separate survival label from the data

In [4]:
df = pd.DataFrame(titanic)
X= df.drop('Survived', axis=1)
label= df['Survived'].copy()

In [5]:
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen 'Carrie'",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Drop the following columns: PassengerId, Name, Ticket, and Cabin

In [6]:
X= X.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

In [7]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [8]:
label.value_counts()

0    549
1    342
Name: Survived, dtype: int64

## Fill the missing age with its median.

In [9]:
X.isna().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [10]:
age_median = X['Age'].median()
X['Age'].fillna(age_median, inplace=True)
X.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64

## One-hot encode the following columns: pclass, sex, and embarked.

In [11]:
from sklearn.preprocessing import OneHotEncoder
onehot_cat = ['Pclass','Sex','Embarked']
X_cat = X[onehot_cat]

In [12]:
X_cat

Unnamed: 0,Pclass,Sex,Embarked
0,3,male,S
1,1,female,C
2,3,female,S
3,1,female,S
4,3,male,S
...,...,...,...
886,2,male,S
887,1,female,S
888,3,female,S
889,1,male,C


In [13]:
cat_encoder = OneHotEncoder(sparse=False)
X_cat_1hot = cat_encoder.fit_transform(X_cat)
X_cat_1hot

array([[0., 0., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       ...,
       [0., 0., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.]])

In [14]:
cat_df = pd.DataFrame(X_cat_1hot, columns=cat_encoder.get_feature_names_out())
X= pd.concat([X, cat_df], axis=1).drop(onehot_cat, axis=1)

In [15]:
X

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,22.0,1,0,7.2500,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,38.0,1,0,71.2833,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,26.0,0,0,7.9250,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,35.0,1,0,53.1000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,35.0,0,0,8.0500,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
887,19.0,0,0,30.0000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
888,28.0,1,2,23.4500,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
889,26.0,0,0,30.0000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


## Scale the features with standard scaler

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [17]:
X_transform = pd.DataFrame(scaler.fit_transform(X.values[:,[0,3]]), columns=X.columns[[0,3]], index=X.index)
X.drop(X.columns[[0,3]], axis=1, inplace=True)
scaled_X = pd.concat([X, X_transform], axis=1)

In [18]:
scaled_X

Unnamed: 0,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,Age,Fare
0,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.565736,-0.502445
1,1,0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.663861,0.786845
2,0,0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.258337,-0.488854
3,1,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.433312,0.420730
4,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.433312,-0.486337
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.181487,-0.386671
887,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.796285,-0.044381
888,1,2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.104637,-0.176263
889,0,0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,-0.258337,-0.044381


In [19]:
cols = scaled_X.columns.tolist()
cols = cols[-2:] + cols[:-2]
scaled_X = scaled_X[cols]
scaled_X

Unnamed: 0,Age,Fare,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,-0.565736,-0.502445,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.663861,0.786845,1,0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,-0.258337,-0.488854,0,0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.433312,0.420730,1,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.433312,-0.486337,0,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,-0.181487,-0.386671,0,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
887,-0.796285,-0.044381,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
888,-0.104637,-0.176263,1,2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
889,-0.258337,-0.044381,0,0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


# Compare different classification algorithms:

## Data Preparation

In [23]:
X = scaled_X.copy().to_numpy()
y = label.to_numpy()

# classification into 2 labels
# label count is already 2, to maintain flexibility, we still do the split using median
y_cat = y.copy()
y_cat[y > np.median(y)] = 1
y_cat[y <= np.median(y)] = 0

print(f'Features shape:{X.shape}\nTarget Shape:{y.shape}')
print(f'Label shape:\n1:{y_cat[y_cat == 1].shape}\n0:{y_cat[y_cat == 0].shape}')

Features shape:(891, 13)
Target Shape:(891,)
Label shape:
1:(342,)
0:(549,)


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, y_cat_train, y_cat_test  = train_test_split(X, y, y_cat, test_size=0.2, random_state=15)
print(X_train.shape, y_train.shape, y_cat_train.shape)
print(X_test.shape, y_test.shape, y_cat_test.shape)

(712, 13) (712,) (712,)
(179, 13) (179,) (179,)


In [63]:
mean_time= []
mean_accuracy= []
mean_f1= []

## Logistic Regression

In [64]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

### 5-fold Cross Validation

In [65]:
from sklearn.model_selection import cross_validate
scores = cross_validate(log_reg, X, y_cat, scoring=["accuracy", "f1"], cv=5)

print(f"Time: {scores['fit_time']}")
print(f"Mean: {scores['fit_time'].mean()}")
print(f"Std: {scores['fit_time'].std()}\n")

print(f"Accuracy: {scores['test_accuracy']}")
print(f"Mean: {scores['test_accuracy'].mean()}")
print(f"Std: {scores['test_accuracy'].std()}\n")

print(f"F1: {scores['test_f1']}")
print(f"Mean: {scores['test_f1'].mean()}")
print(f"Std: {scores['test_f1'].std()}")

mean_time.append(scores['fit_time'].mean())
mean_accuracy.append(scores['test_accuracy'].mean())
mean_f1.append(scores['test_f1'].mean())

Time: [0.01493764 0.01330948 0.0132618  0.01272345 0.0144248 ]
Mean: 0.013731431961059571
Std: 0.0008185762874309063

Accuracy: [0.7877095  0.78651685 0.78651685 0.76966292 0.83146067]
Mean: 0.7923733601154981
Std: 0.02065880487624462

F1: [0.72058824 0.71641791 0.72058824 0.672      0.765625  ]
Mean: 0.7190438762071993
Std: 0.02964656192055655


## K-Nearest Neighbours (KNN)

In [66]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)

### 5-fold Cross Validation

In [67]:
from sklearn.model_selection import cross_validate
scores = cross_validate(knn, X, y_cat, scoring=["accuracy", "f1"], cv=5)

print(f"Time: {scores['fit_time']}")
print(f"Mean: {scores['fit_time'].mean()}")
print(f"Std: {scores['fit_time'].std()}\n")

print(f"Accuracy: {scores['test_accuracy']}")
print(f"Mean: {scores['test_accuracy'].mean()}")
print(f"Std: {scores['test_accuracy'].std()}\n")

print(f"F1: {scores['test_f1']}")
print(f"Mean: {scores['test_f1'].mean()}")
print(f"Std: {scores['test_f1'].std()}")

mean_time.append(scores['fit_time'].mean())
mean_accuracy.append(scores['test_accuracy'].mean())
mean_f1.append(scores['test_f1'].mean())

Time: [0.00202584 0.00123119 0.00202179 0.00131178 0.00125504]
Mean: 0.0015691280364990234
Std: 0.0003721755288412743

Accuracy: [0.74301676 0.76404494 0.80337079 0.78651685 0.81460674]
Mean: 0.7823112171238467
Std: 0.02600732536058306

F1: [0.60344828 0.66666667 0.70588235 0.66071429 0.72268908]
Mean: 0.6718801313628899
Std: 0.0414064609941378


## Decision Tree

In [50]:
from sklearn.tree import DecisionTreeClassifier
tree_class = DecisionTreeClassifier()

### 5-fold Cross Validation

In [70]:
from sklearn.model_selection import cross_validate
scores = cross_validate(tree_class, X, y_cat, scoring=["accuracy", "f1"], cv=5)

print(f"Time: {scores['fit_time']}")
print(f"Mean: {scores['fit_time'].mean()}")
print(f"Std: {scores['fit_time'].std()}\n")

print(f"Accuracy: {scores['test_accuracy']}")
print(f"Mean: {scores['test_accuracy'].mean()}")
print(f"Std: {scores['test_accuracy'].std()}\n")

print(f"F1: {scores['test_f1']}")
print(f"Mean: {scores['test_f1'].mean()}")
print(f"Std: {scores['test_f1'].std()}")

mean_time.append(scores['fit_time'].mean())
mean_accuracy.append(scores['test_accuracy'].mean())
mean_f1.append(scores['test_f1'].mean())

Time: [0.00323987 0.00209808 0.00221777 0.00192666 0.00203013]
Mean: 0.0023025035858154296
Std: 0.0004781352161151006

Accuracy: [0.77094972 0.79213483 0.83146067 0.74719101 0.79775281]
Mean: 0.7878978093026175
Std: 0.02813210865561013

F1: [0.69172932 0.73381295 0.77941176 0.65116279 0.74285714]
Mean: 0.7197947942418516
Std: 0.044228014517833966


## Random Forest

In [52]:
from sklearn.ensemble import RandomForestClassifier 
forest_class = RandomForestClassifier()

### 5-fold Cross Validation

In [71]:
from sklearn.model_selection import cross_validate
scores = cross_validate(forest_class, X, y_cat, scoring=["accuracy", "f1"], cv=5)

print(f"Time: {scores['fit_time']}")
print(f"Mean: {scores['fit_time'].mean()}")
print(f"Std: {scores['fit_time'].std()}\n")

print(f"Accuracy: {scores['test_accuracy']}")
print(f"Mean: {scores['test_accuracy'].mean()}")
print(f"Std: {scores['test_accuracy'].std()}\n")

print(f"F1: {scores['test_f1']}")
print(f"Mean: {scores['test_f1'].mean()}")
print(f"Std: {scores['test_f1'].std()}")

mean_time.append(scores['fit_time'].mean())
mean_accuracy.append(scores['test_accuracy'].mean())
mean_f1.append(scores['test_f1'].mean())

Time: [0.17858434 0.16645408 0.17708635 0.17051506 0.17381096]
Mean: 0.17329015731811523
Std: 0.004407807601349382

Accuracy: [0.7877095  0.80337079 0.85393258 0.7752809  0.80898876]
Mean: 0.8058565061829137
Std: 0.026801047825486934

F1: [0.72463768 0.73282443 0.80597015 0.67741935 0.76388889]
Mean: 0.7409481003243333
Std: 0.04271316416216401


## XGBoost

In [54]:
import xgboost as xgb
xgb_class = xgb.XGBClassifier(  )

### 5-fold Cross Validation

In [72]:
from sklearn.model_selection import cross_validate
scores = cross_validate(xgb_class, X, y_cat, scoring=["accuracy", "f1"], cv=5)

print(f"Time: {scores['fit_time']}")
print(f"Mean: {scores['fit_time'].mean()}")
print(f"Std: {scores['fit_time'].std()}\n")

print(f"Accuracy: {scores['test_accuracy']}")
print(f"Mean: {scores['test_accuracy'].mean()}")
print(f"Std: {scores['test_accuracy'].std()}\n")

print(f"F1: {scores['test_f1']}")
print(f"Mean: {scores['test_f1'].mean()}")
print(f"Std: {scores['test_f1'].std()}")

mean_time.append(scores['fit_time'].mean())
mean_accuracy.append(scores['test_accuracy'].mean())
mean_f1.append(scores['test_f1'].mean())

Time: [0.04971695 0.04633594 0.04627585 0.04676437 0.04673743]
Mean: 0.04716610908508301
Std: 0.0012910215697465221

Accuracy: [0.80446927 0.81460674 0.82022472 0.80898876 0.84831461]
Mean: 0.8193208210407381
Std: 0.015433723109464765

F1: [0.72868217 0.736      0.75       0.72131148 0.79699248]
Mean: 0.7465972254310959
Std: 0.026921664925166938


## SVM

In [56]:
from sklearn.svm import SVC
svc = SVC()

### 5-fold Cross Validation

In [73]:
from sklearn.model_selection import cross_validate
scores = cross_validate(svc, X, y_cat, scoring=["accuracy", "f1"], cv=5)

print(f"Time: {scores['fit_time']}")
print(f"Mean: {scores['fit_time'].mean()}")
print(f"Std: {scores['fit_time'].std()}\n")

print(f"Accuracy: {scores['test_accuracy']}")
print(f"Mean: {scores['test_accuracy'].mean()}")
print(f"Std: {scores['test_accuracy'].std()}\n")

print(f"F1: {scores['test_f1']}")
print(f"Mean: {scores['test_f1'].mean()}")
print(f"Std: {scores['test_f1'].std()}")

mean_time.append(scores['fit_time'].mean())
mean_accuracy.append(scores['test_accuracy'].mean())
mean_f1.append(scores['test_f1'].mean())

Time: [0.02119327 0.01850462 0.01978874 0.01979256 0.02039218]
Mean: 0.01993427276611328
Std: 0.000880898757653237

Accuracy: [0.83240223 0.81460674 0.81460674 0.81460674 0.87640449]
Mean: 0.830525390747599
Std: 0.023952556723287544

F1: [0.77272727 0.74418605 0.7480916  0.71794872 0.83333333]
Mean: 0.7632573947148774
Std: 0.03910731264644871


## Neural Networks

In [60]:
from sklearn.neural_network import MLPClassifier
mlpclass = MLPClassifier(hidden_layer_sizes=[32, 32], max_iter=1000)

### 5-fold Cross Validation

In [74]:
from sklearn.model_selection import cross_validate
scores = cross_validate(mlpclass, X, y_cat, scoring=["accuracy", "f1"], cv=5)

print(f"Time: {scores['fit_time']}")
print(f"Mean: {scores['fit_time'].mean()}")
print(f"Std: {scores['fit_time'].std()}\n")

print(f"Accuracy: {scores['test_accuracy']}")
print(f"Mean: {scores['test_accuracy'].mean()}")
print(f"Std: {scores['test_accuracy'].std()}\n")

print(f"F1: {scores['test_f1']}")
print(f"Mean: {scores['test_f1'].mean()}")
print(f"Std: {scores['test_f1'].std()}")

mean_time.append(scores['fit_time'].mean())
mean_accuracy.append(scores['test_accuracy'].mean())
mean_f1.append(scores['test_f1'].mean())

Time: [2.43196869 2.65038562 1.83281994 2.41256547 2.43396902]
Mean: 2.3523417472839356
Std: 0.2739939477468927

Accuracy: [0.75977654 0.78651685 0.83146067 0.7752809  0.83707865]
Mean: 0.7980227229929069
Std: 0.030840909606074187

F1: [0.66666667 0.70769231 0.76190476 0.66666667 0.78518519]
Mean: 0.7176231176231175
Std: 0.04861530163967652


In [75]:
models = ['Logistic Regression','KNN', 'Decision Tree', 'Random Forest', 'XGBoost', 'SVM', 'Neural Networks']
result_df = pd.DataFrame(list(zip(models,mean_time, mean_accuracy, mean_f1)), columns =['Classifier','Mean Time','Mean Accuracy','Mean F1'])

result_df

Unnamed: 0,Classifier,Mean Time,Mean Accuracy,Mean F1
0,Logistic Regression,0.013731,0.792373,0.719044
1,KNN,0.001569,0.782311,0.67188
2,Decision Tree,0.002303,0.787898,0.719795
3,Random Forest,0.17329,0.805857,0.740948
4,XGBoost,0.047166,0.819321,0.746597
5,SVM,0.019934,0.830525,0.763257
6,Neural Networks,2.352342,0.798023,0.717623
