Goal of this case study is to predict survivors based on Titanic dataset and compare different alghoritms with different hiperparameters. At the end we can choose the best one for this task.

In [25]:
import pandas as pd
import numpy as np
import seaborn as sns

In [26]:
data = pd.read_csv('D:/Dane/walickida/Desktop/Realizowane prace/PYTHON/Zjazd 9/Titanic_case_study-main/titanic.csv', 
                   decimal=',')
data.head() 

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home.dest
0,3,0,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S,
1,3,0,"Abbott, Master. Eugene Joseph",male,13.0,0,2,C.A. 2673,20.25,,S,"East Providence, RI"
2,3,0,"Abbott, Mr. Rossmore Edward",male,16.0,1,1,C.A. 2673,20.25,,S,"East Providence, RI"
3,3,1,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35.0,1,1,C.A. 2673,20.25,,S,"East Providence, RI"
4,3,1,"Abelseth, Miss. Karen Marie",female,16.0,0,0,348125,7.65,,S,"Norway Los Angeles, CA"


1. Data preparation

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1046 entries, 0 to 1045
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1046 non-null   int64  
 1   survived   1046 non-null   int64  
 2   name       1046 non-null   object 
 3   sex        1046 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1046 non-null   int64  
 6   parch      1046 non-null   int64  
 7   ticket     1046 non-null   object 
 8   fare       1045 non-null   float64
 9   cabin      272 non-null    object 
 10  embarked   1044 non-null   object 
 11  home.dest  685 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 98.2+ KB


In [28]:
y = data['survived'] #it's our variable that we'd like to predict

In [102]:
data['ticket'].unique() #it's something like ID of ticket, we can't reasonably 
#use this information

KeyError: 'ticket'

In [30]:
data.drop(columns='ticket', axis=1, inplace=True)

In [31]:
data.drop(columns='fare', axis=1, inplace=True)
#First thing we can see is that fare - ticket price is object Dtype and should be float. 
#But I decided not to use this variable at all, beacouse: 1) we can't be sure about currency and
#correctness (probably it's the summarized price for all people from the ticket) 
#AND this variable is higly correlated with cabin and pclass. 

In [32]:
#There is big amount of Nan's BUT it could be very useful information so we'll try to do sth
#We'll only use the letter sign - we assume that it's a deck designation. 
data.cabin.str[0].value_counts()

C    86
B    63
D    42
E    38
A    19
F    18
G     5
T     1
Name: cabin, dtype: int64

In [33]:
data["deck"] = data.cabin.str[0].fillna("Unknown").replace({"G":"F", "T":"F"})
#There is very little
#samples in classes T and G so we can add them to class F. Nan's we turn into class Unknown.

In [34]:
data.drop(['cabin'], axis=1, inplace=True) #now we can delete column cabin

In [35]:
data['embarked'].fillna(data['embarked'].value_counts().index[0], inplace=True)
#Column 'embarked' it's the port from which passengers departed. It seems as usefull information
#but it's probably highly correlated with cabin (now deck) and it's probably more informative but 
#on the other hand it can be more informative for us than ID's of cabins that we've alreade 
#established it's hard to locate on ship
#We'll use this column and fill Nan's with most common value which is 'S'. There is only 2 Nan's
#so it shouldn't be a mistake.

In [36]:
#home.dest (home destination) is the column that 
#we will not use as it does not carry any information related to the disaster

In [37]:
X = pd.get_dummies(data [['pclass', 'sex', 'age', 'sibsp', 'parch', 
                          'embarked', 'deck']], drop_first=True)
X.head() #'sex','embarked' and 'deck' are categorical columns so they should be encoded.

Unnamed: 0,pclass,age,sibsp,parch,sex_male,embarked_Q,embarked_S,deck_B,deck_C,deck_D,deck_E,deck_F,deck_Unknown
0,3,42.0,0,0,1,0,1,0,0,0,0,0,1
1,3,13.0,0,2,1,0,1,0,0,0,0,0,1
2,3,16.0,1,1,1,0,1,0,0,0,0,0,1
3,3,35.0,1,1,0,0,1,0,0,0,0,0,1
4,3,16.0,0,0,0,0,1,0,0,0,0,0,1


2. Modeling

2.1 Decision Tree Classifier

In [38]:
from sklearn.model_selection import train_test_split

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
#now we split data into training and testing. We can assume basing on dataset size that 80:20 
#should be correct ratio

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold

In [89]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)
accuracy_score(y_test, y_pred_dt)
#~70% accuracy when we fit Decision Tree Classifier without parameter optimization

0.7142857142857143

In [93]:
kfold = StratifiedKFold(10, shuffle=True, random_state=123)

model_dt = DecisionTreeClassifier()
params = {"min_samples_leaf": np.arange(1, 21),
          'max_depth': np.arange(1, 16),
         'criterion': ['gini', 'entropy']}
optimizer_dt = GridSearchCV(model_dt, 
                         params, 
                         scoring="accuracy", 
                         cv=kfold, 
                         n_jobs=-1)
optimizer_dt.fit(X_train, y_train)

y_pred_dt = optimizer_dt.best_estimator_.predict(X_test)
print(optimizer_dt.best_params_)
print(accuracy_score(y_test, y_pred_dt))

{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5}
0.7714285714285715


In [99]:
model_dt = DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_leaf=5)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)
accuracy_score(y_test, y_pred_dt)
#Decision Tree Classifier with optimal parameters. Accuracy increases to ~79%

0.7904761904761904

2.2 Logistic regression

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [100]:
model_lr = LogisticRegression(max_iter=10e5)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
accuracy_score(y_test, y_pred_lr)
#~78% accuracy when we use Linear Regression model without optimization

0.7857142857142857

In [47]:
#Optimization
kfold = StratifiedKFold(10, shuffle=True, random_state=123)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(solver='saga'))
])
params = [
    {
        'model__penalty': ['none']
    },
    {
        'model__penalty': ['l1', 'l2'],
        'model__C': [0.01, 0.1, 1, 10, 100, 1000]
    }
]

optimizer_lr = GridSearchCV(pipeline,
                           params,
                           scoring='accuracy',
                           cv=kfold,
                           n_jobs=-1)
optimizer_lr.fit(X_train, y_train)
optimizer_lr.best_params_

{'model__C': 0.1, 'model__penalty': 'l2'}

In [48]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(penalty='l2', C=0.1))
])
pipeline.fit(X_train, y_train)
y_pred_lr = pipeline.predict(X_test)
accuracy_score(y_test, y_pred_lr)
#after optimization we are at the same level of accuracy

0.7714285714285715

In [50]:
from sklearn import metrics

In [52]:
cm = metrics.confusion_matrix(y_test, y_pred_lr)
print(cm)

[[95 22]
 [26 67]]


In [63]:
#Optimization
kfold = StratifiedKFold(10, shuffle=True, random_state=123)

solvers_list = ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']
C_params = [10e-2, 10e-1, 1, 10, 10e2, 10e3]

for solver in solvers_list:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LogisticRegression(solver=solver))
    ])
    if solver in ['lbfgs', 'newton-cg', 'sag']:
        params = [
            {
                'model__penalty': ['none']
            },
            {
                'model__penalty': ['l2'],
                'model__C': C_params
            }
        ]
        
    elif solver == 'liblinear':
            params = [
            {
                'model__penalty': ['l1', 'l2'],
                'model__C': C_params
            }
        ]
    elif solver == 'saga':
            params = [
            {
                'model__penalty': ['none']
            },
            {
                'model__penalty': ['l1', 'l2'],
                'model__C': C_params
            }
        ]

    optimizer_lr = GridSearchCV(pipeline,
                                   params,
                                   scoring='accuracy',
                                   cv=kfold,
                                   n_jobs=-1)
    optimizer_lr.fit(X_train, y_train)
    print({'solver': solver,
           'best params': optimizer_lr.best_params_})
    y_pred_lr = optimizer_lr.predict(X_test)
    acc = accuracy_score(y_test, y_pred_lr)
    print(f'Accuracy: {acc}')

{'solver': 'lbfgs', 'best params': {'model__C': 0.1, 'model__penalty': 'l2'}}
Accuracy: 0.7714285714285715
{'solver': 'liblinear', 'best params': {'model__C': 1.0, 'model__penalty': 'l2'}}
Accuracy: 0.7761904761904762
{'solver': 'newton-cg', 'best params': {'model__C': 0.1, 'model__penalty': 'l2'}}
Accuracy: 0.7714285714285715
{'solver': 'sag', 'best params': {'model__C': 0.1, 'model__penalty': 'l2'}}
Accuracy: 0.7714285714285715
{'solver': 'saga', 'best params': {'model__C': 0.1, 'model__penalty': 'l2'}}
Accuracy: 0.7714285714285715


In [67]:
#as we can see the best logistic regression parameters for this task is solver liblinear with C parameter 1 and l2 model penalty
#We can create new model with optimal parameters
model_lr_opt = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(solver = 'liblinear', C=1, penalty = 'l2'))
])

model_lr_opt.fit(X_train, y_train)
y_pred_lr_opt = model_lr_opt.predict(X_test)
acc_lr_opt = accuracy_score(y_pred_lr_opt, y_test)
print(acc_lr_opt)

0.7761904761904762
