In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
titanic = pd.read_csv('train.csv', index_col='PassengerId')
titanic.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Exploring Dataset

In [3]:
titanic.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [5]:
titanic['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [6]:
titanic['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [7]:
titanic['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [8]:
fem_survived = ((titanic['Sex'] == 'female') & (titanic['Survived'] == 1)).sum()
male_survived = ((titanic['Sex'] == 'male') & (titanic['Survived'] == 1)).sum()

print('Female survival ratio:', fem_survived / (titanic['Sex'] == 'female').sum())
print('Male survival ratio:', fem_survived / (titanic['Sex'] == 'male').sum())

Female survival ratio: 0.7420382165605095
Male survival ratio: 0.4038128249566724


In [9]:
for i in range(1, 4):
    class_surival_count = ((titanic['Pclass'] == i) & (titanic['Survived'] == 1)).sum()
    class_num = (titanic['Pclass'] == i).sum()
    print(f'Class {i} Survival ratio', class_surival_count / class_num)

Class 1 Survival ratio 0.6296296296296297
Class 2 Survival ratio 0.47282608695652173
Class 3 Survival ratio 0.24236252545824846


# Cleaning Dataset

- The columns "Name" and "Ticket" are unlikely to affect the predictions, so we will remove them
- The cabin column is missing too much data to be useful, so it will be removed
- We will drop the 2 rows where embarked it null
- The numerical attributes will normailzed and filled with the median in case of null value
- The Sex column will be encoded into 0, 1
- The Embarked column will be One Hot Encoded


In [10]:
# Droping rows with embarked null values
titanic.dropna(subset=['Embarked'], axis=0, inplace=True)
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Name      889 non-null    object 
 3   Sex       889 non-null    object 
 4   Age       712 non-null    float64
 5   SibSp     889 non-null    int64  
 6   Parch     889 non-null    int64  
 7   Ticket    889 non-null    object 
 8   Fare      889 non-null    float64
 9   Cabin     202 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.3+ KB


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Numerical attributes pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

pipeline = ColumnTransformer([
    ('num', num_pipeline, ['Fare', 'Age', 'SibSp', 'Parch', 'Pclass']),
    ('sex', OrdinalEncoder(), ['Sex']),
    ('embarked', OneHotEncoder(), ['Embarked'])
])

# Modeling

We going to see the initial accuracy of some classifiers
- Random Forest Classifier
- Support Vector Machine
- Naive Bayes
- K-Nearest Neighbors

In [12]:
from sklearn.model_selection import cross_val_score

def cross_validation_results(model):
    """ Given a model, the function returns the cross validation scores along with the mean and std. """
    
    scores = cross_val_score(model, titanic_prepaired, titanic_labels, scoring='accuracy', cv=10)

    print(scores)
    print(f'Mean: {np.mean(scores)}')
    print(f'Standard Deviation: {np.std(scores)}')

## Random Forest Classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier

titanic_prepaired = pipeline.fit_transform(titanic)
titanic_labels = titanic['Survived']

random_forest = RandomForestClassifier(random_state=42)
cross_validation_results(random_forest)

[0.73033708 0.83146067 0.75280899 0.83146067 0.85393258 0.83146067
 0.82022472 0.76404494 0.82022472 0.85227273]
Mean: 0.8088227783452503
Standard Deviation: 0.04125873032840215


## Support Vector Machine

In [14]:
from sklearn.svm import SVC

svm_classifier = SVC(gamma='auto', random_state=42)
cross_validation_results(svm_classifier)

[0.80898876 0.84269663 0.76404494 0.87640449 0.80898876 0.78651685
 0.82022472 0.78651685 0.86516854 0.85227273]
Mean: 0.821182328907048
Standard Deviation: 0.03518537537751646


## Naive Bayes

In [15]:
from sklearn.naive_bayes import GaussianNB

naive_bayes = GaussianNB()
cross_validation_results(naive_bayes)

[0.75280899 0.75280899 0.76404494 0.7752809  0.7752809  0.78651685
 0.82022472 0.79775281 0.79775281 0.81818182]
Mean: 0.7840653728294178
Standard Deviation: 0.023154279033843102


## K-Nearest Neighbors

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
cross_validation_results(knn)

[0.78651685 0.82022472 0.73033708 0.78651685 0.83146067 0.80898876
 0.84269663 0.79775281 0.83146067 0.77272727]
Mean: 0.8008682328907047
Standard Deviation: 0.03201595481658051


# Exploring Random Forest Parameters

Most of them seem comparable, we will explore the hyper parameters of
- Random Forest
- SVM
- KNN

## Random Forest Hyper Parameters

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=9),
    }

random_forest = RandomForestClassifier(random_state=42)
rnd_search = RandomizedSearchCV(random_forest, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='accuracy', random_state=42)
rnd_search.fit(titanic_prepaired, titanic_labels)

best_random_forest = rnd_search.best_estimator_
cross_validation_results(best_random_forest)

[0.73033708 0.84269663 0.74157303 0.80898876 0.87640449 0.82022472
 0.82022472 0.7752809  0.83146067 0.85227273]
Mean: 0.8099463738508682
Standard Deviation: 0.04491113534620495


## SVM Hypter Parameters

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'C': [0.125, 0.25, 0.5, 1, 2],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'gamma': ['scale', 'auto']
    },
]

svm_classifier = SVC(random_state=42)
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, scoring='accuracy',return_train_score=True)
grid_search.fit(titanic_prepaired, titanic_labels)

best_svm = grid_search.best_estimator_
cross_validation_results(best_svm)

[0.79775281 0.85393258 0.75280899 0.87640449 0.83146067 0.78651685
 0.82022472 0.78651685 0.8988764  0.85227273]
Mean: 0.8256767109295199
Standard Deviation: 0.043193589911323674


In [19]:
param_grid = [
    {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    }
]

knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy',return_train_score=True)
grid_search.fit(titanic_prepaired, titanic_labels)

best_knn = grid_search.best_estimator_
cross_validation_results(best_knn)

[0.78651685 0.82022472 0.73033708 0.78651685 0.83146067 0.80898876
 0.84269663 0.79775281 0.83146067 0.77272727]
Mean: 0.8008682328907047
Standard Deviation: 0.03201595481658051


## Result

SVM seem to be the most promising, so we will use it for predictions on the test set

#  Test Dataset

In [20]:
titanic_test = pd.read_csv('test.csv', index_col='PassengerId')
titanic_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [21]:
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [22]:
# Getting predictions and storing them to disk
svm_predictions = best_svm.predict(pipeline.fit_transform(titanic_test))
predictions_df = pd.DataFrame(
    data=np.c_[titanic_test.index.values, svm_predictions],
    columns=['PassengerId','Survived']
)
predictions_df.to_csv('predictions.csv', index=False)