In [3]:
import numpy as np
import pandas as pd
import os

from sklearn import tree
from sklearn.preprocessing import LabelEncoder

## Setup

In [39]:
data_training = pd.read_csv("dataset/train.csv")
data_goal = pd.read_csv("dataset/test.csv")

## Data exploration

In [5]:
data_training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
data_training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
data_training.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [8]:
data_training.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [40]:
data_goal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


## Data cleaning
We must make sure to remove :
- Empty cells
- Data in wrong format
- Wrong data
- Duplicates

### Empty values cleanup

In [10]:
len(data_training[data_training.duplicated() == True])

0

### Column by column analysis

In [11]:
print('The collumns are:', ", ".join(data_training.columns))

The collumns are: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked


#### PassagerId

In [12]:
print('Has nans:', data_training['PassengerId'].hasnans)
print('Has duplicates', data_training['PassengerId'].duplicated().any())

Has nans: False
Has duplicates False


#### Survived

In [13]:
print('Unique values: ', data_training['Survived'].unique())

Unique values:  [0 1]


#### Pclass

In [14]:
print('Unique values: ', data_training['Pclass'].unique())

Unique values:  [3 1 2]


#### Name
We can reasonably doubt that the name of the passagers had any impact on whether or not they were victims, therefore we will not include this feature.

In [15]:
print('Unique values: ', data_training['Name'].duplicated().any())

Unique values:  False


#### Sex

In [16]:
print('Initial unique values:', data_training['Sex'].unique())

Initial unique values: ['male' 'female']


In [17]:
labelEncoder_sex = LabelEncoder()
data_training['Sex_Num'] = labelEncoder_sex.fit_transform(data_training['Sex'])
print('New unique values:', data_training['Sex_Num'].unique())

New unique values: [1 0]


#### Age

In [18]:
# We will use a simple approach to fill the missing values for the age : we will insert the average value
data_training.loc[data_training['Age'].isnull(), 'Age'] = data_training['Age'].mean()

print('Age range:', np.min(data_training['Age']), 'to', np.max(data_training['Age']))

Age range: 0.42 to 80.0


#### SibSp

In [19]:
print('Unique values:', data_training['SibSp'].unique())

Unique values: [1 0 3 4 2 5 8]


#### Parch

In [20]:
print('Unique values:', data_training['Parch'].unique())

Unique values: [0 1 2 5 3 4 6]


#### Ticket
For the moment, we will not use this information as it is not clear yet how it can be used as every value is unique

In [21]:
print('Number unique values:', len(data_training['Ticket'].unique()))

Number unique values: 681


In [22]:
# Ticket code
data_training['Ticket_Code'] = data_training['Ticket'].str.split(' ', expand=False).apply(lambda list : " ".join(list[:-1]))
data_training['Ticket_Code'] = data_training['Ticket_Code'].apply(lambda string : string.upper())
data_training['Ticket_Code'] = data_training['Ticket_Code'].apply(lambda string : string.replace(" ", ""))
data_training['Ticket_Code'] = data_training['Ticket_Code'].apply(lambda string : string.replace(".", ""))
data_training['Ticket_Code'] = data_training['Ticket_Code'].apply(lambda string : string.replace("/", ""))
print('Unique Ticket codes:\n', data_training['Ticket_Code'].unique())

Unique Ticket codes:
 ['A5' 'PC' 'STONO2' '' 'PP' 'CA' 'SCPARIS' 'SCA4' 'A4' 'SP' 'SOC' 'WC'
 'SOTONOQ' 'WEP' 'C' 'SOP' 'FA' 'FCC' 'SWPP' 'SCOW' 'PPP' 'SC' 'SCAH' 'AS'
 'SCAHBASLE' 'SOPP' 'FC' 'SOTONO2' 'CASOTON']


In [23]:
# Manual adjustments after examining the data
data_training.loc[data_training['Ticket_Code'] == 'FC', 'Ticket_Code'] = 'FCC'
data_training.loc[data_training['Ticket_Code'] == 'AS', 'Ticket_Code'] = 'A5'

In [25]:
labelEncoder_ticket_code = LabelEncoder()
data_training['Ticket_Code_Num'] = labelEncoder_ticket_code.fit_transform(data_training['Ticket_Code'])
list(labelEncoder_ticket_code.classes_)

['',
 'A4',
 'A5',
 'C',
 'CA',
 'CASOTON',
 'FA',
 'FCC',
 'PC',
 'PP',
 'PPP',
 'SC',
 'SCA4',
 'SCAH',
 'SCAHBASLE',
 'SCOW',
 'SCPARIS',
 'SOC',
 'SOP',
 'SOPP',
 'SOTONO2',
 'SOTONOQ',
 'SP',
 'STONO2',
 'SWPP',
 'WC',
 'WEP']

#### Fare

In [26]:
print('Number unique values:', len(np.sort(data_training['Fare'].unique())))
print('Fare range :', data_training['Fare'].min(), 'to', data_training['Fare'].max())

Number unique values: 248
Fare range : 0.0 to 512.3292


#### Cabin

In [27]:
print('Unique values:', data_training['Cabin'].unique())

Unique values: [nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
 'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90'
 'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6'
 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50'
 'B42' 'C148']


In [28]:
data_training.loc[data_training['Cabin'].notnull(), 'Cabin_Code'] = data_training['Cabin'].str[0]

In [29]:
# We create a feature "Cabin Code Num"
labelEncoder_cabin_code = LabelEncoder()
data_training['Cabin_Code_Num'] = labelEncoder_cabin_code.fit_transform(data_training['Cabin_Code'])

In [30]:
labelEncoder_cabin_code.classes_

array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', nan], dtype=object)

#### Embarked

In [31]:
print('Unique values:', data_training['Embarked'].unique())

Unique values: ['S' 'C' 'Q' nan]


In [32]:
# We create a feature "Embarked Num"
labelEncoder_embarked = LabelEncoder()
data_training['Embarked_Num'] = labelEncoder_embarked.fit_transform(data_training['Embarked'])

## Experimenting with models

In [33]:
X, Y = data_training.loc[:, ['Pclass', 'Sex_Num', 'Age', 'SibSp', 'Parch', 'Ticket_Code_Num', 'Fare', 'Cabin_Code_Num', 'Embarked_Num']], data_training.loc[:, 'Survived']

In [34]:
data_training.loc[:, ['Pclass', 'Sex_Num', 'Age', 'SibSp', 'Parch', 'Ticket_Code_Num', 'Fare', 'Cabin_Code_Num', 'Embarked_Num', 'Survived']].corr()

Unnamed: 0,Pclass,Sex_Num,Age,SibSp,Parch,Ticket_Code_Num,Fare,Cabin_Code_Num,Embarked_Num,Survived
Pclass,1.0,0.1319,-0.331339,0.083081,0.018443,-0.027514,-0.5495,0.746616,0.157112,-0.338481
Sex_Num,0.1319,1.0,0.084153,-0.114631,-0.245489,0.023097,-0.182333,0.123076,0.104057,-0.543351
Age,-0.331339,0.084153,1.0,-0.232625,-0.179191,0.019969,0.091566,-0.249134,-0.022239,-0.069809
SibSp,0.083081,-0.114631,-0.232625,1.0,0.414838,-0.015351,0.159651,0.04154,0.066654,-0.035322
Parch,0.018443,-0.245489,-0.179191,0.414838,1.0,0.005542,0.216225,-0.032548,0.038322,0.081629
Ticket_Code_Num,-0.027514,0.023097,0.019969,-0.015351,0.005542,1.0,0.074236,0.013002,-0.016145,-0.011526
Fare,-0.5495,-0.182333,0.091566,0.159651,0.216225,0.074236,1.0,-0.523013,-0.221226,0.257307
Cabin_Code_Num,0.746616,0.123076,-0.249134,0.04154,-0.032548,0.013002,-0.523013,1.0,0.187015,-0.301116
Embarked_Num,0.157112,0.104057,-0.022239,0.066654,0.038322,-0.016145,-0.221226,0.187015,1.0,-0.163517
Survived,-0.338481,-0.543351,-0.069809,-0.035322,0.081629,-0.011526,0.257307,-0.301116,-0.163517,1.0


In [35]:
clf_decision_tree = tree.DecisionTreeClassifier()
clf_decision_tree = clf_decision_tree.fit(X, Y)