# Titanic Tutorial

In [102]:
import numpy as np
import pandas as pd
import os
import re
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
classifier = RandomForestClassifier()


In [103]:
train_data = pd.read_csv(Path("Resources/train.csv"))
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [104]:
test_data = pd.read_csv(Path("Resources/test.csv"))

In [105]:
# Codify sex as female = 0, male = 1
train_copy = train_data.copy()
train_copy['Sex'] = le.fit_transform(train_copy['Sex'])
train_copy.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


In [106]:
# Codify the Embarked to numerical as well
train_copy['Embarked'] = le.fit_transform(train_copy['Embarked'])
train_copy.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2


In [107]:
# Codify the cabins, since NaN 
train_copy['Cabin'] = train_copy.Cabin.fillna("0")
train_copy['Cabin'] = le.fit_transform(train_copy['Cabin'])
len(train_copy.groupby('Cabin'))

148

In [108]:
NoCabin = train_copy.loc[train_copy.Cabin == 0]
len(NoCabin)

print(f"Only {(1 - len(NoCabin)/len(train_copy))*100 :,.1f}% of cabins were known from passengers in the training set.")


Only 22.9% of cabins were known from passengers in the training set.


In [109]:
train_copy.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,82,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,56,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,0,2


In [110]:
train_copy.Ticket.head(20)

# Some of the tickets have letters in their names. Most do not. In order to evaluate them based only on the numeric value, need \
#  to remove anything before the space/last value.

0            A/5 21171
1             PC 17599
2     STON/O2. 3101282
3               113803
4               373450
5               330877
6                17463
7               349909
8               347742
9               237736
10             PP 9549
11              113783
12           A/5. 2151
13              347082
14              350406
15              248706
16              382652
17              244373
18              345763
19                2649
Name: Ticket, dtype: object

In [111]:
# Clean the Tickets by using only the last number sequence
# '(\d+)$'
train_tickets = train_copy.copy()


In [112]:
train_tickets.Ticket = [re.findall('\s*(\d+)$', ticket) for ticket in train_copy.Ticket]
train_tickets.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,[21171],7.25,0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,[17599],71.2833,82,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,[3101282],7.925,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,[113803],53.1,56,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,[373450],8.05,0,2


In [113]:
train_tickets.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int32
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin            int32
Embarked         int32
dtype: object

In [114]:
print(type(train_tickets.Ticket))

<class 'pandas.core.series.Series'>


In [115]:
train_tickets["Ticket"].tail()

886    [211536]
887    [112053]
888      [6607]
889    [111369]
890    [370376]
Name: Ticket, dtype: object

In [116]:
train_tickets.Ticket.isna().sum()

0

In [117]:

train_tickets.Ticket = train_tickets.Ticket.str[0]


In [119]:
train_tickets.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,21171,7.25,0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,17599,71.2833,82,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,3101282,7.925,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,56,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,0,2


In [137]:
X = train_tickets.copy()

In [138]:
X.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,21171,7.25,0,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,17599,71.2833,82,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,3101282,7.925,0,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,56,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,0,2


AttributeError: 'Series' object has no attribute 'Ticket'