# Titanic Submission by Troy Samra

I also found some inspiration from titanic guides on the internet to give me inspiration and to help me with using pandas, and scikit learn. 


In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Read the data in from csv files
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
combine = [train, test]

In [3]:
# We find a strong correlation between the passenger class and their survival rate, so we will use this is our model
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [4]:
# We find a very strong correlation between the sexes in regards to survival rate, so we will use this also in our model
train[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [5]:
# We drop fields that we do not need in order to speed up our calculations and our notebook
train = train.drop(['Ticket', 'Cabin'], axis=1)
test = test.drop(['Ticket', 'Cabin'], axis=1)
combine = [train, test]

In [6]:
# We drop the name and PassengerId info from the data because we do not need them.
train = train.drop(['Name', 'PassengerId'], axis=1)
test = test.drop(['Name'], axis=1)
combine = [train, test]

In [7]:
# We now map the Sex field into an integer from a string
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
    
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


In [8]:
# We fill in lost age data with our best approximation of what their age should be. I found this online and thought it was
#unique and an interesting approach to find missing data.
ages = np.zeros((2,3))

for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()

            age_guess = guess.median()

            # Convert random age float to nearest .5 age
            ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22,1,0,7.25,S
1,1,1,1,38,1,0,71.2833,C
2,1,3,1,26,0,0,7.925,S
3,1,1,1,35,1,0,53.1,S
4,0,3,0,35,0,0,8.05,S


In [9]:
# See the correlation between the certain age ranges and their survival rate
train['AgeRange'] = pd.cut(train['Age'], 5)
train[['AgeRange', 'Survived']].groupby(['AgeRange'], as_index=False).mean().sort_values(by='AgeRange', ascending=True)

Unnamed: 0,AgeRange,Survived
0,"(-0.08, 16.0]",0.55
1,"(16.0, 32.0]",0.337374
2,"(32.0, 48.0]",0.412037
3,"(48.0, 64.0]",0.434783
4,"(64.0, 80.0]",0.090909


In [10]:
# Map the age range into a single integer for our decision tree classification
for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeRange
0,0,3,0,1,1,0,7.25,S,"(16.0, 32.0]"
1,1,1,1,2,1,0,71.2833,C,"(32.0, 48.0]"
2,1,3,1,1,0,0,7.925,S,"(16.0, 32.0]"
3,1,1,1,2,1,0,53.1,S,"(32.0, 48.0]"
4,0,3,0,2,0,0,8.05,S,"(32.0, 48.0]"


In [11]:
# We also see a correlation between the Age and their survival rate.
train[["Age", "Survived"]].groupby(['Age'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Age,Survived
0,0,0.55
3,3,0.434783
2,2,0.412037
1,1,0.337374
4,4,0.090909


In [12]:
# We drop the ageBand because we no longer need it for our calculations
train = train.drop(['AgeRange'], axis=1)
combine = [train, test]
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,1,1,0,7.25,S
1,1,1,1,2,1,0,71.2833,C
2,1,3,1,1,0,0,7.925,S
3,1,1,1,2,1,0,53.1,S
4,0,3,0,2,0,0,8.05,S


In [13]:
# We combine the sibSp and Parch dataset into one Family Size field, then we view to see any correlation between them
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [14]:
# We add an alone field to see if there is any correlation between being single on the ship
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


In [15]:
# We see a stronger relation with isAlone so we drop the family size, parch, and sibsp fields because we dont need them
train = train.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test = test.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train, test]

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,IsAlone
0,0,3,0,1,7.25,S,0
1,1,1,1,2,71.2833,C,0
2,1,3,1,1,7.925,S,1
3,1,1,1,2,53.1,S,0
4,0,3,0,2,8.05,S,1


In [16]:
# We assume they boarded in southhampton because most people boarded there
freq_port = train.Embarked.dropna().mode()[0]

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [17]:
# Map over where they embarked from and turn that into integers for our calculations
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,IsAlone
0,0,3,0,1,7.25,0,0
1,1,1,1,2,71.2833,1,0
2,1,3,1,1,7.925,0,1
3,1,1,1,2,53.1,0,0
4,0,3,0,2,8.05,0,1


In [18]:
# We now fill in any empty fare values with the median value
test['Fare'].fillna(test['Fare'].dropna().median(), inplace=True)
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,IsAlone
0,892,3,0,2,7.8292,2,1
1,893,3,1,2,7.0,0,0
2,894,2,0,3,9.6875,2,1
3,895,3,0,1,8.6625,0,1
4,896,3,1,1,12.2875,0,0


In [19]:
# Cut the fares into four seperate bands
train['FareRange'] = pd.qcut(train['Fare'], 4)
train[['FareRange', 'Survived']].groupby(['FareRange'], as_index=False).mean().sort_values(by='FareRange', ascending=True)

Unnamed: 0,FareRange,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


In [20]:
# We now map over the fares to insert integers for our calculations
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train[["Fare", "Survived"]].groupby(['Fare'], as_index=False).mean().sort_values(by='Survived', ascending=False)
    
train = train.drop(['FareRange'], axis=1)
combine = [train, test]
    
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,IsAlone
0,0,3,0,1,0,0,0
1,1,1,1,2,3,1,0
2,1,3,1,1,1,0,1
3,1,1,1,2,3,0,0
4,0,3,0,2,1,0,1


In [21]:
# We also see a correlation between the Fare they paid and their survival rate.
train[["Fare", "Survived"]].groupby(['Fare'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Fare,Survived
3,3,0.581081
2,2,0.445415
1,1,0.308756
0,0,0.197309


In [22]:
X_train = train.drop("Survived", axis=1)
Y_train = train["Survived"]
X_test  = test.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 6), (891,), (418, 6))

In [23]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 1)
acc_decision_tree

85.6

In [24]:
# create a scv file for submission
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('submission.csv', index=False)