This a program to predict the survivors of the Titanic that sunk many years ago. It is written in Python and uses data obtained from Kaggle.com








# 1. Import libraries
We will import all the relevant libraries that will be used in this program

In [1]:
import numpy as np 
import pandas as pd 

import sklearn
from sklearn.impute import SimpleImputer
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# **2. Import data**
The data is read in with the Pandas library

In [2]:
train_data = pd.read_csv ('train.csv')
test_data = pd.read_csv ('test.csv')

# 3. A peek at the contents of the dataset

In [3]:
#view the first 10 rows of training data
train_data.head (10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
#view data types
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
#view the first 10 rows of test data
test_data.head (10)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [6]:
#view data types
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


Make copies of original datasets

In [7]:
train_data_og=train_data
test_data_og=test_data

# **4. Data Preprocessing**
Data processing is needed to clean the dataset  of noise inorder to feed it into the classifier. First we will address the issue of missing data that is present in the datset as seen above. We will also consider scaling the contents of some columns inorder to have a dataset with uniform values. We will also employ a feature selection method to remove redundant features in the datas set. 

4.1 Address the missing data

In [8]:
# Identify all columns with missing data in both training and testing data
print(train_data.columns[train_data.isna().any()].tolist())
print(test_data.columns[test_data.isna().any()].tolist())

['Age', 'Cabin', 'Embarked']
['Age', 'Fare', 'Cabin']


4.1.1 Replace missing data in training data

In [9]:
#Replace missing data in 'Age' column of training data  with median value
#train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
train_data.Age.fillna(train_data["Age"].median(), inplace=True)
#Replace missing data in 'Cabin' column of training data with mode value since it is categorical data
#train_data['Cabin'] = train_data['Cabin'].fillna(train_data['Cabin'].mode())
train_data.Cabin.fillna(train_data["Cabin"].mode(), inplace=True)
#Replace missing data in 'Embarked' column of training data with mode value since it is categorical data
#train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode())
train_data.Embarked.fillna(train_data["Embarked"].mode(), inplace=True)
train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,G6,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


4.1.2 Replace missing data in test data

In [10]:
#Replace missing data in 'Age' column of test data  with median value
#test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())
test_data.Age.fillna(test_data["Age"].median(), inplace=True)
#Replace missing data in 'Cabin' column of test data with mode value since it is categorical data
#test_data['Cabin'] = test_data['Cabin'].fillna(test_data['Cabin'].mode())
test_data.Cabin.fillna(test_data["Cabin"].mode(), inplace=True)
#Replace missing data in 'Fare' column of training data with median value 
#test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].median())
test_data.Fare.fillna(test_data["Fare"].median(), inplace=True)
test_data.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,B57 B59 B63 B66,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [11]:
print(train_data.columns[train_data.isnull().any()].tolist())
print(test_data.columns[test_data.isnull().any()].tolist())

['Cabin', 'Embarked']
['Cabin']


4.2 Transform columns with non-numerical values. From the data our non-numerical features are 'Sex', 'Cabin', 'Name','Ticket', 'Embarked'

View details about categorical featured columns

In [12]:
print (pd.unique(train_data['Sex']))
print (pd.unique(train_data['Cabin']))
print (pd.unique(train_data['Ticket']))
print (pd.unique(train_data['Embarked']))

['male' 'female']
['B96 B98' 'C85' 'G6' 'C123' nan 'E46' 'C103' 'D56' 'A6' 'C23 C25 C27'
 'B78' 'D33' 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12'
 'D26' 'C110' 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19'
 'A7' 'C49' 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35'
 'C87' 'B77' 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D'
 'C22 C26' 'C106' 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32'
 'B18' 'C124' 'C91' 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'E10' 'E44'
 'A34' 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20'
 'B79' 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101'
 'C68' 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48'
 'E58' 'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24'
 'C90' 'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3'
 'D6' 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24'
 'C50' 'B42' 'C148']
['A/5 21171' 'PC 17599' '

In [13]:
#Check if there are still anymore NaN values
print(train_data.columns[train_data.isnull().any()].tolist())
print(test_data.columns[test_data.isnull().any()].tolist())

['Cabin', 'Embarked']
['Cabin']


We will perform manual feature selection here and drop features that seem irrelevant to this problem these are 'Name','Passenger ID' and 'Ticket'. 'Cabin' will also be dropped because it still has NaN values in both training and testing data after data imputation

In [14]:
#Drop Name and PassengerId columns along
train_data = train_data.drop(['Name','PassengerId', 'Ticket', 'Cabin'], axis=1)
test_data = test_data.drop(['Name', 'PassengerId', 'Ticket','Cabin'], axis=1)

In [15]:
#Handling the 'Sex' column
train_data['Sex'] = train_data['Sex'].apply(lambda x: 1 if x == 'male' else 0)
test_data['Sex'] = test_data['Sex'].apply(lambda x: 1 if x == 'male' else 0)
train_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,S
1,1,1,0,38.0,1,0,71.2833,C
2,1,3,0,26.0,0,0,7.9250,S
3,1,1,0,35.0,1,0,53.1000,S
4,0,3,1,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,S
887,1,1,0,19.0,0,0,30.0000,S
888,0,3,0,28.0,1,2,23.4500,S
889,1,1,1,26.0,0,0,30.0000,C


In [16]:
#Handling the 'Embarked' column
train_data['Embarked'] = train_data['Embarked'].apply(lambda x: 0 if x == 'S' else 1 if x=='C' else 0)
test_data['Embarked'] = test_data['Embarked'].apply(lambda x: 0 if x == 'S' else 1 if x=='C' else 0)
train_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.2500,0
1,1,1,0,38.0,1,0,71.2833,1
2,1,3,0,26.0,0,0,7.9250,0
3,1,1,0,35.0,1,0,53.1000,0
4,0,3,1,35.0,0,0,8.0500,0
...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,0
887,1,1,0,19.0,0,0,30.0000,0
888,0,3,0,28.0,1,2,23.4500,0
889,1,1,1,26.0,0,0,30.0000,1


# 5. Select feature values and target variables in the training data

In [17]:
#Select x and y  variables
y= train_data['Survived']
X = train_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
#split x and y for training and validating the model
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.30, random_state=40)

X_test = test_data[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]


# 6. ML Classifier

6.1 Train the model

In [18]:
#fit the ML model on input data and respective outputs
model = DecisionTreeClassifier()
model.fit (X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

6.2 Test the model

In [19]:
#test the model with validation data
y_testing = model.predict(X_val)

In [20]:
#Evaluate the performance of the model
print(accuracy_score(y_val,y_testing))

0.8097014925373134


 # 7. Make predictions

In [21]:
#Make predictions using test data
y_pred = model.predict(X_test)

# 8. Submissions

In [22]:
#Write results of predictions to csv file
predictions_file = pd.DataFrame({'PassengerId': test_data_og['PassengerId'],'Survived': y_pred})
predictions_file.head(10)
predictions_file.to_csv('titanic_predictions.csv', index=False)
print("Done with predictions!")


Done with predictions!
