# Project 3: Titanic survival dataset
----
In this notebook, we explore some common ML models to guess wheather a person can survive in the tragic Titanic accident.

In [1]:
# import necessary packages
import pandas as pd
import numpy as np
from sklearn import linear_model, metrics, model_selection, svm, preprocessing, ensemble, neighbors, tree

In [2]:
# read training set
df = pd.read_csv('./titanicTrain.csv')

# drop rows that is all nan
df = df.dropna(how='all')

# convert name in to the first name
df.name = df.name.apply(lambda x: x.split(sep=',')[0])

# fill nan with default valuus
df.age = df.age.fillna(value=0)
df.cabin = df.cabin.fillna(value='nan')
df.embarked = df.embarked.fillna(value='nan')
df.boat = df.boat.fillna(value='nan')
df.body = df.body.fillna(value=0)
df = df.drop('home.dest', axis=1)
df.head(10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body
0,1.0,1.0,Allen,female,29.0,0.0,0.0,24160,211.3375,B5,S,2,0.0
1,1.0,1.0,Allison,male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11,0.0
2,1.0,0.0,Allison,female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,0.0
3,1.0,0.0,Allison,male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0
4,1.0,0.0,Allison,female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,0.0
5,1.0,1.0,Anderson,male,48.0,0.0,0.0,19952,26.55,E12,S,3,0.0
6,1.0,1.0,Andrews,female,63.0,1.0,0.0,13502,77.9583,D7,S,10,0.0
7,1.0,0.0,Andrews,male,39.0,0.0,0.0,112050,0.0,A36,S,,0.0
8,1.0,1.0,Appleton,female,53.0,2.0,0.0,11769,51.4792,C101,S,D,0.0
9,1.0,0.0,Artagaveytia,male,71.0,0.0,0.0,PC 17609,49.5042,,C,,22.0


In [3]:
# load testing set
test = pd.read_csv('titanicQuestion.csv')
# drop nan
test = test.dropna(how='all')
# convert name to first name
test.name = test.name.apply(lambda x: x.split(sep=',')[0])
# fill nan
test.age = test.age.fillna(value=0)
test.cabin = test.cabin.fillna(value='nan')
test.embarked = test.embarked.fillna(value='nan')
test.boat = test.boat.fillna(value='nan')
test.body = test.body.fillna(value=0)
# only one nan in fare
test.fare = test.fare.fillna(value=0)
test = test.drop('home.dest', axis=1)
test.head(10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body
0,3,,McCormack,male,0.0,0,0,367228,7.75,,Q,,0.0
1,3,,McCoy,female,0.0,2,0,367226,23.25,,Q,16.0,0.0
2,3,,McCoy,female,0.0,2,0,367226,23.25,,Q,16.0,0.0
3,3,,McCoy,male,0.0,2,0,367226,23.25,,Q,16.0,0.0
4,3,,McDermott,female,0.0,0,0,330932,7.7875,,Q,13.0,0.0
5,3,,McEvoy,male,0.0,0,0,36568,15.5,,Q,,0.0
6,3,,McGovern,female,0.0,0,0,330931,7.8792,,Q,13.0,0.0
7,3,,McGowan,female,15.0,0,0,330923,8.0292,,Q,,0.0
8,3,,McGowan,female,35.0,0,0,9232,7.75,,Q,,0.0
9,3,,McMahon,male,0.0,0,0,370372,7.75,,Q,,0.0


In [4]:
# answers (Y in training set)
Y = df.survived.values

data = df.drop(['survived'], axis=1).values
test_data = test.drop(['survived'], axis=1).values
all_data = np.concatenate((test_data, data))

# categorize name, gender, ticket, cabin, embarked, boat, body data
# e.g. male => 1, female => 0
le = preprocessing.LabelEncoder()
for i in [1,2,6,8,9,10,11]:
    le.fit(all_data[:, i])
    data[:, i] = le.transform(data[:, i])
    test_data[:, i] = le.transform(test_data[:, i])


### Evaluate different ML models:
use 5-fold validation to evaluate models

In [5]:
# Linear SVM classifier:
# result: unable to predict well
# possible reason: too much not continuous categorical data
LSVM = svm.LinearSVC()
result_LSVM = model_selection.cross_validate(LSVM, data, Y, cv=5, n_jobs=-1, scoring='accuracy', return_train_score=True)
print('5-fold validation acc:', result_LSVM['test_score'].mean())
result_LSVM

5-fold validation acc: 0.713874146854


{'fit_time': array([ 0.09056473,  0.05454183,  0.05904222,  0.07654834,  0.07355332]),
 'score_time': array([ 0.04002762,  0.03802395,  0.01250768,  0.0060041 ,  0.00100207]),
 'test_score': array([ 0.59701493,  0.88059701,  0.74      ,  0.87437186,  0.47738693]),
 'train_score': array([ 0.46808511,  0.87234043,  0.7225    ,  0.81772784,  0.63420724])}

In [6]:
# non-linear SVM classifier:
# result: overfitting on training set (100% acc)
SVM = svm.SVC()
result_SVM = model_selection.cross_validate(SVM, data, Y, cv=5, n_jobs=-1, scoring='accuracy', return_train_score=True)
print('5-fold validation acc:', result_SVM['test_score'].mean())
result_SVM

5-fold validation acc: 0.577001550039


{'fit_time': array([ 0.08005643,  0.06554556,  0.06905174,  0.11007786,  0.10307193]),
 'score_time': array([ 0.01951289,  0.01351142,  0.01300955,  0.01901412,  0.02051473]),
 'test_score': array([ 0.5721393 ,  0.58208955,  0.575     ,  0.57286432,  0.58291457]),
 'train_score': array([ 1.,  1.,  1.,  1.,  1.])}

In [7]:
# Stochastic Gradient Descent:
# similar to linear SVM but with better optimizer (SGD)
# result: fail to predict well
# possible reason: too much not continuous categorical data
SGD = linear_model.SGDClassifier()
result_SGD = model_selection.cross_validate(SGD, data, Y, cv=5, n_jobs=-1, scoring='accuracy', return_train_score=True)
print('5-fold validation acc:', result_SGD['test_score'].mean())
result_SGD

5-fold validation acc: 0.580006975174


{'fit_time': array([ 0.01100612,  0.00898647,  0.01100683,  0.01348948,  0.0030005 ]),
 'score_time': array([ 0.01000714,  0.02701998,  0.01200819,  0.00300241,  0.00200605]),
 'test_score': array([ 0.62686567,  0.56716418,  0.505     ,  0.55778894,  0.64321608]),
 'train_score': array([ 0.73216521,  0.52315394,  0.535     ,  0.66167291,  0.69787765])}

In [8]:
# Random forest:
# ensemble model using mutiple decision tree for classification
# perform pretty well
RF = ensemble.RandomForestClassifier(random_state=9487)
result_RF = model_selection.cross_validate(RF, data, Y, cv=5, n_jobs=-1, scoring='accuracy', return_train_score=True)
print('5-fold validation acc:', result_RF['test_score'].mean())
result_RF

5-fold validation acc: 0.973969499237


{'fit_time': array([ 0.02149534,  0.02101564,  0.02051568,  0.0220139 ,  0.02251697]),
 'score_time': array([ 0.00200129,  0.00200152,  0.00200081,  0.00200415,  0.00200129]),
 'test_score': array([ 0.99004975,  0.97512438,  0.97      ,  0.97487437,  0.95979899]),
 'train_score': array([ 0.99749687,  1.        ,  0.99625   ,  0.99875156,  0.99500624])}

In [9]:
# Decision tree:
# perform well for a single model
# probably make most decision respect to the boat variable
Tree = tree.DecisionTreeClassifier(random_state=9487)
result_Tree = model_selection.cross_validate(Tree, data, Y, cv=5, n_jobs=-1, scoring='accuracy', return_train_score=True)
print('5-fold validation acc:', result_Tree['test_score'].mean())
result_Tree

5-fold validation acc: 0.923928598215


{'fit_time': array([ 0.00550437,  0.00400376,  0.00500202,  0.00500083,  0.00400186]),
 'score_time': array([ 0.00150061,  0.00100064,  0.00100064,  0.00100255,  0.00100183]),
 'test_score': array([ 0.96517413,  0.93034826,  0.9       ,  0.91959799,  0.90452261]),
 'train_score': array([ 1.,  1.,  1.,  1.,  1.])}

In [10]:
# AdaBoost classifier:
# use 50 decision trees to deal with most difficult entries in the training set
# use AdaBoost algorithm for optimizer
AB = ensemble.AdaBoostClassifier(random_state=9487)
result_AB = model_selection.cross_validate(AB, data, Y, cv=5, n_jobs=-1, scoring='accuracy', return_train_score=True)
print('5-fold validation acc:', result_AB['test_score'].mean())
result_AB

5-fold validation acc: 0.966944323608


{'fit_time': array([ 0.12910986,  0.18512845,  0.15761113,  0.1656158 ,  0.17862725]),
 'score_time': array([ 0.00748515,  0.01301003,  0.01251054,  0.00902462,  0.00950837]),
 'test_score': array([ 0.9800995 ,  0.9800995 ,  0.97      ,  0.96482412,  0.93969849]),
 'train_score': array([ 0.98748436,  0.98623279,  0.99      ,  0.99001248,  0.99625468])}

In [11]:
# Vote amount top three classifiers above (AdaBoost, Decision tree, Randon forest)
Vote = ensemble.VotingClassifier(estimators=[('AdaBoost', AB), ('Decision Tree', Tree), ('Randon Forest', RF)], n_jobs=-1)
Vote.fit(data, Y)

# make prediction
prediction = Vote.predict(test_data)
tmp = pd.read_csv('titanicQuestion.csv')
tmp.survived = prediction

# write to csv
tmp.to_csv('MyPrediciton.csv', index=False)