In [101]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

In [102]:
# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LassoLarsCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVR
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [103]:
# Modelling Helpers
from sklearn.model_selection import cross_val_score, GridSearchCV, ShuffleSplit, StratifiedShuffleSplit
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.preprocessing import Imputer, Normalizer, scale, StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.externals.joblib import parallel_backend

In [104]:
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab

In [105]:
# get titanic & test csv files as a DataFrame
train = pd.read_csv("C:/Users/Eugene/Desktop/Output/Titanic/train.csv")
test  = pd.read_csv("C:/Users/Eugene/Desktop/Output/Titanic/test.csv")
ids = test['PassengerId']
full = train.append( test , ignore_index = True )
print ('Datasets:' , 'full:' , full.shape , 'train set:' , train.shape)

Datasets: full: (1309, 12) train set: (891, 12)


In [106]:
# Data  check
print('Name', full[pd.isnull(full['Name'])==True].empty)
print('PassengerId', full[pd.isnull(full['PassengerId'])==True].empty)
print('Survived', train[pd.isnull(train['Survived'])==True].empty)
print('Pclass', full[pd.isnull(full['Pclass'])==True].empty)
print('Sex', full[pd.isnull(full['Sex'])==True].empty)
print('Age', full[pd.isnull(full['Age'])==True].empty)
print('SibSp', full[pd.isnull(full['SibSp'])==True].empty)
print('Parch', full[pd.isnull(full['Parch'])==True].empty)
print('Ticket', full[pd.isnull(full['Ticket'])==True].empty)
print('Fare', full[pd.isnull(full['Fare'])==True].empty)
print('Cabin', full[pd.isnull(full['Cabin'])==True].empty)
print('Embarked', full[pd.isnull(full['Embarked'])==True].empty)
# This allows to identify that Age,Cabin,Embark fields require some additional improvement (unless dropped)
# But we won't drop them because there are correlations between survival and age and cabin (deck) so we will
# try to do our best to fill them in
#It appeares that some of these tickets are group tickets, meaning that Fare per passenger should be ajusted

Name True
PassengerId True
Survived True
Pclass True
Sex True
Age False
SibSp True
Parch True
Ticket True
Fare False
Cabin False
Embarked False


In [107]:
repeating_tickets = pd.Series.value_counts(full['Ticket'])[pd.Series.value_counts(full['Ticket'])>1]
ticket_tags=repeating_tickets.index.tolist()
repeating_surnames=pd.Series.value_counts(train.Name.str.extract('([A-Z][a-z]*)\,', expand=False)).index.tolist()
for ticket_tag in ticket_tags:
    full.loc[full['Ticket'] == ticket_tag, ['Fare']] = full[full['Ticket'] == ticket_tag]['Fare'].mul(1/repeating_tickets[ticket_tag])
# category Names contains too much detail, lets take out the essential: Family surname (to buld family variable)
# and title that will give insight into passenger status as well as the age.
# Finding Group tickets and changing Ticket fare per group member rather than total
#Because loc was used it is better to move to the other Cell

In [108]:
# Fares now are per-person normalized. Lets see If we can restrore some cabin data from the tickets.
for ticket_tag in ticket_tags:
    if not np.all(pd.isnull(full[full['Ticket'] == ticket_tag] ['Cabin'])):
        if np.any(pd.isnull(full[full['Ticket'] == ticket_tag] ['Cabin'])):
            fillnan_value  = full[full['Ticket'] == ticket_tag].Cabin.dropna().mode()[0]
            full.loc[full['Ticket'] == ticket_tag,['Cabin']] = full[full['Ticket'] == ticket_tag].Cabin.fillna(fillnan_value)
fillnan_value='U'
full['Cabin']=full['Cabin'].replace('', fillnan_value)
full['Cabin']=full['Cabin'].replace(np.nan, fillnan_value)
print('Cabin', full[pd.isnull(full['Cabin'])==True].empty)
# with that cabin variable is set and ready to be turned into a categorial "deck"d
full.Cabin = full.Cabin.map( lambda c : c[0] )
cabin_dummies = pd.get_dummies(full['Cabin'], prefix='Cabin') 
full = pd.concat([full, cabin_dummies], axis=1)

Cabin True


In [109]:
# Fares and ports time. My guess that ticket fare strongly depends on emberkment port
# Also both of the missing values seem to be from the same port
common_port=full.Embarked.dropna().mode()[0]
full.Embarked = full.Embarked.fillna(common_port)
fill_fare_value = full.groupby('Embarked').mean()['Fare'][2]
full.Fare = full.Fare.fillna(fill_fare_value)

In [110]:
# Lets work on a putting together SibSp and Parch columns. They represent a family size for a given passenger
# Also, it would be preffered to categorize them as propability of survival based on the family size is 
# not linear, and variable itself takes small range of values (0-7) to model it with curve.
full[ 'FamilySize' ] = full[ 'Parch' ] + full[ 'SibSp' ] + 1
full[ 'Family_Single' ] = full[ 'FamilySize' ].map( lambda s : 1 if s == 1 else 0 )
full[ 'Family_Small' ]  = full[ 'FamilySize' ].map( lambda s : 1 if 2 <= s <= 4 else 0 )
full[ 'Family_Large' ]  = full[ 'FamilySize' ].map( lambda s : 1 if 5 <= s else 0 )
full[ 'Embarked_C' ] = full[ 'Embarked' ].map( lambda s : 1 if s == 'C' else 0 )
full[ 'Embarked_Q' ] = full[ 'Embarked' ].map( lambda s : 1 if s == 'Q' else 0 )
full[ 'Embarked_S' ] = full[ 'Embarked' ].map( lambda s : 1 if s == 'S' else 0 )
full[ 'Pclass_First' ] = full[ 'Pclass' ].map( lambda s : 1 if s == 1 else 0 )
full[ 'Pclass_Second' ] = full[ 'Pclass' ].map( lambda s : 1 if s == 2 else 0 )
full[ 'Pclass_Third' ] = full[ 'Pclass' ].map( lambda s : 1 if s == 3 else 0 )
#full=full.drop(['Parch','SibSp','FamilySize','Ticket','PassengerId'],axis=1, inplace=False)
print(full.columns)

Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId',
       'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket', 'Cabin_A', 'Cabin_B',
       'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_T',
       'Cabin_U', 'FamilySize', 'Family_Single', 'Family_Small',
       'Family_Large', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Pclass_First', 'Pclass_Second', 'Pclass_Third'],
      dtype='object')


In [111]:
# Data preparation

full['Title'] = full.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
full['Title'] = full['Title'].replace(['Capt', 'Col','Dr', 'Major', 'Rev'], 'Crew')
full['Title'] = full['Title'].replace(['Don'], 'Mr')
full['Title'] = full['Title'].replace(['Mlle','Ms','Dona'],'Miss')
full['Title'] = full['Title'].replace(['Mme'], 'Mrs')
full['Title'] = full['Title'].replace(['Lady','Countess','Sir', 'Jonkheer'], 'Noble')
full[ 'Title_Mr' ] = full[ 'Title' ].map( lambda s : 1 if s == 'Mr' else 0 )
full[ 'Title_Miss' ] = full[ 'Title' ].map( lambda s : 1 if s == 'Miss' else 0 )
full[ 'Title_Mrs' ] = full[ 'Title' ].map( lambda s : 1 if s == 'Mrs' else 0 )
full[ 'Title_Noble' ] = full[ 'Title' ].map( lambda s : 1 if s == 'Noble' else 0 )
full[ 'Title_Crew' ] = full[ 'Title' ].map( lambda s : 1 if s == 'Crew' else 0 )
full[ 'Sex_male' ] = full[ 'Sex' ].map( lambda s : 1 if s == 'male' else 0 )
full[ 'Sex_female' ] = full[ 'Sex' ].map( lambda s : 1 if s == 'female' else 0 )

In [112]:
# Last step is to fill in Age values. As Title in the name clearly correlates with age of a person (Master)
# I would need to calculate grouped median values rather than using median age of a whole data
grouped_full =full.dropna().groupby(['Sex', 'Pclass','Title'])
grouped_median_full = grouped_full.median()
grouped_median_full = grouped_median_full.reset_index()[['Sex', 'Pclass', 'Title',  'Age']]                                                 
grouped_median_full.head()
def fill_age(row):
    condition = (
        (grouped_median_full['Sex'] == row['Sex']) & 
        (grouped_median_full['Title'] == row['Title']) & 
        (grouped_median_full['Pclass'] == row['Pclass'])
    ) 
    return grouped_median_full[condition]['Age'].values[0]
full['Age'] = full.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)

In [113]:
#Time to drop some columns
full=full.drop(['Parch', 'SibSp', 'FamilySize', 'Ticket', 'PassengerId',\
                'Embarked','Pclass','Name','Title','Cabin', 'Sex'], axis=1, inplace=False)
train=full[:891]
test=full[891:].drop(['Survived'],axis=1)
print(full.columns)
print(test.columns)


Index(['Age', 'Fare', 'Survived', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D',
       'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_T', 'Cabin_U', 'Family_Single',
       'Family_Small', 'Family_Large', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Pclass_First', 'Pclass_Second', 'Pclass_Third',
       'Title_Mr', 'Title_Miss', 'Title_Mrs', 'Title_Noble', 'Title_Crew',
       'Sex_male', 'Sex_female'],
      dtype='object')
Index(['Age', 'Fare', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E',
       'Cabin_F', 'Cabin_G', 'Cabin_T', 'Cabin_U', 'Family_Single',
       'Family_Small', 'Family_Large', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Pclass_First', 'Pclass_Second', 'Pclass_Third',
       'Title_Mr', 'Title_Miss', 'Title_Mrs', 'Title_Noble', 'Title_Crew',
       'Sex_male', 'Sex_female'],
      dtype='object')


In [187]:
#actuall fitting of survival
acc_score = make_scorer(accuracy_score)
cv = ShuffleSplit(n_splits=5, test_size=0.8)
x_norm = x_survival = train.drop('Survived',axis=1)
test_norm=test
y_survival = train['Survived']
scaler=StandardScaler()

#Predictors
logreg = LogisticRegression()
logreg.fit(x_norm, y_survival)
print('logistic reg score: ',logreg.score(x_survival, y_survival))

svc_surv=SVC(gamma=0.1,C=1)
svc_surv.fit(x_survival, y_survival)
print('SVC score: ',svc_surv.score(x_survival, y_survival))

x_norm[['Fare','Age']] = scaler.fit_transform(x_norm[['Fare','Age']])
svc_nsurv = SVC(gamma=1e-1,C=1)
svc_nsurv.fit(x_norm, y_survival)
print('Normalized SVC score: ',svc_nsurv.score(x_survival, y_survival))

knn = KNeighborsClassifier(algorithm='auto', leaf_size=26, metric='minkowski', 
                           metric_params=None, n_jobs=1, n_neighbors=6, p=2, 
                           weights='uniform')
knn.fit(x_norm, y_survival)
print(k,' nn score: ',knn.score(x_survival, y_survival))

gaussian = GaussianNB()
gaussian.fit(x_survival, y_survival)
print('Naive Bayes',gaussian.score(x_survival, y_survival))

bestdecision_tree = DecisionTreeClassifier(max_depth=10, max_features='log2',\
                                           max_leaf_nodes=9, min_samples_leaf=9)
decision_tree = DecisionTreeClassifier(max_depth=17, max_features='log2',\
                                       max_leaf_nodes=8, min_samples_leaf=8)
decision_tree.fit(x_survival, y_survival)
bestdecision_tree.fit(x_survival, y_survival)
print('DecisionTree ',decision_tree.score(x_survival, y_survival))
print('Best DecisionTree ',bestdecision_tree.score(x_survival, y_survival))

random_forest = RandomForestClassifier(n_estimators=205, max_depth=7, min_samples_split=6,\
                                       max_features='log2', min_samples_leaf=6)
random_forest.fit(x_survival, y_survival)
print('Random Forest',random_forest.score(x_survival, y_survival))

method=[
        logreg, svc_surv, svc_nsurv,
        knn ,gaussian, decision_tree,
        bestdecision_tree, random_forest
        ]
scores=np.zeros(8)
scores[0] = cross_val_score(logreg,x_survival,y_survival,scoring=acc_score,cv=cv).mean()
scores[1] = cross_val_score(svc_surv,x_survival,y_survival,scoring=acc_score,cv=cv).mean()
scores[2] = cross_val_score(svc_surv,x_norm,y_survival,scoring=acc_score,cv=cv).mean()
scores[3] = cross_val_score(knn,x_survival,y_survival,scoring=acc_score,cv=cv).mean()
scores[4] = cross_val_score(gaussian,x_survival,y_survival,scoring=acc_score,cv=cv).mean()
scores[5] = cross_val_score(decision_tree,x_survival,y_survival,scoring=acc_score,cv=cv).mean()
scores[6] = cross_val_score(bestdecision_tree,x_survival,y_survival,scoring=acc_score,cv=cv).mean()
scores[7] = cross_val_score(random_forest,x_survival,y_survival,scoring=acc_score,cv=cv).mean()
print('Accuracy Logistic ', scores[0])
print('Accuracy SVC ',scores[1])
print('Accuracy NSVC ',scores[2])
print('Accuracy NN ',scores[3])
print('Accuracy Bayes ',scores[4])
print('Accuracy Tree ',scores[5])
print('Accuracy Best Tree ',scores[6])
print('Accuracy Best forest ',scores[7])
methodId=np.argmax(scores)
print('Best method', str(method[methodId]), ' with score: ', scores[methodId])
test_norm[['Fare','Age']] = scaler.fit_transform(test[['Fare','Age']])
predictions = method[methodId].predict(test_norm).astype(int)
output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions})
output.to_csv('titanic-predictions.csv', index = False)

logistic reg score:  0.8372615039281706
SVC score:  0.8967452300785634
Normalized SVC score:  0.8395061728395061
20  nn score:  0.8507295173961841
Naive Bayes 0.7789001122334456
DecisionTree  0.7845117845117845
Best DecisionTree  0.7542087542087542
Random Forest 0.8529741863075196
Accuracy Logistic  0.8095371669004209
Accuracy SVC  0.8039270687237027
Accuracy NSVC  0.8196353436185133
Accuracy NN  0.8022440392706873
Accuracy Bayes  0.7450210378681626
Accuracy Tree  0.7831697054698457
Accuracy Best Tree  0.7640953716690042
Accuracy Best forest  0.7896213183730716
Best method SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)  with score:  0.8196353436185133


In [175]:
#parameters fiddling
#x_survival[['Fare','Age']] = scaler.fit_transform(x_survival[['Fare','Age']])
#C_range = np.logspace(-2,7, 10)
#gamma_range = np.logspace(-3,3, 7)
md_range=np.arange(1,10); md_range[0] = 6
#mxf_range=np.arange(1,18); mxf_range[0] = 12
#lnodes_range=np.arange(1,10); lnodes_range[0] = 9
#msleaf_range=np.arange(10); msleaf_range[0] = 8
forest_range=np.arange(205,210)
sample_split=np.arange(6,7)
sample_leaf=np.arange(6,10)
boot_range=[True]
#SVC_param_grid = dict(gamma=gamma_range, C=C_range)
#DT_param_grid = dict( max_depth=md_range,max_features=mxf_range,max_leaf_nodes=lnodes_range,min_samples_leaf=msleaf_range)
RF_param_grid=dict(n_estimators=forest_range, max_depth=md_range,\
                   min_samples_split=sample_split, min_samples_leaf=sample_leaf)
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.9)
r_f=RandomForestClassifier()
grid = GridSearchCV(r_f, param_grid=RF_param_grid, cv=cv, verbose=1, n_jobs=-1,  scoring=acc_score)
grid.fit(x_survival, y_survival)
print("The best parameters for method are %s with a score of %0.4f"
     % (grid.best_params_, grid.best_score_))

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1800 out of 1800 | elapsed:  4.0min finished


The best parameters for method are {'n_estimators': 205, 'min_samples_leaf': 6, 'max_depth': 7, 'min_samples_split': 6} with a score of 0.7964
