In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
df_train=pd.read_csv("/Users/nabilch/Downloads/titanic/train.csv")
df_test=pd.read_csv("/Users/nabilch/Downloads/titanic/test.csv")
df_all=pd.concat([df_train, df_test])

check and fix missing values and data exploration

In [2]:
missing_data_train=pd.isnull(df_train).sum()
missing_data_test=pd.isnull(df_test).sum()
summary_train=df_train.describe()
summary_test=df_test.describe()
median_fare=np.nanmedian(df_all['Fare'])
df_test["Fare"]=df_test["Fare"].fillna(median_fare)
df_all['Title']=df_all.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df_train['Title']=df_train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df_test['Title']=df_test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
all_titles=(np.unique(df_all['Title'], return_counts=True))
titles=["Master", "Miss", "Mr", "Mrs"]
df_all.loc[(~df_all['Title'].isin(titles)), 'Title'] = "Other"
titledummies=pd.get_dummies(df_all[['Title']], prefix_sep='_', dtype=int)
df_all=pd.concat([df_all, titledummies], axis=1)
df_test.loc[(~df_test['Title'].isin(titles)), 'Title'] = "Other"
titledummies=pd.get_dummies(df_test[['Title']], prefix_sep='_', dtype=int)
df_test=pd.concat([df_test, titledummies], axis=1)
df_train.loc[(~df_train['Title'].isin(titles)), 'Title'] = "Other"
titledummies=pd.get_dummies(df_train[['Title']], prefix_sep='_', dtype=int)
df_train=pd.concat([df_train, titledummies], axis=1)

title_age = df_all.groupby('Title')['Age'].median()
titles=["Master", "Miss", "Mr", "Mrs", "Other"]
for title in titles:
    df_all.loc[(df_all['Age'].isnull()) & (df_all['Title'] == title), 'Age'] = title_age[title]
    df_train.loc[(df_train['Age'].isnull()) & (df_train['Title'] == title), 'Age'] = title_age[title]
    df_test.loc[(df_test['Age'].isnull()) & (df_test['Title'] == title), 'Age'] = title_age[title]
df_train['Age_group'] = pd.cut(df_train['Age'], bins=[0,10,20,40,120], labels=['Children','Teenage','Adult','Elder'])
df_test['Age_group'] = pd.cut(df_test['Age'], bins=[0,10,20,40,120], labels=['Children','Teenage','Adult','Elder'])
df_all['Age_group'] = pd.cut(df_all['Age'], bins=[0,10,20,40,120], labels=['Children','Teenage','Adult','Elder'])
df_train['Fare_bin'] = pd.cut(df_train['Fare'], bins=[0,7.91,14.45,31,120], labels=['Low_fare','median_fare','Average_fare','high_fare'])
df_test['Fare_bin'] = pd.cut(df_test['Fare'], bins=[0,7.91,14.45,31,120], labels=['Low_fare','median_fare','Average_fare','high_fare'])
df_all['Fare_bin'] = pd.cut(df_all['Fare'], bins=[0,7.91,14.45,31,120], labels=['Low_fare','median_fare','Average_fare','high_fare'])
df_all['Embarked']=df_all['Embarked'].fillna('S')
df_test['Embarked']=df_test['Embarked'].fillna('S')
df_train['Embarked']=df_train['Embarked'].fillna('S')
df_train = pd.get_dummies(df_train, columns = ["Sex", 'Pclass', "Embarked", "Age_group", 'Fare_bin'], prefix_sep="_", dtype=int)
df_test = pd.get_dummies(df_test, columns = ["Sex", 'Pclass', "Embarked", "Age_group", 'Fare_bin'], prefix_sep="_", dtype=int)
df_all['family']=df_all['SibSp']+df_all['Parch']+1
df_train['family']=df_train['SibSp']+df_train['Parch']+1
df_test['family']=df_test['SibSp']+df_test['Parch']+1

In [3]:
df_train=df_train.drop(['Age', 'Fare', 'Title', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis= 1)
df_test=df_test.drop(['Age', 'Fare', 'Title', 'Name', 'Ticket', 'Cabin'], axis= 1)
predictors=df_train.drop(['Survived'], axis= 1)
target=df_train['Survived']
X_train, X_val, y_train, y_val=train_test_split(predictors, target, test_size=.2, random_state=10)
kfold=KFold(n_splits=10,random_state=0,shuffle=True)

In [4]:
log_reg_model=LogisticRegression()
log_reg_model.fit(X_train, y_train)
print('Logistic accuracy train: ', log_reg_model.score(X_train, y_train))
print('Logistic accuracy validation: ', accuracy_score(y_val, log_reg_model.predict(X_val)))
cv_result = cross_val_score(log_reg_model,predictors,target, cv = kfold, scoring = "accuracy")
print('Logistic accuracy cross-validation: ', cv_result.mean())
log_reg_model.fit(predictors, target)
df_test_predictors=df_test.drop(['PassengerId'], axis= 1)
predictions=log_reg_model.predict(df_test_predictors)
df_submission=pd.DataFrame({'PassengerID': df_test['PassengerId'], 'Survived': predictions})
df_submission.to_csv('log_reg_titanic.csv', index=False)

Logistic accuracy train:  0.8202247191011236
Logistic accuracy validation:  0.8547486033519553
Logistic accuracy cross-validation:  0.8192883895131086


In [5]:
rand_forest_model = RandomForestClassifier(n_estimators=20, max_depth=7)
rand_forest_model.fit(X_train,y_train)
print('Random forest accuracy train: ', rand_forest_model.score(X_train, y_train))
print('Random forest accuracy validation: ', accuracy_score(y_val, rand_forest_model.predict(X_val)))
cv_result = cross_val_score(rand_forest_model, predictors,target, cv = kfold, scoring = "accuracy")
print('Random forest accuracy cross-validation: ', cv_result.mean())
rand_forest_model.fit(predictors, target)
predictions=rand_forest_model.predict(df_test_predictors)
df_submission=pd.DataFrame({'PassengerID': df_test['PassengerId'], 'Survived': predictions})
df_submission.to_csv('rand_forest_titanic.csv', index=False)

Random forest accuracy train:  0.8623595505617978
Random forest accuracy validation:  0.8268156424581006
Random forest accuracy cross-validation:  0.8192883895131086


In [6]:
grad_boost_model = GradientBoostingClassifier(n_estimators=20,max_depth=4)
grad_boost_model.fit(X_train,y_train)
print('XGBoost accuracy train: ', grad_boost_model.score(X_train, y_train))
print('XGBoost accuracy validation: ', accuracy_score(y_val, grad_boost_model.predict(X_val)))
cv_result = cross_val_score(grad_boost_model, predictors,target, cv = kfold, scoring = "accuracy")
print('XGBoost accuracy cross-validation: ', cv_result.mean())
grad_boost_model.fit(predictors, target)
predictions=grad_boost_model.predict(df_test_predictors)
df_submission=pd.DataFrame({'PassengerID': df_test['PassengerId'], 'Survived': predictions})
df_submission.to_csv('grad_boost_titanic.csv', index=False)

XGBoost accuracy train:  0.8497191011235955
XGBoost accuracy validation:  0.8268156424581006
XGBoost accuracy cross-validation:  0.8215480649188514
