In [141]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [142]:
df = pd.read_csv('titanic.csv', decimal=",")

In [143]:
total = df.isnull().sum().sort_values(ascending=False)
percent_1 = df.isnull().sum()/len(df)*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(10)

Unnamed: 0,Total,%
cabin,774,74.0
home.dest,361,34.5
embarked,2,0.2
fare,1,0.1
ticket,0,0.0
parch,0,0.0
sibsp,0,0.0
age,0,0.0
sex,0,0.0
name,0,0.0


In [144]:
k = df['embarked'].describe()
df['embarked'] = df['embarked'].fillna(k['top'])

i = (df['fare'] == np.nan).index[0]
list = df.index[df['pclass'] == df['pclass'][i]].tolist()
df['fare'] = df['fare'].fillna(np.mean(df['fare'][list]))

import re
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}

df['cabin'] = df['cabin'].fillna("U0")
df['deck'] = df['cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
df['deck'] = df['deck'].map(deck)
df['deck'] = df['deck'].fillna(0)
df['deck'] = df['deck'].astype(int)
df = df.drop(['cabin'], axis=1)

#categorical age and fare
df['age_cat'] = pd.qcut(df['age'],7, labels=[0, 1, 2, 3, 4, 5, 6])
df['fare_cat'] = pd.qcut(df['fare'],7, labels=[0, 1, 2, 3, 4, 5, 6])

In [145]:
df = df.drop(['ticket', 'name', 'home.dest'], axis = 1)

In [146]:
df_reg = pd.get_dummies(df.drop(['fare', 'age'], axis = 1), columns = ['embarked', 'sex', 'age_cat', 'fare_cat', 'deck'], drop_first=True)
df_tree = pd.get_dummies(df.drop(['fare_cat', 'age_cat', ], axis = 1), columns = ['embarked', 'sex', 'deck'])

In [147]:
X_tree = df_tree.drop(['survived'], axis=1)
X_reg = df_reg.drop(['survived'], axis=1)
y_tree = df_tree['survived']
y_reg = df_reg['survived']
X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(X_tree, y_tree, test_size=0.3, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

In [148]:
np.linspace(2,40,39)

array([ 2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,
       15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27.,
       28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40.])

In [155]:
grid_tree = {"max_depth":[5, 10, 15, 20, 25]}

decision_tree = DecisionTreeClassifier()
decision_tree_cv = GridSearchCV(decision_tree, grid_tree)

decision_tree_cv.fit(X_train_tree, y_train_tree)  
y_pred_tree = decision_tree_cv.predict(X_test_tree)
score_decision_tree = accuracy_score(y_pred_tree, y_test_tree)

In [156]:
scaler = StandardScaler()
X_train_reg_std = scaler.fit_transform(X_train_reg)
X_test_reg_std = scaler.fit_transform(X_test_reg)

In [157]:
grid_reg = {"C":[0.001, 0.01, 0.1, 1, 10, 100]}

logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg, grid_reg)

logreg_cv.fit(X_train_reg_std, y_train_reg)
y_pred_reg = logreg_cv.predict(X_test_reg_std)
score_log_reg = accuracy_score(y_pred_reg, y_test_reg)

In [158]:
print("Decision tree accuracy: ", score_decision_tree)
print("Logistic regression accuracy: ", score_log_reg)
print("Logistic regression c parameter from gridsearch ", logreg_cv.best_params_)
print("Decision tree max_depth parameter from gridsearch ", decision_tree_cv.best_params_)

Decision tree accuracy:  0.7707006369426752
Logistic regression accuracy:  0.802547770700637
Logistic regression c parameter from gridsearch  {'C': 10}
Decision tree max_depth parameter from gridsearch  {'max_depth': 5}
