In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import jaccard_score, f1_score, confusion_matrix, accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

In [None]:
#Function to draw confusion matrices

def plot_confusion_matrix(y,y_predict):
    cm = confusion_matrix(y,y_predict)
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax); #'1e-02' ~ 100
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Did not Survived', 'Survived']); 
    ax.yaxis.set_ticklabels(['Did not Survived', 'Survived']) 
    plt.show()

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [None]:
train_data.dtypes

In [None]:
#train_data.info

In [None]:
sns.pairplot(data=train_data, hue="Survived")

In [None]:
#sns.catplot(y="Fare", x="Survived", hue='Sex', kind="bar", data=train_data)

In [None]:
#sns.lmplot(y="Age", x="Survived", hue='Sex', data=train_data)

In [None]:
#train_data['Embarked'].value_counts().idxmax() - Returns most common value for 'Embarked' - 'S'
#train_data['Age'].mean() - Returns mean value for 'Age' - 29

train_data['Embarked'].replace(np.nan, train_data['Embarked'].value_counts().idxmax(), inplace=True)
train_data['Age'].replace(np.nan, train_data['Age'].mean(), inplace = True)
train_data.isnull().sum()/train_data.count()*100

In [None]:
#Drop columns with "non-repeating values"
#One hot encoding for the rest, convert into Float

train_data.drop(['Name','Ticket','Cabin'],axis=1,inplace=True)
train_data = pd.get_dummies(data=train_data, columns=['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked'])
train_data = train_data.astype(float)
train_data.head()

In [None]:
#Double-check if any NaN left

train_data.isnull().values.any()

In [None]:
#Set features and target

features = train_data.drop(columns='Survived',axis=1).to_numpy()
y = train_data['Survived'].to_numpy()

In [None]:
#Normalize the data

transform = preprocessing.StandardScaler()
features = transform.fit(features).transform(features)

In [None]:
#Splits the data for model comparison

x_train, x_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=10)

In [None]:
#Tried auto parameters first

KNN = KNeighborsClassifier().fit(x_train, y_train)
predictions_knn = KNN.predict(x_test)

knn_acc = accuracy_score(y_test, predictions_knn)
knn_jac = jaccard_score(y_test, predictions_knn, pos_label=0)
knn_f1 = f1_score(y_test, predictions_knn, average='weighted')
#print(' Accuracy:', knn_acc, '\n', 'Jaccard:', knn_jac, '\n', 'F1:', knn_f1)

In [None]:
#DTree accuracy variance is 69-88%

Tree = DecisionTreeClassifier().fit(x_train,y_train)
predictions_tree = Tree.predict(x_test)

tree_acc = accuracy_score(y_test, predictions_tree)
tree_jac = jaccard_score(y_test, predictions_tree, pos_label=0)
tree_f1 = f1_score(y_test, predictions_tree)
#print(' Accuracy:', tree_acc, '\n', 'Jaccard:', tree_jac, '\n', 'F1:', tree_f1)

In [None]:
LR = LogisticRegression().fit(x_train,y_train)
predictions_lr = LR.predict(x_test)

log_acc = accuracy_score(y_test, predictions_lr)
log_jac = jaccard_score(y_test, predictions_lr, pos_label=0)
log_f1 = f1_score(y_test, predictions_lr)
#print(' Accuracy:', log_acc, '\n', 'Jaccard:', log_jac, '\n', 'F1:', log_f1)

In [None]:
SVM = svm.SVC().fit(x_train, y_train)
predictions_svm = SVM.predict(x_test)

svm_acc = accuracy_score(y_test, predictions_svm)
svm_jac = jaccard_score(y_test, predictions_svm, pos_label=0)
svm_f1 = f1_score(y_test, predictions_svm)
#print(' Accuracy:', svm_acc, '\n', 'Jaccard:', svm_jac, '\n', 'F1:', svm_f1)

In [None]:
#Tuned solution from the tutorial. Sadly, it still looks better.

tree_rnd = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=10).fit(x_train,y_train)
predictions_rnd = tree_rnd.predict(x_test)

rnd_acc = accuracy_score(y_test, predictions_rnd)
rnd_jac = jaccard_score(y_test, predictions_rnd, pos_label=0)
rnd_f1 = f1_score(y_test, predictions_rnd)
print(' Accuracy:', rnd_acc, '\n', 'Jaccard:', rnd_jac, '\n', 'F1:', rnd_f1)

In [None]:
Report = {'Model': ['kNN','DTree','LogReg','SVM'],
          'Accuracy Score': [knn_acc, tree_acc, log_acc, svm_acc],
          'Jaccard Score': [knn_jac, tree_jac, log_jac, svm_jac],
          'F1-score': [knn_f1, tree_f1, log_f1, svm_f1]}
Report = pd.DataFrame(Report)
Report

In [None]:
#Start to tune the best looking models: Decision Tree

parameters = {'criterion': ['gini', 'entropy'],
     'splitter': ['best', 'random'],
     'max_depth': [2*n for n in range(1,10)],
     'max_features': ['log2', 'sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10]}
Tree = DecisionTreeClassifier()

tree_cv = GridSearchCV(estimator=Tree, cv=5, param_grid=parameters).fit(x_train, y_train)

print("best parameters: ",tree_cv.best_params_)
print("accuracy: ","%.5f" % tree_cv.best_score_)

In [None]:
#The below is gathered from various GridSearch best parameters.

Tree = DecisionTreeClassifier(criterion='entropy', max_depth=12, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, splitter='best').fit(x_train,y_train)
predictions_tree = Tree.predict(x_test)

tree_acc = accuracy_score(y_test, predictions_tree)
tree_jac = jaccard_score(y_test, predictions_tree, pos_label=0)
tree_f1 = f1_score(y_test, predictions_tree)
print(' Accuracy:', tree_acc, '\n', 'Jaccard:', tree_jac, '\n', 'F1:', tree_f1)
plot_confusion_matrix(y_test,predictions_tree)

In [None]:
#Tuning: Logistic Regression

parameters ={'C':[0.001,0.01,0.1,1],
             'penalty':['l2'],
             'solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']}
LR = LogisticRegression()

logreg_cv = GridSearchCV(estimator=LR, cv=5, param_grid=parameters).fit(x_train, y_train)

print("best parameters: ",logreg_cv.best_params_)
print("accuracy: ","%.5f" % logreg_cv.best_score_)

In [None]:
#Got the best accuracy score from newton-cholesky despite GridSearch lbfgs recommendations.

LR = LogisticRegression(solver='newton-cholesky').fit(x_train,y_train)
predictions_lr = LR.predict(x_test)

log_acc = accuracy_score(y_test, predictions_lr)
log_jac = jaccard_score(y_test, predictions_lr, pos_label=0)
log_f1 = f1_score(y_test, predictions_lr)
print(' Accuracy:', log_acc, '\n', 'Jaccard:', log_jac, '\n', 'F1:', log_f1)
plot_confusion_matrix(y_test,predictions_lr)

In [None]:
#Wasn't satisfied with anything from above, decided to use VoteClassifier.
#Got 96% with it on training data. Probably overfitting, but will see.

knn1 = KNeighborsClassifier()
tree1 = DecisionTreeClassifier()
lr1 = LogisticRegression()
svm1 = svm.SVC()

knn2 = KNeighborsClassifier(n_neighbors=8)
tree2 = DecisionTreeClassifier(criterion='entropy', max_depth=12, max_features='sqrt', min_samples_leaf=4, min_samples_split=5, splitter='best')
lr2 = LogisticRegression(solver='newton-cholesky')

#First run: Hard with Blank
votcl1 = VotingClassifier(estimators = [('kNN',knn1),('DTree',tree1),('LogReg',lr1),('SVM',svm1)],voting='hard')
votcl1 = votcl1.fit(x_test, y_test)
predictions_votcl1 = votcl1.predict(x_test)

crv1 = cross_val_score(votcl1, x_test, y_test, cv=5)
votcl1_acc = accuracy_score(y_test, predictions_votcl1)
votcl1_jac = jaccard_score(y_test, predictions_votcl1, pos_label=0)
votcl1_f1 = f1_score(y_test, predictions_votcl1)

#Second run: Soft with Blank
votcl2 = VotingClassifier(estimators = [('kNN',knn1),('DTree',tree1),('LogReg',lr1)],voting='soft')
votcl2 = votcl2.fit(x_test, y_test)
predictions_votcl2 = votcl2.predict(x_test)

crv2 = cross_val_score(votcl2, x_test, y_test, cv=5)
votcl2_acc = accuracy_score(y_test, predictions_votcl2)
votcl2_jac = jaccard_score(y_test, predictions_votcl2, pos_label=0)
votcl2_f1 = f1_score(y_test, predictions_votcl2)

#Third run: Soft with Tuned
votcl3 = VotingClassifier(estimators = [('kNN',knn2),('DTree',tree2),('LogReg',lr2)],voting='soft')
votcl3 = votcl3.fit(x_test, y_test)
predictions_votcl3 = votcl3.predict(x_test)

crv3 = cross_val_score(votcl3, x_test, y_test, cv=5)
votcl3_acc = accuracy_score(y_test, predictions_votcl3)
votcl3_jac = jaccard_score(y_test, predictions_votcl3, pos_label=0)
votcl3_f1 = f1_score(y_test, predictions_votcl3)

Report = {'Model': ['Hard with Blank','Soft with Blank','Soft with Tuned'],
          'Acc': [votcl1_acc, votcl2_acc, votcl3_acc],
          'Jac': [votcl1_jac, votcl2_jac, votcl3_jac],
          'F1': [votcl1_f1, votcl2_f1, votcl3_f1],
          'CVS': [crv1.mean(),crv2.mean(),crv3.mean()]}
Report = pd.DataFrame(Report)
Report

In [None]:
plot_confusion_matrix(y_test,predictions_votcl3)

In [None]:
#Preparing test data. Few rows are different:
#With Parch got 2 values of "9", replaced them with the mean - 0.39
#Fare had few NaNs, replaced with the mean also
#To mention: age has the higher mean here - 30

test_data['Embarked'].replace(np.nan,"S",inplace=True)
test_data['Parch'].replace(9,0,inplace=True) 
test_data['Age'].replace(np.nan,test_data['Age'].mean(),inplace=True)
test_data['Fare'].replace(np.nan,test_data['Fare'].mean(),inplace=True)
test_data.drop(['Name','Ticket','Cabin'],axis=1,inplace=True)
test_data = pd.get_dummies(data=test_data, columns=['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked'])
test_data = test_data.astype(float)
test_data.head()
#test_data.shape

In [None]:
#Double-check for any NaN left

#test_data.isnull().sum()/test_data.count()*100
test_data.isnull().values.any()

In [None]:
#Start getting the solutions to submit

X_test = test_data.to_numpy()

In [None]:
#Submission 1. Tuned Decision Tree (want to believe it can be better than randomtree)

m1 = DecisionTreeClassifier(criterion='entropy',max_depth=12,max_features='sqrt',min_samples_leaf=4,min_samples_split=5,splitter='best').fit(features,y)
p1 = m1.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': p1})
output= output.astype(int)
output.to_csv('submission_dtree.csv', index=False)

#RESULT: 62.2%

In [None]:
#Submission 2. Tuned Logistic Regression (why not)

m2 = LogisticRegression(solver='newton-cholesky').fit(features,y)
p2 = m2.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': p2})
output= output.astype(int)
output.to_csv('submission_lr.csv', index=False)

#RESULT: 45.9%

In [None]:
#Submission 3. Soft Voting with Blank (best results for training data)

m3 = VotingClassifier(estimators = [('kNN',knn1),('DTree',tree1),('LogReg',lr1)],voting='soft').fit(features,y)
p3 = m3.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': p3})
output= output.astype(int)
output.to_csv('submission_vote_blank.csv', index=False)

#RESULT: 66.0%

In [None]:
#Submission 4. Soft Voting with Tuned (kind of interested in the results)

m4 = VotingClassifier(estimators = [('kNN',knn2),('DTree',tree2),('LogReg',lr2)],voting='soft').fit(features,y)
p4 = m4.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': p4})
output= output.astype(int)
output.to_csv('submission_vote_tuned.csv', index=False)

#RESULT: 71.5%

In [None]:
#Submission 5. Tuned Tutorial solution (just checking)

m5 = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=10).fit(features,y)
p5 = m5.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': p5})
output= output.astype(int)
output.to_csv('submission_tutorial_tuned.csv', index=False)

#RESULT: 73.2%