In [32]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing, tree
from sklearn.metrics import auc, confusion_matrix, roc_auc_score, roc_curve, precision_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from yellowbrick.classifier import ConfusionMatrix, ROCAUC
from yellowbrick.model_selection import LearningCurve
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import regex as re

# 1. Data loading and exploring

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
whole_data = pd.concat([train_data, test_data], ignore_index=True)

In [3]:
whole_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
whole_data.shape

(1309, 12)

In [5]:
whole_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [6]:
whole_data.dtypes

PassengerId      int64
Survived       float64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
whole_data.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [8]:
whole_data.Sex.value_counts(dropna=False)

male      843
female    466
Name: Sex, dtype: int64

In [9]:
whole_data.Embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: Embarked, dtype: int64

# 2. Feature engineering

In [10]:
whole_data = whole_data.drop(columns=['Name', 'Ticket', 'Cabin'])

In [11]:
whole_data = pd.get_dummies(whole_data)

In [12]:
whole_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [13]:
whole_data = whole_data.drop(columns='Sex_female')

In [14]:
whole_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

# 3. Filling in missing values

In [15]:
whole_data.isnull().sum()

PassengerId      0
Survived       418
Pclass           0
Age            263
SibSp            0
Parch            0
Fare             1
Sex_male         0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
dtype: int64

In [16]:
whole_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0.0,3,22.0,1,0,7.25,1,0,0,1
1,2,1.0,1,38.0,1,0,71.2833,0,1,0,0
2,3,1.0,3,26.0,0,0,7.925,0,0,0,1
3,4,1.0,1,35.0,1,0,53.1,0,0,0,1
4,5,0.0,3,35.0,0,0,8.05,1,0,0,1


In [17]:
whole_data[whole_data.Fare.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_C,Embarked_Q,Embarked_S
1043,1044,,3,60.5,0,0,,1,0,0,1


In [18]:
whole_data.Fare.fillna(whole_data[whole_data.Pclass == 3].Fare.mean(), inplace=True)

In [26]:
whole_data.Age.fillna(whole_data.Age.mean(), inplace=True)

In [27]:
# Selecting data to be predicted
X_pred = whole_data[whole_data.Survived.isnull()].drop(['Survived'], axis=1)

# Selecting training data
train_data = whole_data[whole_data.Survived.notnull()]

In [28]:
train_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [29]:
X_pred.isnull().sum()

PassengerId    0
Pclass         0
Age            0
SibSp          0
Parch          0
Fare           0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64

# 4. Prediction

In [30]:
# Fetching data
X, y = train_data.drop('Survived', axis=1), train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

__Simple model__

In [33]:
from sklearn.dummy import DummyClassifier
bm = DummyClassifier()
bm.fit(X_train, y_train)
bm.score(X_test, y_test)

0.503731343283582

In [34]:
precision_score(y_test, bm.predict(X_test))

0.3627450980392157

__Building a model using a more suitable selection method__

In [55]:
X_train_test = pd.concat([X_train, X_test])
y_train_test = pd.concat([y_train, y_test])
models, models_scores = [], []

In [56]:
# RandomForest
rf_model = RandomForestClassifier(
    n_estimators=100, random_state=42
)
rf_model.fit(X_train, y_train)
models.append(rf_model)
models_scores.append(np.mean([rf_model.score(X_test, y_test),
precision_score(y_test, rf_model.predict(X_test))]))

In [57]:
# LogisticRegression
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)
models.append(log_reg_model)
models_scores.append(np.mean([log_reg_model.score(X_test, y_test),
precision_score(y_test, log_reg_model.predict(X_test))]))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [58]:
# DecisionTreeClassifier
dec_tree_model = DecisionTreeClassifier()
dec_tree_model.fit(X_train, y_train)
models.append(dec_tree_model)
models_scores.append(np.mean([dec_tree_model.score(X_test, y_test),
precision_score(y_test, dec_tree_model.predict(X_test))]))

In [59]:
# KNeighborsClassifier
k_neigh_model = KNeighborsClassifier()
k_neigh_model.fit(X_train, y_train)
models.append(k_neigh_model)
models_scores.append(np.mean([k_neigh_model.score(X_test, y_test),
precision_score(y_test, k_neigh_model.predict(X_test))]))

In [60]:
# GaussianNB
gauss_nb_model = GaussianNB()
gauss_nb_model.fit(X_train, y_train)
models.append(gauss_nb_model)
models_scores.append(np.mean([gauss_nb_model.score(X_test, y_test),
precision_score(y_test, gauss_nb_model.predict(X_test))]))

In [61]:
# SVC
svc_model = SVC()
svc_model.fit(X_train, y_train)
models.append(svc_model)
models_scores.append(np.mean([svc_model.score(X_test, y_test),
precision_score(y_test, svc_model.predict(X_test))]))

In [62]:
better_model = models[models_scores.index(max(models_scores))]

In [63]:
result = better_model.predict(X_pred)
submission = pd.DataFrame({'PassengerId':X_pred.PassengerId,'Survived':result})
submission.Survived = submission.Survived.astype(int)
print(submission.shape)
filename = 'Titanic Predictions №2.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)

(418, 2)
Saved file: Titanic Predictions №2.csv


In [64]:
submission.head()

Unnamed: 0,PassengerId,Survived
891,892,0
892,893,0
893,894,0
894,895,0
895,896,0


In [65]:
models

[RandomForestClassifier(random_state=42),
 LogisticRegression(),
 DecisionTreeClassifier(),
 KNeighborsClassifier(),
 GaussianNB(),
 SVC()]

In [71]:
for i in models:
    result = i.predict(X_pred)
    submission = pd.DataFrame({'PassengerId':X_pred.PassengerId,'Survived':result})
    submission.Survived = submission.Survived.astype(int)
    filename = 'Titanic Predictions ' + str(i)[:3] + '.csv'
    submission.to_csv(filename,index=False)

In [67]:
models_scores

[0.8215174129353233,
 0.7844112769485904,
 0.7478473019517795,
 0.6394278606965174,
 0.7746607869742198,
 0.6478544776119404]