In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Imports

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
%matplotlib inline

### Reading the data

In [None]:
trainData = pd.read_csv("/content/drive/My Drive/train.csv")
trainData.columns = map(str.lower, trainData.columns)


In [None]:
testData = pd.read_csv("/content/drive/My Drive/titanic_test.csv")
testData.columns = map(str.lower, testData.columns)

In [None]:
trainData.embarked.fillna('S')
testData.embarked.fillna('S')

0      Q
1      S
2      Q
3      S
4      S
      ..
413    S
414    C
415    S
416    S
417    C
Name: embarked, Length: 418, dtype: object

## Data Cleaning and Feature Selection

### Train and Test Split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(trainData.drop(columns=["survived"]), trainData["survived"], random_state = 42)  

In [None]:
dummies = pd.get_dummies(x_train.sex, drop_first=True)


Unnamed: 0,male
298,1
884,1
247,0
478,1
305,1
...,...
106,0
270,1
860,1
435,0


Let's not drop cabin data but try to make it useful by looking only whether it exists in a row or not.

### Cabin

In [None]:
def cabinExists(dataFrame):
  dataFrame["cabin"] = dataFrame["cabin"].fillna(0)
  dataFrame.cabin = dataFrame.cabin.apply(lambda x: 0 if x == 0 else 1)
  return dataFrame

In [None]:
trainData = cabinExists(trainData)
testData = cabinExists(testData)


### Title from name

In [None]:
normalized_titles = {"Capt":"o",
                     "Col":"o",
                     "Major":"o",
                     "Jonkheer":"r",
                     "Don":"r",
                     "Sir" :"r",
                     "Dr":"o",
                     "Rev":"o",
                     "the Countess":"r",
                     "Dona":"r",
                     "Mme":"mrs",
                     "Mlle":"Miss",
                     "Ms":"mrs",
                     "Mr" :"Mr",
                     "Mrs" :"mrs",
                     "Miss":"Miss",
                     "Master":"Master",
                     "Lady":"mrs"}

def normalize_titles(dataFrame):
  dataFrame['title'] = dataFrame['name'].apply(lambda name: name.split(',')[1].split('.')[0].strip()).map(normalized_titles)
  return dataFrame

In [None]:
trainData = normalize_titles(trainData)
testData = normalize_titles(testData)

### Name length

In [None]:
def name_length(dataFrame):
  dataFrame['name_len'] = dataFrame['name'].apply(lambda name: 1 if len(name)>25 else 0)
  return dataFrame

In [None]:
trainData = name_length(trainData)
testData = name_length(testData)

### Ticket number length

In [None]:
def ticket_token(dataFrame):
  dataFrame["ticket_token"] = dataFrame["ticket"].apply(lambda x: len(re.sub(r'\D', '', x)) if len(re.sub(r'\D', '', x))>1 else 1)
  return dataFrame

In [None]:
x_train = ticket_token(x_train)

In [None]:
trainData = ticket_token(trainData)
testData = ticket_token(testData)

### Fill age

In [None]:
def fill_age(trainSet, testSet = None):
  if testSet is None:
    trainSet = trainSet.fillna(trainSet.median())
    return trainSet
  else:
    testSet = testSet.fillna(trainSet.age.median())
    return testSet

In [None]:
x_train = fill_age(x_train)

In [None]:
trainData = fill_age(trainData)
testData = fill_age(testData)

In [None]:
trainData['age*p'] = trainData['age']*trainData['pclass']
testData['age*p'] = testData['age']*testData['pclass']

### Fare Categories


In [None]:
def fare_categorize(trainSet):
  interval = (0, 5, 12, 18, 25, 35, 60, 120)
  age_cat = ['babies', 'children', 'teenage', 'student', 'young', 'adult', 'senior']
  trainSet["age_cat"] = pd.cut(trainSet.age, interval, labels=age_cat)
  return trainSet

In [None]:
x_train = fare_categorize(x_train)

In [None]:
trainData = fare_categorize(trainData)
testData = fare_categorize(testData)

### Age Groups

In [None]:
def age_categorize(trainSet):
  quant = (-1, 0, 8, 15, 31, 600)
  label_quants = ['NoInf', 'quart_1', 'quart_2', 'quart_3', 'quart_4']
  trainSet["fare_cat"] = pd.cut(trainSet.fare, quant, labels=label_quants)
  return trainSet

In [None]:
x_train = age_categorize(x_train)

In [None]:
trainData = age_categorize(trainData)
testData = age_categorize(testData)

### Family Boarded

In [None]:
def family(dataFrame):
  dataFrame["family"] = dataFrame["sibsp"] + dataFrame["parch"]
  dataFrame["family"] = dataFrame["family"].apply(lambda x: 1 if x>0 else 0)
  return dataFrame

In [None]:
x_train = family(x_train)

In [None]:
trainData = family(trainData)
testData = family(testData)

### Is Alone?

In [None]:
testData['isAlone'] = 0
if ((testData["sibsp"] + testData["parch"]) > 0).bool == True:
  testData['isAlone'] = 1
trainData['isAlone'] = 0
if ((trainData["sibsp"] + trainData["parch"]) > 0).bool == True:
  trainData['isAlone'] = 1

### Encoding categorical data

In [None]:
def get_dummies_t(dataFrame):
  for column in dataFrame.columns:
    if (dataFrame[column].nunique()<10  and dataFrame[column].dtype==np.dtype('O')) or (dataFrame[column].nunique()<10 and dataFrame[column].nunique()>2):
      if column == "sibsp" or column == "parch" or column == "ticket_token":
        continue
      if column == "title":
        dataFrame = dataFrame.join(pd.get_dummies(dataFrame[column], prefix=column))
        dataFrame.drop(columns=column, inplace = True)
        continue
      dataFrame = dataFrame.join(pd.get_dummies(dataFrame[column], prefix=column, drop_first=True))
      dataFrame.drop(columns=column, inplace = True)
  return dataFrame

In [None]:
trainData = get_dummies_t(trainData)
testData = get_dummies_t(testData)

In [None]:
trainData.head()

Unnamed: 0,passengerid,survived,name,age,sibsp,parch,ticket,fare,cabin,name_len,ticket_token,age*p,family,isAlone,pclass_2,pclass_3,sex_male,embarked_Q,embarked_S,title_Master,title_Miss,title_Mr,title_mrs,title_o,title_r,age_cat_children,age_cat_teenage,age_cat_student,age_cat_young,age_cat_adult,age_cat_senior,fare_cat_quart_1,fare_cat_quart_2,fare_cat_quart_3,fare_cat_quart_4
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,0,0,6,66.0,1,0,0,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,1,1,5,38.0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,0,0,8,78.0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,1,1,6,35.0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,0,0,6,105.0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0


### Drop text columns

In [None]:
def drop_text(dataFrame):
  for column in dataFrame.columns:
    if dataFrame[column].dtype==object:
      dataFrame.drop(columns=column, inplace = True)
  return dataFrame

In [None]:
x_train = drop_text(x_train)

In [None]:
trainData = drop_text(trainData)
testData = drop_text(testData)

In [None]:
trainData = trainData.drop(['passengerid'], axis=1)

In [None]:
trainData = trainData.drop(['ticket_token'], axis=1)
testData = testData.drop(['ticket_token'], axis=1)

In [None]:
trainData.columns

Index(['survived', 'age', 'sibsp', 'parch', 'fare', 'name_len', 'ticket_token',
       'family', 'pclass_2', 'pclass_3', 'sex_male', 'cabin_B', 'cabin_C',
       'cabin_D', 'cabin_E', 'cabin_F', 'cabin_G', 'cabin_M', 'cabin_T',
       'embarked_Q', 'embarked_S', 'title_Master', 'title_Miss', 'title_Mr',
       'title_mrs', 'title_o', 'title_r', 'age_cat_children',
       'age_cat_teenage', 'age_cat_student', 'age_cat_young', 'age_cat_adult',
       'age_cat_senior', 'fare_cat_quart_1', 'fare_cat_quart_2',
       'fare_cat_quart_3', 'fare_cat_quart_4'],
      dtype='object')

In [None]:
trainData.columns

Index(['survived', 'age', 'sibsp', 'parch', 'fare', 'cabin', 'name_len',
       'ticket_token', 'age*p', 'family', 'isAlone', 'pclass_2', 'pclass_3',
       'sex_male', 'embarked_Q', 'embarked_S', 'title_Master', 'title_Miss',
       'title_Mr', 'title_mrs', 'title_o', 'title_r', 'age_cat_children',
       'age_cat_teenage', 'age_cat_student', 'age_cat_young', 'age_cat_adult',
       'age_cat_senior', 'fare_cat_quart_1', 'fare_cat_quart_2',
       'fare_cat_quart_3', 'fare_cat_quart_4'],
      dtype='object')

### Test Preperation

In [None]:
from sklearn.preprocessing import StandardScaler

def scale_test(x_train, x_test = None):
  scaler = StandardScaler()
  scaler.fit(x_train)
  if x_test is None:
    return pd.DataFrame(scaler.transform(x_train), columns=x_train.columns)
  return pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

In [None]:
#x_train = scale_test(x_train)

In [None]:
def prepare_test(x_train, x_test):
  x_test = cabinExists(x_test)
  x_test = normalize_titles(x_test)
  x_test = name_length(x_test)
  x_test = ticket_token(x_test)
  x_test = fill_age(x_train, x_test)
  x_test = age_categorize(x_test)
  x_test = family(x_test)
  x_test = get_dummies_t(x_test)
  x_test = drop_text(x_test)
  #x_test = scale_test(x_train, x_test)

  return x_test

In [None]:
x_train

Unnamed: 0,passengerid,age,sibsp,parch,fare,cabin,name_len,ticket_token,family,pclass_2,pclass_3,sex_male,embarked_Q,embarked_S,title_Master,title_Miss,title_Mr,title_mrs,title_o,title_r,age_cat_children,age_cat_teenage,age_cat_student,age_cat_young,age_cat_adult,age_cat_senior,fare_cat_quart_1,fare_cat_quart_2,fare_cat_quart_3,fare_cat_quart_4
298,299,28.00,0,0,30.5000,1,0,5,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
884,885,25.00,0,0,7.0500,0,0,6,0,0,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0
247,248,24.00,0,2,14.5000,0,1,6,1,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0
478,479,22.00,0,0,7.5208,0,0,6,0,0,1,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0
305,306,0.92,1,2,151.5500,1,1,6,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,107,21.00,0,0,7.6500,0,1,6,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0
270,271,28.00,0,0,31.0000,0,0,6,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0
860,861,41.00,2,0,14.1083,0,0,6,1,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0
435,436,14.00,1,2,120.0000,1,0,6,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1


## PREDICTIONS

### Imports

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [None]:
X_train = trainData.drop("survived", axis=1)
Y_train = trainData["survived"]
X_test  = testData.drop("passengerid", axis=1).copy()

In [None]:
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)

sgd.score(X_train, Y_train)

acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)

In [None]:
acc_sgd

43.1

In [None]:
# rf = RandomForestClassifier(max_features='auto', oob_score=True, random_state=1, n_jobs=-1)

# param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2, 4, 10, 12, 16], "n_estimators": [50, 100, 400, 700, 1000]}

# gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=3, n_jobs=-1)

# gs = gs.fit(X_train, Y_train)
# Y_prediction_r = rf.predict(X_test)
# round(gs.score(X_train, Y_train) * 100, 2)

In [None]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(X_train, Y_train)
Y_prediction_r = rf.predict(X_test)

rf.score(X_train, Y_train)
acc_random_forest = round(rf.score(X_train, Y_train) * 100, 2)
acc_random_forest

90.8

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_prediction_r = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

98.99

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred_winner = logreg.predict(X_test)

acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



84.62

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3) 
knn.fit(X_train, Y_train)  
Y_pred = knn.predict(X_test)  
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

84.51

In [None]:
gaussian = GaussianNB() 
gaussian.fit(X_train, Y_train)  
Y_pred = gaussian.predict(X_test)  
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

79.46

In [None]:
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)  
Y_pred = decision_tree.predict(X_test) 
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

98.99

In [None]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)

Y_pred = linear_svc.predict(X_test)

acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc


Liblinear failed to converge, increase the number of iterations.



78.45

In [None]:
perceptron = Perceptron(max_iter=5)
perceptron.fit(X_train, Y_train)

Y_pred = perceptron.predict(X_test)

acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron


Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.



59.26

In [None]:
results = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_linear_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
98.99,Random Forest
98.99,Decision Tree
83.73,KNN
83.73,Logistic Regression
80.36,Support Vector Machines
80.02,Perceptron
79.8,Stochastic Gradient Decent
79.24,Naive Bayes


In [None]:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.77777778 0.84269663 0.74157303 0.82022472 0.84269663 0.84269663
 0.82022472 0.78651685 0.86516854 0.84269663]
Mean: 0.8182272159800249
Standard Deviation: 0.03622712296432581


In [None]:
passenger_ids = testData['passengerid']
submission = {"PassengerId": passenger_ids,
                 "Survived": Y_prediction_r}
submission = pd.DataFrame(submission)
submission.to_csv('titanic_submission.csv', index=False)