<a href="https://colab.research.google.com/github/EmpyEmpt/titanic/blob/main/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [None]:
!pip install -q sklearn
!pip install -q tensorflow_decision_forests

!pip install -q catboost
!pip install -q scikit-learn
%tensorflow_version 2.x  

In [131]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_decision_forests as tfdf

from sklearn.ensemble import RandomForestClassifier

from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense

## Data loading and prepocessing


In [132]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

### Data prepocessing

In [133]:
train_df = train_df.drop(['Ticket', 'Cabin', 'PassengerId'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)

train_df['Title'] = train_df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

for dataset in [train_df, test_df]:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in [train_df, test_df]:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0) 

train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1}).astype(int)
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1}).astype(int)

guess_ages = np.zeros((2,3))
for dataset in [train_df, test_df]:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

for dataset in [train_df, test_df]:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4

train_df = train_df.drop(['AgeBand'], axis=1)
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1 
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

train_df = train_df.drop(['Parch', 'SibSp'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp'], axis=1)

freq_port = train_df.Embarked.dropna().mode()[0]
train_df['Embarked'] = train_df['Embarked'].fillna(freq_port)
test_df['Embarked'] = test_df['Embarked'].fillna(freq_port)

train_df['Embarked'] = train_df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
test_df['Embarked'] = test_df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
for dataset in [train_df, test_df]:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df = train_df.drop(['FareBand'], axis=1)

In [134]:
def split_dataset(dataset, test_ratio=0.20):
  """Splits a panda dataframe in two."""
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

In [135]:
def predict(model, input, output, model_type):
  if model_type is 'custom_dnn':
    data = input.drop(columns = ['PassengerId'])
    data = tf.convert_to_tensor(data, dtype=tf.int32)
  elif model_type is 'catboost':
     data = input.drop(columns = ['PassengerId'])
  elif model_type is 'random_forest_tf':
    data = tfdf.keras.pd_dataframe_to_tf_dataset(input)
  elif model_type == 'random_forest_skl':
    data = input.drop(columns = ['PassengerId'])


  pred = model.predict(data)
  pred_list = []
  for i in pred:
    if i >= 0.5:
      pred_list.append(1)
    else:
      pred_list.append(0)
  res = input['PassengerId'].to_frame()
  res['Survived'] = pred_list
  res.to_csv(output, index = False)

### Dataset split for quick rough evaluation

In [136]:
train_ds, test_ds = split_dataset(train_df)

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds, label='Survived')
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds, label='Survived')

train_, test_ = split_dataset(train_df)

trainY = train_['Survived']
trainX = train_.drop(columns = ['Survived'])

testY = test_['Survived']
testX = test_.drop(columns = ['Survived'])

trainX = tf.convert_to_tensor(trainX, dtype=tf.int32)
trainY = tf.convert_to_tensor(trainY, dtype=tf.int32)

testX = tf.convert_to_tensor(testX, dtype=tf.int32)
testY = tf.convert_to_tensor(testY, dtype=tf.int32)

  features_dataframe = dataframe.drop(label, 1)


### Complete dataset for final training

In [137]:
cmpl_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label='Survived')
cmpl_df = train_df.copy()

cmpl_y = train_df['Survived']
cmpl_y = tf.convert_to_tensor(cmpl_y, dtype=tf.int32)

cmpl_x = train_df.drop(columns = ['Survived'])
cmpl_x = tf.convert_to_tensor(cmpl_x, dtype=tf.int32)

  features_dataframe = dataframe.drop(label, 1)


# Custom DNN model

In [171]:
def build():
  model = Sequential()
  input_shape = (7,)
  model.add(Dense(49, activation = 'relu'))
  model.add(Dense(14, activation = 'relu'))
  model.add(Activation("softmax"))
  model.add(Dense(1))
  return model

In [172]:
modelDNN = build()
modelDNN.compile(optimizer='adam', loss=tf.keras.losses.MeanAbsoluteError(), metrics=['accuracy'])

In [173]:
modelDNN.fit(cmpl_x, 
          cmpl_y,
          epochs=10,
          shuffle=True,
          verbose = 0)
loss, acc = modelDNN.evaluate(cmpl_x, cmpl_y, verbose=2)

28/28 - 0s - loss: 0.2400 - accuracy: 0.8182 - 128ms/epoch - 5ms/step


In [142]:
predict(modelDNN, test_df, 'custom_dnn.csv', 'custom_dnn')

# RandomForestModel tf


In [143]:
modelRFTF = tfdf.keras.RandomForestModel(hyperparameter_template = 'benchmark_rank1', verbose = 0)
modelRFTF.compile(metrics=['accuracy'])
modelRFTF.fit(cmpl_ds, verbose = 0)

Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'winner_take_all': True, 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.


<keras.callbacks.History at 0x7f927f0b8690>

In [144]:
evaluation = modelRFTF.evaluate(cmpl_ds, return_dict=True)
loss, acc = modelDNN.evaluate(cmpl_x, cmpl_y, verbose=2)

28/28 - 0s - loss: 0.2844 - accuracy: 0.7834 - 46ms/epoch - 2ms/step


In [145]:
predict(modelRFTF, test_df, 'random_forest_tf.csv', 'random_forest_tf')

# RandomForestClassifier sklearn


In [160]:
rf = RandomForestClassifier()

rf.fit(cmpl_x, cmpl_y)
acc_log = round(rf.score(cmpl_x, cmpl_y) * 100, 2)
acc_log

88.55

In [None]:
predict(rf, test_df, 'random_forest_skl.csv', 'random_forest_skl')

# CatBoostClassifier 

## Data preparation for catboost 
(it does not like tensors)

In [153]:
train_x_cat, test_x_cat, train_y_cat, test_y_cat = train_test_split(x, y, train_size=0.80, random_state=42)
cmpl_y_cat = train_df['Survived']

cmpl_x_cat = train_df.drop(columns = ['Survived'])

In [154]:
categorical_features_indices = np.where(cmpl_x_cat.dtypes != float)[0]

## Training catboost

In [156]:
modelCat = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

cv_params = modelCat.get_params()
cv_params.update({
    'loss_function': metrics.Logloss()
})

cv_data = cv(
    Pool(cmpl_x_cat, cmpl_y_cat, cat_features=categorical_features_indices),
    cv_params,
    plot=False
) 

modelCat.fit(
    cmpl_x_cat, cmpl_y_cat,
    cat_features=categorical_features_indices,
    eval_set=(cmpl_x_cat, cmpl_y_cat),
    plot=False
)

In [157]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.82±0.03 on step 85


In [158]:
predict(modelCat, test_df, 'catboost.csv', 'catboost')