# Titanic Dataset Analysis via CatBoost
Dataset: https://www.kaggle.com/c/titanic/overview <br>
Inspired by: https://www.kaggle.com/pavlofesenko/simplest-top-10-titanic-0-80861/notebook

In [None]:
!pip install -q kaggle

import os 
os.environ['KAGGLE_USERNAME'] = "filippoairaldi"
os.environ['KAGGLE_KEY'] = "51aebb4dfedacca5d50c2ab359457daa" 

!mkdir titanic_data
!kaggle competitions download -c titanic -p titanic_data

!pip install catboost
import pandas as pd
import catboost

In [7]:
def get_raw_data():
    return pd.read_csv('titanic_data/train.csv'), pd.read_csv('titanic_data/test.csv')

## Modeling

In [8]:
def highlight(value): 
    return 'color: ' + ('green' if value >= 0.5 else 'red')

train, test = get_raw_data()
pd.pivot_table(train, values = 'Survived', index = ['Pclass', 'Embarked'], columns = 'Sex').style.applymap(highlight)

Unnamed: 0_level_0,Sex,female,male
Pclass,Embarked,Unnamed: 2_level_1,Unnamed: 3_level_1
1,C,0.976744,0.404762
1,Q,1.0,0.0
1,S,0.958333,0.35443
2,C,1.0,0.2
2,Q,1.0,0.0
2,S,0.910448,0.154639
3,C,0.652174,0.232558
3,Q,0.727273,0.076923
3,S,0.375,0.128302


In [9]:
def split_data(train, test, features):
    x_train = train[features].fillna('')
    y_train = train['Survived']
    x_test = test[features].fillna('')
    return x_train, y_train, x_test

def get_mdl(iters = 100):
    # CatBoost will one-hot encode all the features with at most 4 categories.
    # The remaining (like Surname, which has a lot of them) will be mean-encoded
    # (and regularized via expanding mean)
    return catboost.CatBoostClassifier(
        one_hot_max_size = 4,
        iterations = 100,
        random_seed = 0,
        verbose = False,
        eval_metric = 'Accuracy',
        loss_function = 'CrossEntropy'
    )   

def show_cv_score(pool, mdl, fold_cnt = 20):
    cv_scores = catboost.cv(pool, mdl.get_params(), fold_count = 20, plot = False)
    print('CV score:', cv_scores['test-Accuracy-mean'].values[-1])

In [10]:
x_train, y_train, _ = split_data(train, test, ['Sex', 'Pclass', 'Embarked'])

mdl = get_mdl()

pool = catboost.Pool(x_train, y_train, cat_features = [0, 2])
show_cv_score(pool, mdl)

CV score: 0.8113636363636365


### Adding Master feature


In [11]:
def process_master(df_tr, df_te):
    # Girl is not needed, since female have higher survival rate
    df_tr['Master'] = (df_tr.Name.str.contains('Master.')).astype('int')
    df_te['Master'] = (df_te.Name.str.contains('Master.')).astype('int')
    return df_tr, df_te

train, test = get_raw_data()
train, test = process_master(train, test)

pd.pivot_table(train, values = 'Survived', index = 'Pclass', columns = 'Master').style.applymap(highlight)

Master,0,1
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.624413,1.0
2,0.445714,1.0
3,0.233261,0.392857


In [12]:
x_train, y_train, _ = split_data(train, test, ['Sex', 'Pclass', 'Embarked', 'Master'])

mdl = get_mdl()

pool = catboost.Pool(x_train, y_train, cat_features = [0, 2])
show_cv_score(pool, mdl)

CV score: 0.8293434343434345


### Adding Surname feature

In [13]:
def process_surname(df_tr, df_te):
    df_tr['Surname'] = df_tr.Name.str.split(',').str[0]
    df_te['Surname'] = df_te.Name.str.split(',').str[0]
    return df_tr, df_te

train, test = get_raw_data()
train, test = process_master(train, test)
train, test = process_surname(train, test)

pd.pivot_table(train, values = 'Survived', index='Surname')[:15].sort_values('Survived').style.applymap(highlight)

Unnamed: 0_level_0,Survived
Surname,Unnamed: 1_level_1
Abbing,0.0
Adahl,0.0
Adams,0.0
Ahlin,0.0
Alexander,0.0
Alhomaki,0.0
Ali,0.0
Allum,0.0
Allison,0.333333
Abbott,0.5


In [14]:
x_train, y_train, _ = split_data(train, test,
                                 ['Sex', 'Pclass', 'Embarked', 'Master', 'Surname'])

mdl = get_mdl()

pool = catboost.Pool(x_train, y_train, cat_features = [0, 2, 4])
show_cv_score(pool, mdl)

CV score: 0.8438383838383838


### Adding Title feature

In [15]:
train_titles_dict = {
    'Capt': 'Offical',
    'Col': 'Offical',
    'Rev': 'Offical',
    'Dr': 'Offical',
    'Major': 'Offical',
    'Don': 'Royalty',
    'Jonkheer': 'Royalty',
    'Sir': 'Royalty',
    'Dona': 'Royalty',
    'the Countess': 'Royalty',
    'Lady': 'Royalty',
    'Master': 'Commoner',
    'Miss': 'Commoner',
    'Mlle': 'Commoner',
    'Mme': 'Commoner',
    'Mr': 'Commoner',
    'Mrs': 'Commoner',
    'Ms': 'Commoner'
}

def extract_title_from_name(n):
    return n.split(',')[1].split('.')[0].strip()
    
def process_title(df_tr, df_te):
    df_tr['Title'] = df_tr.Name.transform(extract_title_from_name).map(train_titles_dict)
    df_te['Title'] = df_te.Name.transform(extract_title_from_name).map(train_titles_dict)
    return df_tr, df_te

In [16]:
train, test = get_raw_data()
train, test = process_master(train, test)
train, test = process_surname(train, test)
train, test = process_title(train, test)

pd.pivot_table(train, values = 'Survived', index = 'Title').sort_values('Survived').style.applymap(highlight)

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Offical,0.277778
Commoner,0.384793
Royalty,0.6


In [17]:
x_train, y_train, _ = split_data(train, test,
                                 ['Sex', 'Pclass', 'Embarked', 'Master', 'Surname', 'Title'])

mdl = get_mdl(iters = 100)

pool = catboost.Pool(x_train, y_train, cat_features = [0, 2, 4, 5])
show_cv_score(pool, mdl)

CV score: 0.8439393939393941


## Submission

In [18]:
x_train, y_train, x_test = split_data(train, test, 
                                      ['Sex', 'Pclass', 'Embarked', 'Master', 'Surname', 'Title'])

mdl = get_mdl()
mdl.fit(x_train, y_train, cat_features = [0, 2, 4, 5])

pred = mdl.predict(x_test).astype('int')

In [19]:
output = pd.DataFrame({
    'PassengerId': get_raw_data()[1].PassengerId,
    'Survived': pred
})
output.to_csv('titanic_data/results.csv', index = False)