# Titanic Dataset Analysis
Dataset: https://www.kaggle.com/c/titanic/overview <br>
Useful link: https://docs.fast.ai/tutorial.tabular
and https://docs.fast.ai/tabular.core.html


## Download from Kaggle

In [None]:
!pip install -q kaggle

import os 
os.environ['KAGGLE_USERNAME'] = "filippoairaldi"
os.environ['KAGGLE_KEY'] = "51aebb4dfedacca5d50c2ab359457daa" 

!mkdir titanic_data
!kaggle competitions download -c titanic -p titanic_data

!pip install fastai --upgrade
!pip install nbdev

Downloading gender_submission.csv to titanic_data
  0% 0.00/3.18k [00:00<?, ?B/s]
100% 3.18k/3.18k [00:00<00:00, 5.69MB/s]
Downloading test.csv to titanic_data
  0% 0.00/28.0k [00:00<?, ?B/s]
100% 28.0k/28.0k [00:00<00:00, 27.3MB/s]
Downloading train.csv to titanic_data
  0% 0.00/59.8k [00:00<?, ?B/s]
100% 59.8k/59.8k [00:00<00:00, 61.1MB/s]
Collecting fastai
[?25l  Downloading https://files.pythonhosted.org/packages/3a/6f/392cf2a6af43abaaf9f5f52c17a5cfdb06902a3535f03ff0ca1bdfcc9b80/fastai-2.1.10-py3-none-any.whl (190kB)
[K     |████████████████████████████████| 194kB 9.2MB/s 
Collecting fastcore>=1.3.8
[?25l  Downloading https://files.pythonhosted.org/packages/66/6e/0da19d76b90a8958081335e415c22ed652d879fec0a42e3a44d1fe15231b/fastcore-1.3.16-py3-none-any.whl (53kB)
[K     |████████████████████████████████| 61kB 6.5MB/s 
Installing collected packages: fastcore, fastai
  Found existing installation: fastai 1.0.61
    Uninstalling fastai-1.0.61:
      Successfully uninstalled fastai-

In [None]:
import re
import numpy as np
import pandas as pd
from fastai.tabular.all import *

In [None]:
def get_train_data(): return pd.read_csv('titanic_data/train.csv')
def get_test_data(): return pd.read_csv('titanic_data/test.csv')

In [None]:
df = get_train_data()
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Pre-processing

### PassengerId

In [None]:
def process_passenger_id(df): # insignificat
    return df.drop('PassengerId', axis = 1)

### Title and Name

In [None]:
titles_dict = {
    'Capt': 'Offical',
    'Col': 'Offical',
    'Rev': 'Offical',
    'Dr': 'Offical',
    'Major': 'Offical',
    'Don': 'Royalty',
    'Jonkheer': 'Royalty',
    'Sir': 'Royalty',
    'Dona': 'Royalty',
    'the Countess': 'Royalty',
    'Lady': 'Royalty',
    'Master': 'Commoner',
    'Miss': 'Commoner',
    'Mlle': 'Commoner',
    'Mme': 'Commoner',
    'Mr': 'Commoner',
    'Mrs': 'Commoner',
    'Ms': 'Commoner'
}

def process_name(df):
    def process_name_to_ticket(df):
        def extract_title_from_name(n):
            try:
                return n.split(',')[1].split('.')[0].strip()
            except:
                return np.nan
        df['Title'] = df.Name.transform(extract_title_from_name).map(titles_dict)
        return df

    def process_children(df):
        df_train = get_train_data() # must use only train dataset
        train_max_male_child_age = df_train[df_train.Name.str.contains('Master.')].Age.max()
        df['IsChild'] = df.Age <= train_max_male_child_age
        return df

    def process_family(df):
        def extract_family_name(n):
            try:
                return n.split(',')[0]
            except:
                np.nan

        df_train = get_train_data() # must use only train dataset
        train_family_size_dict = df_train.Name.transform(extract_family_name).value_counts().to_dict()
        families = df.Name.transform(extract_family_name)
        df['FamilySize'] = df.Name.transform(extract_family_name).map(train_family_size_dict)
        return df

    df = process_name_to_ticket(df)
    df = process_children(df)
    df = process_family(df)
    return df.drop('Name', axis = 1)

### Ticket

In [None]:
def process_ticket(df): # random forest suggested tickets are not influential
    return df.drop('Ticket', axis = 1)

### Cabin

In [None]:
def process_cabin(df):
    def get_cabin_height(cabin):
        try:
            cabin = cabin.replace(' ', '')
            x = re.search('^([A-Z]+)([0-9]+)', cabin)
            if x is None:
                return np.nan
            return x[1]
        except:
            return np.nan
    def get_cabin_length(cabin):    
        try:
            cabin = cabin.replace(' ', '')
            x = re.search('^([A-Z]+)([0-9]+)', cabin)
            if x is None:
                return np.nan
            return x[2]
        except:
            return np.nan

    df['CabinHeight'] = df.Cabin.transform(get_cabin_height)
    df['CabinLength'] = df.Cabin.transform(get_cabin_length) # seems more important than height
    return df.drop('Cabin', axis = 1)

### Transform to categories or continuous variables

In [None]:
def get_cat_cols():
    return ['Pclass', 'Sex', 'Embarked', 'Title', 'CabinHeight', 'IsChild']

def get_cont_cols():
    return ['Age', 'CabinLength', 'Fare', 'Parch', 'SibSp', 'FamilySize']

def get_cols_types():
    return { 
        'Age': 'float32',
        'CabinLength': 'float32', # contains NaN, must be float
        'Fare': 'float32',
        'Parch': 'uint8',
        'SibSp': 'uint8',
        'FamilySize': 'float32', # contains NaN, must be float
        'Pclass': 'category',
        'Sex': 'category',
        'Embarked': 'category',
        'Title': 'category',
        'CabinHeight': 'category',
        'IsChild': 'category'
        }

def process_df_types(df):
    if 'Survived' in df.columns:
        df.Survived = df.Survived.astype('bool')

    df = df.astype(get_cols_types())

    df.Pclass.cat.set_categories([3, 2, 1], ordered = True, inplace = True)
    df.CabinHeight.cat.set_categories(
        ['G', 'FG', 'F', 'FE', 'E', 'D', 'C', 'B', 'A'], # A highest
        ordered = True, inplace = True)
    return df

# df_shrink_dtypes(df, int2uint = True) helpful function

### Fill missing values
Fill missing continuous values from the median of the corresponding (sex, title, pclass) group. If no entry is found, the overall median for the column is returned.<br>
Categorical columns are not filled since NaN is a category itself.

In [None]:
def fill_missing_values(df): 
    df_train = get_train_data() # always use train medians to fill any dataset 
    df_train = process_passenger_id(df_train)
    df_train = process_name(df_train)
    df_train = process_ticket(df_train)
    df_train = process_cabin(df_train)
    df_train = process_df_types(df_train)

    df.Embarked = df.Embarked.fillna('S') # most occurrent

    cont_cols = get_cont_cols()
    overall_train_median_dict = { c: df_train[c].median() for c in cont_cols }

    grouping_cols = ['Sex', 'Pclass', 'Title']
    df_train = df_train.groupby(grouping_cols).median().reset_index()
    df_train = df_train[grouping_cols + cont_cols]

    def first_or_default(arr, default):
        if arr.shape[0] == 0 or np.isnan(arr[0]):
            return default
        return arr[0]

    def fill_row_values(row):
        cond = ((df_train.Sex == row.Sex) &
                (df_train.Pclass == row.Pclass) &
                (df_train.Title == row.Title))
        matches = df_train[cond]
        for c in cont_cols:
            if not np.isnan(row[c]):
                continue
            val = matches[c].values
            row[c] = first_or_default(val, overall_train_median_dict[c])
        return row

    df[cont_cols] = df.apply(fill_row_values, axis = 1)[cont_cols]
    return df

### Dividing into train and validation set

In [None]:
def get_processed_train_data():
    df = get_train_data()
    df = process_passenger_id(df)
    df = process_name(df)
    df = process_ticket(df)
    df = process_cabin(df)
    df = process_df_types(df)
    df = fill_missing_values(df)
    return df

def convert_train_df_to_tabular(df):
    # FillMissing is not gonna really do anything because I manually filled continuous columns
    return TabularPandas(
        df,
        procs = [Categorify, FillMissing, Normalize],
        cat_names = get_cat_cols(),
        cont_names = get_cont_cols(),
        y_names = 'Survived',
        y_block = CategoryBlock(),
        splits = RandomSplitter(valid_pct = 0.33)(range_of(df)))

def get_train_to():
    return convert_train_df_to_tabular(get_processed_train_data())

## Modeling

In [None]:
to = get_train_to()

dls = to.dataloaders(bs = 256, set_random_states = 0)

# layers = [512, 128], learning = (90, 1e-3) -> 0.369492 	0.331962 	0.898876

learn = tabular_learner(dls, metrics = accuracy, layers = [512, 128])

learn.fit_one_cycle(100, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,0.724132,0.692956,0.5,00:00
1,0.715482,0.694868,0.44898,00:00
2,0.705481,0.696494,0.414966,00:00
3,0.693962,0.697118,0.418367,00:00
4,0.67943,0.69652,0.445578,00:00
5,0.663626,0.694293,0.445578,00:00
6,0.647317,0.690222,0.571429,00:00
7,0.628709,0.684389,0.619048,00:00
8,0.609696,0.676823,0.605442,00:00
9,0.591076,0.668027,0.622449,00:00


In [None]:
# learn.lr_find()

## Submission

In [None]:
def get_processed_test_data():
    df = get_test_data()
    df = process_passenger_id(df)
    df = process_name(df)
    df = process_ticket(df)
    df = process_cabin(df)
    df = process_df_types(df)
    df = fill_missing_values(df)
    return df

dl_test = learn.dls.test_dl(get_processed_test_data())

output = learn.get_preds(dl = dl_test)

In [None]:
survived = output[0][:, 1] >= 0.5 

df_output = pd.DataFrame({
    'PassengerId': get_test_data().PassengerId,
    'Survived': survived
})
df_output.Survived = df_output.Survived.map(lambda f: 1 if f else 0)
df_output.to_csv('titanic_data/results.csv', index = False)