# Imports

In [31]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

In [32]:
%matplotlib inline

# Loading the Datasets

In [33]:
titanic_test = pd.read_csv('data/test.csv')
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [34]:
titanic = pd.read_csv('data/train.csv')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [35]:
combined = [titanic, titanic_test]

# General purpose helpers

In [36]:
def fit_eval_model(fit_data, features, target, prd, fit_args={}, scoring=None):
    """
        Wrapper to fit a sklearn predictor on a dataframe
        and optionally perform a diagnostic CV run.
        
        - fit_data is a dataframe including an Age column.
        - features is a list of columns in fit_data to use in
          fitting the model, it should _not_ include Age.
        - target is a string containing the name of the variable
          that is to be predicted.
        - prd is a predictor object.
        - fit_args is a dictionary of additional parameters to
          the predictor's fit method.
        - scoring optional, the scoring method to be used in CV
          if not provided then CV is skipped  
    """
    
    if scoring:
        scores = cross_val_score(prd,
                                 fit_data.loc[:, features],
                                 fit_data[target], 
                                 scoring=scoring,
                                 cv=10)
        print("CV {:.8}: {:0.2f} (+/- {:0.2f})" \
              .format(scoring, scores.mean(), scores.std() * 2))


    prd.fit(fit_data.loc[:, features],
            fit_data[target],
            **fit_args
           );
    return prd

# Cleaning, Wrangling, Engineering

In this section I deal with missing data and engineering new features from the columns already present in the datasets.

The order of those processes is a bit of a mess because some engineered features depend on a feature with missing data that must be first imputated using other engineered features and so on.

Also not all features engineered ended up being of any use, those will probably get cleaned up at a convenient time.

## On missing data

### Training set
* 177(~20%) of **ages** are missing. Age is expected to be a strong predictor. I should work on remedying that, [Gertlowitz](http://gertlowitz.blogspot.com.br/2013/06/where-am-i-up-to-with-titanic-competion.html) used people's titles(Ms., Mr. etc.) to predict missing ages to good measure.

* 2(<1%) people are missing **embark** points. This shouldn't be too important, but my swarmplots indicated there might be something at work there.

* 687(~80%) people are missing **cabin** information. I expect deck location and floor to be a reasonable predictor, but there might not be a way to reliably estimate missing data.

### Test set
* 86(~20%) are missing **ages**.

* 1(<1%) is missing **fare**

* 327(~80%) are missing **cabin**.

## Feature Engineering I

### Titles

Following the work of Gertlowitz mentioned above I extract the titles out of people's names in order to exploit it for both age and survival estimation.

In [37]:
def get_titles (data):
    return data['Name'].str.extract(', (\w+).', expand=False)

In [38]:
for dataset in combined:
    dataset['Title'] = get_titles(dataset)

Most titles are too infrequent or not really relevant. I'm merging them into Mr, Master, Mrs and Miss as proxies for adult male, young male, adult married famale and unmarried, possibly young, female respectively. There are many models that did just fine without a  Miss/Mrs distinction however.

In [39]:
def group_titles (row):
    miss_titles = ['the', 'Ms', 'Mme', 'Mlle', 'Dona']
    mrs_titles = ['Lady']
    mr_titles = ['Jonkheer', 'Capt', 'Col', 'Don', 'Major', 'Sir', 'Rev']
    
    if row['Title'] in miss_titles:
        return 'Miss'
    elif row['Title'] in mr_titles:
        return 'Mr'
    elif row['Title'] in mrs_titles:
        return 'Mrs'
    elif row['Title'] == 'Dr':
        if row['Sex'] == 'male':
            return 'Mr'
        else:
            return 'Miss'
    else:
        return row['Title']

In [40]:
for dataset in combined:
    dataset['Title'] = dataset.apply(group_titles, 1)

### Surname

[Deotte](https://www.kaggle.com/cdeotte/titanic-using-name-only-0-81818) did an awesome job based only on names. I decided to borrow some of his ideas on surnames and groups of passengers.

In [41]:
def get_surnames (data):
    return data['Name'].str.extract('^(.+),', expand=False)

In [42]:
for dataset in combined:
    dataset['Surname'] = get_surnames(dataset)

### Deck

Another thing Gertlowitz did was extract the deck letter from cabin information.

In [43]:
def get_deck (data):
    return data.Cabin.str.extract('^(\w)', expand=False)

In [44]:
for dataset in combined:
    dataset['Cabin'] = dataset['Cabin'].fillna('Z')
    dataset['Deck'] = get_deck(dataset)

### Family Aboard

I think both Gertlowitz and Sehgal did this.

In [45]:
def get_nfam (data):
    return data.Parch + data.SibSp + 1

In [46]:
for dataset in combined:
    dataset['Fam'] = get_nfam(dataset)

### Alone

Another feature due to Sehgal

In [47]:
def get_alone (fam):
    return {True: 1, False: 0}[fam == 1]

In [48]:
for dataset in combined:
    dataset['Alone'] = dataset.Fam.apply(get_alone)

## Reencoding  the training set

For the used categorical features

In [49]:
def gen_fit_les (data):
    """
    Generates a dictionary of LabelEncoders with an entry
    for each column in data that is of type object
    """
    
    cat_cols_les = {x: LabelEncoder() \
                    for x in data if data[x].dtype.name == 'object'}

    for item in cat_cols_les.items():
        col = item[0]
        le = item[1]
        
        le.fit(data[col])
        
    return cat_cols_les

def les_transform (data, le_dict):
    """
    Transforms data's columns with a dictionary generated by
    gen_fit_les (above)
    """
    
    for item in le_dict.items():
        col = item[0]
        le = item[1]
        
        data[col] = le.transform(data[col])
        
def les_inverse_transform (data, le_dict):
    """
    Inverse transforms data's columns with a dictionary generated by
    gen_fit_les (above)
    """
    
    for item in le_dict.items():
        col = item[0]
        le = item[1]
        
        data[col] = le.inverse_transform(data[col])

In [50]:
titanic.Embarked = titanic.Embarked.fillna('U')

# Because some labels are present in only one set, it's necessary
# to train the encoders on their concatenation
cat_cols_les = gen_fit_les(pd.concat(combined, sort=False))
les_transform(titanic, cat_cols_les)

## Reencoding  the test set

In [51]:
titanic_test.Embarked = titanic_test.Embarked.fillna('U')

les_transform(titanic_test, cat_cols_les)

## Imputation of fares

For the one lone passenger in the test set without fare information I'm using the mean of the fares for his Pclass in the training set.

In [52]:
def imputate_fare(row):
    if np.isnan(row['Fare']):
        return titanic[titanic.Pclass == row['Pclass']] \
                .groupby('Ticket').mean()['Fare'].mean()
    else:
        return row['Fare']

In [53]:
titanic_test['Fare'] = titanic_test.apply(imputate_fare, 1)

## Imputation of ages

The age imputation models are trained on the whole set of data with valid ages, spanning both the training and test sets. Is that really kosher though?

In [54]:
age_features = ['Pclass', 'Parch', 'SibSp', 'Title']

age_train = pd.concat([
                        titanic[~np.isnan(titanic.Age)],
                        titanic_test[~np.isnan(titanic_test.Age)]
                      ],
                      sort=False
                     )

In [55]:
age_model = fit_eval_model(age_train, 
                            age_features,
                            'Age',
                            GradientBoostingRegressor(),
                            scoring='neg_mean_absolute_error'
                           )

CV neg_mean: -8.15 (+/- 0.94)


In [56]:
titanic.loc[np.isnan(titanic.Age), 'Age'] = \
    age_model.predict(titanic[np.isnan(titanic.Age)] \
                       .loc[:, age_features])

In [57]:
titanic_test.loc[np.isnan(titanic_test.Age), 'Age'] = \
    age_model.predict(titanic_test[np.isnan(titanic_test.Age)] \
                       .loc[:, age_features])

## Age*Pclass

This is due to [Sehgal](https://www.kaggle.com/startupsci/titanic-data-science-solutions)

In [58]:
for dataset in combined:
    dataset['AgePclass'] = dataset['Age'] * dataset['Pclass']

# Training the model

I reserve 30% of the training set in order to perform validation experiments that may or may not get pushed into the repository.

In [287]:
#train_features = ['Title', 'Sex', 'AgePclass', 'Fare', 'Fam', 'Ticket']
train_features = ['Sex', 'Age', 'Fare', 'Ticket', 'Fam', 'Alone', 'Surname']

train_set, test_set = train_test_split(titanic[['Survived'] + train_features],
                                       test_size=0.3
                                      )

In [288]:
model = fit_eval_model(train_set, 
                       train_features,
                       'Survived',
                       GradientBoostingClassifier(),
                       scoring='f1'
                      )

CV f1: 0.79 (+/- 0.12)


In [289]:
pred = model.predict(test_set[train_features])
confusion_matrix(test_set['Survived'], pred)

array([[147,  22],
       [ 21,  78]])

In [120]:
test_set['predErr'] = pred
test_set['predErr'] = test_set.apply(lambda x: 
                                     {
                                       (0,0):  1,
                                       (0,1):  0,
                                       (1,0):  0,
                                       (1,1):  2
                                     }[(x['Survived'], x['predErr'])],
                                     1
                                    )

In [122]:
%matplotlib qt5
sns.pairplot(test_set.loc[test_set['predErr'] != 0, train_features + ['predErr']], hue='predErr')

<seaborn.axisgrid.PairGrid at 0x7efc006bcfd0>

# Applying the model to the test set

In [326]:
predictions = model.predict(titanic_test.loc[:, train_features])

In [327]:
results = pd.DataFrame(titanic_test['PassengerId'])
results['Survived'] = predictions
results.to_csv('predictions.csv', 
                  columns=('PassengerId', 'Survived'), index=False)

# Experiments and stuff

In [77]:
cat_cols_les['Sex'].classes_

array(['female', 'male'], dtype=object)

In [283]:
[*zip(train_features, model.feature_importances_)]

[('Sex', 0.14341193778147643),
 ('Age', 0.17842239554227896),
 ('Fare', 0.23742625427085695),
 ('Ticket', 0.3852047227195057),
 ('Fam', 0.05553468968588207)]

In [299]:
titanic.groupby(['Ticket', 'Surname']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title,Deck,Fam,Alone,AgePclass
Ticket,Surname,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
110152,Cherry,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
110152,Maioni,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
110152,Rothes,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
110413,Taussig,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
110465,Clifford,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
110465,Porter,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
110564,Bjornstrom-Steffansson,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
110813,Warren,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
111240,Van der hoef,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
111320,Gee,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
