# Imports

In [147]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [148]:
%matplotlib inline

# Loading the Datasets

In [149]:
titanic_test = pd.read_csv('data/test.csv')
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [150]:
titanic = pd.read_csv('data/train.csv')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [151]:
combined = [titanic, titanic_test]

# Cleaning, Wrangling, Engineering

In this section I deal with missing data and engineering new features from the columns already present in the datasets.

The order of those processes is a bit of a mess because some engineered features depend on a feature with missing data that must be first imputated using other engineered features and so on.

## On missing data

### Training set
* 177(~20%) of **ages** are missing. Age is expected to be a strong predictor. I should work on remedying that, [Gertlowitz](http://gertlowitz.blogspot.com.br/2013/06/where-am-i-up-to-with-titanic-competion.html) used people's titles(Ms., Mr. etc.) to predict missing ages to good measure.

* 2(<1%) people are missing **embark** points. This shouldn't be too important, but my swarmplots indicated there might be something at work there.

* 687(~80%) people are missing **cabin** information. I expect deck location and floor to be a reasonable predictor, but there might not be a way to reliably estimate missing data.

### Test set
* 86(~20%) are missing **ages**.

* 1(<1%) is missing **fare**

* 327(~80%) are missing **cabin**.

## Feature Engineering I

### Titles

Following the work of Gertlowitz mentioned above I extract the titles out of people's names in order to exploit it for both age and survival estimation.

In [152]:
def get_titles (data):
    return data['Name'].str.extract(', (\w+).', expand=False)

In [153]:
for dataset in combined:
    dataset['Title'] = get_titles(dataset)

In [154]:
def group_titles (row):
    miss_titles = ['the', 'Ms', 'Mme', 'Mlle', 'Dona']
    mrs_titles = ['Lady']
    mr_titles = ['Jonkheer', 'Capt', 'Col', 'Don', 'Major', 'Sir', 'Rev']
    
    if row['Title'] in miss_titles:
        return 'Miss'
    elif row['Title'] in mr_titles:
        return 'Mr'
    elif row['Title'] in mrs_titles:
        return 'Mrs'
    elif row['Title'] == 'Dr':
        if row['Sex'] == 'male':
            return 'Mr'
        else:
            return 'Miss'
        
    else:
        return row['Title']

In [155]:
for dataset in combined:
    dataset['Title'] = dataset.apply(group_titles, 1)

### Deck

Another thing Gertlowitz did was extract the deck letter from cabin information.

In [156]:
def get_deck (data):
    return data.Cabin.fillna('Z').str.extract('^(\w)', expand=False)

In [157]:
for dataset in combined:
    dataset['Deck'] = get_deck(dataset)

### Family Aboard

I think both Gertlowitz and Sehgal did this.

In [158]:
def get_nfam (data):
    return data.Parch + data.SibSp + 1

In [159]:
for dataset in combined:
    dataset['Fam'] = get_nfam(dataset)

### Alone

Another feature due to Sehgal

In [160]:
def get_alone (fam):
    return {True: 1, False: 0}[fam == 1]

In [161]:
for dataset in combined:
    dataset['Alone'] = dataset.Fam.apply(get_alone)

## Defining the used subsets

### Training

In [162]:
used_features = ['Survived', 'Pclass', 'Title', 'Sex', 'Age',
                 'Fam', 'Ticket', 'Fare', 'Deck', 'Embarked', 'Alone']
data_train = titanic.loc[:, used_features]
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Title       891 non-null object
Sex         891 non-null object
Age         714 non-null float64
Fam         891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Deck        891 non-null object
Embarked    889 non-null object
Alone       891 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


### Test

In [163]:
test_features = list(used_features)
test_features.remove('Survived')
data_test = titanic_test.loc[:, test_features]
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
Pclass      418 non-null int64
Title       418 non-null object
Sex         418 non-null object
Age         332 non-null float64
Fam         418 non-null int64
Ticket      418 non-null object
Fare        417 non-null float64
Deck        418 non-null object
Embarked    418 non-null object
Alone       418 non-null int64
dtypes: float64(2), int64(3), object(5)
memory usage: 32.7+ KB


In [164]:
combined_used = [data_train, data_test]

## Reencoding  the training set

For the used categorical features

In [165]:
def gen_fit_les (data):
    """
    Generates a dictionary of LabelEncoders with an entry
    for each column in data that is of type object
    """
    
    cat_cols_les = { x: LabelEncoder() \
                    for x in data if data[x].dtype.name == 'object'}

    for item in cat_cols_les.items():
        col = item[0]
        le = item[1]
        
        le.fit(data[col])
        
    return cat_cols_les

def les_transform (data, le_dict):
    """
    Transform data's columns with a dictionary generated by
    gen_fit_les (above)
    """
    
    for item in le_dict.items():
        col = item[0]
        le = item[1]
        
        data[col] = le.transform(data[col])

In [166]:
data_train.Embarked = data_train.Embarked.fillna('U')

# Because some labels are present in only one set, it's necessary
# to train the encoders on their concatenation
cat_cols_les = gen_fit_les(pd.concat([data_train, data_test]))
les_transform(data_train, cat_cols_les)

## Reencoding  the test set

In [167]:
data_test.Embarked = data_test.Embarked.fillna('U')

les_transform(data_test, cat_cols_les)

## Imputation of fares

For the one lone passenger in the test set without fare information I'm using the mean of the fares for his Pclass in the training set.

In [168]:
def imputate_fare(row):
    if np.isnan(row['Fare']):
        return titanic[titanic.Pclass == row['Pclass']] \
                .groupby('Ticket').mean()['Fare'].mean()
    else:
        return row['Fare']

In [169]:
data_test['Fare'] = data_test.apply(imputate_fare, 1)

## Imputation of ages

Experimental imputation using a regression forest, because computation is cheap.

In [170]:
age_features = ['Pclass', 'Title', 'Fam', 'Ticket', 'Fare']

age_forest = RandomForestRegressor(n_jobs=-1)

# Is this really kosher?
age_train = pd.concat([
                        data_train[~np.isnan(data_train.Age)],
                        data_test[~np.isnan(data_test.Age)]
                      ])

scores = cross_val_score(age_forest,
                         age_train.loc[:, age_features],
                         age_train['Age'], 
                         cv=10)
print("CV Accuracy: {:0.2f} (+/- {:0.2f})" \
      .format(scores.mean(), scores.std() * 2))


age_forest.fit(age_train.loc[:, age_features],
               age_train['Age']);

CV Accuracy: 0.31 (+/- 0.21)


In [171]:
data_train.loc[np.isnan(titanic.Age), 'Age'] = \
    age_forest.predict(data_train[np.isnan(titanic.Age)] \
                       .loc[:, age_features])

In [172]:
data_test.loc[np.isnan(titanic_test.Age), 'Age'] = \
    age_forest.predict(data_test[np.isnan(titanic_test.Age)] \
                       .loc[:, age_features])

## Age*Pclass

This is due to [Sehgal](https://www.kaggle.com/startupsci/titanic-data-science-solutions)

In [173]:
used_features.remove('Age')
used_features.remove('Pclass')
used_features.append('AgePclass')

for dataset in combined_used:
    dataset['AgePclass'] = dataset['Age'] * dataset['Pclass']

# Training the model

Random forest classifier, just to get things going:

In [174]:
train_features = list(used_features)
train_features.remove('Survived')
train_features = ['Title', 'Sex', 'AgePclass', 'Fare', 'Ticket', 'Alone']

model = RandomForestClassifier(
                               n_jobs=-1
                              )
model.fit(data_train.loc[:, train_features], data_train['Survived'])
#model.score(test.iloc[:, 1:], test['Survived'])
scores = cross_val_score(model, 
                         data_train.iloc[:, 1:], 
                         data_train['Survived'], 
                         cv=10,
                         n_jobs=-1
                        )
print("CV Accuracy: {:0.2f} (+/- {:0.2f})" \
      .format(scores.mean(), scores.std() * 2))

CV Accuracy: 0.83 (+/- 0.04)


# Applying the model to the test set

In [29]:
predictions = model.predict(data_test.loc[:, train_features])

In [30]:
results = pd.DataFrame(titanic_test['PassengerId'])
results['Survived'] = predictions
results.to_csv('predictions.csv', 
                  columns=('PassengerId', 'Survived'), index=False)

# Experiments and stuff

In [117]:
[*zip(age_features, age_forest.feature_importances_)]

[('Pclass', 0.1304445892664015),
 ('Title', 0.25744065637321223),
 ('Fam', 0.11672885355080706),
 ('Ticket', 0.30182037746726315),
 ('Fare', 0.19356552334231608)]

In [118]:
[*zip(train_features, model.feature_importances_)]

[('Title', 0.09912321892544149),
 ('Sex', 0.17509023040807634),
 ('AgePclass', 0.2649289012184134),
 ('Fare', 0.19375925465290028),
 ('Ticket', 0.24579442631515702),
 ('Alone', 0.021303968480011558)]