# Imports

In [1]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
%matplotlib inline

# Loading the Datasets

In [3]:
titanic_test = pd.read_csv('data/test.csv')
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [4]:
titanic = pd.read_csv('data/train.csv')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## Feature Engineering

### Titles

Following the work of Gertlowitz mentioned above I extract the titles out of people's names in order to exploit it for both age and survival estimation.

In [5]:
def get_titles (data):
    return data['Name'].str.extract(', (\w+).', expand=False)

In [6]:
titanic['Title'] = get_titles(titanic)

In [7]:
titanic_test['Title'] = get_titles(titanic_test)

### Deck

Another thing Gertlowitz did was extract the deck letter from cabin information.

In [8]:
def get_deck (data):
    return data.Cabin.fillna('Z').str.extract('^(\w)', expand=False)

In [9]:
titanic['Deck'] = get_deck(titanic)

In [10]:
titanic_test['Deck'] = get_deck(titanic_test)

## On missing data

### Training set
* 177(~20%) of **ages** are missing. Age is expected to be a strong predictor. I should work on remedying that, [Gertlowitz](http://gertlowitz.blogspot.com.br/2013/06/where-am-i-up-to-with-titanic-competion.html) used people's titles(Ms., Mr. etc.) to predict missing ages to good measure.

* 2(<1%) people are missing **embark** points. This shouldn't be too important, but my swarmplots indicated there might be something at work there.

* 687(~80%) people are missing **cabin** information. I expect deck location and floor to be a reasonable predictor, but there might not be a way to reliably estimate missing data.

### Test set
* 86(~20%) are missing **ages**.

* 1(<1%) is missing **fare**

* 327(~80%) are missing **cabin**.

## Defining the used subsets

### Training

In [11]:
used_features = ['Survived', 'Pclass', 'Title', 'Sex', 'Age',
                 'SibSp', 'Parch', 'Ticket', 'Fare', 'Deck', 'Embarked']
data_train = titanic.loc[:, used_features]
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Title       891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Deck        891 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


### Test

In [12]:
test_features = list(used_features)
test_features.remove('Survived')
data_test = titanic_test.loc[:, test_features]
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
Pclass      418 non-null int64
Title       418 non-null object
Sex         418 non-null object
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Ticket      418 non-null object
Fare        417 non-null float64
Deck        418 non-null object
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 32.7+ KB


# Data preprocessing

## Reencoding  the training set

For the used categorical features

In [13]:
def gen_fit_les (data):
    """
    Generates a dictionary of LabelEncoders with an entry
    for each column in data that is of type object
    """
    
    cat_cols_les = { x: LabelEncoder() \
                    for x in data if data[x].dtype.name == 'object'}

    for item in cat_cols_les.items():
        col = item[0]
        le = item[1]
        
        le.fit(data[col])
        
    return cat_cols_les

def les_transform (data, le_dict):
    """
    Transform data's columns with a dictionary generated by
    gen_fit_les (above)
    """
    
    for item in le_dict.items():
        col = item[0]
        le = item[1]
        
        data[col] = le.transform(data[col])

In [14]:
data_train.Embarked = data_train.Embarked.fillna('U')

# Because some labels are present in only one set, it's necessary
# to train the encoders on their concatenation
cat_cols_les = gen_fit_les(pd.concat([data_train, data_test]))
les_transform(data_train, cat_cols_les)

## Reencoding  the test set

In [15]:
data_test.Embarked = data_test.Embarked.fillna('U')

les_transform(data_test, cat_cols_les)

## Imputation of ages

Experimental imputation using a regression forest, because computation is cheap.

In [23]:
age_features = list(used_features)
age_features.remove('Age')
age_features.remove('Survived')
age_features = ['Title', 'Parch', 'Ticket', 'Parch', 'Fare']

age_forest = RandomForestRegressor(n_jobs=-1)

scores = cross_val_score(age_forest,
                         data_train[~np.isnan(data_train.Age)].loc[:, age_features],
                         data_train[~np.isnan(data_train.Age)]['Age'], 
                         cv=10)
print("Accuracy: {:0.2f} (+/- {:0.2f})" \
      .format(scores.mean(), scores.std() * 2))


age_forest.fit(data_train[~np.isnan(data_train.Age)].loc[:, age_features],
               data_train[~np.isnan(data_train.Age)]['Age']);

Accuracy: 0.35 (+/- 0.30)


In [24]:
data_train.loc[np.isnan(titanic.Age), 'Age'] = \
    age_forest.predict(data_train[np.isnan(titanic.Age)] \
                       .loc[:, age_features])

In [25]:
data_test.loc[np.isnan(titanic_test.Age), 'Age'] = \
    age_forest.predict(data_test[np.isnan(titanic_test.Age)] \
                       .loc[:, age_features])

## Imputation of fares

For the one lone passenger in the test set without fare information I'm using the mean of the fares for his Pclass in the training set.

In [19]:
def imputate_fare(row):
    if np.isnan(row['Fare']):
        return titanic[titanic.Pclass == row['Pclass']] \
                .groupby('Ticket').mean()['Fare'].mean()
    else:
        return row['Fare']

In [20]:
data_test['Fare'] = data_test.apply(imputate_fare, 1)

# Training the model

Random forest classifier, just to get things going:

In [29]:
train_features = list(used_features)
train_features.remove('Survived')
train_features = ['Title', 'Sex', 'Age', 'Ticket', 'Fare']

model = RandomForestClassifier(
                               n_jobs=-1
                              )
model.fit(data_train.loc[:, train_features], data_train['Survived'])
#model.score(test.iloc[:, 1:], test['Survived'])
scores = cross_val_score(model, 
                         data_train.iloc[:, 1:], 
                         data_train['Survived'], 
                         cv=10,
                         n_jobs=-1
                        )
print("Accuracy: {:0.2f} (+/- {:0.2f})" \
      .format(scores.mean(), scores.std() * 2))

Accuracy: 0.82 (+/- 0.07)


With a random forest(or even a simple decision tree) and Pclass, Sex, Age, SibSp, Parch, Fare and Embarked as features 82% accuracy seems to be the best I can do.

# Applying the model to the test set

In [28]:
predictions = model.predict(data_test.loc[:, train_features])

In [None]:
results = pd.DataFrame(titanic_test['PassengerId'])
results['Survived'] = predictions
results.to_csv('predictions.csv', 
                  columns=('PassengerId', 'Survived'), index=False)

# Experiments and stuff

In [30]:
{x: y for x,y in zip(train_features, model.feature_importances_)}

{'Title': 0.12002156251845848,
 'Sex': 0.16039342583623706,
 'Age': 0.20061950971480497,
 'Ticket': 0.2777411084290061,
 'Fare': 0.2412243935014934}

In [31]:
{x: y for x,y in zip(age_features, age_forest.feature_importances_)}

{'Title': 0.2417933056848328,
 'Parch': 0.03708994190915848,
 'Ticket': 0.34124144592229466,
 'Fare': 0.264198197533486}