# Imports

In [1]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
%matplotlib inline

# Loading the Datasets

In [3]:
titanic_test = pd.read_csv('data/test.csv')
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [4]:
titanic = pd.read_csv('data/train.csv')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## On missing data

### Training set
* 177(~20%) of **ages** are missing. Age is expected to be a strong predictor. I should work on remedying that, [Gertlowitz](http://gertlowitz.blogspot.com.br/2013/06/where-am-i-up-to-with-titanic-competion.html) used people's titles(Ms., Mr. etc.) to predict missing ages to good measure.

* 2(<1%) people are missing **embark** points. This shouldn't be too important, but my swarmplots indicated there might be something at work there.

* 687(~80%) people are missing **cabin** information. I expect deck location and floor to be a reasonable predictor, but there might not be a way to reliably estimate missing data.

### Test set
* 86(~20%) are missing **ages**.

* 1(<1%) is missing **fare**

* 327(~80%) are missing **cabin**.

## Defining the actually used subsets

### Training

In [5]:
used_features = ['Survived', 'Pclass', 'Sex', 'Age',
                 'SibSp', 'Parch', 'Fare', 'Embarked']
titanic_tr = titanic.loc[:, used_features]
titanic_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


### Test

In [6]:
titanic_te = titanic_test.loc[:, used_features[1:]]
titanic_te.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        417 non-null float64
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


# Data preprocessing

## Reencoding  the training set

For the used categorical features

In [7]:
titanic_tr.Embarked = titanic_tr.Embarked.fillna('U')
embarked_le = LabelEncoder()
embarked_le.fit(titanic_tr.Embarked)
titanic_tr.Embarked = embarked_le.transform(titanic_tr.Embarked)

sex_le = LabelEncoder()
sex_le.fit(titanic_tr.Sex)
titanic_tr.Sex = sex_le.transform(titanic_tr.Sex)

## Reencoding  the training set

In [8]:
titanic_te.Embarked = titanic_te.Embarked.fillna('U')
titanic_te.Embarked = embarked_le.transform(titanic_te.Embarked)

#titanic_te['Fare'] = titanic_te.apply(imputate_fare, 1)

titanic_te.Sex = sex_le.transform(titanic_te.Sex)

#titanic_te['Age'] = titanic_te.apply(imputate_age, 1)

## Imputation of ages

Experimental imputation using a regression forest, because computation is cheap.

In [9]:
age_features = list(used_features)
age_features.remove('Age')
age_features.remove('Survived')

age_forest = RandomForestRegressor(n_jobs=-1)

scores = cross_val_score(age_forest,
                         titanic_tr[~np.isnan(titanic_tr.Age)].loc[:, age_features],
                         titanic_tr[~np.isnan(titanic_tr.Age)]['Age'], 
                         cv=10)
print("Accuracy: {:0.2f} (+/- {:0.2f})" \
      .format(scores.mean(), scores.std() * 2))


age_forest.fit(titanic_tr[~np.isnan(titanic_tr.Age)].loc[:, age_features],
               titanic_tr[~np.isnan(titanic_tr.Age)]['Age']);

Accuracy: 0.12 (+/- 0.27)


In [10]:
titanic_tr.loc[np.isnan(titanic_tr.Age), 'Age'] = \
    age_forest.predict(titanic_tr[np.isnan(titanic_tr.Age)] \
                       .loc[:, age_features])

In [11]:
titanic_te.loc[np.isnan(titanic_te.Age), 'Age'] = \
    age_forest.predict(titanic_te[np.isnan(titanic_te.Age)] \
                       .loc[:, age_features])

## Imputation of fares

For the one lone passenger in the test set without fare information I'm using the mean of the fares for his Pclass in the training set.

In [12]:
def imputate_fare(row):
    if np.isnan(row['Fare']):
        return titanic[titanic.Pclass == row['Pclass']] \
                .groupby('Ticket').mean()['Fare'].mean()
    else:
        return row['Fare']

In [None]:
titanic_te['Fare'] = titanic_te.apply(imputate_fare, 1)

# Training the model

Random forest classifier, just to get things going:

In [None]:
model = RandomForestClassifier(criterion='entropy', n_jobs=3)
model.fit(titanic_tr.iloc[:, 1:], titanic_tr['Survived'])
#model.score(test.iloc[:, 1:], test['Survived'])
scores = cross_val_score(model, titanic_tr.iloc[:, 1:], 
                         titanic_tr['Survived'], cv=10)
print("Accuracy: {:0.2f} (+/- {:0.2f})" \
      .format(scores.mean(), scores.std() * 2))

With a random forest(or even a simple decision tree) and Pclass, Sex, Age, SibSp, Parch, Fare and Embarked as features 82% accuracy seems to be the best I can do.

# Applying the model to the test set

In [None]:
predictions = model.predict(titanic_te)

In [None]:
results = pd.DataFrame(titanic_test['PassengerId'])
results['Survived'] = predictions
results.to_csv('predictions.csv', 
                  columns=('PassengerId', 'Survived'), index=False)

# Experiments and stuff