In [118]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [119]:
%matplotlib qt5

In [137]:
titanic_test = pd.read_csv('data/test.csv')
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [120]:
titanic = pd.read_csv('data/train.csv')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


### On missing data

#### Training set
* 177(~20%) of **ages** are missing. Age is expected to be a strong predictor. I should work on remedying that, [Gertlowitz](http://gertlowitz.blogspot.com.br/2013/06/where-am-i-up-to-with-titanic-competion.html) used people's titles(Ms., Mr. etc.) to predict missing ages to good measure.

* 3(<1%) people are missing **embark** points. This shouldn't be too important, but my swarmplots indicated there might be something at work there.

* 687(~80%) people are missing **cabin** information. I expect deck location and floor to be a reasonable predictor, but there might not be a way to reliably estimate missing data.

##### Test set
* 86(~20%) are missing **ages**.

* 1(<1%) is missing **fare**

* 327(~80%) are missing **cabin**.

# Data cleaning

### Decategorizing data

* Sex: m,f -> 0,1

* Embarked: C,Q,S,nan -> 0,1,2,3

(It's really about forcing them into categories represented by numbers)

In [121]:
def decat_Embarked (item):
    return {'C': 0, 'Q': 1, 'S': 2, 'U':3}[item]

def decat_Sex (item):
    return {'male': 0, 'female': 1}[item]

In [122]:
titanic.Embarked = titanic.Embarked.fillna('U')
titanic.Embarked = titanic.Embarked.apply(decat_Embarked)

titanic.Sex = titanic.Sex.apply(decat_Sex)

In [123]:
used_features = ['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age',
                 'SibSp', 'Parch', 'Fare', 'Embarked']
titanic_valid = titanic.loc[:, used_features].dropna()
titanic_valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    714 non-null int64
Survived       714 non-null int64
Pclass         714 non-null int64
Sex            714 non-null int64
Age            714 non-null float64
SibSp          714 non-null int64
Parch          714 non-null int64
Fare           714 non-null float64
Embarked       714 non-null int64
dtypes: float64(2), int64(7)
memory usage: 55.8 KB


In [124]:
train, test = train_test_split(titanic_valid.iloc[:, 1:])

Random forest classifier, just to get things going:

In [136]:
model = RandomForestClassifier(criterion='entropy', n_jobs=3)
#model.fit(train.iloc[:, 1:], train['Survived'])
#model.score(test.iloc[:, 1:], test['Survived'])
scores = cross_val_score(model, titanic_valid.iloc[:, 2:], 
                         titanic_valid['Survived'], cv=10)
print("Accuracy: {:0.2f} (+/- {:0.2f})".format(scores.mean(), scores.std() * 2))

Accuracy: 0.80 (+/- 0.09)


With a random forest(or even a simple decision tree) and Pclass, Sex, Age, SibSp, Parch, Fare and Embarked as features 82% accuracy seems to be the best I can do.

In [94]:
cabineers = titanic[titanic.Cabin.notna()]

In [97]:
groups = cabineers.groupby('Pclas1s')
groups.size()

Pclass
1    176
2     16
3     12
dtype: int64

In [113]:
ticket_groups = cabineers.groupby('Ticket')
ticket_groups.size()

Ticket
110152               3
110413               3
110465               2
110564               1
110813               1
111240               1
111320               1
111361               2
111369               1
112050               1
112053               1
112058               1
112059               1
112277               1
113028               1
113043               1
113050               1
113051               1
113055               1
113056               1
113501               1
113503               1
113505               2
113509               1
113510               1
113514               1
113572               2
113760               4
113767               1
113773               1
                    ..
PC 17482             1
PC 17483             1
PC 17485             2
PC 17558             2
PC 17569             2
PC 17572             3
PC 17582             3
PC 17590             1
PC 17592             1
PC 17593             2
PC 17595             1
PC 17596             1
PC 1

0.047517827758518684