In [None]:
import pandas as pd

In [None]:
prefix = "../datasets/titanic/"

In [None]:
data = pd.read_csv(prefix + "train.csv")
labels = pd.read_csv(prefix + "genderclassmodel.csv")

## Dataset information

From [here](https://www.kaggle.com/c/titanic/data).

```
VARIABLE DESCRIPTIONS:
survival        Survival
                (0 = No; 1 = Yes)
pclass          Passenger Class
                (1 = 1st; 2 = 2nd; 3 = 3rd)
name            Name
sex             Sex
age             Age
sibsp           Number of Siblings/Spouses Aboard
parch           Number of Parents/Children Aboard
ticket          Ticket Number
fare            Passenger Fare
cabin           Cabin
embarked        Port of Embarkation
                (C = Cherbourg; Q = Queenstown; S = Southampton)

SPECIAL NOTES:
Pclass is a proxy for socio-economic status (SES)
 1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower

Age is in Years; Fractional if Age less than One (1)
 If the Age is Estimated, it is in the form xx.5

With respect to the family relation variables (i.e. sibsp and parch)
some relations were ignored.  The following are the definitions used
for sibsp and parch.

Sibling:  Brother, Sister, Stepbrother, or Stepsister of Passenger Aboard Titanic
Spouse:   Husband or Wife of Passenger Aboard Titanic (Mistresses and Fiances Ignored)
Parent:   Mother or Father of Passenger Aboard Titanic
Child:    Son, Daughter, Stepson, or Stepdaughter of Passenger Aboard Titanic

Other family relatives excluded from this study include cousins,
nephews/nieces, aunts/uncles, and in-laws.  Some children travelled
only with a nanny, therefore parch=0 for them.  As well, some
travelled with very close friends or neighbors in a village, however,
the definitions do not support such relations.
```

In [None]:
data.head()

In [None]:
def count_nan(df):
    """Counts the missing (NaN) values in a dataframe. Returns a 
    pandas.Series object that indicates the number of NaN values
    per row.
    
    http://stackoverflow.com/a/26266451/2014591
    """
    return df.isnull().sum()

In [None]:
count_nan(data)

In [None]:
len(data)

In [None]:
## Try a simple model. Use only 4 features.
features = ["Pclass", "Sex", "Fare", "Embarked", "Survived"]
subset = data[features]

In [None]:
subset.head()

In [None]:
y = subset["Survived"]
X = subset.drop("Survived", axis=1)

## The quick and easy experimentation procedure

1. Make your train test split
2. Encode the categorical variables, scale the variables
3. Fit and evaluate the model

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

## Analyzing who lived or died, based upon fare

In [None]:
# Plot a histogram of the fares
# http://stackoverflow.com/a/6873956/2014591   <- plotting two histograms at once
%matplotlib inline
import matplotlib.pyplot as plt

# Way to change sizes of plots
# http://stackoverflow.com/a/332311/2014591
from pylab import rcParams
rcParams['figure.figsize'] = 8, 5  

survived = X_train[y_train == 1]
died = X_train[y_train == 0]

# Remove more data
survived = survived[~(survived["Fare"] > 300)]

# http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.hist
plt.hist([survived["Fare"], died["Fare"]], alpha=0.5, color=['g', 'r'], label=["Survived", "Died"], bins=10, stacked=True)
plt.xlabel("Fare")
plt.ylabel("Number of Passengers")
plt.legend()

## Analyzing people who lived or died, based upon gender

In [None]:
survived = X_train[y_train == 1]
died = X_train[y_train == 0]

print "SURVIVORS"
print survived["Sex"].value_counts()
print survived["Sex"].value_counts() / len(survived)
print ""  # Newline

print "PEOPLE WHO DIED"
print died["Sex"].value_counts()
print died["Sex"].value_counts() / len(died)

## Analyzing people who lived or died, based upon which port they came from

In [None]:
survived = X_train[y_train == 1]
died = X_train[y_train == 0]

print "SURVIVORS"
print survived["Embarked"].value_counts()
print survived["Embarked"].value_counts() / len(survived)
print ""  # Newline

print "PEOPLE WHO DIED"
print died["Embarked"].value_counts()
print died["Embarked"].value_counts() / len(died)

In [None]:
# The distribution overall looks the same... 

## Build a baseline classifier that we want to beat

In [None]:
# Encode the categorical variables
_X_train = pd.get_dummies(X_train, columns=["Pclass", "Sex", "Embarked"])
_X_test = pd.get_dummies(X_test, columns=["Pclass", "Sex", "Embarked"])

# Only doing this beacuse encoding might not be preserved
# This checks that the column of the training and test set are aligned
assert (_X_train.columns == _X_test.columns).all()  

In [None]:
from sklearn.svm import SVC

In [None]:
predictions = SVC().fit(_X_train, y_train).predict(_X_test)
accuracy_score(y_test, predictions)

In [None]:
## Let's try scaling some features now.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(_X_train["Fare"])
_X_train["Fare"] = scaler.transform(_X_train["Fare"])
_X_test["Fare"] = scaler.transform(_X_test["Fare"])

In [None]:
predictions = SVC().fit(_X_train, y_train).predict(_X_test)
accuracy_score(y_test, predictions)

## Starting over... 

We dropped the features "PassengerId", "Age", "Cabin", "SibSp", and "Parch". A good guess is that "Age" is a good indication of whether a passenger died or not.

In [None]:
features = ["Pclass", "Sex", "Fare", "Embarked", "Survived", "Age"]
subset = data[features]

In [None]:
subset.head()

In [None]:
X = subset.drop("Survived", axis=1)
y = subset["Survived"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
# Plot a histogram of the ages
# http://stackoverflow.com/a/6873956/2014591   <- plotting two histograms at once
%matplotlib inline
import matplotlib.pyplot as plt

# Way to change sizes of plots
# http://stackoverflow.com/a/332311/2014591
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5  

survived = X_train[y_train == 1]
died = X_train[y_train == 0]

# Need to also drop NaN values
survived = survived.dropna()
died = died.dropna()

In [None]:
# http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.hist
plt.hist([died["Age"], survived["Age"]], alpha=0.5, color=['r', 'g'], label=["Survived", "Died"], stacked=True)

plt.xlabel("Age")
plt.ylabel("Number of Passengers")
plt.xlim(0, 100)
plt.legend()

In [None]:
X_train.head()

In [None]:
X_test.head()

## Filling in missing values

This is called [imputation](https://en.wikipedia.org/wiki/Imputation_(statistics). In this case, we're going to use the mean of the age to impute. There are other strategies for imputation.

Ways to impute:
* Scikit-Learn: [Imputer](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html) (Disclaimer: haven't gotten it to work yet)
* Pandas: [DataFrame.fillna](http://pandas.pydata.org/pandas-docs/version/0.17.1/generated/pandas.DataFrame.fillna.html)

We're going to fill the missing values using the *mean* of the ages.

Imputation is an open research problem. There are various ways to go about imputing values, and not one of them is claimed to be the best.

## Count the number of missing values first

In [None]:
#X_train["Age"].describe()  <-- Most version of pandas lets you see number of NaN values here, not old version
len(X_train["Age"]) - X_train["Age"].count()

In [None]:
mean = X_train["Age"].describe()["mean"]

In [None]:
X_train["Age"].fillna(mean, inplace=True)
X_test["Age"].fillna(mean, inplace=True)

In [None]:
# Encode the categorical variables
_X_train = pd.get_dummies(X_train, columns=["Pclass", "Sex", "Embarked"])
_X_test = pd.get_dummies(X_test, columns=["Pclass", "Sex", "Embarked"])

# Only doing this beacuse encoding might not be preserved
# This checks that the column of the training and test set are aligned
assert (_X_train.columns == _X_test.columns).all()  

In [None]:
clf = RandomForestClassifier(random_state=0).fit(_X_train, y_train)
predictions = clf.predict(_X_test)
accuracy_score(y_test, predictions)