# Titanic Kaggle

In [1]:
import fancyimpute
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
import warnings

warnings.filterwarnings("ignore")
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

Using Theano backend.


The first step is going to be to combine the training and test sets so that any data transformations / feature engineering is easily applied to both. Only the training set is labeled, so I will create values of -999 for `Survived` in the test subset of data.

In [2]:
data = pd.concat(
    [train.assign(Train = 1), 
    test.assign(Train = 0).assign(Survived = -999)[list(train) + ['Train']]]
)

## Feature engineering

Dealing with `Name` - create last name, title features.

In [3]:
extract_lastname = lambda x: x.split(',')[0]

def extract_title(x):
    title = x.split(',')[1].split('.')[0][1:]
    if title in ['Mlle', 'Ms']:
        title = 'Miss'
    elif title == 'Mme':
        title = 'Mrs'
    elif title in ['Don', 'Rev', 'Dr', 'Major', 'Lady', 'Sir', 'Col', 'Capt', 
                   'the Countess', 'Jonkheer', 'Dona']:
        title = 'Esteemed'
    return title
    
data = (data
    .assign(LastName = lambda x: x.Name.map(extract_lastname))
    .assign(Title = lambda x: x.Name.map(extract_title))
    .drop(['Name'], axis = 1)
)

Dealing with the `Cabin` feature - creating a deck feature (the letter in the cabin name).

In [4]:
first_letter = np.vectorize(lambda x: x[:1]) 

data = (data
        .assign(Deck = lambda x: np.where(
            pd.notnull(x.Cabin), first_letter(x.Cabin.fillna('z')), x.Cabin))
        .assign(Deck = lambda x: np.where(x.Deck == 'T', np.nan, x.Deck))
        .drop(['Cabin'], axis = 1)
)

Drop columns we don't need, convert Sex to a binary variable.

In [5]:
data = (data
        .drop(['Ticket', 'PassengerId'], axis = 1)
        .assign(Sex = lambda x: np.where(x.Sex == 'male', 1, 0))
)

Create dummy variables for categorical features.

In [6]:
def create_dummy_nans(data, col_name):
    deck_cols = [col for col in list(data) if col_name in col]
    for deck_col in deck_cols:
        data[deck_col] = np.where(
            data[col_name + 'nan'] == 1.0, np.nan, data[deck_col])
    return data.drop([col_name + 'nan'], axis = 1)

data = (data
        .assign(Pclass = lambda x: x.Pclass.astype(str))
        .pipe(pd.get_dummies, columns = ['Pclass', 'LastName', 'Title'])
        .pipe(pd.get_dummies, columns = ['Deck'], dummy_na = True)
        .pipe(pd.get_dummies, columns = ['Embarked'], dummy_na = True)
        .pipe(create_dummy_nans, 'Deck_')
        .pipe(create_dummy_nans, 'Embarked_')
)

Impute missing values.

In [7]:
def impute(data):
    impute_missing = data.drop(['Survived', 'Train'], axis = 1)
    impute_missing_cols = list(impute_missing)
    filled_soft = fancyimpute.KNN().complete(np.array(impute_missing))
    results = pd.DataFrame(filled_soft, columns = impute_missing_cols)
    results['Train'] = list(data['Train'])
    results['Survived'] = list(data['Survived'])
    assert results.isnull().sum().sum() == 0, 'Not all NAs removed'
    return results

data = data.pipe(impute)
print 'Number of NAs:', data.isnull().sum().sum()

Computing pairwise distances between 1309 samples
Computing distances for sample #1/1309, elapsed time: 1.188
Computing distances for sample #101/1309, elapsed time: 1.646
Computing distances for sample #201/1309, elapsed time: 2.112
Computing distances for sample #301/1309, elapsed time: 2.595
Computing distances for sample #401/1309, elapsed time: 3.050
Computing distances for sample #501/1309, elapsed time: 3.554
Computing distances for sample #601/1309, elapsed time: 4.020
Computing distances for sample #701/1309, elapsed time: 4.482
Computing distances for sample #801/1309, elapsed time: 4.946
Computing distances for sample #901/1309, elapsed time: 5.436
Computing distances for sample #1001/1309, elapsed time: 5.944
Computing distances for sample #1101/1309, elapsed time: 6.413
Computing distances for sample #1201/1309, elapsed time: 6.900
Computing distances for sample #1301/1309, elapsed time: 7.356
Imputing row 1/1309 with 7 missing columns, elapsed time: 7.497
Imputing row 101

Split into separate training and predicting sets.

In [9]:
outcomes = np.array(data.query('Train == 1')['Survived'])
train = (data.query('Train == 1')
         .drop(['Train', 'Survived'], axis = 1))
to_predict = (data.query('Train == 0')
              .drop(['Train', 'Survived'], axis = 1))

Further, randomly split the training set into training and test sets using hold-out cross validation.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train, outcomes, test_size = 0.2, random_state = 50)

 Use grid search to optimize hyperparameters for a random forest classifier.

In [None]:
%%time

def train_test_model(model, hyperparameters, X_train, X_test, y_train, y_test,
                    folds = 5):
    """
    Given a [model] and a set of possible [hyperparameters], along with 
    matricies corresponding to hold-out cross-validation, returns a model w/ 
    optimized hyperparameters, and prints out model evaluation metrics.
    """
    optimized_model = GridSearchCV(model, hyperparameters, cv = folds, n_jobs = -1)
    optimized_model.fit(X_train, y_train)
    predicted = optimized_model.predict(X_test)
    print 'Optimized parameters:', optimized_model.best_params_
    print 'Model accuracy:', optimized_model.score(X_test, y_test), '\n'
    return optimized_model

param_grid = {'n_estimators': [10, 50, 100, 300, 500, 800, 1000, 2000],
             'max_depth': [3, 7, None],
             'max_features': ['auto', 'log2', None],
             'min_samples_leaf': [1, 3, 10],
             'min_samples_split': [1, 2, 10]}
rf_model = train_test_model(
    RandomForestClassifier(), param_grid, X_train, X_test, y_train, y_test
)

In [14]:
rf_model = RandomForestClassifier( **{'max_features': None, 'min_samples_split': 1, 'n_estimators': 10, 
                                      'max_depth': 7, 'min_samples_leaf': 1, 'n_jobs': -1} )
np.mean(cross_val_score(rf_model, np.array(train), outcomes, cv = 10))

0.83395358075133363

In [15]:
def create_submission(name, model, train, outcomes, to_predict):
    """
    Train [model] on [train] and predict the probabilties on [test], and
    format the submission according to Kaggle.
    """
    model.fit(np.array(train), outcomes)
    probs = model.predict(np.array(to_predict))
    results = pd.DataFrame(probs, columns = ['Survived'])
    results['PassengerId'] = list(pd.read_csv('data/test.csv')['PassengerId'])
    (results[['PassengerId', 'Survived']]
        .to_csv('submissions/' + name, index = False))
    return None

create_submission('rf_model3.csv', rf_model, train, outcomes, to_predict)