# Titanic Kaggle - Data Processing and Modeling

In [1]:
from __future__ import division
import fancyimpute
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import warnings
import xgboost as xgb

warnings.filterwarnings("ignore")
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

Using Theano backend.


## Data processing

Functions

In [2]:
def ingest_data():
    train = pd.read_csv('data/train.csv').assign(Train = 1)
    test = (pd.read_csv('data/test.csv').assign(Train = 0)
            .assign(Survived = -999)[list(train)])
    return pd.concat([train, test])


extract_lastname = lambda x: x.split(',')[0]


def extract_title(x):
    """Get the person's title from their name. Combine reduntant or less common 
    titles together.
    """
    title = x.split(',')[1].split('.')[0][1:]
    if title in ['Mlle', 'Ms']:
        title = 'Miss'
    elif title == 'Mme':
        title = 'Mrs'
    elif title in ['Rev', 'Dr', 'Major', 'Col', 'Capt', 'Jonkheer', 'Dona']:
        title = 'Esteemed'
    elif title in ['Don', 'Lady', 'Sir', 'the Countess']:
        title = 'Royalty'
    return title


first_letter = np.vectorize(lambda x: x[:1]) 


def ticket_counts(data):
    """Tickets in cases where 2 or more people shared a single ticket.
    """
    ticket_to_count = dict(data.Ticket.value_counts())
    data['TicketCount'] = data['Ticket'].map(ticket_to_count.get)
    data['Ticket'] = np.where(data['TicketCount'] > 1, data['Ticket'], np.nan)
    return data.drop(['TicketCount'], axis = 1)


def create_dummy_nans(data, col_name):
    """Create dummies for a column in a DataFrame, and preserve np.nans in their 
    original places instead of in a separate _nan column.
    """
    deck_cols = [col for col in list(data) if col_name in col]
    for deck_col in deck_cols:
        data[deck_col] = np.where(
            data[col_name + 'nan'] == 1.0, np.nan, data[deck_col])
    return data.drop([col_name + 'nan'], axis = 1)


def impute(data):
    """Impute missing values in the Age, Deck, Embarked, and Fare features.
    """
    impute_missing = data.drop(['Survived', 'Train'], axis = 1)
    impute_missing_cols = list(impute_missing)
    filled_soft = fancyimpute.MICE().complete(np.array(impute_missing))
    results = pd.DataFrame(filled_soft, columns = impute_missing_cols)
    results['Train'] = list(data['Train'])
    results['Survived'] = list(data['Survived'])
    assert results.isnull().sum().sum() == 0, 'Not all NAs removed'
    return results


def feature_engineering(data):
    return (data

        # Create last name, title, family size, and family features
        .assign(LastName = lambda x: x.Name.map(extract_lastname))
        .assign(Title = lambda x: x.Name.map(extract_title))
        .assign(FamSize = lambda x: x.SibSp + x.Parch + 1)
        .assign(Family = lambda x: [a + '_' + str(b) for a, b in zip(
                    list(x.LastName), list(x.FamSize))])
            
        # Create ticket counts for passengers sharing tickets
        .pipe(ticket_counts)

        # Turn the Cabin feature into a Deck feature (A-G)
        .assign(Deck = lambda x: np.where(
            pd.notnull(x.Cabin), first_letter(x.Cabin.fillna('z')), x.Cabin))
        .assign(Deck = lambda x: np.where(x.Deck == 'T', np.nan, x.Deck))

        # Turn Sex into a dummy variable
        .assign(Sex = lambda x: np.where(x.Sex == 'male', 1, 0))

        # Create dummy variables for the categorical features
        .assign(Pclass = lambda x: x.Pclass.astype(str))
        .pipe(pd.get_dummies, columns = ['Pclass', 'Family', 'Title', 'Ticket'])
        .pipe(pd.get_dummies, columns = ['Deck'], dummy_na = True)
        .pipe(pd.get_dummies, columns = ['Embarked'], dummy_na = True)
        .pipe(create_dummy_nans, 'Deck_')
        .pipe(create_dummy_nans, 'Embarked_')

        # Drop columns we don't need
        .drop(['Name', 'Cabin', 'PassengerId', 'SibSp', 'Parch', 'LastName'], axis = 1)

        # Impute NAs using MICE
        .pipe(impute)
    )


def split_data(data):
    """
    Split the combined training/prediction data into separate training and 
    prediction sets.
    """
    outcomes = np.array(data.query('Train == 1')['Survived'])
    train = (data.query('Train == 1')
             .drop(['Train', 'Survived'], axis = 1))
    to_predict = (data.query('Train == 0')
                  .drop(['Train', 'Survived'], axis = 1))
    return train, outcomes, to_predict

Actions

In [3]:
data = ingest_data()
data = feature_engineering(data)
train, outcomes, to_predict = split_data(data)

[MICE] Completing matrix with shape (1309, 1167)
[MICE] Starting imputation round 1/110, elapsed time 0.019
[MICE] Starting imputation round 2/110, elapsed time 2.703
[MICE] Starting imputation round 3/110, elapsed time 4.815
[MICE] Starting imputation round 4/110, elapsed time 6.975
[MICE] Starting imputation round 5/110, elapsed time 9.106
[MICE] Starting imputation round 6/110, elapsed time 11.229
[MICE] Starting imputation round 7/110, elapsed time 13.350
[MICE] Starting imputation round 8/110, elapsed time 15.471
[MICE] Starting imputation round 9/110, elapsed time 17.555
[MICE] Starting imputation round 10/110, elapsed time 19.637
[MICE] Starting imputation round 11/110, elapsed time 21.720
[MICE] Starting imputation round 12/110, elapsed time 23.809
[MICE] Starting imputation round 13/110, elapsed time 25.896
[MICE] Starting imputation round 14/110, elapsed time 27.984
[MICE] Starting imputation round 15/110, elapsed time 30.075
[MICE] Starting imputation round 16/110, elapsed t

## Modeling

In [4]:
def train_test_model(model, hyperparameters, X_train, X_test, y_train, y_test,
                    folds = 5):
    """
    Given a [model] and a set of possible [hyperparameters], along with 
    matricies corresponding to hold-out cross-validation, returns a model w/ 
    optimized hyperparameters, and prints out model evaluation metrics.
    """
    optimized_model = GridSearchCV(model, hyperparameters, cv = folds, n_jobs = -1)
    optimized_model.fit(X_train, y_train)
    predicted = optimized_model.predict(X_test)
    print 'Optimized parameters:', optimized_model.best_params_
    print 'Model accuracy (hold-out):', optimized_model.score(X_test, y_test)
    kfold_score = np.mean(cross_val_score(
            optimized_model.best_estimator_, np.append(X_train, X_test, axis = 0), 
            np.append(y_train, y_test), cv = folds, n_jobs = -1))
    print 'Model accuracy ({0}-fold):'.format(str(folds)), kfold_score, '\n'
    return optimized_model


def create_submission(name, model, train, outcomes, to_predict):
    """
    Train [model] on [train] and predict the probabilties on [test], and
    format the submission according to Kaggle.
    """
    model.fit(np.array(train), outcomes)
    probs = model.predict(np.array(to_predict))
    results = pd.DataFrame(probs, columns = ['Survived'])
    results['PassengerId'] = list(pd.read_csv('data/test.csv')['PassengerId'])
    (results[['PassengerId', 'Survived']]
        .to_csv('submissions/' + name, index = False))
    return None

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    train, outcomes, test_size = 0.2, random_state = 50)

Random forest

In [6]:
%%time
rf_model = train_test_model(
    RandomForestClassifier(n_estimators = 800, random_state = 25), {
        'min_samples_split': [1, 3, 10],
        'min_samples_leaf': [1, 3, 10],
        'max_depth': [3, None]}, 
    X_train, X_test, y_train, y_test)

Optimized parameters: {'min_samples_split': 3, 'max_depth': None, 'min_samples_leaf': 1}
Model accuracy (hold-out): 0.798882681564
Model accuracy (5-fold): 0.81925570974 

CPU times: user 4.92 s, sys: 152 ms, total: 5.07 s
Wall time: 44.8 s


KNN

In [7]:
%%time
kNN_model = train_test_model(
    KNeighborsClassifier(), {
        'n_neighbors': np.array([num + 1 for num in range(0, 20) if num % 2 == 0])}, 
    X_train, X_test, y_train, y_test)

Optimized parameters: {'n_neighbors': 19}
Model accuracy (hold-out): 0.675977653631
Model accuracy (5-fold): 0.687981858204 

CPU times: user 768 ms, sys: 116 ms, total: 884 ms
Wall time: 1.59 s


Logistic regression

In [8]:
%%time
lr_model = train_test_model(
    LogisticRegression(random_state = 25), {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'class_weight': [None, 'balanced']}, 
    X_train, X_test, y_train, y_test)

Optimized parameters: {'C': 10, 'class_weight': 'balanced'}
Model accuracy (hold-out): 0.837988826816
Model accuracy (5-fold): 0.835017857732 

CPU times: user 1.24 s, sys: 148 ms, total: 1.39 s
Wall time: 1.87 s


SVM

In [9]:
%%time
svm_model = train_test_model(
    SVC(probability = True, random_state = 25), {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'gamma': np.logspace(-9, 3, 13)}, 
    X_train, X_test, y_train, y_test)

Optimized parameters: {'C': 1000, 'gamma': 0.0001}
Model accuracy (hold-out): 0.826815642458
Model accuracy (5-fold): 0.821534427955 

CPU times: user 12.3 s, sys: 332 ms, total: 12.6 s
Wall time: 3min 31s


Gradient boosted trees

In [10]:
%%time
gbt_model = train_test_model(
    xgb.XGBClassifier(learning_rate = 0.05, n_estimators = 200, seed = 25), {
        'max_depth': range(3, 10, 2), 
        'min_child_weight': range(1, 6, 2),
        'gamma': [i / 10.0 for i in range(0, 5)], 
        'reg_alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, 
    np.array(X_train), np.array(X_test), y_train, y_test)

Optimized parameters: {'reg_alpha': 0.01, 'max_depth': 3, 'gamma': 0.1, 'min_child_weight': 1}
Model accuracy (hold-out): 0.787709497207
Model accuracy (5-fold): 0.822607665047 

CPU times: user 42.3 s, sys: 1.38 s, total: 43.6 s
Wall time: 1h 10min 6s


### Try submitting specific models before trying an ensemble

In [11]:
create_submission('ensemble_rf.csv', rf_model.best_estimator_, train, outcomes, to_predict)

Public leaderboard score of `0.79426`

In [12]:
create_submission('ensemble_lr.csv', lr_model.best_estimator_, train, outcomes, to_predict)

Public leaderboard score of `0.80383`

In [13]:
create_submission('ensemble_svm.csv', svm_model.best_estimator_, train, outcomes, to_predict)

Public leaderboard score of `0.78947`

In [14]:
create_submission('ensemble_gbt.csv', gbt_model.best_estimator_, train, outcomes, to_predict)

Public leaderboard score of `0.79426`

### Create a majority vote ensemble - give one of the best models two votes

In [15]:
def majority_vote_ensemble(name, models_votes, train, outcomes, to_predict):
    """Creates a submission from a majority voting ensemble, given training/
    testing data and a list of models and votes.
    """
    model_results = []
    for model, votes in models_votes:
        model.fit(np.array(train), outcomes)
        probs = model.predict(np.array(to_predict))
        probs[probs == 0] = -1
        model_results.append((probs, votes))
    
    # Assemble the ensemble
    ensemble = pd.read_csv('data/test.csv')[['PassengerId']].assign(Survived = 0)
    for probs, votes in model_results:
        for i in range(0, votes):
            ensemble = ensemble.assign(Survived = lambda x: x.Survived + probs)
    (ensemble.assign(Survived = lambda x: np.where(x.Survived > 0, 1, 0))
     .to_csv(name, index = False))
    return None
    
    
models_votes = [
    (rf_model.best_estimator_, 2),
    (lr_model.best_estimator_, 1),
    (svm_model.best_estimator_, 1),
    (gbt_model.best_estimator_, 1)
]


majority_vote_ensemble('submissions/ensemble_majority_vote.csv', 
                       models_votes, train, outcomes, to_predict)

Public leaderboard score of `0.82775`, my highest yet!

### Create a weighted average of probabilities ensemble

In [None]:
def weighted_average_ensemble(name, models_weights, train, outcomes, to_predict):
    """Creates a submission from a weighted average of probabilities ensemble, 
    given training/testing data and a list of models and weights.
    """
    weights_sum = sum([mw[1] for mw in models_weights])
    model_results = []
    for model, weight in models_weights:
        model.fit(np.array(train), outcomes)
        probs = model.predict_proba(np.array(to_predict))[:, 0]
        model_results.append((probs, weight))
    
    # Assemble the ensemble
    ensemble = pd.read_csv('data/test.csv')[['PassengerId']].assign(Survived = 0)
    for probs, weight in model_results:
        ensemble = ensemble.assign(Survived = lambda x: x.Survived + probs * weight)
    (ensemble.assign(Survived = lambda x: x.Survived / weights_sum)
     .assign(Survived = lambda x: np.where(x.Survived >= 0.5, 1, 0))
     .to_csv(name, index = False))
    return None


# Simple average
models_weights = [
    (rf_model.best_estimator_, 1),
    (lr_model.best_estimator_, 1),
    (svm_model.best_estimator_, 1),
    (gbt_model.best_estimator_, 1)
]

weighted_average_ensemble('submissions/ensemble_simple_avg.csv', 
                          models_weights, train, outcomes, to_predict)