In [29]:
from tpot import TPOTClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [30]:
def load_and_labelize_titanic(filename, encoders=None):
    """Read csv and perform basic labeling encoding"""
    
    df = pd.read_csv(filename)
    if not encoders:
        encoders = {'Sex': LabelEncoder(), 
                    'Cabin': LabelEncoder(), 
                    'Embarked': LabelEncoder()}
        for column, encoder in encoders.items():
            encoder.fit(list(df[column].astype(str)) + ['UnknownLabel'])
            df[column] = encoder.transform(df[column].astype(str))
    else:
        for column, encoder in encoders.items():
            df.loc[~df[column].isin(encoder.classes_), column] = 'UnknownLabel'
            df[column] = encoder.transform(df[column].astype(str))
        
    df = df.fillna(-999)
    passenger_ids = df['PassengerId']
    df = df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
    return df, encoders, passenger_ids

In [31]:
train, encoders, _ = load_and_labelize_titanic('titanic/train.csv')

In [32]:
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, 
                     n_jobs=1, scoring='accuracy')
tpot.fit(train.drop('Survived', axis=1).values, train['Survived'])

Optimization Progress:  33%|███▎      | 100/300 [00:57<01:29,  2.24pipeline/s]

Generation 1 - Current best internal CV score: 0.8305358170670338


Optimization Progress:  50%|█████     | 150/300 [01:34<03:28,  1.39s/pipeline]

Generation 2 - Current best internal CV score: 0.8316971459191776


Optimization Progress:  67%|██████▋   | 200/300 [02:24<01:57,  1.18s/pipeline]

Generation 3 - Current best internal CV score: 0.8316971459191776


Optimization Progress:  83%|████████▎ | 250/300 [03:52<00:55,  1.12s/pipeline]

Generation 4 - Current best internal CV score: 0.8316971459191776


                                                                              

Generation 5 - Current best internal CV score: 0.8317038485531001

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.9, min_samples_leaf=2, min_samples_split=5, n_estimators=100)


TPOTClassifier(config_dict={'sklearn.naive_bayes.MultinomialNB': {'fit_prior': [True, False], 'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}, 'sklearn.feature_selection.VarianceThreshold': {'threshold': array([ 0.05,  0.1 ,  0.15,  0.2 ,  0.25,  0.3 ,  0.35,  0.4 ,  0.45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 , ...tpot.builtins.OneHotEncoder': {'sparse': [False], 'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25]}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=5, max_eval_time_mins=5,
        max_time_mins=None, mutation_rate=0.9, n_jobs=1, offspring_size=50,
        periodic_checkpoint_folder=None, population_size=50,
        random_state=None, scoring=None, subsample=1.0, verbosity=2,
        warm_start=False)

In [33]:
test, _, passenger_ids = load_and_labelize_titanic('titanic/test.csv', encoders)
results = tpot.predict(test)
results_df = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': results})
results_df.to_csv('titanic/predictions.csv', index=False)