In [1]:
import ml_framework
from ml_framework import dataset, preprocess, feature, model

In [2]:
import seaborn.apionly as sns
import pandas as pd

@dataset(train_valid_test=(0.6, 0.2, 0.2))
def raw_dataset():
    titanic_dataset = sns.load_dataset('titanic')

    # Drop NaN rows for simplicity
    titanic_dataset.dropna(inplace=True)

    # Extract X and y
    X = titanic_dataset.drop('survived', axis=1)
    y = titanic_dataset['survived']
    return X, y

# my_dataset is now a variable that holds the X values of the evaluated function
# (the test data's ground truth is locked away to prevent accidentially fitting to it)
raw_dataset.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [3]:
@preprocess
def preprocessed_dataset(X):
    # Encode categorical columns
    categorical_column_names = [
        'sex', 'embarked', 'class',
        'who', 'adult_male', 'deck',
        'embark_town', 'alive', 'alone'
    ]

    X = pd.get_dummies(X,
                       columns=categorical_column_names,
                       prefix=categorical_column_names)

    return X

preprocessed_dataset.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,...,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes,alone_False,alone_True
1,1,38.0,1,0,71.2833,1,0,1,0,0,...,0,0,0,1,0,0,0,1,1,0
3,1,35.0,1,0,53.1,1,0,0,0,1,...,0,0,0,0,0,1,0,1,1,0
6,1,54.0,0,0,51.8625,0,1,0,0,1,...,1,0,0,0,0,1,1,0,0,1
10,3,4.0,1,1,16.7,1,0,0,0,1,...,0,0,1,0,0,1,0,1,1,0
11,1,58.0,0,0,26.55,1,0,0,0,1,...,0,0,0,0,0,1,0,1,0,1


In [4]:
from sklearn import decomposition
import numpy as np

@feature('pca')
def pca_feature(X):
    pca = decomposition.PCA(n_components=3)
    pca.fit(X)
    pca_out = pca.transform(X)
    
    pca_out = np.transpose(pca_out, (1, 0))
    return {'pca_0': pca_out[0], 'pca_1': pca_out[1], 'pca_2': pca_out[2]}

pca_feature.head()

Unnamed: 0,pca_0,pca_1,pca_2
1,-7.67143,2.211611,-1.524306
3,-25.798643,-1.153763,-1.330469
6,-27.419192,17.883788,1.170164
10,-61.59439,-32.894045,-0.423018
11,-52.795269,21.329436,-1.718752


In [5]:
import xgboost as xgb

@model('xgboost')
def xgboost_model():
    def define(num_columns):
        return None # xgboost models are not pre-defined
    
    def train(model, params, train, validation):
        params['objective'] = 'binary:logistic'
        params['eval_metric'] = 'logloss'
        
        d_train = xgb.DMatrix(train['X'], label=train['y'])
        d_valid = xgb.DMatrix(validation['X'], label=validation['y'])

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]

        return xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=50, verbose_eval=200)
    
    def predict(model, X):
        return model.predict(xgb.DMatrix(X))
    
    return define, train, predict

In [6]:
ml_framework.train('xgboost', {
    'max_depth': 7,
    'eta': 0.005
})

[0]	train-logloss:0.6885	valid-logloss:0.688485
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[200]	train-logloss:0.221082	valid-logloss:0.220172
[400]	train-logloss:0.087941	valid-logloss:0.087103
[600]	train-logloss:0.040761	valid-logloss:0.040085
[800]	train-logloss:0.022548	valid-logloss:0.022152
[1000]	train-logloss:0.019019	valid-logloss:0.019206
[1200]	train-logloss:0.018926	valid-logloss:0.019022
[1400]	train-logloss:0.018901	valid-logloss:0.018951
[1600]	train-logloss:0.018894	valid-logloss:0.018921
[1800]	train-logloss:0.018892	valid-logloss:0.018907
[2000]	train-logloss:0.018892	valid-logloss:0.0189
Stopping. Best iteration:
[2075]	train-logloss:0.018892	valid-logloss:0.018898



In [7]:
ml_framework.evaluate('xgboost')

0.02013238549635217

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

@model('keras_nn')
def keras_nn_model():
    def define(num_columns):
        model = Sequential()
        model.add(Dense(64, input_dim=num_columns, activation='relu'))
        model.add(Dropout(0.25))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.25))
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.25))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(loss='binary_crossentropy',
                      optimizer='rmsprop',
                      metrics=['accuracy'])
        return model

    
    def train(model, params, train, validation):
        model.fit(train['X'].values, train['y'].values,
                  epochs=50,
                  batch_size=5)
        
        return model
    
    def predict(model, X):
        return model.predict(X.values)
    
    return define, train, predict

In [17]:
ml_framework.train('keras_nn', {})

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [18]:
ml_framework.evaluate('keras_nn')

0.56353224844143202