In [1]:
import h5py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [2]:
DATA_FILE = 'data/original_train_data.h5'
METADATA = 'metadata.npy'

f = h5py.File(DATA_FILE, 'r')
data_x = f['x']
data_y = f['y']
metadata = np.load(METADATA).item()

In [3]:
import pandas as pd
from sklearn import metrics

results = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'LogLoss'])

In [4]:
from sklearn.model_selection import train_test_split

data_x = np.asarray(data_x)
data_x = data_x.reshape(data_x.shape[0], -1)
data_y = np.asarray(data_y)
X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.15, random_state=42)

Gradient Boosting Model

In [5]:
from xgboost import XGBClassifier



In [6]:
n_estimators = 200
max_depth = 50
learning_rate = 0.1

gb_model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, silent=False)
gb_model.fit(data_x, data_y.argmax(1))

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=50,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)

In [7]:
preds = gb_model.predict(X_val)
print(preds)

proba = gb_model.predict_proba(X_val)
print(proba)

y_val_reshape = y_val.argmax(1)

idx = len(results) + 1

results.loc[idx] = ['Gradient Boosting, n=%f' % n_estimators + ' depth=%f' % max_depth,
                       metrics.accuracy_score(y_val_reshape, preds),
                       metrics.precision_score(y_val_reshape, preds, average="micro"),
                       metrics.recall_score(y_val_reshape, preds, average="micro"),
                       metrics.f1_score(y_val_reshape, preds, average="micro"),
                       metrics.log_loss(y_val_reshape, proba)]

results

[7 0 6 0 4 7 6 7 0 0 0 4 6 5 0 6 0 0 0 3 0 0 0 7 0 7 5 7 0 0 5 7 7 0 0 0 7
 0 6 5 0 0 0 4 0 0 7 7 7 4 5 0 4 7 4 0 4 0 6 0 0 0 7 0 7 0 7 4 7 0 6 0 6 3
 0 0 1 4 0 7 2 7 2 0 0 0 7 4 4 4 0 1 1 0 5 0 7 5 5 0 4 0 0 4 0 1 0 4 0 0 2
 0 0 0 0 5 7 7 0 0 0 0 5 5 4 7 0 0 0 5 0 0 0 0 3 4 7 7 0 0 0 0 0 0 2 5 0 7
 0 0 0 7 5 0 7 6 0 4 0 7 2 7 0 0 7 0 3 0 1 0 0 0 0 0 7 6 7 0 0 0 7 4 7 7 7
 0 5 4 4 1 0 4 7 3 4 0 7 5 0 7 7 6 0 0 0 0 7 1 0 7 0 7 0 0 4 0 5 0 5 0 0 5
 4 1 0 0 0 0 7 4 5 0 7 0 0 4 0 0 1 5 2 0 0 0 0 7 0 0 0 0 0 6 0 4 4 0 0 4 0
 0 7 2 0 0 0 5 6 0 0 7 0 0 0 0 7 7 7 4 4 0 0 1 5 4 0 0 6 7 0 4 0 0 0 0 0 0
 0 7 0 6 0 4 1 0 0 2 0 4 6 0 6 0 0 0 7 0 7 6 4 5 5 5 7 1 0 0 7 0 2 6 0 0 7
 5 0 7 1 0 4 0 1 0 1 0 2 7 5 0 6 7 1 0 0 0 0 2 0 7 0 0 0 0 7 4 4 6 0 4 0 0
 7 0 2 4 4 0 7 0 0 0 2 7 0 0 4 0 0 7 0 4 0 7 4 0 0 0 7 1 1 1 0 5 5 0 0 6 5
 7 6 5 0 0 0 4 1 0 0 5 5 0 7 1 0 1 4 7 0 0 5 4 7 7 0 0 1 0 4 7 0 0 5 5 4 7
 0 0 7 0 7 4 0 0 4 7 0 2 0 0 0 7 4 0 1 0 0 0 4 6 6 2 5 0 7 0 4 6 0 2 0 5 0
 0 0 7 0 7 2 0 7 0 4 0 0 

[[  1.32511486e-03   2.66747724e-04   1.20993049e-04 ...,   3.63940140e-04
    8.01237402e-05   9.97595608e-01]
 [  9.98491883e-01   4.62035096e-05   6.29660353e-05 ...,   1.27557738e-04
    8.01819551e-05   1.03422825e-03]
 [  9.99830314e-04   2.61057459e-04   1.62297220e-04 ...,   6.13722077e-04
    9.96487737e-01   1.00173627e-03]
 ..., 
 [  9.98476803e-01   8.08840923e-05   7.48760722e-05 ...,   6.66264023e-05
    3.95513307e-05   5.04741154e-04]
 [  9.99345005e-01   1.04659754e-04   5.71017044e-05 ...,   1.71472071e-04
    2.59108383e-05   1.70765779e-04]
 [  1.83245036e-04   1.19380449e-04   9.95298445e-01 ...,   8.65925103e-05
    2.09242062e-04   3.92362196e-03]]


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,LogLoss
1,"Gradient Boosting, n=200.000000 depth=50.000000",1.0,1.0,1.0,1.0,0.002185


In [8]:
from sklearn.externals import joblib

joblib.dump(gb_model, "models/GB_Model_Full_200_50.h5")

['models/GB_Model_Full_200_50.h5']