In [1]:
import h5py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [2]:
DATA_FILE = 'data/original_train_data.h5'
METADATA = 'metadata.npy'

f = h5py.File(DATA_FILE, 'r')
data_x = f['x']
data_y = f['y']
metadata = np.load(METADATA).item()

In [3]:
import pandas as pd
from sklearn import metrics

results = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'LogLoss'])

In [4]:
from sklearn.model_selection import train_test_split

data_x = np.asarray(data_x)
data_x = data_x.reshape(data_x.shape[0], -1)
data_y = np.asarray(data_y)
X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.15, random_state=42)

Gradient Boosting Model

In [7]:
from xgboost import XGBClassifier



In [8]:
n_estimators = 200
max_depth = 50
learning_rate = 0.1

y_train_reshape = y_train.argmax(1)
print(y_train.shape)
print(y_train_reshape.shape)
gb_model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, silent=False)
gb_model.fit(X_train, y_train_reshape)

(3210, 8)
(3210,)


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=50,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)

In [9]:
preds = gb_model.predict(X_val)
print(preds)

proba = gb_model.predict_proba(X_val)
print(proba)

y_val_reshape = y_val.argmax(1)

idx = len(results) + 1

results.loc[idx] = ['Gradient Boosting, n=%f' % n_estimators + ' depth=%f' % max_depth,
                       metrics.accuracy_score(y_val_reshape, preds),
                       metrics.precision_score(y_val_reshape, preds, average="micro"),
                       metrics.recall_score(y_val_reshape, preds, average="micro"),
                       metrics.f1_score(y_val_reshape, preds, average="micro"),
                       metrics.log_loss(y_val_reshape, proba)]

results

[7 0 6 0 4 7 6 7 1 0 0 4 6 5 0 6 0 0 0 3 0 0 0 7 0 7 5 7 0 0 5 7 7 0 0 0 7
 0 6 5 0 0 0 4 0 0 0 7 7 0 5 0 4 7 4 0 4 0 6 0 0 0 7 0 7 0 7 4 0 0 0 0 6 3
 0 0 1 4 0 7 2 7 2 0 0 0 7 4 4 4 0 1 1 0 5 0 7 5 5 0 4 0 0 4 0 1 0 4 0 0 2
 0 0 0 0 5 7 7 0 0 0 0 5 5 4 7 0 0 0 5 0 0 0 0 3 4 7 7 0 0 0 0 0 0 0 5 0 7
 0 0 0 7 5 0 7 6 0 4 0 7 2 7 0 0 7 0 3 0 1 0 0 0 0 0 7 6 7 0 0 0 7 4 7 7 7
 0 5 4 4 1 0 0 7 3 0 0 7 5 0 7 7 6 0 0 0 0 7 1 0 7 0 7 0 0 4 0 5 0 0 0 0 5
 0 1 0 0 0 0 7 4 5 0 7 0 0 4 0 0 1 5 7 0 0 0 0 7 0 0 0 0 0 6 0 4 4 0 0 4 0
 0 7 2 0 0 0 5 6 0 0 7 0 0 0 0 7 7 7 0 4 0 0 1 5 4 0 0 6 7 0 4 0 0 0 0 0 0
 0 7 0 6 0 4 1 0 0 2 0 4 6 0 6 0 0 0 7 0 7 6 0 5 5 5 7 1 0 0 7 0 2 6 0 0 7
 5 0 7 1 0 4 0 0 0 1 0 2 7 5 0 6 7 1 0 0 0 0 0 0 7 0 0 0 0 7 0 4 6 0 4 0 0
 7 0 2 4 4 0 7 0 0 0 2 0 0 0 4 0 7 7 0 4 0 7 4 0 7 0 7 1 0 1 0 5 5 0 0 6 5
 7 6 5 0 0 0 4 1 0 0 5 5 0 7 1 0 1 4 7 0 0 5 4 7 7 0 0 1 0 4 7 0 0 5 5 1 7
 0 0 7 0 7 4 0 0 4 7 0 2 0 0 4 7 4 0 1 0 0 0 4 6 6 2 5 0 7 0 4 6 0 2 0 5 0
 0 0 7 0 7 0 0 7 0 4 0 0 

[[  1.41552379e-02   3.56436614e-03   3.81787220e-04 ...,   2.24787183e-03
    7.64663506e-04   9.77745891e-01]
 [  9.88884091e-01   1.22723781e-04   1.24228463e-04 ...,   4.91133018e-04
    3.20333638e-04   9.59310215e-03]
 [  6.78266538e-03   1.71843392e-03   6.69499743e-04 ...,   2.37684441e-03
    9.82036054e-01   5.12925163e-03]
 ..., 
 [  9.96762633e-01   1.45370679e-04   2.38086257e-04 ...,   1.01654987e-04
    7.72333951e-05   9.61314945e-04]
 [  9.99212861e-01   1.47608182e-04   3.24116736e-05 ...,   2.30838472e-04
    3.65355154e-05   1.74910994e-04]
 [  3.85983469e-04   4.24594386e-04   9.29917395e-01 ...,   2.14068597e-04
    5.97974868e-04   6.79094195e-02]]


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,LogLoss
1,"Random Forest, n=200.000000 depth=60.000000",0.850088,0.971774,0.850088,0.906867,1.113198
2,"Gradient Boosting, n=200.000000 depth=50.000000",0.955908,0.955908,0.955908,0.955908,0.18824


In [12]:
from sklearn.externals import joblib

joblib.dump(gb_model, "models/GB_Model.h5")

['models/GB_Model.h5']