In [1]:
import h5py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [2]:
DATA_FILE = 'data/original_train_data.h5'
METADATA = 'metadata.npy'

f = h5py.File(DATA_FILE, 'r')
data_x = f['x']
data_y = f['y']
metadata = np.load(METADATA).item()

In [3]:
import pandas as pd
from sklearn import metrics

results = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'LogLoss'])

In [4]:
from sklearn.model_selection import train_test_split

data_x = np.asarray(data_x)
data_x = data_x.reshape(data_x.shape[0], -1)
data_y = np.asarray(data_y)
X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.15, random_state=42)

RANDOM FOREST MODEL

In [5]:
n_estimators = 200
max_depth = 60
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=60, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [7]:
preds = model.predict(X_val)
print(preds.shape)
idx = len(results) + 1

results.loc[idx] = ['Random Forest, n=%f' % n_estimators + ' depth=%f' % max_depth,
                       metrics.accuracy_score(y_val, preds),
                       metrics.precision_score(y_val, preds, average="micro"),
                       metrics.recall_score(y_val, preds, average="micro"),
                       metrics.f1_score(y_val, preds, average="micro"),
                       metrics.log_loss(y_val, preds)]

results

(567, 8)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,LogLoss
1,"Random Forest, n=200.000000 depth=60.000000",0.850088,0.971774,0.850088,0.906867,1.113198
2,"Random Forest, n=200.000000 depth=60.000000",0.850088,0.971774,0.850088,0.906867,1.113198


Gradient Boosting Model

In [5]:
from xgboost import XGBClassifier



In [6]:
n_estimators = 10
max_depth = 3
learning_rate = 0.1

y_train_reshape = y_train.argmax(1)
print(y_train.shape)
print(y_train_reshape.shape)
gb_model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, silent=False)
gb_model.fit(X_train, y_train_reshape)

(3210, 8)
(3210,)


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=10, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=False, subsample=1)

In [8]:
preds = gb_model.predict(X_val)
print(preds)

[0 7 6 7 4 7 6 0 0 0 7 0 0 5 0 6 0 0 0 3 0 0 0 7 0 7 0 7 0 0 0 7 7 0 0 0 7
 0 6 0 0 0 0 4 0 0 0 7 7 0 5 0 4 7 4 0 0 0 6 0 0 0 7 0 0 0 7 4 0 0 0 0 6 3
 0 0 1 0 0 2 2 7 7 0 0 0 7 4 0 4 0 1 0 0 5 0 7 5 5 0 4 0 0 4 0 1 0 7 6 0 2
 0 0 0 0 5 7 0 0 0 0 0 5 0 4 7 0 0 0 5 0 0 0 0 0 7 6 0 0 0 0 0 0 0 0 5 0 7
 0 0 0 7 0 0 7 6 0 0 0 7 0 7 0 0 7 0 3 0 1 0 0 0 0 0 7 6 0 0 0 0 7 4 7 7 7
 0 5 4 4 1 0 0 0 0 0 0 0 0 0 7 7 6 0 0 0 0 0 4 0 7 0 7 0 0 0 0 5 0 0 0 0 0
 0 1 0 0 0 0 7 7 5 0 0 0 0 4 0 0 0 5 7 0 0 0 0 0 0 0 0 0 0 6 0 4 4 0 0 4 0
 0 0 2 0 0 7 5 6 0 0 7 0 0 0 7 7 7 0 0 0 0 0 1 5 0 0 0 6 2 0 0 0 0 0 0 0 0
 0 0 0 6 0 0 0 0 0 7 0 4 6 0 6 0 0 0 7 0 0 6 0 5 5 0 7 0 0 0 7 0 2 6 0 0 0
 0 0 7 0 7 4 0 0 0 0 0 7 7 5 0 6 7 1 0 0 0 0 0 0 0 0 0 0 0 7 0 4 6 0 0 0 0
 7 0 7 4 4 0 7 0 0 0 0 0 0 0 0 0 0 7 0 4 0 7 4 0 0 0 7 1 0 1 0 7 5 0 0 6 5
 7 6 0 0 0 0 4 7 0 0 0 5 0 7 1 0 1 4 7 0 0 5 4 7 0 0 0 1 0 0 7 0 0 5 5 1 0
 0 0 7 0 7 4 0 0 4 0 0 7 0 0 4 7 4 0 1 0 0 0 4 6 6 7 0 0 0 0 4 6 0 2 0 0 0
 0 0 7 0 7 0 0 7 0 4 0 0 