In [1]:
import h5py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from keras.models import load_model
from keras import backend as K

%matplotlib inline

Using TensorFlow backend.


In [2]:
DATA_FILE = 'data/original_train_data.h5'
METADATA = 'metadata.npy'

f = h5py.File(DATA_FILE, 'r')
data_x = f['x']
data_y = f['y']
metadata = np.load(METADATA).item()

In [3]:
data_x = np.asarray(data_x)
data_y = np.asarray(data_y)
X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.15, random_state=42)

In [4]:
results = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'LogLoss'])

CNN MODEL

In [5]:
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall
    
    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    
    recall = recall(y_true, y_pred)
    precision = precision(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))


cnn_model = load_model('models/CNN_Model.h5', custom_objects={'recall': recall, 'precision': precision, 'f1': f1})

score, acc, recall, precision, f1 = cnn_model.evaluate(X_val, y_val, batch_size=64)

cnn_proba = cnn_model.predict_proba(X_val)

idx = len(results) + 1
results.loc[idx] = ['CNN', acc, precision, recall, f1, score]

results

 64/567 [==>...........................] - ETA: 6s

128/567 [=====>........................] - ETA: 4s


















 32/567 [>.............................] - ETA: 6s

 64/567 [==>...........................] - ETA: 4s

 96/567 [====>.........................] - ETA: 4s

128/567 [=====>........................] - ETA: 4s






























[[  1.26575690e-03   9.39438865e-03   2.53276230e-04 ...,   1.98720692e-04
    1.46540027e-04   9.88517284e-01]
 [  9.99981403e-01   1.15906516e-06   1.78377820e-06 ...,   4.93893731e-06
    5.73473017e-07   3.90833247e-06]
 [  3.40599254e-05   1.99182778e-05   8.83281700e-06 ...,   1.14394888e-05
    9.99841452e-01   6.01492502e-05]
 ..., 
 [  9.99257863e-01   3.62396931e-05   8.37494808e-05 ...,   8.12816215e-05
    1.63660225e-05   8.48591371e-05]
 [  9.93568659e-01   4.45203506e-04   1.05121449e-04 ...,   3.44426790e-03
    8.03063886e-05   4.12302936e-04]
 [  6.27813788e-05   4.58951436e-05   9.99611676e-01 ...,   1.28759020e-05
    8.72376950e-06   2.38754248e-04]]


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,LogLoss
1,CNN,0.96649,0.968142,0.964727,0.966421,0.143362


GRADIENT BOOST MODEL

In [6]:
data_x = np.asarray(data_x)
data_x = data_x.reshape(data_x.shape[0], -1)
data_y = np.asarray(data_y)
X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.15, random_state=42)

In [7]:
gb_model = joblib.load("models/GB_Model.h5")



In [8]:
def get_preds_from_proba(proba):
    preds = np.zeros_like(proba)
    preds[np.arange(len(proba)), proba.argmax(1)] = 1
    return preds

In [10]:
from xgboost import XGBClassifier

gb_proba = gb_model.predict_proba(X_val)

gb_preds = get_preds_from_proba(gb_proba)

idx = len(results) + 1

results.loc[idx] = ['Gradient Boosting',
                       metrics.accuracy_score(y_val, gb_preds),
                       metrics.precision_score(y_val, gb_preds, average="micro"),
                       metrics.recall_score(y_val, gb_preds, average="micro"),
                       metrics.f1_score(y_val, gb_preds, average="micro"),
                       metrics.log_loss(y_val, gb_proba)]

results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,LogLoss
1,CNN,0.96649,0.968142,0.964727,0.966421,0.143362
2,Gradient Boosting,0.955908,0.955908,0.955908,0.955908,0.18824


HARD VOTING

In [11]:
from sklearn.ensemble import VotingClassifier
from keras.wrappers.scikit_learn import KerasClassifier

models_list = [gb_model, cnn_model]
weights = [1, 6]

proba = np.asarray([gb_proba, cnn_proba])
proba = np.average(proba, axis=0, weights=weights)

preds = get_preds_from_proba(proba)


idx = len(results) + 1

results.loc[idx] = ['Soft Voting',
                       metrics.accuracy_score(y_val, preds),
                       metrics.precision_score(y_val, preds, average="micro"),
                       metrics.recall_score(y_val, preds, average="micro"),
                       metrics.f1_score(y_val, preds, average="micro"),
                       metrics.log_loss(y_val, proba)]

results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,LogLoss
1,CNN,0.96649,0.968142,0.964727,0.966421,0.143362
2,Gradient Boosting,0.955908,0.955908,0.955908,0.955908,0.18824
3,Soft Voting,0.96649,0.96649,0.96649,0.96649,0.140977
