# Ensembling

In [17]:
from fastai.tabular.all import *

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier

path = Path('/Users/baranserajelahi/Codes/fraud-detection-pytorch-scikit-fastai/data')
Path.BASE_PATH = path

In [18]:
to = (path/'to.pkl').load()
xs, y             = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

## Models

**Random Forest**

In [16]:
%%time
rf = RandomForestClassifier(n_jobs=-1 , max_samples=0.66, oob_score=True,
                            n_estimators=1000,  criterion='entropy', class_weight='balanced').fit(xs,y)

roc_auc_score(y, rf.predict_proba(xs)[:,1]), roc_auc_score(valid_y, rf.predict_proba(valid_xs)[:,1])

CPU times: user 2min 25s, sys: 2.52 s, total: 2min 27s
Wall time: 28.8 s


(0.9999999999999999, 0.8591078579310556)

**Gradient Boosted Tree**

In [19]:
%%time
gbt = HistGradientBoostingClassifier(loss='binary_crossentropy', verbose=1, l2_regularization=2.4,
                             learning_rate=0.03, max_depth=27, max_iter=1000, max_leaf_nodes=44,
                             min_samples_leaf=8,  scoring='roc_auc', tol=1e-8).fit(xs,y)

roc_auc_score(y, gbt.predict_proba(xs)[:,1]), roc_auc_score(valid_y, gbt.predict_proba(valid_xs)[:,1])

Binning 0.042 GB of training data: 0.269 s
Binning 0.005 GB of validation data: 0.019 s
Fitting gradient boosted rounds:
[1/1000] 1 tree, 44 leaves, max depth = 9, train score: 0.85910, val score: 0.83711, in 0.085s
[2/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.86387, val score: 0.84390, in 0.073s
[3/1000] 1 tree, 44 leaves, max depth = 9, train score: 0.86610, val score: 0.84655, in 0.082s
[4/1000] 1 tree, 44 leaves, max depth = 10, train score: 0.87292, val score: 0.86242, in 0.057s
[5/1000] 1 tree, 44 leaves, max depth = 9, train score: 0.87804, val score: 0.86661, in 0.074s
[6/1000] 1 tree, 44 leaves, max depth = 10, train score: 0.87861, val score: 0.86396, in 0.091s
[7/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.88009, val score: 0.86565, in 0.064s
[8/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.87995, val score: 0.86607, in 0.077s
[9/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.88000, val score: 0.86682, in 0.085s
[10/1000] 1 tree, 

[85/1000] 1 tree, 44 leaves, max depth = 10, train score: 0.93341, val score: 0.90388, in 0.075s
[86/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.93389, val score: 0.90411, in 0.078s
[87/1000] 1 tree, 44 leaves, max depth = 10, train score: 0.93474, val score: 0.90429, in 0.074s
[88/1000] 1 tree, 44 leaves, max depth = 9, train score: 0.93554, val score: 0.90425, in 0.074s
[89/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.93612, val score: 0.90435, in 0.097s
[90/1000] 1 tree, 44 leaves, max depth = 17, train score: 0.93663, val score: 0.90454, in 0.119s
[91/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.93704, val score: 0.90469, in 0.137s
[92/1000] 1 tree, 44 leaves, max depth = 18, train score: 0.93750, val score: 0.90539, in 0.104s
[93/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.93822, val score: 0.90585, in 0.087s
[94/1000] 1 tree, 44 leaves, max depth = 12, train score: 0.93860, val score: 0.90612, in 0.088s
[95/1000] 1 tree, 44 leaves, ma

[169/1000] 1 tree, 44 leaves, max depth = 17, train score: 0.96650, val score: 0.91299, in 0.103s
[170/1000] 1 tree, 44 leaves, max depth = 16, train score: 0.96675, val score: 0.91326, in 0.105s
[171/1000] 1 tree, 44 leaves, max depth = 14, train score: 0.96693, val score: 0.91322, in 0.101s
[172/1000] 1 tree, 44 leaves, max depth = 14, train score: 0.96700, val score: 0.91326, in 0.106s
[173/1000] 1 tree, 44 leaves, max depth = 19, train score: 0.96726, val score: 0.91310, in 0.119s
[174/1000] 1 tree, 44 leaves, max depth = 16, train score: 0.96757, val score: 0.91314, in 0.134s
[175/1000] 1 tree, 44 leaves, max depth = 14, train score: 0.96790, val score: 0.91324, in 0.112s
[176/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.96801, val score: 0.91317, in 0.129s
[177/1000] 1 tree, 44 leaves, max depth = 10, train score: 0.96835, val score: 0.91307, in 0.216s
[178/1000] 1 tree, 44 leaves, max depth = 13, train score: 0.96845, val score: 0.91309, in 0.119s
[179/1000] 1 tree, 4

(0.9651940400817189, 0.8598415063071321)

**Neural Network**

In [25]:
to_nn = (path/'to_nn_full.pkl').load()

dls = to_nn.dataloaders(1024)
learn = tabular_learner(dls, layers=[500,250])
learn.loss_func

FlattenedLoss of CrossEntropyLoss()

In [26]:
learn.load('nn_nf_full_stage-1')

FileNotFoundError: [Errno 2] No such file or directory: 'models/nn_nf_full_stage-1.pth'

## Ensemble

In [20]:
rf_preds_train = rf.predict_proba(xs)[:,1]
rf_preds_valid = rf.predict_proba(valid_xs)[:,1]

histgbt_preds_train = gbt.predict_proba(xs)[:,1]
histgbt_preds_valid = gbt.predict_proba(valid_xs)[:,1]

#nn_preds =
ens_preds_train =  (rf_preds_train + histgbt_preds_train) / 2
ens_preds_valid =  (rf_preds_valid + histgbt_preds_valid) / 2

In [21]:
roc_auc_score(y, ens_preds_train), roc_auc_score(valid_y, ens_preds_valid)

(0.9993501460215007, 0.8628004078406483)