# Ensembling

In [3]:
from fastai.tabular.all import *

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier

path = Path('/home/jupyter/fraud-detection-pytorch-scikit-fastai/data')
#path = Path('/Users/baranserajelahi/Codes/fraud-detection-pytorch-scikit-fastai/data')
Path.BASE_PATH = path

In [4]:
to = (path/'to_nn_full.pkl').load()
xs, y             = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

## Models

**Random Forest**

In [11]:
%%time
rf = RandomForestClassifier(n_jobs=-1 , max_samples=0.66, oob_score=True,
                            n_estimators=1000,  criterion='entropy', class_weight='balanced').fit(xs,y)

roc_auc_score(y, rf.predict_proba(xs)[:,1]), roc_auc_score(valid_y, rf.predict_proba(valid_xs)[:,1])

CPU times: user 5min 10s, sys: 464 ms, total: 5min 10s
Wall time: 1min 33s


(0.9999983121220913, 0.8887045938380909)

**Gradient Boosted Tree**

In [7]:
%%time
gbt = HistGradientBoostingClassifier(loss='binary_crossentropy', verbose=1, l2_regularization=2.4,
                             learning_rate=0.03, max_depth=27, max_iter=1000, max_leaf_nodes=44,
                             min_samples_leaf=8,  scoring='roc_auc', tol=1e-8).fit(xs,y)

roc_auc_score(y, gbt.predict_proba(xs)[:,1]), roc_auc_score(valid_y, gbt.predict_proba(valid_xs)[:,1])

Binning 0.278 GB of training data: 1.840 s
Binning 0.031 GB of validation data: 0.026 s
Fitting gradient boosted rounds:
[1/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.84357, val score: 0.84713, in 0.208s
[2/1000] 1 tree, 44 leaves, max depth = 12, train score: 0.84652, val score: 0.84848, in 0.213s
[3/1000] 1 tree, 44 leaves, max depth = 10, train score: 0.85400, val score: 0.84944, in 0.208s
[4/1000] 1 tree, 44 leaves, max depth = 9, train score: 0.85932, val score: 0.86002, in 0.206s
[5/1000] 1 tree, 44 leaves, max depth = 9, train score: 0.86384, val score: 0.86090, in 0.210s
[6/1000] 1 tree, 44 leaves, max depth = 9, train score: 0.86402, val score: 0.86132, in 0.209s
[7/1000] 1 tree, 44 leaves, max depth = 9, train score: 0.86463, val score: 0.86371, in 0.210s
[8/1000] 1 tree, 44 leaves, max depth = 10, train score: 0.86552, val score: 0.86385, in 0.308s
[9/1000] 1 tree, 44 leaves, max depth = 12, train score: 0.86551, val score: 0.86366, in 0.287s
[10/1000] 1 tree, 4

[85/1000] 1 tree, 44 leaves, max depth = 13, train score: 0.92416, val score: 0.89243, in 0.182s
[86/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.92455, val score: 0.89251, in 0.174s
[87/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.92484, val score: 0.89262, in 0.170s
[88/1000] 1 tree, 44 leaves, max depth = 10, train score: 0.92523, val score: 0.89260, in 0.172s
[89/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.92558, val score: 0.89279, in 0.170s
[90/1000] 1 tree, 44 leaves, max depth = 14, train score: 0.92586, val score: 0.89350, in 0.178s
[91/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.92625, val score: 0.89391, in 0.169s
[92/1000] 1 tree, 44 leaves, max depth = 12, train score: 0.92718, val score: 0.89409, in 0.171s
[93/1000] 1 tree, 44 leaves, max depth = 13, train score: 0.92827, val score: 0.89430, in 0.182s
[94/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.92862, val score: 0.89469, in 0.171s
[95/1000] 1 tree, 44 leaves, m

[169/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.95636, val score: 0.90270, in 0.185s
[170/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.95661, val score: 0.90267, in 0.255s
[171/1000] 1 tree, 44 leaves, max depth = 20, train score: 0.95692, val score: 0.90266, in 0.342s
[172/1000] 1 tree, 44 leaves, max depth = 20, train score: 0.95711, val score: 0.90258, in 0.192s
[173/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.95722, val score: 0.90279, in 0.170s
[174/1000] 1 tree, 44 leaves, max depth = 14, train score: 0.95743, val score: 0.90315, in 0.178s
[175/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.95767, val score: 0.90333, in 0.183s
[176/1000] 1 tree, 44 leaves, max depth = 16, train score: 0.95786, val score: 0.90362, in 0.187s
[177/1000] 1 tree, 44 leaves, max depth = 12, train score: 0.95804, val score: 0.90375, in 0.169s
[178/1000] 1 tree, 44 leaves, max depth = 14, train score: 0.95858, val score: 0.90359, in 0.189s
[179/1000] 1 tree, 4

(0.960113077134029, 0.895746772390276)

**Neural Network**

In [26]:
learn_inf = load_learner(path/'nn_full_weighted.pkl')

In [19]:
len(xs)

48246

In [20]:
learn_inf.predict(xs.iloc[0])[2][1]

tensor(3.5031e-06)

In [21]:
def df_predict(learner, df):
     return np.stack([learner.predict(df.iloc[row])[2][1] for row in range(len(df))])

In [23]:
preds = df_predict(learn_inf, xs)

Exception ignored in: <function _releaseLock at 0x7fde50faca70>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/logging/__init__.py", line 221, in _releaseLock
    def _releaseLock():
KeyboardInterrupt


Exception ignored in: <function _releaseLock at 0x7fde50faca70>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/logging/__init__.py", line 221, in _releaseLock
    def _releaseLock():
KeyboardInterrupt


KeyboardInterrupt: 

## Ensemble

In [12]:
rf_preds_train = rf.predict_proba(xs)[:,1]
rf_preds_valid = rf.predict_proba(valid_xs)[:,1]

histgbt_preds_train = gbt.predict_proba(xs)[:,1]
histgbt_preds_valid = gbt.predict_proba(valid_xs)[:,1]

#nn_preds =
ens_preds_train =  (rf_preds_train + histgbt_preds_train) / 2
ens_preds_valid =  (rf_preds_valid + histgbt_preds_valid) / 2

In [13]:
roc_auc_score(y, ens_preds_train), roc_auc_score(valid_y, ens_preds_valid)

(0.999575031200241, 0.8964089764580041)