# Ensembling

In [25]:
from fastai.tabular.all import *

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier

path = Path('/Users/baranserajelahi/Codes/fraud-detection-pytorch-scikit-fastai/data')
Path.BASE_PATH = path

In [26]:
to = (path/'to_nn_full.pkl').load()
xs, y             = to.train.xs, to.train.y
valid_xs, valid_y = to.valid.xs, to.valid.y

## Models

**Random Forest**

In [27]:
%%time
rf = RandomForestClassifier(n_jobs=-1 , max_samples=0.66, oob_score=True,
                            n_estimators=1000,  criterion='entropy', class_weight='balanced').fit(xs,y)

roc_auc_score(y, rf.predict_proba(xs)[:,1]), roc_auc_score(valid_y, rf.predict_proba(valid_xs)[:,1])

CPU times: user 5min 21s, sys: 7.08 s, total: 5min 28s
Wall time: 1min 22s


(0.9999992589894948, 0.8938047929538541)

**Gradient Boosted Tree**

In [28]:
%%time
gbt = HistGradientBoostingClassifier(loss='binary_crossentropy', verbose=1, l2_regularization=2.4,
                             learning_rate=0.03, max_depth=27, max_iter=1000, max_leaf_nodes=44,
                             min_samples_leaf=8,  scoring='roc_auc', tol=1e-8).fit(xs,y)

roc_auc_score(y, gbt.predict_proba(xs)[:,1]), roc_auc_score(valid_y, gbt.predict_proba(valid_xs)[:,1])

Binning 0.242 GB of training data: 1.554 s
Binning 0.027 GB of validation data: 0.058 s
Fitting gradient boosted rounds:
[1/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.87028, val score: 0.83194, in 0.669s
[2/1000] 1 tree, 44 leaves, max depth = 14, train score: 0.87330, val score: 0.83810, in 0.636s
[3/1000] 1 tree, 44 leaves, max depth = 10, train score: 0.87887, val score: 0.83998, in 0.604s
[4/1000] 1 tree, 44 leaves, max depth = 10, train score: 0.87965, val score: 0.84296, in 0.614s
[5/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.88276, val score: 0.84328, in 0.483s
[6/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.88299, val score: 0.84369, in 0.439s
[7/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.88597, val score: 0.85403, in 0.589s
[8/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.88657, val score: 0.85798, in 0.815s
[9/1000] 1 tree, 44 leaves, max depth = 11, train score: 0.88889, val score: 0.86066, in 0.280s
[10/1000] 1 tre

[85/1000] 1 tree, 44 leaves, max depth = 13, train score: 0.93810, val score: 0.89886, in 0.341s
[86/1000] 1 tree, 44 leaves, max depth = 14, train score: 0.93862, val score: 0.89885, in 0.397s
[87/1000] 1 tree, 44 leaves, max depth = 14, train score: 0.93889, val score: 0.89921, in 0.374s
[88/1000] 1 tree, 44 leaves, max depth = 17, train score: 0.93966, val score: 0.89929, in 0.413s
[89/1000] 1 tree, 44 leaves, max depth = 14, train score: 0.94036, val score: 0.89991, in 0.378s
[90/1000] 1 tree, 44 leaves, max depth = 18, train score: 0.94067, val score: 0.89999, in 0.375s
[91/1000] 1 tree, 44 leaves, max depth = 16, train score: 0.94122, val score: 0.90003, in 0.355s
[92/1000] 1 tree, 44 leaves, max depth = 12, train score: 0.94155, val score: 0.90036, in 0.538s
[93/1000] 1 tree, 44 leaves, max depth = 21, train score: 0.94184, val score: 0.90052, in 0.578s
[94/1000] 1 tree, 44 leaves, max depth = 12, train score: 0.94220, val score: 0.90057, in 0.583s
[95/1000] 1 tree, 44 leaves, m

[169/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.96812, val score: 0.91280, in 0.654s
[170/1000] 1 tree, 44 leaves, max depth = 20, train score: 0.96829, val score: 0.91272, in 0.653s
[171/1000] 1 tree, 44 leaves, max depth = 20, train score: 0.96838, val score: 0.91272, in 0.906s
[172/1000] 1 tree, 44 leaves, max depth = 17, train score: 0.96867, val score: 0.91272, in 0.360s
[173/1000] 1 tree, 44 leaves, max depth = 18, train score: 0.96887, val score: 0.91284, in 0.378s
[174/1000] 1 tree, 44 leaves, max depth = 15, train score: 0.96925, val score: 0.91283, in 0.388s
[175/1000] 1 tree, 44 leaves, max depth = 13, train score: 0.96942, val score: 0.91260, in 0.479s
Fit 175 trees in 91.340 s, (7700 total leaves)
Time spent computing histograms: 51.614s
Time spent finding best splits:  9.172s
Time spent applying splits:      9.304s
Time spent predicting:           0.069s
CPU times: user 6min 19s, sys: 2min 46s, total: 9min 6s
Wall time: 1min 32s


(0.9602684678590151, 0.8904493780352221)

**Neural Network**

In [15]:
learn_inf = load_learner(path/'nn_full_weighted.pkl', cpu=True)

In [24]:
len(xs)

48378

In [39]:
learn_inf.predict(xs.iloc[0])[2][1]

tensor(2.1046e-06)

In [None]:
np.stack([t.predict_proba(valid_xs) for t in m.estimators_])

In [42]:
def df_predict(learner, df):
     preds = np.stack([learner.predict(df.iloc[row])[2][1] for row in range(len(df))])

In [None]:
df_predict(learn_inf, xs)

## Ensemble

In [29]:
rf_preds_train = rf.predict_proba(xs)[:,1]
rf_preds_valid = rf.predict_proba(valid_xs)[:,1]

histgbt_preds_train = gbt.predict_proba(xs)[:,1]
histgbt_preds_valid = gbt.predict_proba(valid_xs)[:,1]

#nn_preds =
ens_preds_train =  (rf_preds_train + histgbt_preds_train) / 2
ens_preds_valid =  (rf_preds_valid + histgbt_preds_valid) / 2

In [30]:
roc_auc_score(y, ens_preds_train), roc_auc_score(valid_y, ens_preds_valid)

(0.9994092417249381, 0.8969180552301927)