# Explanatory Notebooks
please ensure to start your jupyter server from the "Notebooks" directory to comply with relative path in this demo

## Imports

In [22]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
import pickle

## Data loading

In [23]:
data = pd.read_csv('data/train.csv')

In [24]:
data.head(2)

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0


## Model fitting

### train test split 

In [25]:
np.random.seed(42)

In [26]:
train = data.loc[np.random.rand(len(data)) < 0.6]

In [27]:
test = data.loc[np.random.rand(len(data)) > 0.4]

Quick check on the target rate, to ensure this split is relevant : 

In [28]:
1.0*sum(train.TARGET )/len(train)

0.039698205904285654

In [29]:
1.0*sum(test.TARGET )/len(test)

0.039036871801600842

### Fitting the model

In [30]:
# Parameters for the 
params = {
    'max_depth':6, 
    'eta':0.1, 
    'n_estimators':100, 
    'objective':'binary:logistic', 
    'nthread':-1, 
    'min_child_weight':3, 
    'seed':1337,
    'eval_metric':'auc'
}

In [31]:
xgb_train = xgb.DMatrix(train.drop('TARGET', axis = 1), label = train.TARGET==1)
xgb_eval = xgb.DMatrix(test.drop('TARGET', axis = 1), label = test.TARGET==1)

In [32]:
model = xgb.train(params=params, 
          dtrain=xgb_train,
          num_boost_round=100,
          evals=[(xgb_train,'train'), (xgb_eval,'evaluation')], 
          verbose_eval=10
     )

[0]	train-auc:0.815937	evaluation-auc:0.81089
[10]	train-auc:0.848127	evaluation-auc:0.839665
[20]	train-auc:0.861809	evaluation-auc:0.851111
[30]	train-auc:0.873297	evaluation-auc:0.858172
[40]	train-auc:0.883245	evaluation-auc:0.865858
[50]	train-auc:0.889465	evaluation-auc:0.869843
[60]	train-auc:0.895267	evaluation-auc:0.873623
[70]	train-auc:0.897779	evaluation-auc:0.875331
[80]	train-auc:0.900066	evaluation-auc:0.876799
[90]	train-auc:0.901921	evaluation-auc:0.878025


In [43]:
model_sklearn = xgb.XGBClassifier()

In [44]:
type(model_sklearn)

xgboost.sklearn.XGBClassifier

In [45]:
model_sklearn.fit(train.drop('TARGET', axis = 1),train.TARGET==1)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [46]:
pickle.dump(model_sklearn, open("./data/model_sklearn.pickle.dat", "wb"))

In [20]:
type(model_sklearn)

xgboost.sklearn.XGBClassifier

In [101]:
# save model to file
pickle.dump(model, open("./data/model.pickle.dat", "wb"))

In [15]:
trees = model.get_dump(with_stats=True)

# Notes

* on peut aussi prendre la version scikit en recuperant model._Booster.get_dump
* garder la pr_auc dans un coin 
* un model sauvegardé et un model "dumpé" n'ont pas le même format. Réconcilier les deux, en commençant par un vrai modèle puis en faisant le cleaning pour un modèle dumpé
* comme on sait dans quelle feuille ils sont tombés, on peut faire des stats de proximité

## parcours d'un arbre avec les children (checker le souci avec le self, mais la méthode est cool : l'ensemble est un dictionnaire avec comme clé le numéro de la feuille !!!

# Annexes

In [None]:
model.dump_model('./data/model.dump', with_stats=True)

In [87]:
with open('./data/model.dump') as f:
    trees = f.read()

In [95]:
from feature_importance_gbm.tree_information import Tree

In [97]:
trees = model.get_dump(with_stats=True)

In [None]:
forest = []

In [91]:
trees.split('booster')

AttributeError: 'list' object has no attribute 'trim'

In [None]:
model.save_model('./data/model.save')

In [None]:
model.load_model('')

In [93]:
xgb.Booster.load_model()

TypeError: load_model() missing 1 required positional argument: 'self'

In [None]:
def pr_auc(preds, dtrain):
    labels = dtrain.get_label()
    precision, recall, thresholds = precision_recall_curve(labels, preds)
    # return a pair metric_name, result
    # since preds are margin(before logistic transformation, cutoff at 0)
    return 'pr_auc', (xgb_train,'train')(xgb_train,'train')(xgb_train,'train')(xgb_train,'train')(xgb_train,'train')
