In [None]:
from util import *
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from collections import ChainMap

In [None]:
x_train, x_test, y_train, y_test = load_data('var_w1.csv')
x_train.head()

In [None]:
x_train_bt, x_train_lr, y_train_bt, y_train_lr = train_test_split(x_train, y_train, test_size=0.5)
grd = GradientBoostingClassifier(n_estimators=100, verbose=1)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression(max_iter=1000, verbose=1)
grd.fit(x_train_bt, y_train_bt)
grd_enc.fit(grd.apply(x_train_bt)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(x_train_lr)[:, :, 0]), y_train_lr)

In [None]:
y_pred_grd_lm = grd_lm.predict_proba(grd_enc.transform(grd.apply(x_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
plt.figure()
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR')
plt.legend(loc='best')
plt.show()

In [None]:
print(fdr_prob(y_test, y_pred_grd_lm))
print(fdr(grd, x_test, y_test))

In [None]:
original_param = {'max_depth': 5, 'n_estimators': 200, 'verbose': 1}
for label, color, setting in [('learning_rate=0.01', 'orange',
                               {'learning_rate': 0.01}),
                              ('learning_rate=0.05', 'turquoise',
                               {'learning_rate': 0.05}),
                              ('learning_rate=0.1', 'blue',
                               {'learning_rate': 0.1}),
                              ('learning_rate=0.2', 'gray',
                               {'learning_rate': 0.2})]:
    params = ChainMap(setting, original_param)
    clf = GradientBoostingClassifier(**params).fit(x_train, y_train)
    test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)

    for i, y_pred in enumerate(clf.staged_decision_function(x_test)):
        # clf.loss_ assumes that y_test[i] in {0, 1}
        test_deviance[i] = clf.loss_(y_test, y_pred)

    plt.plot((np.arange(test_deviance.shape[0]) + 1)[::5], test_deviance[::5],
            '-', color=color, label=label)

plt.legend(loc='upper left')
plt.xlabel('Boosting Iterations')
plt.ylabel('Test Set Deviance')

plt.show()

In [3]:
bt = GradientBoostingClassifier(verbose=1)
params = {'n_estimators': [50, 100, 200],
          'max_depth': [2, 5, 10]}
grid = GridSearchCV(bt, params, cv=5, refit='FDR', 
                    verbose=1, n_jobs=6,
                    scoring={'FDR': fdr, 'ROC': 'roc_auc'})
grid.fit(x_train, y_train)
print(f"The best parameters are {grid.best_params_} "
      f"with a score of {grid.best_score_:0.5f}")
pd.DataFrame(grid.cv_results_).filter(
    items=['param_max_depth', 'param_n_estimators', 
           'mean_test_FDR', 'std_test_FDR', 'rank_test_FDR',
           'mean_test_ROC', 'std_test_ROC'])

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  45 out of  45 | elapsed: 30.0min finished


      Iter       Train Loss   Remaining Time 
         1           0.1022            1.69m
         2           0.1003            1.64m
         3           0.0993            1.62m
         4           0.0985            1.62m
         5           0.0979            1.61m
         6           0.0973            1.59m
         7           0.0968            1.57m
         8           0.0964            1.55m
         9           0.0960            1.54m
        10           0.0957            1.53m
        20           0.0938            1.38m
        30           0.0933            1.20m
        40           0.0930            1.02m
        50           0.0928           51.43s
        60           0.0926           41.17s
        70           0.0924           30.85s
        80           0.0922           20.57s
        90           0.0920           10.27s
       100           0.0919            0.00s
The best parameters are {'max_depth': 5, 'n_estimators': 100} with a score of 0.53208


Unnamed: 0,param_max_depth,param_n_estimators,mean_test_FDR,std_test_FDR,rank_test_FDR,mean_test_ROC,std_test_ROC
0,2,50,0.522316,0.010017,7,0.754292,0.00551
1,2,100,0.521793,0.010386,8,0.767748,0.010817
2,2,200,0.52345,0.011739,5,0.773917,0.008134
3,5,50,0.530772,0.009595,3,0.774771,0.007083
4,5,100,0.532079,0.009572,1,0.778038,0.007745
5,5,200,0.531992,0.009761,2,0.778257,0.008135
6,10,50,0.529464,0.010439,4,0.777839,0.008627
7,10,100,0.522317,0.009719,6,0.773738,0.006902
8,10,200,0.515343,0.009829,9,0.766066,0.007282


In [7]:
fdr(grid.best_estimator_, x_test, y_test)

0.5231085244779186

In [9]:
bt = GradientBoostingClassifier(**grid.best_params_).fit(x_train, y_train)
fdr(bt, x_train, y_train)
fdr(bt, x_test, y_test)

0.5231085244779186

In [None]:
bt = GradientBoostingClassifier(**grid.best_params_)
params = {'max_depth': [5, 10, 15, 20]}
grid_2 = GridSearchCV(bt, params, cv=2, refit='FDR', 
                      verbose=1, n_jobs=4,
                      scoring={'FDR': fdr, 'ROC': 'roc_auc'})
grid_2.fit(x_train, y_train)
print(f"The best parameters are {grid_2.best_params_} "
      f"with a score of {grid_2.best_score_:0.5f}")
pd.DataFrame(grid_2.cv_results_).filter(
    items=['param_max_depth', 'mean_test_FDR',
           'std_test_FDR', 'rank_test_FDR',
           'mean_test_ROC', 'std_test_ROC'])

In [None]:
dt = GradientBoostingClassifier(verbose=1, 
                                learning_rate=0.1,
                                max_depth=5,
                                n_estimators=100).fit(x_train, y_train)
fdr(dt, x_test, y_test)

In [None]:
fdr(dt, x_train, y_train)

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier().fit(x_train, y_train)
