# Gradient Boosting Classifier 2

---

__This Notebook__

- run fuller param GridSearchCV

__Results__ 

- get best mean validation sensitivity at 0.974, not as great as random forests
- strange how it overfits easily and how there seems to be no clear pattern for hyperparameters



## Setup

In [1]:
import re
import os
import time
import joblib 

import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.metrics import confusion_matrix

dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-02-10


## Load

In [2]:
# load target vector
raw_path = os.path.join("data","1_raw")
y_df = pd.read_csv(os.path.join(raw_path, 'y_train.csv'))
y_array = np.array(y_df.iloc[:,0].ravel())

y = y_array.copy()

# transform y_array into int type
y[y=='ham'] = 0
y[y=='spam'] = 1
y = y.astype('int')

# load matrix
proc_dir = os.path.join("data", "2_processed")
X_tfidf_svd800_spamcos = sp.load_npz(os.path.join(proc_dir, 'X_tfidf_svd800_spamcos.npz'))

## Helpful Functions


In [3]:
# train classifier and time it
def time_deco(func):
    def wrapper(clf):
        start = time.time()
        func(clf)
        m,s = divmod(time.time() - start, 60)
        print(f'Elapsed: {m:0.0f}m {s:0.0f}s')
    return wrapper

@time_deco
def fit_clf(clf):
    clf.fit(X_train, y_train)
    
# evaluate classifier
def eval_clf(y_val, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_val, 
                                      y_pred).ravel()
    confmat_df = pd.DataFrame(
        np.array(([tn, fp], [fn, tp])),
        columns=['pred_neg', 'pred_pos'], 
        index=['cond_neg', 'cond_pos']
    )
    # unpack metrics
    acc = (tp + tn) / (tp + tn + fp + fn)
    tpr = tp / (tp + fn)
    tnr = tn / (tn + fp)
    # print results
    print(confmat_df)
    print(f'acc: {acc:0.4f}')
    print(f'tpr: {tpr:0.4f}')
    print(f'tnr: {tnr:0.4f}')

def extract_df(gd):
    gd_res = gd.cv_results_
    df = pd.concat([
                    pd.DataFrame(gd_res["params"]),
                    pd.DataFrame(gd_res["mean_train_acc"], columns=["mean_train_acc"]),
                    pd.DataFrame(gd_res["mean_train_tpr"], columns=["mean_train_tpr"]),
                    pd.DataFrame(gd_res["mean_train_tnr"], columns=["mean_train_tnr"]),
                    pd.DataFrame(gd_res["mean_test_acc"], columns=["mean_val_acc"]),
                    pd.DataFrame(gd_res["mean_test_tpr"], columns=["mean_val_tpr"]),
                    pd.DataFrame(gd_res["mean_test_tnr"], columns=["mean_val_tnr"]),
                    pd.DataFrame(gd_res["mean_fit_time"], columns=["mean_fit_time"])
                    #pd.DataFrame(gd_res["std_test_acc"], columns=["std_val_acc"]),
                    #pd.DataFrame(gd_res["std_test_tpr"], columns=["std_val_tpr"]),
                    #pd.DataFrame(gd_res["std_test_tnr"], columns=["std_val_tnr"]),
                   ]
                   , axis=1)
    return df

### Train Test Split

##  Gradient Boosting Classifier


```
class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
    """Gradient Boosting for classification.
    GB builds an additive model in a
    forward stage-wise fashion; it allows for the optimization of
    arbitrary differentiable loss functions. In each stage ``n_classes_``
    regression trees are fit on the negative gradient of the
    binomial or multinomial deviance loss function. Binary classification
    is a special case where only a single regression tree is induced.
```

See [docs](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html), [code](https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/ensemble/_gb.py#L768)

__Some defaults:__


- `loss='deviance'`
- `learning_rate=0.1`
- `n_estimators=100`
- `subsample=1.0`
- `criterion='friedman_mse'`
- `min_samples_split=2`
- `min_samples_leaf=1`
- `min_weight_fraction_leaf=0.0`
- `max_depth=3`
- `min_impurity_decrease=0.0`
- `min_impurity_split=None`
- `init=None`
- `random_state=None`
- `max_features=None`
- `verbose=0`
- `max_leaf_nodes=None`
- `warm_start=False`
- `validation_fraction=0.1`
- `n_iter_no_change=None`
- `tol=0.0001`
- `ccp_alpha=0.0`

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score
from sklearn.ensemble import GradientBoostingClassifier

## GridSearchCV

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

def gridsearch_wrapper(X, y, param_grid, k=5, n_jobs=6):
    """
    Performs a grid search with
    Args:
        X: numeric matrix
        y: target variable
        param_grid : dict of hyperparameters for search
        k: number of CV folds
        n_jobs: number of logical cores
    """
    start_time = time.time()

    # split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, 
                                                      y, 
                                                      stratify=y,
                                                      random_state=42)

    # setup scorers
    scorers = {
        'acc': make_scorer(accuracy_score),
        'tpr': make_scorer(recall_score, pos_label=1), # sensitivity, recall
        'tnr': make_scorer(recall_score, pos_label=0) # specificity, selectivity
    }

    # instantiate estimator
    clf =  GradientBoostingClassifier(
        random_state=42
    )

    # instantiate k-fold gridsearch
    cv_folds = StratifiedKFold(n_splits=k)
    
    grid_search_clf = GridSearchCV(clf, 
                                   param_grid,
                                   scoring=scorers, 
                                   refit='tpr', 
                                   cv=cv_folds, 
                                   return_train_score=True, 
                                   n_jobs=n_jobs,
                                   verbose=1)
    
    # train models
    grid_search_clf.fit(X_train, y_train)

    # predict
    y_pred = grid_search_clf.predict(X_val)
    print(f'Best params: {grid_search_clf.best_params_}')

    # eval metrics
    print('Evaluation metrics:')
    eval_clf(y_val, y_pred)
    
    return grid_search_clf

In [6]:
test_params = {
    'n_estimators': [10, 25],
    'learning_rate': [.1, 1],
    'max_depth': [1, 2, 3]
}

params = {
    'n_estimators': [10, 50, 100],
    'learning_rate': [.001, .01, .1, 1],
    'max_depth': [1, 2, 3]
}

In [7]:
gridsearch_clf = gridsearch_wrapper(X_tfidf_svd800_spamcos,
                                    y, 
                                    params,
                                    k=10,                        
                                    n_jobs=-1)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 29.9min finished


Best params: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 50}
Evaluation metrics:
          pred_neg  pred_pos
cond_neg       839         7
cond_pos         2       127
acc: 0.9908
tpr: 0.9845
tnr: 0.9917


In [8]:
df = extract_df(gridsearch_clf)
df.sort_values(by=['mean_val_tpr'], ascending=False)

Unnamed: 0,learning_rate,max_depth,n_estimators,mean_train_acc,mean_train_tpr,mean_train_tnr,mean_val_acc,mean_val_tpr,mean_val_tnr,mean_fit_time
19,0.1,1,50,0.994796,0.980528,0.996978,0.992815,0.974224,0.995666,19.62068
26,0.1,3,100,1.0,1.0,1.0,0.993499,0.97166,0.996849,108.593983
25,0.1,3,50,1.0,1.0,1.0,0.993499,0.97166,0.996849,54.514598
29,1.0,1,100,0.995251,0.982532,0.997197,0.991449,0.969096,0.994879,39.499829
28,1.0,1,50,0.995251,0.982532,0.997197,0.991449,0.969096,0.994879,19.738885
27,1.0,1,10,0.995214,0.982532,0.997153,0.991449,0.969096,0.994879,3.97259
23,0.1,2,100,1.0,1.0,1.0,0.993158,0.969096,0.996849,73.330781
22,0.1,2,50,0.998822,0.991124,1.0,0.992474,0.969096,0.99606,36.902974
20,0.1,1,100,0.997265,0.98654,0.998905,0.992474,0.969096,0.996061,39.379844
35,1.0,3,100,1.0,1.0,1.0,0.990764,0.966532,0.994482,98.218061


In [12]:
# persist
save_path = os.path.join("data", "3_modeling", "02102021_gb_gridsearch.joblib")
joblib.dump(gridsearch_clf, save_path)

['data\\3_modeling\\02102021_gb_gridsearch.joblib']

---