# Gradient Boosting Classifier 3

---

__This Notebook__

- run GridSearchCV with similar params from previous best random forest models
- run final params to try to optimize

__Results__ 

- best mean validation sensitivity 0.9769 `{max_depth=8, max_features=300, min_samples_split=5, n_estimators=100}`
- final params overfit badly, even rf params might be doing so

__Next Steps__

- plot learning curves for best models, make sure they're not overfitting

## Setup

In [1]:
import re
import os
import time
import joblib 

import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.metrics import confusion_matrix

dt_object = datetime.fromtimestamp(time.time())
day, T = str(dt_object).split('.')[0].split(' ')
print('Revised on: ' + day)

Revised on: 2021-02-10


## Load

In [2]:
# load target vector
raw_path = os.path.join("data","1_raw")
y_df = pd.read_csv(os.path.join(raw_path, 'y_train.csv'))
y_array = np.array(y_df.iloc[:,0].ravel())

y = y_array.copy()

# transform y_array into int type
y[y=='ham'] = 0
y[y=='spam'] = 1
y = y.astype('int')

# load matrix
proc_dir = os.path.join("data", "2_processed")
X_tfidf_svd800_spamcos = sp.load_npz(os.path.join(proc_dir, 'X_tfidf_svd800_spamcos.npz'))

## Helpful Functions


In [3]:
# train classifier and time it
def time_deco(func):
    def wrapper(clf):
        start = time.time()
        func(clf)
        m,s = divmod(time.time() - start, 60)
        print(f'Elapsed: {m:0.0f}m {s:0.0f}s')
    return wrapper

@time_deco
def fit_clf(clf):
    clf.fit(X_train, y_train)
    
# evaluate classifier
def eval_clf(y_val, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_val, 
                                      y_pred).ravel()
    confmat_df = pd.DataFrame(
        np.array(([tn, fp], [fn, tp])),
        columns=['pred_neg', 'pred_pos'], 
        index=['cond_neg', 'cond_pos']
    )
    # unpack metrics
    acc = (tp + tn) / (tp + tn + fp + fn)
    tpr = tp / (tp + fn)
    tnr = tn / (tn + fp)
    # print results
    print(confmat_df)
    print(f'acc: {acc:0.4f}')
    print(f'tpr: {tpr:0.4f}')
    print(f'tnr: {tnr:0.4f}')

def extract_df(gd):
    gd_res = gd.cv_results_
    df = pd.concat([
                    pd.DataFrame(gd_res["params"]),
                    pd.DataFrame(gd_res["mean_train_acc"], columns=["mean_train_acc"]),
                    pd.DataFrame(gd_res["mean_train_tpr"], columns=["mean_train_tpr"]),
                    pd.DataFrame(gd_res["mean_train_tnr"], columns=["mean_train_tnr"]),
                    pd.DataFrame(gd_res["mean_test_acc"], columns=["mean_val_acc"]),
                    pd.DataFrame(gd_res["mean_test_tpr"], columns=["mean_val_tpr"]),
                    pd.DataFrame(gd_res["mean_test_tnr"], columns=["mean_val_tnr"]),
                    pd.DataFrame(gd_res["mean_fit_time"], columns=["mean_fit_time"])
                    #pd.DataFrame(gd_res["std_test_acc"], columns=["std_val_acc"]),
                    #pd.DataFrame(gd_res["std_test_tpr"], columns=["std_val_tpr"]),
                    #pd.DataFrame(gd_res["std_test_tnr"], columns=["std_val_tnr"]),
                   ]
                   , axis=1)
    return df

### Train Test Split

##  Gradient Boosting Classifier


```
class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting):
    """Gradient Boosting for classification.
    GB builds an additive model in a
    forward stage-wise fashion; it allows for the optimization of
    arbitrary differentiable loss functions. In each stage ``n_classes_``
    regression trees are fit on the negative gradient of the
    binomial or multinomial deviance loss function. Binary classification
    is a special case where only a single regression tree is induced.
```

See [docs](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html), [code](https://github.com/scikit-learn/scikit-learn/blob/95119c13a/sklearn/ensemble/_gb.py#L768)

__Some defaults:__


- `loss='deviance'`
- `learning_rate=0.1`
- `n_estimators=100`
- `subsample=1.0`
- `criterion='friedman_mse'`
- `min_samples_split=2`
- `min_samples_leaf=1`
- `min_weight_fraction_leaf=0.0`
- `max_depth=3`
- `min_impurity_decrease=0.0`
- `min_impurity_split=None`
- `init=None`
- `random_state=None`
- `max_features=None`
- `verbose=0`
- `max_leaf_nodes=None`
- `warm_start=False`
- `validation_fraction=0.1`
- `n_iter_no_change=None`
- `tol=0.0001`
- `ccp_alpha=0.0`

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score
from sklearn.ensemble import GradientBoostingClassifier

## GridSearchCV

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

def gridsearch_wrapper(X, y, param_grid, k=5, n_jobs=6):
    """
    Performs a grid search with
    Args:
        X: numeric matrix
        y: target variable
        param_grid : dict of hyperparameters for search
        k: number of CV folds
        n_jobs: number of logical cores
    """
    start_time = time.time()

    # split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, 
                                                      y, 
                                                      stratify=y,
                                                      random_state=42)

    # setup scorers
    scorers = {
        'acc': make_scorer(accuracy_score),
        'tpr': make_scorer(recall_score, pos_label=1), # sensitivity, recall
        'tnr': make_scorer(recall_score, pos_label=0) # specificity, selectivity
    }

    # instantiate estimator
    clf =  GradientBoostingClassifier(
        random_state=42
    )

    # instantiate k-fold gridsearch
    cv_folds = StratifiedKFold(n_splits=k)
    
    grid_search_clf = GridSearchCV(clf, 
                                   param_grid,
                                   scoring=scorers, 
                                   refit='tpr', 
                                   cv=cv_folds, 
                                   return_train_score=True, 
                                   n_jobs=n_jobs,
                                   verbose=1)
    
    # train models
    grid_search_clf.fit(X_train, y_train)

    # predict
    y_pred = grid_search_clf.predict(X_val)
    print(f'Best params: {grid_search_clf.best_params_}')

    # eval metrics
    print('Evaluation metrics:')
    eval_clf(y_val, y_pred)
    
    return grid_search_clf

In [6]:
test_params = {
    'n_estimators': [10, 25],
    'learning_rate': [.1, 1],
    'max_depth': [1, 2, 3]
}

params = {
    'n_estimators': [10, 50, 100],
    'learning_rate': [.001, .01, .1, 1],
    'max_depth': [1, 2, 3]
    
}

rf_params = {
    'n_estimators': [50, 100],
    'max_features': [150, 300], 
    'max_depth': [1, 5, 8], 
    'min_samples_split': [3, 5],    
}

In [7]:
gridsearch_clf = gridsearch_wrapper(X_tfidf_svd800_spamcos,
                                    y, 
                                    rf_params,
                                    k=10,                        
                                    n_jobs=-1)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   40.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 20.5min finished


Best params: {'max_depth': 8, 'max_features': 300, 'min_samples_split': 3, 'n_estimators': 100}
Evaluation metrics:
          pred_neg  pred_pos
cond_neg       842         4
cond_pos         2       127
acc: 0.9938
tpr: 0.9845
tnr: 0.9953


In [8]:
df = extract_df(gridsearch_clf)
df.sort_values(by=['mean_val_tpr'], ascending=False)

Unnamed: 0,max_depth,max_features,min_samples_split,n_estimators,mean_train_acc,mean_train_tpr,mean_train_tnr,mean_val_acc,mean_val_tpr,mean_val_tnr,mean_fit_time
23,8,300,5,100,1.0,1.0,1.0,0.994526,0.976856,0.997243,105.487126
22,8,300,5,50,1.0,1.0,1.0,0.994869,0.976856,0.997636,56.383824
21,8,300,3,100,1.0,1.0,1.0,0.993841,0.976856,0.996455,113.843175
19,8,150,5,100,1.0,1.0,1.0,0.994524,0.974224,0.997636,61.491556
13,5,300,3,100,1.0,1.0,1.0,0.993499,0.97166,0.996849,71.455355
20,8,300,3,50,1.0,1.0,1.0,0.994184,0.97166,0.997636,56.969746
18,8,150,5,50,1.0,1.0,1.0,0.994524,0.97166,0.99803,30.672955
15,5,300,5,100,1.0,1.0,1.0,0.993499,0.97166,0.996849,70.982826
14,5,300,5,50,1.0,1.0,1.0,0.993499,0.97166,0.996849,35.904575
12,5,300,3,50,1.0,1.0,1.0,0.993499,0.97166,0.996849,35.914619


In [9]:
# persist
save_path = os.path.join("data", "3_modeling", "02102021_gb_gridsearch2.joblib")
joblib.dump(gridsearch_clf, save_path)

['data\\3_modeling\\02102021_gb_gridsearch2.joblib']

In [10]:
final_params = {
    'n_estimators': [50],
    'max_features': [300, 500], 
    'max_depth': [10, 20], 
    'min_samples_split': [5, 10],    
}

In [11]:
gridsearch_clf = gridsearch_wrapper(X_tfidf_svd800_spamcos,
                                    y, 
                                    final_params,
                                    k=10,                        
                                    n_jobs=-1)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 21.2min finished


Best params: {'max_depth': 10, 'max_features': 300, 'min_samples_split': 10, 'n_estimators': 50}
Evaluation metrics:
          pred_neg  pred_pos
cond_neg       842         4
cond_pos         3       126
acc: 0.9928
tpr: 0.9767
tnr: 0.9953


In [12]:
df = extract_df(gridsearch_clf)
df.sort_values(by=['mean_val_tpr'], ascending=False)

Unnamed: 0,max_depth,max_features,min_samples_split,n_estimators,mean_train_acc,mean_train_tpr,mean_train_tnr,mean_val_acc,mean_val_tpr,mean_val_tnr,mean_fit_time
1,10,300,10,50,1.0,1.0,1.0,0.993841,0.97166,0.997243,68.440274
5,20,300,10,50,1.0,1.0,1.0,0.993841,0.97166,0.997243,128.469588
0,10,300,5,50,1.0,1.0,1.0,0.993499,0.969096,0.997243,64.249456
3,10,500,10,50,1.0,1.0,1.0,0.993499,0.969096,0.997243,110.385718
4,20,300,5,50,1.0,1.0,1.0,0.993499,0.969096,0.997243,124.182556
6,20,500,5,50,1.0,1.0,1.0,0.993158,0.969096,0.996847,191.705846
7,20,500,10,50,1.0,1.0,1.0,0.993839,0.969096,0.997636,185.908019
2,10,500,5,50,1.0,1.0,1.0,0.992131,0.966532,0.99606,108.407338


In [13]:
# persist
save_path = os.path.join("data", "3_modeling", "02102021_gb_gridsearch3.joblib")
joblib.dump(gridsearch_clf, save_path)

['data\\3_modeling\\02102021_gb_gridsearch3.joblib']

---