# Gradient Boosting

In [315]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, KFold
from yellowbrick.classifier import ConfusionMatrix
import push_results as pr

In [316]:
infile1 = open('../data/processed/X_train_trans.pickle','rb')
X_train = pickle.load(infile1)
infile1.close()

infile2 = open('../data/processed/y_train.pickle','rb')
y_train = pickle.load(infile2)
infile2.close()

In [317]:
infile3 = open('../data/model_results/model_results.pickle','rb')
results_dict = pickle.load(infile3)
infile3.close()

In [318]:
# y_train[y_train == 'FF'] = 0
# y_train[y_train !=  0] = 1

In [321]:
y_train = np.array(y_train).reshape(-1, )
y_train[y_train == 'FF'] = 0
y_train[y_train != 0] = 1

In [329]:
y_train = y_train.astype('int')

In [330]:
# y_train[y_train > 0] = 1

In [331]:
# X_train = X_train.drop(columns = ['inning', 'pitch_number', 'opp_score', 'nats_score', 'nats_home1_away0', 
#                         'pitch_season', 'pitch_game', 'pitch_bat_gm', 'total_pitches',
#                        'abs', 'whiffs', 'swings', 'takes', 'k', 'walk', 'single', 'double', 'triple',
#                        'hr', 'line_drive', 'ground_ball', 'fly_ball', 'popup', 'rbi', 'sac', 'ba', 'slg',
#                        'iso', 'babip', 'stand_r1', 'if_standard', 'if_strategic', 'of_strategic'])

In [332]:
gbc = GradientBoostingClassifier()

In [333]:
gbc.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [334]:
gbc.score(X_train, y_train)

0.7153171738009283

In [335]:
y_pred = gbc.predict(X_train)

In [336]:
pr.push_results('gbc_def', 'GradientBoostingClassifier', 'Default', gbc.score(X_train, y_train))

# results_dict.update({'Gradient Boosting': gbc.score(X_train, y_train)})

## Cross validation

In [337]:
y_train = y_train.reshape(-1, )

In [338]:
scores = cross_val_score(gbc, X_train, y_train, cv=10, scoring= 'accuracy', verbose =1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    4.5s finished


In [339]:
scores

array([0.53092784, 0.56185567, 0.5257732 , 0.54639175, 0.56701031,
       0.53092784, 0.52061856, 0.55670103, 0.54123711, 0.59585492])

In [340]:
np.average(scores)

0.5477298221248865

In [341]:
# pr.push_results('gbc_cv', 'GradientBoostingClassifier', 'Cross Validation', gbc.score(X_train, y_train))


# results_dict.update({'GradientBoosting CV5': np.average(scores)})

## GridSearch

In [342]:
gb = GradientBoostingClassifier(verbose=-1, random_state=31)

params = {'n_estimators': range(2, 200, 10), 
          'max_depth': range(2,10), 
          'max_features': ['auto', 'sqrt', 'log2'], 
          'loss': ['deviance', 'exponential'], 
          'subsample': [.9, 1, 1.5], 
          'criterion': ['friedman_mse', 'mse', 'mae']
         }

gridsearch = (GridSearchCV(estimator = gb, 
                          param_grid = params,
                          n_jobs = -1,
                          verbose = 1,
                          cv = 5,
                          scoring = 'accuracy',
                          return_train_score= True))

# gridsearch = gridsearch.fit(X_train, y_train)

In [343]:
# gridsearch.best_score_

In [344]:
# results_dict.update({'GrandientBoosting GS': gridsearch.best_score_})

In [345]:
# gridsearch.cv_results_;

In [346]:
# gridsearch.cv_results_['params'][gridsearch.best_index_]

In [347]:
# colums = ['params', 'mean_test_score', 
#           'std_test_score', 'rank_test_score',
#           'mean_train_score', 'std_train_score']

# results = pd.DataFrame(gridsearch.cv_results_)[colums]
# df = results.sort_values(by = 'rank_test_score').head(10)

In [348]:
# df

In [349]:
# pickle_out = open('../data/model_results/model_results.pickle', 'wb')
# pickle.dump(results_dict, pickle_out)
# pickle_out.close()

## This gridsearch took several hours, saving results below

In [350]:
# pickle_out = open('./gbc_results/best_from_gs.pickle', 'wb')
# pickle.dump(gridsearch.cv_results_['params'][gridsearch.best_index_], pickle_out)
# pickle_out.close()

# pickle_out = open('./gbc_results/all_gbc_results_df.pickle', 'wb')
# pickle.dump(df, pickle_out)
# pickle_out.close()

## Randomized Search

In [351]:
params = {'n_estimators': range(2, 500, 20), 
          'learning_rate': [.25, 0.1, .75, 1],
          'max_depth': range(2,10), 
          'max_features': ['auto', 'sqrt', 'log2'], 
          'loss': ['deviance', 'exponential'], 
          'subsample': [.9, 1, 1.5], 
          'criterion': ['friedman_mse', 'mse', 'mae'], 
          'min_samples_split': [1, 2, 5, 10, 20, 50, 200],
          'min_samples_leaf': [1, 5, 25, 50, 100, 200, 400]
         }

rs = RandomizedSearchCV(gb, 
                        params, 
                        n_jobs=-1, 
                        random_state=31, 
                        cv=5, 
                        return_train_score=True, 
                        n_iter = 25, 
                        verbose = 1)
rs.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.1s


      Iter       Train Loss      OOB Improve   Remaining Time 


[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:  1.8min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                        criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                    

In [352]:
pd.DataFrame(rs.cv_results_).columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_subsample', 'param_n_estimators', 'param_min_samples_split',
       'param_min_samples_leaf', 'param_max_features', 'param_max_depth',
       'param_loss', 'param_learning_rate', 'param_criterion', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score', 'split0_train_score',
       'split1_train_score', 'split2_train_score', 'split3_train_score',
       'split4_train_score', 'mean_train_score', 'std_train_score'],
      dtype='object')

In [353]:
pd.DataFrame(rs.cv_results_)[['rank_test_score', 
                              'mean_train_score', 
                              'mean_test_score', 
                              'std_train_score']].sort_values(by='rank_test_score')

Unnamed: 0,rank_test_score,mean_train_score,mean_test_score,std_train_score
6,1,0.593476,0.549746,0.006844
14,2,0.573104,0.54614,0.005212
1,3,0.652271,0.517294,0.006418
21,4,0.708355,0.51005,0.00315
24,5,0.665034,0.508519,0.006271
16,6,0.774368,0.50644,0.009474
18,7,0.660264,0.503347,0.005684
0,8,0.994327,0.497677,0.001699
2,9,0.731692,0.49407,0.005999
5,10,0.777332,0.488913,0.008209


In [354]:
rs.best_params_

{'subsample': 0.9,
 'n_estimators': 2,
 'min_samples_split': 5,
 'min_samples_leaf': 50,
 'max_features': 'sqrt',
 'max_depth': 4,
 'loss': 'deviance',
 'learning_rate': 0.25,
 'criterion': 'mse'}

In [355]:
rs.score(X_train, y_train)

0.5982465188241362

In [356]:
pr.push_results('gbc_rs', 'GradientBoostingClassifier', 'Randomized Search', rs.score(X_train, y_train))

# results_dict.update({'GradientBoost Randomized Search': rs.score(X_train, y_train)})

In [357]:
# pickle_out = open('../data/model_results/model_results.pickle', 'wb')
# pickle.dump(results_dict, pickle_out)
# pickle_out.close()

## Best model

In [358]:
gbc2 = GradientBoostingClassifier(subsample=1, 
                                n_estimators=12, 
                                max_features='sqrt', 
                                max_depth=5, 
                                loss='deviance', 
                                criterion='friedman_mse')

In [359]:
gbc2.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=5,
                           max_features='sqrt', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=12,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [360]:
gbc2.score(X_train, y_train)

0.6817947395564724

In [361]:
gbc2.feature_importances_

array([2.87218527e-02, 2.83684841e-02, 5.09028603e-02, 2.61206874e-02,
       3.04371061e-02, 0.00000000e+00, 3.97034660e-02, 1.01616256e-01,
       1.62658586e-01, 6.70588554e-03, 1.00046774e-02, 2.32544113e-02,
       2.50734828e-02, 1.05184346e-02, 1.58438672e-02, 3.19745577e-02,
       6.38420229e-03, 1.44665582e-02, 8.30816226e-03, 2.09632193e-02,
       2.01148079e-02, 2.85199357e-02, 1.14880901e-02, 1.85827621e-02,
       8.55067866e-03, 1.41116514e-02, 5.53759085e-02, 1.43499915e-02,
       3.03836572e-02, 9.23619394e-03, 3.60042885e-03, 2.94704560e-03,
       2.55288040e-03, 0.00000000e+00, 1.94104262e-02, 2.63160168e-02,
       2.07157612e-03, 4.32365077e-03, 1.19094050e-02, 4.68490244e-03,
       3.36262982e-03, 9.77358042e-03, 1.89655513e-02, 3.41801886e-03,
       2.08962247e-02, 0.00000000e+00, 3.51190416e-03, 0.00000000e+00,
       3.75563648e-03, 5.75798190e-03, 1.70661865e-06, 0.00000000e+00])

In [362]:
weights = pd.DataFrame(data = gbc2.feature_importances_, index = X_train.columns, columns = ['Weight'])
weights = weights.sort_values(by='Weight', ascending=False)
# weights.loc[weights.Weight <.006].index

In [363]:
X_train_select = X_train.drop(columns = ['fb:1_sb:0_tb:0', 'triple', '2_1', '2_0', 'fb:1_sb:1_tb:0', '1_0',
       '1_1', 'if_strategic', 'fb:0_sb:1_tb:0', 'fb:0_sb:1_tb:1',
       'nats_home1_away0', 'of_strategic', '3_1', 'fb:1_sb:1_tb:1',
       'fb:1_sb:0_tb:1', 'fb:0_sb:0_tb:1'])

In [364]:
params = {'n_estimators': range(2, 500, 25), 
          'max_depth': range(2,10), 
          'max_features': ['auto', 'sqrt', 'log2'], 
          'loss': ['deviance', 'exponential'], 
          'subsample': [.9, 1, 1.5], 
          'criterion': ['friedman_mse', 'mse', 'mae']
         }

rs = RandomizedSearchCV(gb, params, n_jobs=-1, random_state=31, cv=15, verbose=1)
rs.fit(X_train_select, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 15 folds for each of 10 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 119 out of 150 | elapsed:    4.2s remaining:    1.1s


KeyboardInterrupt: 

In [None]:
rs.best_params_

In [None]:
rs.score(X_train_select, y_train)

In [None]:
gbc3 = GradientBoostingClassifier(subsample=1, n_estimators=27, max_features='sqrt', 
                                  max_depth=5, loss='deviance', criterion='friedman_mse',
                                  verbose=1, random_state=31)


scores = cross_val_score(gbc3, X_train_select, y_train, cv=15, scoring= 'accuracy', n_jobs=-1, verbose = 1)

In [None]:
scores.mean()

In [None]:
gbc3.score(X_train_select, y_train)

In [None]:
y_pred = gbc3.predict(X_train_select)

In [None]:
cm = ConfusionMatrix(gbc3, classes=[0, 1, 2, 3, 4])
cm.fit(X_train_select, y_train)
cm.score(X_train_select, y_train)