# Gradient Boosting

In [1]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import push_results as pr

In [2]:
filepath = '../data/train_test_split/scherzer/transformed/'

infile1 = open(filepath + 'X_train_2019_trans.pickle','rb')
X_train = pickle.load(infile1)
infile1.close()

infile2 = open(filepath + 'y_train_2c_2019_trans.pickle','rb')
y_train = pickle.load(infile2)
infile2.close()

In [3]:
y_train = np.array(y_train).reshape(-1, )

## GBC for all types of pitches

In [4]:
gbc = GradientBoostingClassifier(random_state=31)

In [5]:
gbc.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=31, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [6]:
gbc.score(X_train, y_train)

0.7153171738009283

In [7]:
y_pred = gbc.predict(X_train)

In [8]:
# pr.push_results('gbc_def_v2', 'GradientBoostingClassifier', 'Default', gbc.score(X_train, y_train))

# # results_dict.update({'Gradient Boosting': gbc.score(X_train, y_train)})

## Cross validation

In [9]:
scores = cross_val_score(gbc, X_train, y_train, cv=5, scoring= 'accuracy')

In [10]:
np.average(scores)

0.5332667359279682

In [11]:
# pr.push_results('gbc_cv_v2', 'GradientBoostingClassifier', 'Cross Validation', gbc.score(X_train, y_train))


# # results_dict.update({'GradientBoosting CV5': np.average(scores)})

## GridSearch

In [12]:
gb = GradientBoostingClassifier(verbose=-1, random_state=31)

params = {'n_estimators': range(2, 200, 10), 
          'max_depth': range(2,10), 
          'max_features': ['auto', 'sqrt', 'log2'], 
          'loss': ['deviance', 'exponential'], 
          'subsample': [.9, 1, 1.5], 
          'criterion': ['friedman_mse', 'mse', 'mae']
         }

gridsearch = (GridSearchCV(estimator = gb, 
                          param_grid = params,
                          n_jobs = -1,
                          verbose = 1,
                          cv = 5,
                          scoring = 'accuracy',
                          return_train_score= True))

# gridsearch = gridsearch.fit(X_train, y_train)

In [13]:
# gridsearch.best_score_

In [14]:
# results_dict.update({'GrandientBoosting GS': gridsearch.best_score_})

In [15]:
# gridsearch.cv_results_;

In [16]:
# gridsearch.cv_results_['params'][gridsearch.best_index_]

In [17]:
# colums = ['params', 'mean_test_score', 
#           'std_test_score', 'rank_test_score',
#           'mean_train_score', 'std_train_score']

# results = pd.DataFrame(gridsearch.cv_results_)[colums]
# df = results.sort_values(by = 'rank_test_score').head(10)

In [18]:
# df

In [19]:
# pickle_out = open('../data/model_results/model_results.pickle', 'wb')
# pickle.dump(results_dict, pickle_out)
# pickle_out.close()

## This gridsearch took several hours, saving results below

In [20]:
# pickle_out = open('./gbc_results/best_from_gs.pickle', 'wb')
# pickle.dump(gridsearch.cv_results_['params'][gridsearch.best_index_], pickle_out)
# pickle_out.close()

# pickle_out = open('./gbc_results/all_gbc_results_df.pickle', 'wb')
# pickle.dump(df, pickle_out)
# pickle_out.close()

## Randomized Search

In [21]:
params = {'n_estimators': range(2, 200, 10), 
          'max_depth': range(2,10), 
          'max_features': ['auto', 'sqrt', 'log2'], 
          'loss': ['deviance', 'exponential'], 
          'subsample': [.9, 1, 1.5], 
          'criterion': ['friedman_mse', 'mse', 'mae']
         }

rs = RandomizedSearchCV(gb, params, n_jobs=-1, random_state=31, cv=15)
rs.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 


RandomizedSearchCV(cv=15, error_score=nan,
                   estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                        criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                   

In [22]:
rs.best_params_

{'subsample': 1,
 'n_estimators': 92,
 'max_features': 'auto',
 'max_depth': 2,
 'loss': 'deviance',
 'criterion': 'friedman_mse'}

In [23]:
rs.score(X_train, y_train)

0.6482723053120165

In [24]:
pr.push_results('gbc_rs_v2', 'GradientBoostingClassifier', 'Randomized Search', rs.score(X_train, y_train))

# results_dict.update({'GradientBoost Randomized Search': rs.score(X_train, y_train)})

In [25]:
# pickle_out = open('../data/model_results/model_results.pickle', 'wb')
# pickle.dump(results_dict, pickle_out)
# pickle_out.close()