# Gradient Boosting

In [7]:
# conda install -c conda-forge xgboost 

In [8]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import push_results as pr
import xgboost

In [43]:
infile1 = open('../data/processed/X_train2_trans.pickle','rb')
X_train = pickle.load(infile1)
infile1.close()

infile2 = open('../data/processed/y_train_trans.pickle','rb')
y_train = pickle.load(infile2)
infile2.close()

In [44]:
infile3 = open('../data/model_results/model_results.pickle','rb')
results_dict = pickle.load(infile3)
infile3.close()

In [45]:
gbc = GradientBoostingClassifier()

In [46]:
gbc.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [47]:
gbc.score(X_train, y_train)

0.6750902527075813

In [48]:
y_pred = gbc.predict(X_train)

In [49]:
pr.push_results('gbc_def_v3', 'GradientBoostingClassifier_reduced_pitch_types', 'Default', gbc.score(X_train, y_train))

# results_dict.update({'Gradient Boosting': gbc.score(X_train, y_train)})

## Cross validation

In [50]:
scores = cross_val_score(gbc, X_train, y_train, cv=5, scoring= 'accuracy')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [51]:
np.average(scores)

0.4162031487253257

In [52]:
pr.push_results('gbc_cv_v3', 'GradientBoostingClassifier_reduced_pitch_types', 'Cross Validation', np.average(scores))


# results_dict.update({'GradientBoosting CV5': np.average(scores)})

## GridSearch

In [53]:
gb = GradientBoostingClassifier(verbose=-1, random_state=31)

params = {'n_estimators': range(2, 200, 10), 
          'max_depth': range(2,10), 
          'max_features': ['auto', 'sqrt', 'log2'], 
          'loss': ['deviance', 'exponential'], 
          'subsample': [.9, 1, 1.5], 
          'criterion': ['friedman_mse', 'mse', 'mae']
         }

gridsearch = (GridSearchCV(estimator = gb, 
                          param_grid = params,
                          n_jobs = -1,
                          verbose = 1,
                          cv = 5,
                          scoring = 'accuracy',
                          return_train_score= True))

# gridsearch = gridsearch.fit(X_train, y_train)

In [54]:
# gridsearch.best_score_

In [55]:
# results_dict.update({'GrandientBoosting GS': gridsearch.best_score_})

In [56]:
# gridsearch.cv_results_;

In [57]:
# gridsearch.cv_results_['params'][gridsearch.best_index_]

In [58]:
# colums = ['params', 'mean_test_score', 
#           'std_test_score', 'rank_test_score',
#           'mean_train_score', 'std_train_score']

# results = pd.DataFrame(gridsearch.cv_results_)[colums]
# df = results.sort_values(by = 'rank_test_score').head(10)

In [59]:
# df

In [60]:
# pickle_out = open('../data/model_results/model_results.pickle', 'wb')
# pickle.dump(results_dict, pickle_out)
# pickle_out.close()

## This gridsearch took several hours, saving results below

In [61]:
# pickle_out = open('./gbc_results/best_from_gs.pickle', 'wb')
# pickle.dump(gridsearch.cv_results_['params'][gridsearch.best_index_], pickle_out)
# pickle_out.close()

# pickle_out = open('./gbc_results/all_gbc_results_df.pickle', 'wb')
# pickle.dump(df, pickle_out)
# pickle_out.close()

## Randomized Search

In [62]:
params = {'n_estimators': range(2, 200, 10), 
          'max_depth': range(2,10), 
          'max_features': ['auto', 'sqrt', 'log2'], 
          'loss': ['deviance', 'exponential'], 
          'subsample': [.9, 1, 1.5], 
          'criterion': ['friedman_mse', 'mse', 'mae']
         }

rs = RandomizedSearchCV(gb, params, n_jobs=-1, random_state=31, cv=15)
rs.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 


  y = column_or_1d(y, warn=True)


RandomizedSearchCV(cv=15, error_score=nan,
                   estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                        criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                   

In [63]:
rs.best_params_

{'subsample': 1,
 'n_estimators': 12,
 'max_features': 'sqrt',
 'max_depth': 5,
 'loss': 'deviance',
 'criterion': 'friedman_mse'}

In [64]:
rs.score(X_train, y_train)

0.5404847859721505

In [65]:
pr.push_results('gbc_rs_v3', 'GradientBoostingClassifier_reduced_pitch_types', 'Randomized Search', rs.score(X_train, y_train))

# results_dict.update({'GradientBoost Randomized Search': rs.score(X_train, y_train)})

In [66]:
# pickle_out = open('../data/model_results/model_results.pickle', 'wb')
# pickle.dump(results_dict, pickle_out)
# pickle_out.close()

## xgboost

In [67]:
# multiclass classification
import pandas
import xgboost
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [68]:
# # load data
# data = pandas.read_csv('iris.csv', header=None)
# dataset = data.values

In [69]:
# #split data into X and y
# X = dataset[:,0:4]
# Y = dataset[:,4]

In [77]:
X = X_train
Y = y_train

In [78]:
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
# seed = 7
# test_size = 0.33
# X_train, X_test, y_train, y_test = model_selection.train_test_split(X, 
#                                                                     label_encoded_y, 
#                                                                     test_size=test_size, 
#                                                                     random_state=seed)

In [79]:
np.unique(y_train)

array([0, 1, 2, 3, 4])

In [80]:
# fit model no training data
model = xgboost.XGBClassifier()
model.fit(X_train, y_train)
print(model)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)


In [81]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [82]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 39.84%
