# Gradient Boosting All Classes

In [2]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import confusion_matrix
from yellowbrick.model_selection import ValidationCurve
from yellowbrick.features import RadViz
import matplotlib.pyplot as plt

In [3]:
filepath = '../data/train_test_split/'

infile = open(filepath + 'X_train_ac.pickle','rb')
X_train_full = pickle.load(infile)
infile.close()

infile = open(filepath + 'X_test_ac.pickle','rb')
X_test_real = pickle.load(infile)
infile.close()

infile = open(filepath + 'y_train_ac.pickle','rb')
y_train_full = pickle.load(infile)
infile.close()

infile = open(filepath + 'y_test_ac.pickle','rb')
y_test_real = pickle.load(infile)
infile.close()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_train_full,
                                                    y_train_full,
                                                    test_size = .5,
                                                    random_state = 31,
                                                    shuffle = True,
                                                    stratify = y_train_full)

In [5]:
y_train = np.array(y_train).reshape(-1, ).astype('int')
y_test = np.array(y_test).reshape(-1, ).astype('int')
y_test_real = np.array(y_test_real).reshape(-1, ).astype('int')
# y_train_2c = np.array(y_train_2c).reshape(-1, )
# y_train_3c = np.array(y_train_3c).reshape(-1, )

## GBC for all types of pitches

In [17]:
gbc = GradientBoostingClassifier(random_state=31, verbose=1)

In [None]:
gbc.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1      389723.9432           58.85m
         2      388069.2373           59.44m
         3      386608.7432           59.14m
         4 2247829298733610225834655744.0000           58.38m
         5 2247829298733610225834655744.0000           57.61m
         6 2247829298733610225834655744.0000           57.00m
         7 2247829298733610225834655744.0000           56.96m
         8 2247829298733610225834655744.0000           56.39m
         9 2247829298733610225834655744.0000           56.13m
        10 2247829298733610225834655744.0000           55.43m


In [None]:
gbc.score(X_train, y_train)

In [None]:
gbc.score(X_test, y_test)

## Cross validation plots

In [None]:
scores = cross_val_score(gbc, X_train, y_train, cv=5, scoring= 'accuracy')

In [None]:
np.average(scores)

In [None]:
parameters = [.0001, .001, .01, .1]

fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = gbc, 
                                 X = X_train,
                                 y = y_train,
                                 param_name = 'learning_rate', ## this is the way for accessing a parameter of a 
                                                                                        #transformer within pipeline
                                 param_range = parameters, 
                                 cv = 10, ## note that this can take too long if your data is big
                                 verbose = 1, # algorithms will update us about the progress
                                 n_jobs = -1, # we will be using the other processing units in parallel
                                 logx = False 
                                )
                                             

# Fit and show the visualizer
viz.fit(X_train, y_train)
viz.show(outpath = '../viz/learning');

In [None]:
parameters = [2, 3, 4, 5, 10]

fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = gbc, 
                                 X = X_train,
                                 y = y_train,
                                 param_name = 'max_depth', ## this is the way for accessing a parameter of a 
                                                                                        #transformer within pipeline
                                 param_range = parameters, 
                                 cv = 5, ## note that this can take too long if your data is big
                                 verbose = 1, # algorithms will update us about the progress
                                 n_jobs = -1, # we will be using the other processing units in parallel
                                 logx = False 
                                )
                                             

# Fit and show the visualizer
viz.fit(X_train, y_train)
viz.show(outpath = '../viz/max_depth');

In [None]:
parameters = [5, 10] + list(range(25, 1200, 25))

fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = gbc, 
                                 X = X_train,
                                 y = y_train,
                                 param_name = 'min_samples_split', ## this is the way for accessing a parameter of a 
                                                                                        #transformer within pipeline
                                 param_range = parameters, 
                                 cv = 5, ## note that this can take too long if your data is big
                                 verbose = 1, # algorithms will update us about the progress
                                 n_jobs = -1, # we will be using the other processing units in parallel
                                 logx = False 
                                )
                                             

# Fit and show the visualizer
viz.fit(X_train, y_train)
viz.show(outpath = '../viz/min_samples_split');

In [None]:
parameters = range(25, 1000, 25)

fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = gbc, 
                                 X = X_train,
                                 y = y_train,
                                 param_name = 'min_samples_leaf', ## this is the way for accessing a parameter of a 
                                                                                        #transformer within pipeline
                                 param_range = parameters, 
                                 cv = 5, ## note that this can take too long if your data is big
                                 verbose = 1, # algorithms will update us about the progress
                                 n_jobs = -1, # we will be using the other processing units in parallel
                                 logx = False 
                                )
                                             

# Fit and show the visualizer
viz.fit(X_train, y_train)
viz.show(outpath = '../viz/min_samples_leaf');

In [None]:
parameters = range(2, 47)

fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = gbc, 
                                 X = X_train,
                                 y = y_train,
                                 param_name = 'max_features', ## this is the way for accessing a parameter of a 
                                                                                        #transformer within pipeline
                                 param_range = parameters, 
                                 cv = 5, ## note that this can take too long if your data is big
                                 verbose = 1, # algorithms will update us about the progress
                                 n_jobs = -1, # we will be using the other processing units in parallel
                                 logx = False 
                                )
                                             

# Fit and show the visualizer
viz.fit(X_train, y_train)
viz.show(outpath = '../viz/max_features');

In [None]:
X_train.shape[0]

In [None]:
parameters = range(50, 800, 50)

fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = gbc, 
                                 X = X_train,
                                 y = y_train,
                                 param_name = 'max_leaf_nodes', ## this is the way for accessing a parameter of a 
                                                                                        #transformer within pipeline
                                 param_range = parameters, 
                                 cv = 5, ## note that this can take too long if your data is big
                                 verbose = 1, # algorithms will update us about the progress
                                 n_jobs = -1, # we will be using the other processing units in parallel
                                 logx = False 
                                )
                                             

# Fit and show the visualizer
viz.fit(X_train, y_train)
viz.show(outpath = '../viz/max_leaf_nodes');

In [None]:
parameters = [1, 2, 3, 4] + list(range(5, 50, 5))

fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = gbc, 
                                 X = X_train,
                                 y = y_train,
                                 param_name = 'n_estimators', ## this is the way for accessing a parameter of a 
                                                                                        #transformer within pipeline
                                 param_range = parameters, 
                                 cv = 5, ## note that this can take too long if your data is big
                                 verbose = 1, # algorithms will update us about the progress
                                 n_jobs = -1, # we will be using the other processing units in parallel
                                 logx = False 
                                )
                                             

# Fit and show the visualizer
viz.fit(X_train, y_train)
viz.show(outpath = '../viz/n_estimators');

In [None]:
parameters = [.001, .005, .01, .05, .75, 1, 1.25, 1.5]

fig, ax = plt.subplots(figsize=(15, 8))

viz = ValidationCurve(model = gbc, 
                                 X = X_train,
                                 y = y_train,
                                 param_name = 'subsample', ## this is the way for accessing a parameter of a 
                                                                                        #transformer within pipeline
                                 param_range = parameters, 
                                 cv = 5, ## note that this can take too long if your data is big
                                 verbose = 1, # algorithms will update us about the progress
                                 n_jobs = -1, # we will be using the other processing units in parallel
                                 logx = False 
                                )
                                             

# Fit and show the visualizer
viz.fit(X_train, y_train)
viz.show(outpath = '../viz/subsample');

## Ada Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import cross_validate

In [None]:
dc = DecisionTreeClassifier(max_depth=7, class_weight='balanced', max_features=10)

ada = AdaBoostClassifier(base_estimator=dc, n_estimators=2000, learning_rate=.01, random_state=31)

cv = cross_validate(ada, X_train, y_train, cv=5, n_jobs=-1, return_estimator=True)

In [None]:
mod = cv['estimator'][1]
iris_cm = ConfusionMatrix(mod, classes=['Fastball', 'Cutter', 'Two-seam', 'Slider', 'Curve', 'Changeup'])

iris_cm.fit(X_train, y_train)
iris_cm.score(X_test, y_test)

iris_cm.show(outpath = '../viz/testing_matrix');
print(cv['test_score'])

## SVC

In [None]:
from sklearn.svm import SVC

model = SVC(C=.1, kernel = 'sigmoid', max_iter=1000)
model.fit(X_train, y_train)
model.score(X_train, y_train)

cv = cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1)
cv

## Randomized Search

In [None]:
##### Tested ranges

# params = {'min_samples_split': [2, range(5, 1000, 5)]}

## Model based on analysis from yellowbrick cv graphs
# gbc2 = GradientBoostingClassifier(learning_rate=.01, 
#                                   max_depth=5, 
#                                   max_features=2, 
#                                   min_samples_leaf=775, 
#                                   min_samples_split=200, 
#                                   n_estimators=30,
#                                   subsample=.75,
#                                   random_state=31)

In [None]:
gbc2 = GradientBoostingClassifier()

params = {'subsample': [.001, .005, .01, .05, .75, 1, 1.25, 1.5],
          'n_estimators': [1, 2, 3, 4, range(5, 50, 5)],
          'criterion': ['friedman_mse', 'mse', 'mae'], 
          'max_leaf_nodes': range(50, 800, 50),
          'max_features': range(2, 47), 
          'learning_rate': [.0001, .001, .01, .1], 
          'max_depth': [2, 3, 4, 5, 10], 
          'min_samples_leaf': range(25, 1000, 25), 
          'min_samples_split': [2, 5, 10, range(25, 1200, 25)]}

rs = RandomizedSearchCV(gbc2, params, n_jobs=-1, random_state=31, cv=5, verbose = 1, n_iter=10000, refit=True)

rs.fit(X_train, y_train);

rs.best_params_, rs.score(X_train, y_train), rs.score(X_test, y_test)

In [None]:
rs_cm = ConfusionMatrix(rs, classes=['Fastball', 'Cutter', 'Slider', 'Curve', 'Changeup'])

rs_cm.fit(X_train, y_train)
rs_cm.score(X_test, y_test)

rs_cm.show(outpath = '../viz/rs_matrix');

In [None]:
gbcb = GradientBoostingClassifier(subsample=0.001, 
                                  n_estimators=3, min_samples_leaf=425, max_leaf_nodes=150, 
                                  min_samples_split= 5,
                                  max_features=16, 
                                  max_depth=2, 
                                  learning_rate = 0.0001,
                                  criterion ='mse', 
                                  random_state=31)

In [None]:
gbcb.fit(X_train, y_train);

In [None]:
gbcb.score(X_train, y_train)

In [None]:
gbcb.score(X_test, y_test)

In [None]:
iris_cm = ConfusionMatrix(gbc, classes=['Fastball', 'Cutter', 'Slider', 'Curve', 'Changeup'])

iris_cm.fit(X_train, y_train)
iris_cm.score(X_test, y_test)

iris_cm.show(outpath = '../viz/default_matrix');

## Model validation
Code from https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score

In [None]:
# Binarize the output
y_train = label_binarize(y_train, classes=[0, 1, 2, 3, 4])
y_test = label_binarize(y_test, classes=[0, 1, 2, 3, 4])
n_classes = y_train.shape[1]

In [None]:
# Learn to predict each class against the other
classifier = OneVsRestClassifier(gbc)

y_score = classifier.fit(X_train, y_train).decision_function(X_test)

In [None]:
# # Learn to predict each class against the other
# classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
#                                  random_state=31))

# y_score = classifier.fit(X_train, y_train).decision_function(X_test)

In [None]:
classifier.score(X_train, y_train)

In [None]:
classifier.score(X_test, y_test)

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
plt.figure()
plt.figure(figsize=(15,11))
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('../viz/roc_one_vs_rest.png')
plt.show()

## Multiclass ROC

In [None]:
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.figure(figsize=(15,11))
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'blue', 'green'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")

plt.savefig('../viz/roc_mc.png')
plt.show()

## Try MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
clf = MLPClassifier(random_state=31, max_iter=10000, hidden_layer_sizes=(500,))
clf.fit(X_train, y_train)
clf.score(X_train, y_train), clf.score(X_test, y_test)
# clf.predict_proba(X_test[:1])

# clf.predict(X_test[:5, :])



In [None]:
params = {'hidden_layer_sizes': [(100,), (200,), (300,), (500,), (1000,)],
          'activation': ['identity', 'logistic', 'tanh', 'relu'],
          'solver': ['lbfgs', 'sgd', 'adam'],
          'alpha': [0.0001, .001, .01, 1],
          'learning_rate': ['constant', 'invscaling', 'adaptive'],
          'learning_rate_init': [0.001, 0.01, .1, 1]}

rsmlp = RandomizedSearchCV(estimator = clf, 
                           param_distributions = params, 
                           n_iter = 5000, 
                           n_jobs = -1, 
                           cv = 5, 
                           verbose = 1)

In [None]:
rsmlp.fit(X_train, y_train);
rsmlp.score(X_train, y_train)

In [None]:
rsmlp.best_params_

In [None]:
rsmlp.score(X_test, y_test)

In [None]:
rsmlp.classes_

In [None]:
rsmlp.error_score

In [None]:
rsmlp.predict_proba(X_test)

In [None]:
y_test