# Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
from sklearn.model_selection import learning_curve
from sklearn import model_selection, linear_model
import pickle
import os

# Dummy
from sklearn.dummy import DummyRegressor

# Decision Tree
from sklearn.tree import DecisionTreeRegressor

# Random Forrest
from sklearn.ensemble import RandomForestRegressor

# K-Nearest-Neighbor
from sklearn.neighbors import KNeighborsRegressor

# Neural Network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models,layers


# Ready Data

Make sure to upload pitch_data_clean.csv from first notebook

In [4]:
df = pd.read_csv('pitch_data_clean.csv')

target_columns = ['launch_speed', 'strikeout', 'hbp_bb', 'home_run']

target = df[target_columns]

# Walk rate does not seem significant so I will drop it
target = target.drop(['hbp_bb'], axis=1)

target_columns.append('Unnamed: 0')
df_ready = df.drop(target_columns, axis = 1)

df_ready


Unnamed: 0,pfx_x,pfx_z,release_speed,release_spin_rate,total_movement,pitch_type_CH,pitch_type_CS,pitch_type_CU,pitch_type_EP,pitch_type_FA,pitch_type_FC,pitch_type_FF,pitch_type_FO,pitch_type_FS,pitch_type_KC,pitch_type_KN,pitch_type_SI,pitch_type_SL,pitch_type_ST,pitch_type_SV
0,1.171111,0.190556,83.400000,1403.055556,1.186513,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1.192632,0.446842,88.900000,1785.315789,1.273593,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1.323810,0.671905,88.000000,1736.333333,1.484563,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.175806,0.539355,81.100000,1925.064516,1.293609,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1.154265,0.748824,83.600000,1561.558824,1.375887,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9483,1.391290,0.269355,81.700000,2774.096774,1.417124,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
9484,1.294255,0.311830,83.100000,2278.714894,1.331291,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9485,1.180752,0.012857,84.700000,2655.150376,1.180822,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9486,1.352192,0.076577,81.200000,2926.051051,1.354359,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [5]:
target

Unnamed: 0,launch_speed,strikeout,home_run
0,74.200000,0.013699,0.000000
1,82.600000,0.097222,0.013889
2,77.700000,0.096386,0.012048
3,75.500000,0.071942,0.000000
4,84.500000,0.030043,0.017167
...,...,...,...
9483,83.100000,0.056911,0.016260
9484,80.500000,0.077892,0.009164
9485,81.400000,0.057915,0.001931
9486,79.000000,0.065591,0.009677


# Models

## Linear Regression

In [6]:
# Remove the pitch type one hot encoding
df_regression = df_ready[['pfx_x','pfx_z','release_speed','release_spin_rate','total_movement']]

reg = linear_model.LinearRegression().fit(df_regression, target)
i_index = 0
for i in ['launch_speed', 'strikeout', 'home_run']:
  j_index = 0
  print("\n{}:".format(i))
  print("intercept: {}".format(reg.intercept_[i_index]))
  for j in df_regression.columns:
    print("{}: {}".format(j,reg.coef_[i_index][j_index]))
    j_index += 1
  i_index += 1


launch_speed:
intercept: 67.75082426333452
pfx_x: 1.4324216503467457
pfx_z: 2.463089221279412
release_speed: 0.17913489511734443
release_spin_rate: -0.0003313859635558636
total_movement: -2.6287076671067284

strikeout:
intercept: 0.12409725923957052
pfx_x: -0.007460308427145197
pfx_z: -0.014328840027136696
release_speed: -0.0009080942118669589
release_spin_rate: 1.2607779122299212e-05
total_movement: 0.0017678730471317751

home_run:
intercept: 0.006566594547917
pfx_x: -0.0006123377138846421
pfx_z: 0.001957402743098887
release_speed: 3.186045424420807e-05
release_spin_rate: -7.478809157432896e-07
total_movement: -0.00040916544281270243


## Dummy

In [7]:
print('Dummy Regressor:\n')

dummy = DummyRegressor()
train_sizes, train_scores, test_scores, fit_times, score_times = learning_curve(dummy, df_ready, target, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], cv=10,return_times=True,scoring='neg_root_mean_squared_error',shuffle=True,random_state=0)
dummy_acc = 0-test_scores
print('acc: {}'.format(dummy_acc.mean(axis=1)))
print('train times: {}'.format(score_times[4].mean()))
print('fit times: {}'.format(fit_times[4].mean()))
print('total times: {}\n\n'.format(score_times[4].mean() + fit_times[4].mean()))

Dummy Regressor:

acc: [1.06003701 1.05974909 1.06116931 1.06282544 1.06288088]
train times: 0.0017901897430419923
fit times: 0.005360889434814453
total times: 0.007151079177856446




## Decision Tree

In [8]:
print('DecisionTreeRegressor:')
param = [{'min_samples_leaf':[50, 75, 100]}]
dtr = DecisionTreeRegressor()
# dtr = DecisionTreeRegressor(min_samples_leaf = 10)
decTree_tuned = model_selection.GridSearchCV(dtr, param, scoring = 'neg_root_mean_squared_error', cv = 4)
decTree_tuned.fit(df_ready, target)
#dtr.fit(battingData, target)

bestMinLeaf = decTree_tuned.best_params_['min_samples_leaf']
train_sizes, train_scores, test_scores, fit_times, score_times = learning_curve(decTree_tuned, df_ready, target, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], cv=10,return_times=True,scoring='neg_root_mean_squared_error',shuffle=True,random_state=0)
decTree_tuned_rmse = 0-test_scores
print('min_samples_leaf - {}'.format(bestMinLeaf))
print('rmse: {}'.format(decTree_tuned_rmse.mean(axis=1)))
print('train times: {}'.format(score_times[4].mean()))
print('fit times: {}'.format(fit_times[4].mean()))
print('total times: {}\n\n'.format(score_times[4].mean() + fit_times[4].mean()))

filename = './models/pitchMovementDecisionTree.sav'
pickle.dump(decTree_tuned, open(filename, 'wb'))

DecisionTreeRegressor:
min_samples_leaf - 100
rmse: [0.98583451 0.97866217 0.97101676 0.96828702 0.96271312]
train times: 0.007701992988586426
fit times: 1.0179325103759767
total times: 1.025634503364563




## Random Forest

In [12]:
print('RandomForrestRegressor:')
param = [{'min_samples_leaf':[20, 25, 50, 300]}]
randF = RandomForestRegressor(n_estimators = 401)
decTree_tuned = model_selection.GridSearchCV(randF, param, scoring = 'neg_root_mean_squared_error', cv = 4)
decTree_tuned.fit(df_ready, target)
#randF.fit(df, target)

bestMinLeaf = decTree_tuned.best_params_['min_samples_leaf']
train_sizes, train_scores, test_scores, fit_times, score_times = learning_curve(decTree_tuned, df_ready, target, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], cv=10,return_times=True,scoring='neg_root_mean_squared_error',shuffle=True,random_state=0)
decTree_tuned_rmse = 0-test_scores
print('min_samples_leaf - {}'.format(bestMinLeaf))
print('rmse: {}'.format(decTree_tuned_rmse.mean(axis=1)))
print('train times: {}'.format(score_times[4].mean()))
print('fit times: {}'.format(fit_times[4].mean()))
print('total times: {}\n\n'.format(score_times[4].mean() + fit_times[4].mean()))

filename = './models/pitchMovementRandomForrest.sav.sav'
pickle.dump(decTree_tuned, open(filename, 'wb'))

RandomForrestRegressor:
min_samples_leaf - 25
rmse: [0.96237395 0.95503561 0.94812893 0.94528787 0.94397649]
train times: 0.10228278636932372
fit times: 170.3491261959076
total times: 170.45140898227692




## K-Nearest-Neighbor


In [9]:
# KNearestNeighborsRegressor 
# Normalize
df_norm=(df_ready-df_ready.min())/(df_ready.max()-df_ready.min())
print('Nearest Neighbors:')

param = [{'n_neighbors':[20,30, 35, 40, 45, 50, 55, 60, 65, 70, 100, 125, 150, 175, 200, 225, 250, 275, 300]}]
knnr = KNeighborsRegressor()
knnr_tuned = model_selection.GridSearchCV(knnr, param, scoring = 'neg_root_mean_squared_error', cv = 4)

knnr_tuned.fit(df_norm, target)
bestNNeighbors = knnr_tuned.best_params_['n_neighbors']
train_sizes, train_scores, test_scores, fit_times, score_times = learning_curve(knnr, df_norm, target, train_sizes=[0.2, 0.4, 0.6, 0.8, 1], cv=10,return_times=True,scoring='neg_root_mean_squared_error',shuffle=True,random_state=0)
knnr_tuned_rmse = 0-test_scores
print('n_neighbors - {}'.format(bestNNeighbors))
print('rmse: {}'.format(knnr_tuned_rmse.mean(axis=1)))
print('train times: {}'.format(score_times[4].mean()))
print('fit times: {}'.format(fit_times[4].mean()))
print('total times: {}\n\n'.format(score_times[4].mean() + fit_times[4].mean()))

filename = './models/pitchMovementKNN.sav'
pickle.dump(knnr_tuned, open(filename, 'wb'))


Nearest Neighbors:
n_neighbors - 100
rmse: [1.01495156 1.0049761  1.00954212 1.00549657 0.99891382]
train times: 0.07433168888092041
fit times: 0.007747292518615723
total times: 0.08207898139953614




## Neural Network REMEMBER ENABLE GPU!!

In [10]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=0)
root_mean_squared_error = []
error = 1000000000

for train, test in kfolds.split(df_ready, target):
    nn = models.Sequential()
    '''layers.LeakyReLU(alpha=0.001)'''
    nn.add(layers.Dense(256, activation='relu', input_dim=20,kernel_regularizer=keras.regularizers.l2(0.01)))
    # nn.add(layers.Dropout(0.2))
    nn.add(layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)))
    # nn.add(layers.Dropout(0.2))
    nn.add(layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)))
    # nn.add(layers.Dropout(0.2))
    # nn.add(layers.Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)))
    # nn.add(layers.Dropout(.5))
    # nn.add(layers.Dense(256, activation=layers.LeakyReLU(alpha=0.001)))
    nn.add(layers.Dense(3, activation='linear'))
    nn.compile(
        optimizer=tf.keras.optimizers.Adadelta(learning_rate=0.01,
                                                rho=0.95,
                                                epsilon=1e-07), 
        loss='mean_squared_error', 
        metrics=[tf.keras.metrics.RootMeanSquaredError()])
    n_epochs = 200

    early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20)
    history = nn.fit(df_ready.iloc[train], target.iloc[train], epochs=n_epochs, callbacks=[early_stop_callback], verbose = 0)#, batch_size = 64)
    s = nn.evaluate(df_ready.iloc[test], target.iloc[test])
    root_mean_squared_error.append(s[1])
    
    #save model with least error 
    if(s[1]<error):
      error = s[1]
      nn.save('models/pitchMovementNN.h5')




In [11]:
# Root mean squaared error for Neural Network
sum(root_mean_squared_error)/len(root_mean_squared_error)

2.343558692932129

# Download models

In [13]:
!zip -r models.zip /content/models

from google.colab import files
files.download("/content/models.zip")

  adding: content/models/ (stored 0%)
  adding: content/models/pitchMovementKNN.sav (deflated 75%)
  adding: content/models/pitchMovementNN.h5 (deflated 12%)
  adding: content/models/pitchMovementDecisionTree.sav (deflated 47%)
  adding: content/models/pitchMovementRandomForrest.sav.sav (deflated 49%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>