# First Neural Network Experiement


## Pipeline Preparation

In [1]:
import numpy as np
import pandas as pd
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os
import pitch_functions
import sklearn

Using TensorFlow backend.


In [2]:
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM

from keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
each_pitch_clean = pd.read_csv('raw_data/final_pitches.csv')

In [31]:
pitch_clean = each_pitch_clean.dropna().copy()

In [32]:
pitch_clean = pitch_clean[:1000]

In [33]:
target = pitch_clean['pitch_type']

In [34]:
predictors = pitch_clean.drop(['pitch_type'], axis=1)

In [35]:
cat_features = list(predictors.select_dtypes(include='object'))
cat_features.extend(['about.inning', 'pitchData.zone', 'count'])

In [36]:
cat_features

['pitcher',
 'hitter',
 'about.halfInning',
 'details.call.description',
 'details.description',
 'matchup.batSide.code',
 'matchup.pitchHand.code',
 'matchup.splits.menOnBase',
 'prior_pitch_type',
 'count',
 'about.inning',
 'pitchData.zone',
 'count']

## Data Prep for Neural Network 
- Game Plan for Today
- Engineer the count column with a string of what the count is - maybe turn each ball and strike into a string and then combine them.
- One Hot Encode the final data frame with all categorical columns 
- Create Final Dataframe for computer work today, then look into creating a neural network on the cloud. 

In [37]:
pitches_dict = {'Fastball': 0, 'Breaking_Ball': 1, 'Changeup': 2}

In [38]:
pitches_numerical = target.map(pitches_dict)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(predictors, 
                                                    pitches_numerical, 
                                                    random_state=10)

In [40]:
print(len(X_train)) 
print(len(y_train))

750
750


In [41]:
print(len(X_test)) 
print(len(y_test))

250
250


In [42]:
encoder = OneHotEncoder(handle_unknown='ignore', categories='auto')

In [43]:
X_train.columns

Index(['pitcher', 'WAR_x', 'WHIP', 'ERA', 'SO', 'hitter', 'SLG', 'OPS',
       'WAR_y', 'about.atBatIndex', 'about.halfInning', 'about.inning',
       'details.call.description', 'details.description',
       'matchup.batSide.code', 'matchup.pitchHand.code', 'matchup.pitcher.id',
       'matchup.splits.menOnBase', 'pitchData.nastyFactor', 'pitchData.zone',
       'pitchNumber', 'prior_pitch_type', 'count'],
      dtype='object')

In [44]:
X_train.head(2)

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.atBatIndex,about.halfInning,about.inning,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,prior_pitch_type,count
265,Daniel Robertson,0.0,0.0,0.0,0,Brock Holt,0.411,0.774,1.3,65,bottom,8,Hit Into Play - Out(s),"In play, out(s)",L,R,621002,Empty,19.54,4.0,3.0,Breaking_Ball,2.0-0.0
1051,Blake Treinen,4.3,0.83,0.78,100,Mookie Betts,0.64,1.078,10.9,74,bottom,9,Strike - Swinging,Foul,R,R,595014,RISP,35.34,14.0,2.0,Breaking_Ball,0.0-2.0


- New Dataframe with columns from all categories that need to be one-hot-encoded.
- Use the data frame that was created then 

In [45]:
X_train_c = X_train[cat_features]
X_train_c.head()


Unnamed: 0,pitcher,hitter,about.halfInning,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,prior_pitch_type,count,about.inning,pitchData.zone,count.1
265,Daniel Robertson,Brock Holt,bottom,Hit Into Play - Out(s),"In play, out(s)",L,R,Empty,Breaking_Ball,2.0-0.0,8,4.0,2.0-0.0
1051,Blake Treinen,Mookie Betts,bottom,Strike - Swinging,Foul,R,R,RISP,Breaking_Ball,0.0-2.0,9,14.0,0.0-2.0
1043,J.B. Wendelken,Darnell Sweeney,top,Ball - Called,Ball,L,R,RISP,Fastball,4.0-2.0,7,11.0,4.0-2.0
422,Julio Urias,Francisco Mejia,top,Strike - Swinging,Foul,R,L,Men_On,Fastball,1.0-1.0,9,5.0,1.0-1.0
896,J.B. Wendelken,Ehire Adrianza,top,Strike - Swinging,Foul,L,R,Men_On,Fastball,0.0-2.0,9,8.0,0.0-2.0


In [46]:
X_test_c = X_test[cat_features]

In [47]:
y_train_c = to_categorical(y_train)

In [48]:
y_test_c = to_categorical(y_test)

In [49]:
encoder.fit(X_train_c)

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True)

In [51]:
X_train_1 = pd.DataFrame(encoder.transform(X_train_c).todense(),
                   columns=encoder.get_feature_names())

In [52]:
X_train_1.head(2)

Unnamed: 0,x0_Aaron Brooks,x0_Alec Asher,x0_Alex Avila,x0_Alex Blandino,x0_Alex Reyes,x0_Anthony Rizzo,x0_Blake Treinen,x0_Brandon Dixon,x0_Brandon Guyer,x0_Carlos Tocci,x0_Chase d'Arnaud,x0_Daniel Robertson,x0_Danny Valencia,x0_Greg Garcia,x0_Ian Happ,x0_Ian Krol,x0_J.B. Wendelken,x0_James Hoyt,x0_Joey Krehbiel,x0_Johnny Field,x0_Julio Urias,x0_Kendrys Morales,x0_Mark Reynolds,x0_Matt Davidson,x0_Mitch Garver,x0_Osmer Morales,x0_Pablo Sandoval,x0_Phillip Ervin,x0_Rex Brothers,x0_Rob Whalen,x0_Ryan Rua,x0_Scott Copeland,x0_Zack Weiss,x1_A.J. Pollock,x1_Aaron Hicks,x1_Aaron Judge,x1_Abiatal Avelino,x1_Adrian Beltre,x1_Albert Almora Jr.,x1_Aledmys Diaz,x1_Alex Bregman,x1_Andrelton Simmons,x1_Andrew Benintendi,x1_Andrew McCutchen,x1_Austin Meadows,x1_Austin Wynns,x1_Ben Gamel,x1_Ben Zobrist,x1_Blake Swihart,x1_Brandon Drury,x1_Brandon Guyer,x1_Brett Gardner,x1_Breyvic Valera,x1_Brock Holt,x1_Bryan Holaday,x1_Buster Posey,x1_Byron Buxton,x1_Carlos Tocci,x1_Carson Kelly,x1_Cedric Mullins,x1_Chance Sisco,x1_Charlie Blackmon,x1_Charlie Tilson,x1_Chris Gimenez,x1_Chris Iannetta,x1_Chris Taylor,x1_Christian Yelich,x1_Cody Bellinger,x1_Corey Dickerson,x1_Cory Spangenberg,x1_Craig Gentry,x1_DJ Stewart,x1_Darnell Sweeney,x1_David Fletcher,x1_Dee Gordon,x1_Denard Span,x1_Devin Mesoraco,x1_Didi Gregorius,x1_Dilson Herrera,x1_Domingo Santana,x1_Dwight Smith Jr.,x1_Eduardo Nunez,x1_Ehire Adrianza,x1_Elias Diaz,x1_Elvis Andrus,x1_Eric Sogard,x1_Eric Young Jr.,x1_Erik Gonzalez,x1_Erik Kratz,x1_Eugenio Suarez,x1_Francisco Arcia,x1_Francisco Lindor,x1_Francisco Mejia,x1_Francisco Pena,x1_Franklin Barreto,x1_Franmil Reyes,x1_Garrett Hampson,x1_George Springer,x1_Gerardo Parra,x1_Giancarlo Stanton,x1_Gleyber Torres,x1_Greg Allen,x1_Greg Bird,x1_Guillermo Heredia,x1_Harrison Bader,x1_Hunter Pence,x1_Hunter Renfroe,x1_Ian Desmond,x1_Ian Happ,x1_Isiah Kiner-Falefa,x1_J.D. Martinez,x1_J.P. Crawford,x1_Jace Peterson,x1_Jackie Bradley Jr.,x1_Jake Cave,x1_Jason Castro,x1_Javier Baez,x1_Jean Segura,x1_Jedd Gyorko,x1_Jesus Aguilar,x1_Joe Mauer,x1_Joe Panik,x1_Joey Gallo,x1_Joey Votto,x1_Joey Wendle,x1_John Hicks,x1_Jonathan Lucroy,x1_Jonathan Schoop,x1_Jonathan Villar,x1_Jorge Polanco,x1_Jorge Soler,x1_Jose Abreu,x1_Jose Altuve,x1_Jose Iglesias,x1_Jose Martinez,x1_Jose Pirela,x1_Jose Ramirez,x1_Justin Upton,x1_Kaleb Cowart,x1_Kazuhisa Makita,x1_Kendrys Morales,x1_Khris Davis,x1_Kolten Wong,x1_Kristopher Negron,x1_Kyle Freeland,x1_Kyle Seager,x1_Leury Garcia,x1_Lorenzo Cain,x1_Luke Voit,x1_Maikel Franco,x1_Mallex Smith,x1_Marcell Ozuna,x1_Marcus Semien,x1_Matt Chapman,x1_Matt Duffy,x1_Matt Olson,x1_Max Kepler,x1_Max Muncy,x1_Michael Brantley,x1_Michael Hermosillo,x1_Miguel Andujar,x1_Mike Trout,x1_Mike Zunino,x1_Mitch Haniger,x1_Mookie Betts,x1_Neil Walker,x1_Nelson Cruz,x1_Nicholas Castellanos,x1_Nick Hundley,x1_Noel Cuevas,x1_Nolan Arenado,x1_Omar Narvaez,x1_Patrick Wisdom,x1_Paul DeJong,x1_Phillip Ervin,x1_Preston Tucker,x1_Rafael Devers,x1_Raimel Tapia,x1_Randal Grichuk,x1_Robbie Grossman,x1_Roberto Perez,x1_Robinson Cano,x1_Ronald Guzman,x1_Rougned Odor,x1_Ryan Rua,x1_Ryon Healy,x1_Sandy Leon,x1_Sherman Johnson,x1_Shin-Soo Choo,x1_Steven Duggar,x1_Taylor Motter,x1_Teoscar Hernandez,x1_Tommy La Stella,x1_Tommy Pham,x1_Tony Kemp,x1_Travis Jankowski,x1_Travis Shaw,x1_Trevor Story,x1_Trey Mancini,x1_Tyler White,x1_Welington Castillo,x1_Whit Merrifield,x1_Wil Myers,x1_Willians Astudillo,x1_Willie Calhoun,x1_Xander Bogaerts,x1_Yadier Molina,x1_Yairo Munoz,x1_Yan Gomes,x1_Yasiel Puig,x1_Yasmani Grandal,x1_Yuli Gurriel,x2_bottom,x2_top,x3_Ball - Called,x3_Hit Into Play - Out(s),x3_Strike - Swinging,x4_Ball,x4_Ball In Dirt,x4_Called Strike,x4_Foul,x4_Foul Tip,"x4_In play, no out","x4_In play, out(s)","x4_In play, run(s)",x4_Swinging Strike,x4_Swinging Strike (Blocked),x5_L,x5_R,x6_L,x6_R,x7_Empty,x7_Loaded,x7_Men_On,x7_RISP,x8_Breaking_Ball,x8_Changeup,x8_Fastball,x9_0.0-1.0,x9_0.0-2.0,x9_0.0-3.0,x9_1.0-0.0,x9_1.0-1.0,x9_1.0-2.0,x9_1.0-3.0,x9_2.0-0.0,x9_2.0-1.0,x9_2.0-2.0,x9_2.0-3.0,x9_3.0-0.0,x9_3.0-1.0,x9_3.0-2.0,x9_3.0-3.0,x9_4.0-0.0,x9_4.0-1.0,x9_4.0-2.0,x10_1,x10_2,x10_3,x10_4,x10_5,x10_6,x10_7,x10_8,x10_9,x10_10,x10_11,x11_1.0,x11_2.0,x11_3.0,x11_4.0,x11_5.0,x11_6.0,x11_7.0,x11_8.0,x11_9.0,x11_11.0,x11_12.0,x11_13.0,x11_14.0,x12_0.0-1.0,x12_0.0-2.0,x12_0.0-3.0,x12_1.0-0.0,x12_1.0-1.0,x12_1.0-2.0,x12_1.0-3.0,x12_2.0-0.0,x12_2.0-1.0,x12_2.0-2.0,x12_2.0-3.0,x12_3.0-0.0,x12_3.0-1.0,x12_3.0-2.0,x12_3.0-3.0,x12_4.0-0.0,x12_4.0-1.0,x12_4.0-2.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
X_test_1 = pd.DataFrame(encoder.transform(X_test_c).todense(),
                   columns=encoder.get_feature_names())

In [54]:
len(X_train_1)

750

In [55]:
X_train_1.shape

(750, 298)

In [56]:
X_test_1.shape

(250, 298)

In [57]:
len(X_test_1)

250

In [58]:
# List of Normalish Distributions
#OPS
#WAR_y
#Nasty Factor
#Drop matchup.pitcher.id or drop all players in general

In [59]:
from keras.layers import Bidirectional, Activation
import numpy as np

In [60]:
X_train_r = np.reshape(np.asarray(X_train_1), (X_train_1.shape[0], 1, X_train_1.shape[1]))
X_test_r = np.reshape(np.asarray(X_test_1), (X_test_1.shape[0], 1, X_test_1.shape[1]))
y_train_r = np.reshape(np.asarray(y_train_c), (y_train_c.shape[0], 1, y_train_c.shape[1]))
y_test_r = np.reshape(np.asarray(y_test_c), (y_test_c.shape[0], 1, y_test_c.shape[1]))

In [61]:
X_train_r.shape

(750, 1, 298)

In [62]:
model = Sequential()
model.add(Bidirectional(LSTM(10, return_sequences=True),
                        input_shape=(X_train_r.shape[1], X_train_r.shape[2] )))
model.add(Bidirectional(LSTM(10)))
model.add(Dense(5))
# model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

Instructions for updating:
Colocations handled automatically by placer.


In [63]:
# model = Sequential()
# inputs = X_train_1.shape[1]

# model.add(Dense(inputs, activation='relu'))
# model.add(Dense(100, activation='relu'))

# model.add(Dense(3, activation='softmax'))

In [64]:
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [65]:
rnn = Sequential()

inputs = X_train_r.shape[2]

rnn.add(LSTM(inputs, input_shape=(1, inputs), return_sequences=True))
rnn.add(LSTM(200, return_sequences=True))
rnn.add(LSTM(30, return_sequences=True))
rnn.add(Dense(3, activation='softmax'))

In [66]:
rnn.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [68]:
logg= rnn.fit(X_train_r, y_train_r,
          validation_data = (X_test_r, y_test_r), epochs=30)

Train on 750 samples, validate on 250 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
