# First Neural Network Experiement


## Pipeline Preparation

In [1]:
import numpy as np
import pandas as pd
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os
import pitch_functions
import sklearn

In [2]:
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM

from keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer

Using TensorFlow backend.


In [4]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [5]:
pd.set_option('display.max_columns', None)

In [3]:
each_pitch_clean = pd.read_csv('raw_data/master.csv')

In [6]:
pitch_clean = each_pitch_clean.dropna().copy()

In [7]:
pitch_clean = pitch_clean

In [8]:
pitch_clean['count.balls'] = pitch_clean['count.balls'].astype(str)

In [9]:
pitch_clean['count.strikes'] = pitch_clean['count.strikes'].astype(str)

In [10]:
pitch_clean['count'] = pitch_clean['count.balls'] + '-' + pitch_clean['count.strikes'] 

In [11]:
pitch_clean = pitch_clean.drop([ 'previous_pitch_code', 'details.call.code', 'count.balls', 'count.strikes'], axis=1)

In [13]:
pitch_clean.shape

(516273, 24)

In [14]:
pitch_clean.to_csv(r'raw_data/cleaned_pitches.csv', index=False, sep=',', encoding='utf-8')

In [302]:
cat_features = list(predictors.select_dtypes(include='object'))
cat_features.extend(['about.inning', 'pitchData.zone', 'count'])

In [303]:
cat_features

['pitcher',
 'hitter',
 'about.halfInning',
 'details.call.description',
 'details.description',
 'matchup.batSide.code',
 'matchup.pitchHand.code',
 'matchup.splits.menOnBase',
 'prior_pitch_type',
 'count',
 'about.inning',
 'pitchData.zone',
 'count']

In [304]:
target = pitch_clean['pitch_type']

In [305]:
predictors = pitch_clean.drop(['pitch_type'], axis=1)

## Data Prep for Neural Network 
- Game Plan for Today
- Engineer the count column with a string of what the count is - maybe turn each ball and strike into a string and then combine them.
- One Hot Encode the final data frame with all categorical columns 
- Create Final Dataframe for computer work today, then look into creating a neural network on the cloud. 

In [306]:
pitches_dict = {'Fastball': 0, 'Breaking_Ball': 1, 'Changeup': 2}

In [307]:
pitches_numerical = target.map(pitches_dict)

In [308]:
X_train, X_test, y_train, y_test = train_test_split(predictors, 
                                                    pitches_numerical, 
                                                    random_state=10)

In [309]:
print(len(X_train)) 
print(len(y_train))

75
75


In [310]:
print(len(X_test)) 
print(len(y_test))

25
25


In [311]:
encoder = OneHotEncoder(handle_unknown='ignore', categories='auto')

In [312]:
X_train.columns

Index(['pitcher', 'WAR_x', 'WHIP', 'ERA', 'SO', 'hitter', 'SLG', 'OPS',
       'WAR_y', 'about.atBatIndex', 'about.halfInning', 'about.inning',
       'details.call.description', 'details.description',
       'matchup.batSide.code', 'matchup.pitchHand.code', 'matchup.pitcher.id',
       'matchup.splits.menOnBase', 'pitchData.nastyFactor', 'pitchData.zone',
       'pitchNumber', 'prior_pitch_type', 'count'],
      dtype='object')

In [313]:
X_train.head(2)

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.atBatIndex,about.halfInning,about.inning,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,prior_pitch_type,count
102,Rex Brothers,0.0,0.0,0.0,0,Maikel Franco,0.467,0.78,0.2,42,top,6,Ball - Called,Ball In Dirt,R,L,571521,Loaded,27.87,13.0,5.0,Fastball,4.0-1.0
37,Pablo Sandoval,0.0,0.0,0.0,0,Yasmani Grandal,0.466,0.815,3.3,91,top,9,Hit Into Play - Out(s),"In play, out(s)",L,R,467055,Empty,59.34,13.0,5.0,Changeup,2.0-2.0


- New Dataframe with columns from all categories that need to be one-hot-encoded.
- Use the data frame that was created then 

In [314]:
X_train_c = X_train[cat_features]
X_train_c.head()


Unnamed: 0,pitcher,hitter,about.halfInning,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,prior_pitch_type,count,about.inning,pitchData.zone,count.1
102,Rex Brothers,Maikel Franco,top,Ball - Called,Ball In Dirt,R,L,Loaded,Fastball,4.0-1.0,6,13.0,4.0-1.0
37,Pablo Sandoval,Yasmani Grandal,top,Hit Into Play - Out(s),"In play, out(s)",L,R,Empty,Changeup,2.0-2.0,9,13.0,2.0-2.0
111,Rex Brothers,J.P. Crawford,top,Ball - Called,Ball,L,L,Loaded,Fastball,4.0-2.0,6,12.0,4.0-2.0
10,Kendrys Morales,Matt Olson,top,Ball - Called,Ball,L,R,Empty,Changeup,3.0-0.0,9,11.0,3.0-0.0
115,Matt Davidson,Gleyber Torres,top,Hit Into Play - Out(s),"In play, out(s)",R,R,Empty,Fastball,2.0-0.0,9,14.0,2.0-0.0


In [315]:
X_test_c = X_test[cat_features]

In [220]:
y_train_c = to_categorical(y_train)

In [206]:
y_test_c = to_categorical(y_test)

In [316]:
encoder.fit(X_train_c)

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True)

In [317]:
encoder.get_feature_names()

array(['x0_Alex Avila', 'x0_Anthony Rizzo', 'x0_Brandon Guyer',
       "x0_Chase d'Arnaud", 'x0_Danny Valencia', 'x0_Ian Krol',
       'x0_Kendrys Morales', 'x0_Mark Reynolds', 'x0_Matt Davidson',
       'x0_Pablo Sandoval', 'x0_Rex Brothers', 'x1_A.J. Pollock',
       'x1_Aaron Hicks', 'x1_Brett Gardner', 'x1_Bryan Holaday',
       'x1_Carlos Tocci', 'x1_Chris Taylor', 'x1_Didi Gregorius',
       'x1_Dilson Herrera', 'x1_Eugenio Suarez', 'x1_Giancarlo Stanton',
       'x1_Gleyber Torres', 'x1_Ian Desmond', 'x1_J.P. Crawford',
       'x1_Joey Gallo', 'x1_Jonathan Lucroy', 'x1_Kendrys Morales',
       'x1_Maikel Franco', 'x1_Matt Chapman', 'x1_Matt Olson',
       'x1_Max Kepler', 'x1_Miguel Andujar', 'x1_Noel Cuevas',
       'x1_Phillip Ervin', 'x1_Preston Tucker', 'x1_Raimel Tapia',
       'x1_Rougned Odor', 'x1_Ryan Rua', 'x1_Taylor Motter',
       'x1_Teoscar Hernandez', 'x1_Yangervis Solarte',
       'x1_Yasmani Grandal', 'x2_bottom', 'x2_top', 'x3_Ball - Called',
       'x3_Hit Int

In [319]:
X_train_1 = pd.DataFrame(encoder.transform(X_train_c).todense(),
                   columns=encoder.get_feature_names())

In [320]:
X_train_1.head(2)

Unnamed: 0,x0_Alex Avila,x0_Anthony Rizzo,x0_Brandon Guyer,x0_Chase d'Arnaud,x0_Danny Valencia,x0_Ian Krol,x0_Kendrys Morales,x0_Mark Reynolds,x0_Matt Davidson,x0_Pablo Sandoval,x0_Rex Brothers,x1_A.J. Pollock,x1_Aaron Hicks,x1_Brett Gardner,x1_Bryan Holaday,x1_Carlos Tocci,x1_Chris Taylor,x1_Didi Gregorius,x1_Dilson Herrera,x1_Eugenio Suarez,x1_Giancarlo Stanton,x1_Gleyber Torres,x1_Ian Desmond,x1_J.P. Crawford,x1_Joey Gallo,x1_Jonathan Lucroy,x1_Kendrys Morales,x1_Maikel Franco,x1_Matt Chapman,x1_Matt Olson,x1_Max Kepler,x1_Miguel Andujar,x1_Noel Cuevas,x1_Phillip Ervin,x1_Preston Tucker,x1_Raimel Tapia,x1_Rougned Odor,x1_Ryan Rua,x1_Taylor Motter,x1_Teoscar Hernandez,x1_Yangervis Solarte,x1_Yasmani Grandal,x2_bottom,x2_top,x3_Ball - Called,x3_Hit Into Play - Out(s),x3_Strike - Swinging,x4_Ball,x4_Ball In Dirt,x4_Called Strike,x4_Foul,x4_Foul Tip,"x4_In play, no out","x4_In play, out(s)",x4_Swinging Strike,x4_Swinging Strike (Blocked),x5_L,x5_R,x6_L,x6_R,x7_Empty,x7_Loaded,x7_Men_On,x7_RISP,x8_Breaking_Ball,x8_Changeup,x8_Fastball,x9_0.0-1.0,x9_0.0-2.0,x9_1.0-0.0,x9_1.0-1.0,x9_1.0-2.0,x9_1.0-3.0,x9_2.0-0.0,x9_2.0-1.0,x9_2.0-2.0,x9_3.0-0.0,x9_3.0-1.0,x9_3.0-2.0,x9_3.0-3.0,x9_4.0-1.0,x9_4.0-2.0,x10_6,x10_7,x10_8,x10_9,x11_1.0,x11_2.0,x11_3.0,x11_4.0,x11_5.0,x11_6.0,x11_7.0,x11_8.0,x11_9.0,x11_11.0,x11_12.0,x11_13.0,x11_14.0,x12_0.0-1.0,x12_0.0-2.0,x12_1.0-0.0,x12_1.0-1.0,x12_1.0-2.0,x12_1.0-3.0,x12_2.0-0.0,x12_2.0-1.0,x12_2.0-2.0,x12_3.0-0.0,x12_3.0-1.0,x12_3.0-2.0,x12_3.0-3.0,x12_4.0-1.0,x12_4.0-2.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [321]:
X_test_1 = pd.DataFrame(encoder.transform(X_test_c).todense(),
                   columns=encoder.get_feature_names())

In [322]:
len(X_train_1)

75

In [323]:
X_train_1.shape

(75, 114)

In [324]:
X_test_1.shape

(25, 114)

In [325]:
len(X_test_1)

25

In [95]:
# List of Normalish Distributions
#OPS
#WAR_y
#Nasty Factor
#Drop matchup.pitcher.id or drop all players in general

In [96]:
from keras.layers import Bidirectional, Activation
import numpy as np

In [97]:
X_train_r = np.reshape(np.asarray(X_train_1), (X_train_1.shape[0], 1, X_train_1.shape[1]))
X_test_r = np.reshape(np.asarray(X_test_1), (X_test_1.shape[0], 1, X_test_1.shape[1]))
y_train_r = np.reshape(np.asarray(y_train_c), (y_train_c.shape[0], 1, y_train_c.shape[1]))
y_test_r = np.reshape(np.asarray(y_test_c), (y_test_c.shape[0], 1, y_test_c.shape[1]))

In [98]:
X_train_r.shape

(7500, 1, 4882)

In [99]:
model = Sequential()
model.add(Bidirectional(LSTM(10, return_sequences=True),
                        input_shape=(X_train_r.shape[1], X_train_r.shape[2] )))
model.add(Bidirectional(LSTM(10)))
model.add(Dense(5))
# model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [100]:
# model = Sequential()
# inputs = X_train_1.shape[1]

# model.add(Dense(inputs, activation='relu'))
# model.add(Dense(100, activation='relu'))

# model.add(Dense(3, activation='softmax'))

In [101]:
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [102]:
rnn = Sequential()

inputs = X_train_r.shape[2]

rnn.add(LSTM(inputs, input_shape=(1, inputs), return_sequences=True))
rnn.add(LSTM(200, return_sequences=True))
rnn.add(LSTM(30, return_sequences=True))
rnn.add(Dense(3, activation='softmax'))

In [103]:
rnn.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [104]:
logg= rnn.fit(X_train_r, y_train_r,
          validation_data = (X_test_r, y_test_r), epochs=5)

Train on 7500 samples, validate on 2500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
