# First Neural Network Experiement


## Pipeline Preparation

In [1]:
import numpy as np
import pandas as pd
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper, FunctionTransformer, gen_features, pipeline
from sklearn_pandas.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
import os
import pitch_functions
import sklearn

Using TensorFlow backend.


In [2]:
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM

from keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [4]:
pd.set_option('display.max_columns', None)

In [9]:
each_pitch_clean = pd.read_csv('../raw_data/cleaned_pitches.csv')

In [10]:
pitch_clean = each_pitch_clean.dropna().copy()

In [11]:
pitch_clean = pitch_clean[:10000]

In [12]:
target = pitch_clean['pitch_type']

In [13]:
predictors = pitch_clean.drop(['pitch_type'], axis=1)

In [9]:
cat_features = list(predictors.select_dtypes(include='object'))
cat_features.extend(['about.inning', 'pitchData.zone', 'count'])

In [99]:
cat_features

['about.halfInning',
 'details.call.description',
 'details.description',
 'matchup.batSide.code',
 'matchup.pitchHand.code',
 'matchup.splits.menOnBase',
 'count',
 'about.inning',
 'pitchData.zone',
 'count']

## Data Prep for Neural Network 
- Game Plan for Today
- Engineer the count column with a string of what the count is - maybe turn each ball and strike into a string and then combine them.
- One Hot Encode the final data frame with all categorical columns 
- Create Final Dataframe for computer work today, then look into creating a neural network on the cloud. 

In [100]:
pitches_dict = {'Fastball': 0, 'Breaking_Ball': 1, 'Changeup': 2}

In [101]:
pitches_numerical = target.map(pitches_dict)

In [5]:
final_df_test = pd.read_csv('../raw_data/all_2018_pitches.csv')

In [6]:
final_df_test = final_df_test.drop(['Unnamed: 0'], axis=1)

In [7]:
target = final_df_test['pitch_type']


In [8]:
predictors = final_df_test.drop(['pitch_type', 'hitter', 'pitcher'], axis=1).copy()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(predictors, 
                                                    target, 
                                                    random_state=10)

In [11]:
print(len(X_train)) 
print(len(y_train))

415206
415206


In [12]:
print(len(X_test)) 
print(len(y_test))

138403
138403


In [13]:
encoder = OneHotEncoder(handle_unknown='ignore', categories='auto')

In [14]:
X_train.columns

Index(['WAR_x', 'WHIP', 'ERA', 'SO', 'SLG', 'OPS', 'WAR_y', 'about.atBatIndex',
       'about.halfInning', 'about.inning', 'details.call.description',
       'details.description', 'matchup.batSide.code', 'matchup.pitchHand.code',
       'matchup.pitcher.id', 'matchup.splits.menOnBase',
       'pitchData.nastyFactor', 'pitchData.zone', 'pitchNumber',
       'prior_pitch_type', 'count'],
      dtype='object')

In [15]:
X_train.head(2)

Unnamed: 0,WAR_x,WHIP,ERA,SO,SLG,OPS,WAR_y,about.atBatIndex,about.halfInning,about.inning,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,prior_pitch_type,count
396000,-0.1,1.5,4.73,66,0.425,0.747,1.9,67,top,8,Ball - Called,Ball,L,L,518617,Empty,62.78,13.0,2.0,1.0,1.0-1.0
106585,0.9,1.34,3.1,53,0.409,0.733,2.5,47,top,6,Ball - Called,Ball In Dirt,R,L,605177,RISP,8.44,13.0,7.0,1.0,4.0-2.0


- New Dataframe with columns from all categories that need to be one-hot-encoded.
- Use the data frame that was created then 

In [16]:
X_train_c = X_train[cat_features]
X_train_c.head()


Unnamed: 0,about.halfInning,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.splits.menOnBase,count,about.inning,pitchData.zone,count.1
396000,top,Ball - Called,Ball,L,L,Empty,1.0-1.0,8,13.0,1.0-1.0
106585,top,Ball - Called,Ball In Dirt,R,L,RISP,4.0-2.0,6,13.0,4.0-2.0
43872,top,Strike - Swinging,Swinging Strike,L,R,Empty,0.0-2.0,3,5.0,0.0-2.0
300476,top,Strike - Swinging,Foul,L,L,Empty,2.0-2.0,6,4.0,2.0-2.0
508866,bottom,Ball - Called,Ball,R,R,Empty,2.0-1.0,7,14.0,2.0-1.0


In [17]:
X_test_c = X_test[cat_features]

In [18]:
y_train_c = to_categorical(y_train)

In [19]:
y_test_c = to_categorical(y_test)

In [20]:
encoder.fit(X_train_c)

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True)

In [21]:
X_train_1 = pd.DataFrame(encoder.transform(X_train_c).todense(),
                   columns=encoder.get_feature_names())

In [22]:
X_train_1.head(2)

Unnamed: 0,x0_bottom,x0_top,x1_Ball - Called,x1_Hit Into Play - Out(s),x1_Strike - Swinging,x2_Ball,x2_Ball In Dirt,x2_Called Strike,x2_Foul,x2_Foul Bunt,x2_Foul Tip,x2_Hit By Pitch,"x2_In play, no out","x2_In play, out(s)","x2_In play, run(s)",x2_Missed Bunt,x2_Swinging Strike,x2_Swinging Strike (Blocked),x3_L,x3_R,x4_L,x4_R,x5_Empty,x5_Loaded,x5_Men_On,x5_RISP,x6_0.0-1.0,x6_0.0-2.0,x6_0.0-3.0,x6_1.0-0.0,x6_1.0-1.0,x6_1.0-2.0,x6_1.0-3.0,x6_2.0-0.0,x6_2.0-1.0,x6_2.0-2.0,x6_2.0-3.0,x6_3.0-0.0,x6_3.0-1.0,x6_3.0-2.0,x6_3.0-3.0,x6_4.0-0.0,x6_4.0-1.0,x6_4.0-2.0,x6_4.0-3.0,x7_1,x7_2,x7_3,x7_4,x7_5,x7_6,x7_7,x7_8,x7_9,x7_10,x7_11,x7_12,x7_13,x7_14,x7_15,x7_16,x7_17,x8_1.0,x8_2.0,x8_3.0,x8_4.0,x8_5.0,x8_6.0,x8_7.0,x8_8.0,x8_9.0,x8_11.0,x8_12.0,x8_13.0,x8_14.0,x9_0.0-1.0,x9_0.0-2.0,x9_0.0-3.0,x9_1.0-0.0,x9_1.0-1.0,x9_1.0-2.0,x9_1.0-3.0,x9_2.0-0.0,x9_2.0-1.0,x9_2.0-2.0,x9_2.0-3.0,x9_3.0-0.0,x9_3.0-1.0,x9_3.0-2.0,x9_3.0-3.0,x9_4.0-0.0,x9_4.0-1.0,x9_4.0-2.0,x9_4.0-3.0
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
X_test_1 = pd.DataFrame(encoder.transform(X_test_c).todense(),
                   columns=encoder.get_feature_names())

In [24]:
len(X_train_1)

415206

In [25]:
X_train_1.shape

(415206, 94)

In [26]:
X_test_1.shape

(138403, 94)

In [27]:
len(X_test_1)

138403

In [28]:
# List of Normalish Distributions
#OPS
#WAR_y
#Nasty Factor
#Drop matchup.pitcher.id or drop all players in general

In [29]:
from keras.layers import Bidirectional, Activation
import numpy as np

In [30]:
X_train_r = np.reshape(np.asarray(X_train_1), (X_train_1.shape[0], 1, X_train_1.shape[1]))
X_test_r = np.reshape(np.asarray(X_test_1), (X_test_1.shape[0], 1, X_test_1.shape[1]))
y_train_r = np.reshape(np.asarray(y_train_c), (y_train_c.shape[0], 1, y_train_c.shape[1]))
y_test_r = np.reshape(np.asarray(y_test_c), (y_test_c.shape[0], 1, y_test_c.shape[1]))

In [31]:
X_train_r.shape

(415206, 1, 94)

In [32]:
y_train_r.shape

(415206, 1, 2)

In [33]:
X_test_r.shape

(138403, 1, 94)

In [34]:
y_test_r.shape

(138403, 1, 2)

In [35]:
# model = Sequential()
# model.add(Bidirectional(LSTM(10, return_sequences=True),
#                         input_shape=(X_train_r.shape[1], X_train_r.shape[2] )))
# model.add(Bidirectional(LSTM(10)))
# model.add(Dense(5))
# # model.add(Activation('softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [36]:
# model = Sequential()
# inputs = X_train_1.shape[1]

# model.add(Dense(inputs, activation='relu'))
# model.add(Dense(100, activation='relu'))

# model.add(Dense(3, activation='softmax'))

In [37]:
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [47]:
rnn = Sequential()

inputs = X_train_r.shape[2]

rnn.add(LSTM(inputs, input_shape=(1, inputs), return_sequences=True))
rnn.add(LSTM(200, return_sequences=True))
rnn.add(LSTM(30, return_sequences=True))
rnn.add(Dense(2, activation='softmax'))

In [48]:
rnn.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [49]:
logg= rnn.fit(X_train_r, y_train_r,
          validation_data = (X_test_r, y_test_r), epochs=10)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 415206 samples, validate on 138403 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [55]:
test_data1 = pd.read_csv('../raw_data/2019_test_pitches.csv')


In [56]:
test_data = test_data1.drop(['Unnamed: 0'], axis=1)

In [57]:
test_predictors = test_data.drop(['pitch_type'], axis=1).copy()

In [58]:
testing = test_predictors[-442:-441]
testing

Unnamed: 0,pitcher,WAR_x,WHIP,ERA,SO,hitter,SLG,OPS,WAR_y,about.atBatIndex,about.halfInning,about.inning,details.call.description,details.description,matchup.batSide.code,matchup.pitchHand.code,matchup.pitcher.id,matchup.splits.menOnBase,pitchData.nastyFactor,pitchData.zone,pitchNumber,prior_pitch_type,count
147932,John Means,-0.2,1.8,13.5,4,Jeff Mathis,0.272,0.544,0.2,40,bottom,5,Strike - Swinging,Called Strike,R,L,607644,Men_On,35.326612,6.0,3.0,0.0,2.0-1.0


In [60]:
testy = np.reshape(np.asarray(testing), (testing.shape[0], 1,testing.shape[1]))

In [62]:
logg.model.predict

ValueError: Error when checking input: expected lstm_10_input to have shape (1, 94) but got array with shape (1, 23)