In [44]:
import pandas as pd
import numpy as np
import pickle
from keras.preprocessing.sequence import pad_sequences, TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [45]:
infile = open('../data/processed/scherzer_with_batters.pickle','rb')
pb = pickle.load(infile)
infile.close()

infile = open('../data/processed/X_train_trans.pickle','rb')
X_train = pickle.load(infile)
infile.close()

infile = open('../data/processed/y_train_trans.pickle','rb')
y_train = pickle.load(infile)
infile.close()

In [46]:
pb.columns

Index(['pitch_type', 'game_date_x', 'sv_id', 'batter_id', 'pitch_number',
       'release_speed', 'zone', 'stand', 'home_team', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'inning', 'release_spin_rate', 'opp_score',
       'nats_score', 'if_fielding_alignment', 'of_fielding_alignment',
       'nats_home1_away0', 'balls_strikes', 'all_runners', 'pitch_season',
       'pitch_game', 'pitch_bat_gm', 'game_date_y', 'shift_date',
       'player_name', 'total_pitches', 'hits', 'abs', 'whiffs', 'swings',
       'takes', 'k', 'walk', 'single', 'double', 'triple', 'hr', 'line_drive',
       'ground_ball', 'fly_ball', 'popup', 'rbi', 'sac', 'ba', 'slg', 'iso',
       'babip'],
      dtype='object')

In [71]:
pb.pitch_type.unique().tolist()

['FF', 'SL', 'FC', 'CH', 'CU']

In [72]:
pitch_vals = pb.pitch_type.unique().tolist()

pitch_vals = dict(zip(pitch_vals, range(1, len(pitch_vals) + 1)))

In [74]:
pb.loc[(pb.pitch_number == 0), 'new_ab'] = 'yes'
pb.new_ab = pb.new_ab.fillna(value='no')

pb['pitch_val'] = pb.pitch_type.apply(lambda p: pitch_vals[p] -1)
# pb_df = pb_df[['pitch_type', 'pitch_val', 'pitch_number', 'pitch_bat_gm', 'new_ab']]

pb

Unnamed: 0,pitch_type,game_date_x,sv_id,batter_id,pitch_number,release_speed,zone,stand,home_team,on_3b,...,fly_ball,popup,rbi,sac,ba,slg,iso,babip,new_ab,pitch_val
0,FF,2019-03-28,190328_170717,607043,0,93.7,6.0,L,WSH,0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,yes,0
1,FF,2019-03-28,190328_170732,607043,1,94.2,5.0,L,WSH,0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,no,0
2,FF,2019-03-28,190328_170752,607043,2,96.3,5.0,L,WSH,0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,no,0
3,SL,2019-03-28,190328_170825,624413,0,85.6,6.0,R,WSH,0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,yes,1
4,FF,2019-03-28,190328_170842,624413,1,95.5,12.0,R,WSH,0,...,0,0,0,0,0.000000,0.000000,0.000000,0.000000,no,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2765,CU,2019-09-24,190925_005705,664068,1,79.1,4.0,R,WSH,0,...,76,27,36,3,0.259009,0.481982,0.222973,0.334495,no,4
2766,SL,2019-09-24,190925_005726,664068,2,84.8,14.0,R,WSH,0,...,76,27,36,3,0.259009,0.481982,0.222973,0.334495,no,1
2767,FF,2019-09-24,190925_005750,664068,3,97.4,8.0,R,WSH,0,...,76,27,36,3,0.259009,0.481982,0.222973,0.334495,no,0
2768,CH,2019-09-24,190925_005823,664068,4,84.4,9.0,R,WSH,0,...,76,27,36,3,0.259009,0.481982,0.222973,0.334495,no,3


In [75]:
ab_list = []
pitchs_ab = []

for ab in range(0, pb.shape[0]):
    ab_val = pb.at[ab, 'new_ab']
    
# Case for last pitch of the season is a one pitch at bat
    if ab_val == 'yes' and ab == pb.shape[0] - 1:
        pitchs_ab = []
        pitchs_ab.append(pb.at[ab, 'pitch_val'])
        ab_list.append(pitchs_ab)
        
# Case for last pitch of the season is not a one pitch at bat
    elif ab_val == 'no' and ab == pb.shape[0] - 1:
        pitchs_ab.append(pb.at[ab, 'pitch_val'])
        ab_list.append(pitchs_ab)
        
# Case for a one pitch at bat that is not the last of the season
    elif ab_val == 'yes' and ab == pb.at[ab + 1, 'new_ab'] == 'yes':
        pitchs_ab = []
        pitchs_ab.append(pb.at[ab, 'pitch_val'])
        ab_list.append(pitchs_ab)
        
# Case for first pitch of an at bat
    elif ab_val == 'yes' and pb.at[ab + 1, 'new_ab'] == 'no':
        pitchs_ab = []
        pitchs_ab.append(pb.at[ab, 'pitch_val'])

# Case for pitch that is last pitch of an at bat
    elif ab_val == 'no' and pb.at[ab + 1, 'new_ab'] == 'yes':
        pitchs_ab.append(pb.at[ab, 'pitch_val'])
        ab_list.append(pitchs_ab)

# Case for pitch that is neither the first nor last pitch in an at bat
    else:
        pitchs_ab.append(pb.at[ab, 'pitch_val'])

ab_list

[[0, 0, 0],
 [1, 0, 0, 2, 0, 3, 2],
 [0, 3, 3],
 [4, 0, 0, 3],
 [0, 1, 0, 3, 4],
 [1, 0, 1, 0],
 [0, 0, 0, 3],
 [3, 0, 0],
 [0, 4, 0, 0, 3, 2],
 [1, 0, 0, 0, 3],
 [0, 0, 2],
 [3, 0, 4, 0, 0, 3, 0],
 [0, 1],
 [0, 0],
 [1, 3, 1, 1, 3, 1],
 [0, 3, 0],
 [3, 3, 0],
 [0, 1, 0],
 [0, 4, 2],
 [3, 3, 3, 0, 4, 2, 0],
 [1, 3],
 [0, 0, 3, 3, 0, 4],
 [0, 3],
 [0, 1, 1, 1, 0],
 [4, 0, 2, 3, 0, 0, 3],
 [0, 4, 3, 3],
 [0, 0],
 [1, 0],
 [0, 2, 0, 2, 3, 3],
 [0, 1, 0, 0, 0, 0, 1],
 [1, 1, 1],
 [0, 2, 0, 3, 0],
 [0, 0, 4, 0, 0],
 [1, 0, 0, 4, 0, 3],
 [0, 1, 1],
 [1, 0, 4],
 [4, 1, 1],
 [2, 0, 0, 0, 0, 3, 2],
 [0, 3, 1, 0, 3, 0],
 [0, 0, 3, 0],
 [0, 4, 0, 4, 4],
 [0, 3],
 [0, 1, 0, 0, 1, 1, 0],
 [0, 1, 0, 3],
 [1, 0, 3],
 [3, 2, 0, 0, 4],
 [3, 1, 3],
 [4, 1, 1, 3, 0],
 [0, 0, 0, 2],
 [0, 0, 0, 3, 3, 0],
 [0, 0, 3, 2, 0, 0],
 [0, 0, 0, 3, 3],
 [0, 0, 1, 0, 1, 1],
 [0, 1, 1, 1],
 [0, 0, 1, 0],
 [0, 0],
 [0, 4, 3, 2, 3, 0],
 [1, 0, 0, 0],
 [1, 0, 0],
 [4, 4, 2, 3, 0, 0, 3],
 [0, 0, 2],
 [1, 1, 0, 0],
 [0, 1,

In [76]:
data = pb.pitch_val.tolist()

In [77]:
len(data)
val_data = data[2000:2770]
val_data;

In [78]:
len(data)

2770

In [79]:
text = data

SEQUENCE_LENGTH = 3 #make pitches OHE, add five seasons, try gated recurrent networks, mind the input layer value
# Add innings as OHE columns or use ordinal values, number of pitches for batter, 



step = 1
sentences = []
next_chars = []
for i in range(0, len(text) - SEQUENCE_LENGTH, step):
    sentences.append(text[i: i + SEQUENCE_LENGTH])
    next_chars.append(text[i + SEQUENCE_LENGTH])
print(f'num training examples: {len(sentences)}')

num training examples: 2767


In [80]:
##### char_indicies
X = np.zeros((len(sentences), SEQUENCE_LENGTH, 6), dtype=np.bool)
y = np.zeros((len(sentences), 5), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char] = 1
    y[i, next_chars[i]] = 1

In [81]:
X

array([[[ True, False, False, False, False, False],
        [ True, False, False, False, False, False],
        [ True, False, False, False, False, False]],

       [[ True, False, False, False, False, False],
        [ True, False, False, False, False, False],
        [False,  True, False, False, False, False]],

       [[ True, False, False, False, False, False],
        [False,  True, False, False, False, False],
        [ True, False, False, False, False, False]],

       ...,

       [[ True, False, False, False, False, False],
        [False, False, False, False,  True, False],
        [False,  True, False, False, False, False]],

       [[False, False, False, False,  True, False],
        [False,  True, False, False, False, False],
        [ True, False, False, False, False, False]],

       [[False,  True, False, False, False, False],
        [ True, False, False, False, False, False],
        [False, False, False,  True, False, False]]])

In [82]:
X.shape

(2767, 3, 6)

In [83]:
X[2]

array([[ True, False, False, False, False, False],
       [False,  True, False, False, False, False],
       [ True, False, False, False, False, False]])

In [84]:
y[2]

array([ True, False, False, False, False])

In [85]:
np.unique(next_chars, return_counts=True)

(array([0, 1, 2, 3, 4]), array([1338,  573,  214,  401,  241]))

In [86]:
1338/len(next_chars)

0.4835561980484279

In [89]:
model = Sequential()
model.add(LSTM(128, input_shape=(SEQUENCE_LENGTH, 6)))
model.add(Dense(5, activation='softmax'))
# model.add(Activation('softmax'))

In [90]:
# optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X, y, validation_split=0.1, batch_size=32, epochs=20, shuffle=True).history

Train on 2490 samples, validate on 277 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [91]:
model.predict(X[1:5, :, :])

array([[0.52008075, 0.32783696, 0.02825764, 0.08109622, 0.04272845],
       [0.4693363 , 0.2640844 , 0.03185396, 0.13211425, 0.10261115],
       [0.4288286 , 0.29250246, 0.04429341, 0.10796937, 0.12640615],
       [0.48071355, 0.1017143 , 0.10608588, 0.2325374 , 0.07894894]],
      dtype=float32)

In [23]:
y[1:5]

array([[False,  True, False, False, False, False],
       [False,  True, False, False, False, False],
       [False, False, False,  True, False, False],
       [False,  True, False, False, False, False]])