# Swing Outcome Model: data cleaning, model training, and testing

In [1]:
# Import appropriate libraries

import pandas, json, requests, urllib, io
import tensorflow as tf
from tensorflow import keras        
import numpy as np        
from tensorflow.keras import models, layers, Input, optimizers, callbacks
import matplotlib.pyplot as plt

In [2]:
# Gets the pitch data csv
def get_total_pitch_data():

  pitchData = pandas.read_csv("all_pitch.csv", error_bad_lines=False)
  pitchData = pitchData.rename({"res":"result"}, axis = 1)
  del pitchData["zone"]
  pitchData["zone"] = pitchData['zones']
  pitchData["zones"] = pitchData['zones'].map(lambda x: x.rstrip('aAbB')).astype(int)
  
  return pitchData


total_pitch_data = get_total_pitch_data()



  pitchData = pandas.read_csv("all_pitch.csv", error_bad_lines=False)


In [3]:
# Gets at-bat data csv 

def get_at_bat_data():
    at_bats = pandas.read_csv("all_atbat.csv")
    #get rid of Sac Bunt, Intentional walk, Runner Out, Batter Interference, Catcher Interference because not included when computing OBP
    at_bat_remove_events = ['Sac Bunt', 'Intent Walk', 'Runner Out', 'Batter Interference', 'Catcher Interference']

    at_bats = at_bats.loc[~at_bats['event'].isin(at_bat_remove_events)]
    
    
    return at_bats
at_bats = get_at_bat_data()

In [4]:
# Creates a map from pitcher ID to their tensor (without normalized values)
def get_pitcher_tensors(df_pitch):
  pitcher_tensors = {}
  pitch_types = ["FF", "FT", "CU", "CH", "FC", "SL"]

  pitchers = df_pitch['pitcher_id'].unique()
  
  zones = df_pitch['zones'].unique()
  zones.sort()
  
  for pitcher in pitchers:
    pitcher_tensor = [0] * len(pitch_types)

    df_pitcher = df_pitch.loc[df_pitch['pitcher_id'] == pitcher]

    num_pitches = df_pitcher.shape[0]

    for i, pitch in enumerate(pitch_types):
      pitcher_tensor[i] = [0] * len(zones)
      df_pitch_type = df_pitcher.loc[df_pitcher['pitch_type'] == pitch]

      for zone in zones:
        pitcher_tensor[i][zone] = [0] * 2

        if df_pitch_type.shape[0] != 0: 
          df_pitch_type_zone = df_pitch_type.loc[df_pitch_type['zones'] == zone]
          num_pitch_type_in_zone = df_pitch_type_zone.shape[0]

          if num_pitch_type_in_zone != 0: 
            proportion_in_zone = num_pitch_type_in_zone / num_pitches 
            avg_speed = df_pitch_type_zone['start_speed'].mean()

            pitcher_tensor[i][zone][0] = proportion_in_zone
            pitcher_tensor[i][zone][1] = avg_speed

    pitcher_tensors[pitcher] = pitcher_tensor

  return pitcher_tensors

pitcher_tensors = get_pitcher_tensors(total_pitch_data)

In [5]:
# Computes the average and standard deviation in speed for each pitch type 
def get_mean_std_for_pitches(df_pitch): 
  pitch_types = ["FF", "FT", "CU", "CH", "FC", "SL"]
  pitcher_means = []  
  pitchers = df_pitch['pitcher_id'].unique()

  for pitcher in pitchers:
    means = [0] * len(pitch_types) 
    df_pitcher = df_pitch.loc[df_pitch['pitcher_id'] == pitcher]
    for i, pitch in enumerate(pitch_types):
      pitches = df_pitcher.loc[df_pitcher['pitch_type'] == pitch]
      mean_speed = 0
      if pitches.shape[0] != 0: 
        mean_speed = pitches['start_speed'].mean()
      means[i] = mean_speed 
    pitcher_means.append(means)

  mean_speeds = np.mean(pitcher_means, axis=(0), keepdims=True)
  std_speeds = np.std(pitcher_means, axis=(0), keepdims=True)

  return mean_speeds, std_speeds

mean_speeds, std_speeds = get_mean_std_for_pitches(total_pitch_data)

In [6]:
# Normalizes pitch speed in the pitcher tensors by (speed-mean)/std
def normalize_pitch_speed(pitcher_tensors_dict, mean_speeds_array, std_speeds_array):
  return_dict = pitcher_tensors_dict.copy()
  pitch_types = ["FF", "FT", "CU", "CH", "FC", "SL"]
  zones = list(range(17))
  for pitcher, tensor in return_dict.items():
    for i, pitch in enumerate(pitch_types):
      mean_speed = mean_speeds_array[0][i]
      std_speed = std_speeds_array[0][i]
      for zone in zones: 
        normalized_value = (tensor[i][zone][1] - mean_speed) / std_speed
        tensor[i][zone][1] = normalized_value
    return_dict[pitcher] = tensor
  
  return return_dict

pitcher_tensors = normalize_pitch_speed(pitcher_tensors, mean_speeds, std_speeds)

In [7]:
# Reshapes pitcher tensors from (6,17,2) to (5,5,12)
def reshape_pitcher_tensors(pitcher_tensors):
    new_pitcher_tensors = {}
    zone_index_map = {
        0:(1,1), #(x,y)
        1:(2,1),
        2:(3,1),
        3:(1,2),
        4:(2,2),
        5:(3,2),
        6:(1,3),
        7:(2,3),
        8:(3,3),
        9:(0,0),
        10:(np.s_[1:4],0),
        11:(4,0),
        12:(0,np.s_[1:4]),
        13:(4,np.s_[1:4]),
        14:(0,4),
        15:(np.s_[1:4],4),
        16:(4,4)
    }

    for pitcher_key in pitcher_tensors.keys():
        pitcher = np.array(pitcher_tensors[pitcher_key])
    
        new_tensor = np.zeros((0,5,5))
        for pitch in pitcher:
            #print(pitch.shape)
            zone_matrix = np.zeros((2,5,5))
            
            for i, zone in enumerate(pitch):
                index = zone_index_map[i]
                #print(index)
                #print(index[0],index[1])
                #print(zone[0],zone[1])
                zone_matrix[0,index[1],index[0]]= zone[0] #follows (...,row,column)
                zone_matrix[1,index[1],index[0]]= zone[1]
            #print(zone_matrix.shape)
            new_tensor = np.concatenate((new_tensor,zone_matrix),axis=0)
            
      
        new_tensor = new_tensor.reshape((5,5,12))
    
        new_pitcher_tensors[pitcher_key]=new_tensor
            
        
 


    return new_pitcher_tensors

pitcher_tensors = reshape_pitcher_tensors(pitcher_tensors)

In [8]:
# Creates a map from batter ID to batter tensor 
def get_batter_tensors(df_pitch):
  batter_tensors = {}

  pitch_types = ["FF", "FT", "CU", "CH", "FC", "SL"]
  batters = df_pitch['batter_id'].unique()

  zones = df_pitch['zones'].unique()
  zones.sort()

  for batter in batters: 
    batter_tensor = [0] * len(pitch_types)

    df_batter = df_pitch.loc[df_pitch['batter_id'] == batter]
    
    for i, pitch in enumerate(pitch_types):
      batter_tensor[i] = [0] * len(zones)

      df_pitch_type = df_batter.loc[df_batter['pitch_type'] == pitch]
      for zone in zones: 
        batter_tensor[i][zone] = [0] * 2

        if df_pitch_type.shape[0] != 0: 
          df_pitch_type_zone = df_pitch_type.loc[df_pitch_type['zones'] == zone]
          num_pitch_type_in_zone = df_pitch_type_zone.shape[0]

          if num_pitch_type_in_zone != 0: 
            df_swing = df_pitch_type_zone.loc[df_pitch_type_zone['swing'] == 1]
            num_swing = df_swing.shape[0]
            proportion_swing = num_swing / num_pitch_type_in_zone

            num_hit = df_swing.loc[df_swing['result'] == 'Hit'].shape[0]
            proportion_hit = num_hit / num_pitch_type_in_zone

            batter_tensor[i][zone][0] = proportion_swing
            batter_tensor[i][zone][1] = proportion_hit
    batter_tensors[batter] = batter_tensor
  return batter_tensors

batter_tensors = get_batter_tensors(total_pitch_data)

In [9]:
# Reshapes batter tensors from (6,17,2) to (5,5,12)
def reshape_batter_tensors(batter_tensors):
    new_batter_tensors = {}
    zone_index_map = {
        0:(1,1), #(x,y)
        1:(2,1),
        2:(3,1),
        3:(1,2),
        4:(2,2),
        5:(3,2),
        6:(1,3),
        7:(2,3),
        8:(3,3),
        9:(0,0),
        10:(np.s_[1:4],0),
        11:(4,0),
        12:(0,np.s_[1:4]),
        13:(4,np.s_[1:4]),
        14:(0,4),
        15:(np.s_[1:4],4),
        16:(4,4)
    }

    for batter_key in batter_tensors.keys():
        batter = np.array(batter_tensors[batter_key])
    
        new_tensor = np.zeros((0,5,5))
        for pitch in batter:
            #print(pitch.shape)
            zone_matrix = np.zeros((2,5,5))
            
            for i, zone in enumerate(pitch):
                index = zone_index_map[i]
                #print(index)
                #print(index[0],index[1])
                #print(zone[0],zone[1])
                zone_matrix[0,index[1],index[0]]= zone[0]
                zone_matrix[1,index[1],index[0]]= zone[1]
            #print(zone_matrix.shape)
            new_tensor = np.concatenate((new_tensor,zone_matrix),axis=0)
            
        new_tensor = new_tensor.reshape((5,5,12))
        new_batter_tensors[batter_key]=new_tensor
            
        
    
    return new_batter_tensors

batter_tensors = reshape_batter_tensors(batter_tensors)


In [10]:
# Saves tensors to json files
def save_tensors(pitcher_tensors, batter_tensors):
    #enumrate over keys in pitcher tensors, making them into strings
    new_pitcher_tensors = {}
    for key in pitcher_tensors.keys():
        new_pitcher_tensors[str(key)] = pitcher_tensors[key].tolist()
        
        
    #enumrate over keys in batter tensors, making them into strings
    new_batter_tensors = {}
    for key in batter_tensors.keys():
        new_batter_tensors[str(key)] = batter_tensors[key].tolist()
    #WORRIED THAT SERIALIZING MAY NOT PRESERVE LIST ORDER
    with open("pitcher_tensors.json", "w") as write_file:
        json.dump(new_pitcher_tensors, write_file)
    with open("batter_tensors.json", "w") as write_file:
        json.dump(new_batter_tensors, write_file)

save_tensors(pitcher_tensors, batter_tensors)

In [11]:
# Returns tensor for pitch of shape (5,5,6)
def get_pitch_tensor(zone, pitch_type):
    pitch_types = ["FF", "FT", "CU", "CH", "FC", "SL"]
    pitch_tensor = np.zeros((5,5,6))
    p_ind = pitch_types.index(pitch_type)
    zone_index_map = {
        0:(1,1), #(x,y)
        1:(2,1),
        2:(3,1),
        3:(1,2),
        4:(2,2),
        5:(3,2),
        6:(1,3),
        7:(2,3),
        8:(3,3),
        9:(0,0),
        10:(np.s_[1:4],0),
        11:(4,0),
        12:(0,np.s_[1:4]),
        13:(4,np.s_[1:4]),
        14:(0,4),
        15:(np.s_[1:4],4),
        16:(4,4)
    }
    pitch_tensor[zone_index_map[zone][0],zone_index_map[zone][1], p_ind] = 1
   
    return pitch_tensor
    

In [12]:
# Creates dataframe of pitche filtered by batter swinging and pitch not landing in obvious zone
swing_df = total_pitch_data[(total_pitch_data["swing"]==1) & (total_pitch_data["zone"][:-1]!="b")]

In [None]:
# Iterates through all at bats and creates 3 lists of the pitcher tensor, batter tensor, and outcome
def create_input_output_tensors(pitcher_tensors, batter_tensors, at_bats, pitches):

  outcome_list = ['Strike', 'Foul', 'Strikeout', 'Groundout','Single', 'Double', 'Triple', 'Home Run']

  pitcher = []
  batter = []
  outcome = []
  s_count = []
  b_count = []
  ids = []
  pitch = []

  for i, row in pitches.iterrows():
    print(i)
    res = np.zeros(8)
    if row.pitcher_id in pitcher_tensors.keys() and row.batter_id in batter_tensors.keys():
        pitcher.append(pitcher_tensors[row.pitcher_id])
        batter.append(batter_tensors[row.batter_id])
        ids.append((row.pitcher_id, row.batter_id, i))
        res[outcome_list.index(row.result)]=1
        outcome.append(res)
        s_count.append(row.s_count)
        b_count.append(row.b_count)
        pitch.append(get_pitch_tensor(row.zones,row.pitch_type))

  data = {
      'Pitcher' : pitcher,
      'Batter' : batter,
      'Outcome' : outcome,
      'IDs' : ids,
      's_count': s_count,
      'b_count': b_count,
      'Pitch': pitch
      
  }

  return data

input_output_tensors = create_input_output_tensors(pitcher_tensors, batter_tensors, at_bats, swing_df)

1
2
5
7
10
11
12
13
15
16
17
18
20
21
23
24
26
32
33
35
36
37
38
42
43
45
46
47
48
49
50
51
52
54
58
60
64
65
66
67
69
71
74
76
77
78
80
85
89
91
92
95
98
99
103
105
111
115
118
119
120
121
122
124
125
129
130
132
137
138
144
147
149
150
151
153
154
155
156
157
158
160
161
162
163
166
168
174
177
178
180
181
182
183
184
185
190
191
192
197
198
199
202
203
205
207
208
210
211
213
214
215
217
218
220
221
225
227
229
232
234
239
240
241
242
243
246
247
249
250
255
258
259
261
264
265
268
273
274
275
277
281
284
285
286
288
292
293
302
303
307
309
311
313
316
317
318
320
325
326
327
330
332
338
341
343
348
349
351
353
356
360
363
365
366
370
372
375
376
378
380
383
385
386
389
392
393
396
397
399
400
403
406
407
409
410
413
414
415
416
418
419
421
422
426
427
431
432
434
435
438
439
440
443
448
451
454
455
457
458
459
463
467
469
470
471
475
476
479
483
484
485
486
488
489
491
492
497
500
503
504
505
508
509
513
514
515
516
518
520
522
525
526
527
535
538
539
540
541
551
553
554
556
558
56

In [None]:
# Convert lists to np arrays 
pitcher_inputs = np.array(input_output_tensors['Pitcher'])
batter_inputs = np.array(input_output_tensors['Batter'])
outcome = np.array(input_output_tensors['Outcome'])
s_count = np.array(input_output_tensors['s_count'])
b_count = np.array(input_output_tensors['b_count'])
ids = input_output_tensors['IDs']
pitch_inputs = np.array(input_output_tensors['Pitch'])

In [None]:
# Create a dictionary of tensors
input_output_tensors = {
    'Pitcher' : pitcher_inputs,
    'Batter' : batter_inputs,
    'Outcome' : outcome,
    'IDs' : ids,
    's_count' : s_count,
    'b_count' : b_count,
    'Pitch' : pitch_inputs
}

In [16]:
# Returns lists of training, validation, and test indicies within swing_df
def get_train_val_test_indices(swing_df):
    unique_matchup_df = swing_df[["pitcher_id","batter_id"]].drop_duplicates()
    unique_matchups = unique_matchup_df.values
    matchups = {}
    num_matchups = len(unique_matchups)
    print("%s matchups" %num_matchups)
    for i, matchup in enumerate(unique_matchups):
        hashed = (matchup[0],matchup[1])
        if i < int(num_matchups*.7):
            matchups[hashed] = "train"
        elif int(i < num_matchups*.85):
            matchups[hashed] = "test"
 
        else:
            matchups[hashed] = "val"
   
   
    train_val_test_indices = {
        'train':[],
        'validation':[],
        'test':[]
    }
    row_count = 0
    for i, row in swing_df.iterrows():
      
            pitcher = row.pitcher_id
            batter = row.batter_id
            hashed = (pitcher, batter)
            
            if matchups[hashed] == "train":
  
                train_val_test_indices["train"].append(row_count)
            elif matchups[hashed] == "val":
  
                train_val_test_indices["validation"].append(row_count)
            elif matchups[hashed] == "test":
   
                train_val_test_indices["test"].append(row_count)
     
            row_count += 1
                
            
    
    
    return train_val_test_indices
    
train_val_test_indices = get_train_val_test_indices(swing_df)

train_indices = train_val_test_indices['train']
val_indices = train_val_test_indices['validation']
test_indices = train_val_test_indices['test']

In [17]:
# Splits the data into the three sets based on the gotten indices
def get_train_val_test_sets(train_indices, val_indices, test_indices, tensors):
  p_train = []
  p_test = []
  p_val = []

  b_train = []
  b_test = []
  b_val = []

  o_train = []
  o_test = []
  o_val = []

  ids_train = []
  ids_val = []
  ids_test = []
    
  sc_train = []
  sc_val = []
  sc_test = []

  bc_train = []
  bc_val = []
  bc_test = []
    
  pitch_train = []
  pitch_val = []
  pitch_test = []

  
  for i in train_indices:
    if i < len(tensors['Pitcher']):
        p_train.append(tensors['Pitcher'][i])
        b_train.append(tensors['Batter'][i])
        o_train.append(tensors['Outcome'][i])
        ids_train.append(tensors['IDs'][i])
        sc_train.append(tensors['s_count'][i])
        bc_train.append(tensors['b_count'][i])
        pitch_train.append(tensors['Pitch'][i])

  for j in val_indices:
    if j < len(tensors['Pitcher']):
        p_val.append(tensors['Pitcher'][j])
        b_val.append(tensors['Batter'][j])
        o_val.append(tensors['Outcome'][j])
        ids_val.append(tensors['IDs'][j])
        sc_val.append(tensors['s_count'][j])
        bc_val.append(tensors['b_count'][j])
        pitch_val.append(tensors['Pitch'][j])

  for k in test_indices:
    if k < len(tensors['Pitcher']):
        p_test.append(tensors['Pitcher'][k])
        b_test.append(tensors['Batter'][k])
        o_test.append(tensors['Outcome'][k])
        ids_test.append(tensors['IDs'][k])
        sc_test.append(tensors['s_count'][k])
        bc_test.append(tensors['b_count'][k])
        pitch_test.append(tensors['Pitch'][k])

  sets = {
      'Training' : {
          'Pitcher' : p_train,
          'Batter' : b_train,
          'Outcome' : o_train,
          'IDs' : ids_train,
          's_count' : sc_train,
          'b_count' : bc_train,
          'Pitch' : pitch_train
      },
      'Validation' : {
          'Pitcher' : p_val,
          'Batter' : b_val,
          'Outcome' : o_val,
          'IDs' : ids_val,
          's_count' : sc_val,
          'b_count' : bc_val,
          'Pitch' : pitch_val
      },
      'Test' : {
          'Pitcher' : p_test,
          'Batter' : b_test,
          'Outcome' : o_test,
          'IDs' : ids_test,
          's_count' : sc_test,
          'b_count' : bc_test,
          'Pitch' : pitch_test
      }
  }
  return sets

input_output_tensors = get_train_val_test_sets(train_indices, val_indices, test_indices, input_output_tensors)

In [18]:
# Create list objects from dictionary of tensors

pitcher_train = input_output_tensors['Training']['Pitcher']
batter_train = input_output_tensors['Training']['Batter']
outcome_train = input_output_tensors['Training']['Outcome']
ids_train = input_output_tensors['Training']['IDs']
sc_train = input_output_tensors['Training']['s_count']
bc_train = input_output_tensors['Training']['b_count']
pitch_set_train = input_output_tensors['Training']['Pitch']

pitcher_val = input_output_tensors['Validation']['Pitcher']
batter_val = input_output_tensors['Validation']['Batter']
outcome_val = input_output_tensors['Validation']['Outcome']
ids_val = input_output_tensors['Validation']['IDs']
sc_val = input_output_tensors['Validation']['s_count']
bc_val = input_output_tensors['Validation']['b_count']
pitch_set_val = input_output_tensors['Validation']['Pitch']

pitcher_test = input_output_tensors['Test']['Pitcher']
batter_test = input_output_tensors['Test']['Batter']
outcome_test = input_output_tensors['Test']['Outcome']
ids_test = input_output_tensors['Test']['IDs']
sc_test = input_output_tensors['Test']['s_count']
bc_test = input_output_tensors['Test']['b_count']
pitch_set_test = input_output_tensors['Test']['Pitch']

In [19]:
# Shuffle order of training data each time before training the model 
def randomize_order(p_tens, b_tens, out_tens, ids, sc_tens, bc_tens, pitch_tens):
  together = list(zip(list(p_tens), list(b_tens), list(out_tens), list(ids), list(sc_tens), list(bc_tens), list(pitch_tens)))
  import random
  random.shuffle(together)
  p_tens, b_tens, out_tens, ids, sc_tens, bc_tens, pitch_tens = zip(*together)
  return list(p_tens), list(b_tens), list(out_tens), list(ids), list(sc_tens), list(bc_tens), list(pitch_tens)

pitcher_train, batter_train, outcome_train, ids_train, sc_train, bc_train, pitch_set_train = randomize_order(pitcher_train, batter_train, outcome_train, ids_train, sc_train, bc_train, pitch_set_train)


In [20]:
pitcher_train, batter_train, outcome_train, sc_train, bc_train, pitch_set_train = np.array(pitcher_train), np.array(batter_train), np.array(outcome_train), np.array(sc_train), np.array(bc_train), np.array(pitch_set_train)

pitcher_val, batter_val, outcome_val, sc_val, bc_val, pitch_set_val = np.array(pitcher_val), np.array(batter_val), np.array(outcome_val), np.array(sc_val), np.array(bc_val), np.array(pitch_set_val)

pitcher_test, batter_test, outcome_test, sc_test, bc_test, pitch_set_test = np.array(pitcher_test), np.array(batter_test), np.array(outcome_test), np.array(sc_test), np.array(bc_test), np.array(pitch_set_test)

In [21]:
## DEFINE THE MODEL

#Pitcher layers
p_input = Input(shape=(5, 5, 12), dtype='float32', name='pitcher')
p_conv_1 = layers.Conv2D(32, (4, 4), activation='relu', padding='same', name='p_conv_1')(p_input)
p_conv_2 = layers.Conv2D(32, (3, 3), activation='relu', padding='same', name='p_conv_2')(p_conv_1)
p_conv_3 = layers.Conv2D(32, (2, 2), activation='relu', padding='same', name='p_conv_3')(p_conv_2)
p_maxpool_1 = layers.MaxPooling2D((2,2), padding='same', name='p_maxpool_1')(p_conv_1)
p_conv_4 = layers.Conv2D(64, (2,5), activation='relu', padding='same', name='p_conv_4')(p_maxpool_1)
p_maxpool_2 = layers.MaxPooling2D((2,2), padding='same', name='p_maxpool_2')(p_conv_4)
p_conv_5 = layers.Conv2D(64, (2,4), activation='relu', padding='same', name='p_conv_5')(p_maxpool_2)
p_maxpool_3 = layers.MaxPooling2D((2,2), padding='same', name='p_maxpool_3')(p_conv_5)
p_conv_6 = layers.Conv2D(64, (2,3), activation='relu', padding='same', name='p_conv_6')(p_maxpool_3)
p_maxpool_4 = layers.MaxPooling2D((2,2), padding='same', name='p_maxpool_4')(p_conv_6)
p_conv_7 = layers.Conv2D(64, (2,2), activation='relu', padding='same', name='p_conv_7')(p_maxpool_4)
p_maxpool_5 = layers.MaxPooling2D((2,2), padding='same', name='p_maxpool_5')(p_conv_7)
flat_pitcher = layers.Flatten(name='flatten_pitcher')(p_maxpool_5)
 
 
#Batter layers
b_input = Input(shape=(5, 5, 12), dtype='float32', name='batter')
b_conv_1 = layers.Conv2D(32, (4, 4), activation='relu', padding='same', name='b_conv_1')(b_input)
b_conv_2 = layers.Conv2D(32, (3, 3), activation='relu', padding='same', name='b_conv_2')(b_conv_1)
b_conv_3 = layers.Conv2D(32, (2, 2), activation='relu', padding='same', name='b_conv_3')(b_conv_2)
b_maxpool_1 = layers.MaxPooling2D((2,2), padding='same', name='b_maxpool_1')(b_conv_1)
b_conv_4 = layers.Conv2D(64, (2,3), activation='relu', padding='same', name='b_conv_4')(b_maxpool_1)
b_maxpool_2 = layers.MaxPooling2D((2,2), padding='same', name='b_maxpool_2')(b_conv_4)
b_conv_5 = layers.Conv2D(64, (3,3), activation='relu', padding='same', name='b_conv_5')(b_maxpool_2)
b_maxpool_3 = layers.MaxPooling2D((2,2), padding='same', name='b_maxpool_3')(b_conv_5)
b_conv_6 = layers.Conv2D(64, (2,3), activation='relu', padding='same', name='b_conv_6')(b_maxpool_3)
b_maxpool_4 = layers.MaxPooling2D((2,2), padding='same', name='b_maxpool_4')(b_conv_6)
b_conv_7 = layers.Conv2D(64, (2,2), activation='relu', padding='same', name='b_conv_7')(b_maxpool_4)
b_maxpool_5 = layers.MaxPooling2D((2,2), padding='same', name='b_maxpool_5')(b_conv_7)
flat_batter = layers.Flatten(name='flatten_batter')(b_maxpool_5)

#Strike Count Layer
strike_count = Input(shape=(1,), dtype='float32', name = "strike_count")
#Ball Count Layer
ball_count = Input(shape=(1,), dtype='float32', name = "ball_count")

#Pitch Count Layer
pitch_input = Input(shape=(5,5,6), dtype='float32', name = 'pitch_input')
pitch_conv_1 = layers.Conv2D(32, (3, 3), activation='relu', padding='same', name='pitch_conv_1')(pitch_input)
pitch_maxpool_1 = layers.MaxPooling2D((2,2), padding='same', name='pitch_maxpool_1')(pitch_conv_1)
flat_pitch = layers.Flatten(name = "flatten_pitch")(pitch_maxpool_1)
#Concatenate layer
concatenated = layers.concatenate([flat_pitcher, flat_batter, strike_count, ball_count, flat_pitch], name='concat')

#Dense layer
dense_1 = layers.Dense(128, activation='sigmoid')(concatenated)
dense_2 = layers.Dense(64, activation='sigmoid')(dense_1)
dense_3 = layers.Dense(32, activation='sigmoid')(dense_2)


 
#Output layer
output = layers.Dense(8, activation='softmax')(dense_3)
 
network = models.Model([p_input, b_input, strike_count, ball_count, pitch_input], output, name='Pred_Transitions')
network.summary()

Model: "Pred_Transitions"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 pitcher (InputLayer)           [(None, 5, 5, 12)]   0           []                               
                                                                                                  
 batter (InputLayer)            [(None, 5, 5, 12)]   0           []                               
                                                                                                  
 p_conv_1 (Conv2D)              (None, 5, 5, 32)     6176        ['pitcher[0][0]']                
                                                                                                  
 b_conv_1 (Conv2D)              (None, 5, 5, 32)     6176        ['batter[0][0]']                 
                                                                                   

In [22]:

# Set up optimizer and callback functions 
opt = optimizers.Adam(learning_rate=0.001)
# Compile and train the network 
network.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

history = network.fit([pitcher_train, batter_train, sc_train, bc_train, pitch_set_train], outcome_train, epochs=40, batch_size=512, validation_data=([pitcher_val, batter_val, sc_val, bc_val, pitch_set_val], outcome_val))


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [23]:
# Save model
network.save("transition_model_2015-2018.h5")

In [24]:
# Load model and generate output for test set

network_test = keras.models.load_model("transition_model_2015-2018.h5")



pred_results = []    

pred_res = network_test.predict([pitcher_test,batter_test,sc_test,bc_test,pitch_set_test])

    
transitions_dict={
    "pitcher_test":pitcher_test.tolist(),
    "batter_test":batter_test.tolist(),
    "ids_test":ids_test,
    "outcome_test":outcome_test.tolist(),
    "sc_test":sc_test.tolist(),
    "bc_test":bc_test.tolist(),
    "pitch_set_test":pitch_set_test.tolist(),
    "pred_test":pred_res.tolist()
}
    
with open("transition_test_predictions.json", "w") as outfile: 
    json.dump(transitions_dict, outfile)

In [25]:
# Print mean of outcomes (stike, foul, out, hit)

np.mean(outcome_test, axis = 0)

array([0.25734274, 0.3784464 , 0.13411457, 0.10733496, 0.0784985 ,
       0.02457391, 0.00273344, 0.01695546])

In [26]:
# Open test set and aggregate outcomes across player skill matchups

with open('transition_test_predictions.json') as json_file:
    test_set = json.load(json_file)

with open('thirds.json') as json_file:
    thirds = json.load(json_file)
    
pitcher_test=np.array(test_set["pitcher_test"])
batter_test=np.array(test_set["batter_test"])
ids_test=np.array(test_set["ids_test"])
outcome_test=np.array(test_set["outcome_test"])
sc_test=np.array(test_set["sc_test"])
bc_test=np.array(test_set["bc_test"])
pitch_set_test=np.array(test_set["pitch_set_test"])
pred_test=np.array(test_set["pred_test"])
#Aggregate between thirds
results = {}    #indexed by pitcher, batter
for i in range(len(pitcher_test)):
    if i % 10000 == 0: print(i)
    pitcher = pitcher_test[i]
    pitcher_id = str(ids_test[i][0])
    batter_id = str(ids_test[i][1])
    batter = batter_test[i]
    pitch = pitch_set_test[i]
    sc = sc_test[i]
    bc = bc_test[i]
    res = outcome_test[i]
    pred_res = pred_test[i]
    #print(pred_res)
    if pitcher_id in thirds["pitchers"].keys() and batter_id in thirds["batters"].keys():
        p_perc = thirds["pitchers"][pitcher_id]
        b_perc = thirds["batters"][batter_id]
        if p_perc not in results.keys():
            results[p_perc] = {}
        if b_perc not in results[p_perc].keys():
            results[p_perc][b_perc] = {"emp":[],"pred":[]}
        results[p_perc][b_perc]["emp"].append(res.tolist())
        results[p_perc][b_perc]["pred"].append(pred_res.tolist())
        
for p_perc in results.keys():
    for b_perc in results[p_perc].keys():
        #print(results[p_perc][b_perc]["pred"])
        pred_mean = np.mean(results[p_perc][b_perc]["pred"],axis=0)
        emp_mean = np.mean(results[p_perc][b_perc]["emp"],axis=0)
        
        print("Pitcher percentile %s, Batter percentile %s" %(p_perc, b_perc))
        print("PRED")
        print(pred_mean)
        print("EMP")
        print(emp_mean)
        print("-----")
        

FileNotFoundError: [Errno 2] No such file or directory: 'thirds.json'

In [None]:
# Evaluate model accuracy

network_test = keras.models.load_model("transition_model_2015-2018.h5")
    
network_test.evaluate([pitcher_test, batter_test, sc_test, bc_test, pitch_set_test], outcome_test, batch_size = 1)

In [None]:
with open("transition_test_results.json", "w") as outfile: 
    json.dump(results, outfile)

In [None]:
# Import plotting libraries and set format for Latex

import matplotlib
import matplotlib.pyplot as plt
matplotlib.use("pdf")
matplotlib.rcParams.update({
    'font.family': 'serif',
})

In [None]:
# Map group number to "weak"/"average"/"strong"

group_map = {
    '0':'Weak',
    '1':'Average',
    '2':'Strong'
}

In [None]:
# Plot outcomes by player skill matchup

with open("transition_thirds_test.json") as json_file:
    results = json.load(json_file)

# Quality: string "AB", where A = pitcher quality and B = batter quality; A,B is from {"0","1","2"}
def get_outcomes_by_quality(quality, results):
    p_perc = quality[0]
    b_perc = quality[1]
    
    labels = ['Strike', 'Foul', 'Out', 'Single', 'Double', 'Triple', 'Home Run']
    
    pred_mean = np.mean(results[p_perc][b_perc]["pred"],axis=0)
    emp_mean = np.mean(results[p_perc][b_perc]["emp"],axis=0)
    
    emp_yerr = []
    pred_yerr = []
    
    emp_means = []
    pred_means = []
    
    
    for label in labels:
        label_ind = labels.index(label)
        
        print(label)
        emp = np.array(results[p_perc][b_perc]["emp"])[:,label_ind]
        
        emp_mean = np.mean(emp)
        emp_std = np.std(emp)
        emp_len = emp.shape[0]
        emp_err = (1.96*emp_std/np.sqrt(emp_len))
        emp_yerr.append(emp_err)
        emp_means.append(emp_mean)
        
        
        pred = np.array(results[p_perc][b_perc]["pred"])[:,label_ind]
        
        pred_mean = np.mean(pred)
        pred_std = np.std(pred)
        pred_len = pred.shape[0]
        pred_err = (1.96*pred_std/np.sqrt(pred_len))
        pred_yerr.append(pred_err)
        pred_means.append(pred_mean)  
        
    data = np.arange(len(labels))  
    width = 0.3 
    dist = .08

    fig, ax = plt.subplots()
    fig.set_size_inches(w=4.2, h=3.2)
    rect1 = ax.bar(data - width/2-dist/2, emp_means, width, yerr= emp_yerr, label='Empirical', color = "tomato")
    rect2 = ax.bar(data + width/2+dist/2, pred_means, width, yerr= pred_yerr, label='Predicted', color = "dodgerblue")
    
    def autolabel(bars):
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., 1.2*height,
                    '%.3f' % float(height), rotation=90,
                    ha='center', va='bottom')
    autolabel(rect1)
    autolabel(rect2)
    ax.set_ylim([0,1])
    ax.set_ylabel('Outcome Probability') 
    ax.set_xlabel('Outcome')
    ax.set_title("%s Pitcher vs. %s Batter Outcomes" %(group_map[p_perc], group_map[b_perc]))
    ax.set_xticks(data)
    ax.set_xticklabels(labels)
    ax.legend()
    
    file_name = '%s_p_%s_b_trans_probs.pdf'%(group_map[p_perc], group_map[b_perc])
    print(file_name)



    plt.show()
    plt.savefig(file_name,dpi=120)

In [None]:
# Call outcome printing function
get_outcomes_by_quality("22", results)

In [None]:
# Open test set and aggregate outcomes between count

with open('transition_test_predictions.json') as json_file:
    test_set = json.load(json_file)

pitcher_test=np.array(test_set["pitcher_test"])
batter_test=np.array(test_set["batter_test"])
ids_test=np.array(test_set["ids_test"])
outcome_test=np.array(test_set["outcome_test"])
sc_test=np.array(test_set["sc_test"])
bc_test=np.array(test_set["bc_test"])
pitch_set_test=np.array(test_set["pitch_set_test"])
pred_test=np.array(test_set["pred_test"])
#Aggregate between thirds
zone_results = {"ball":{"pred":[],"emp":[]},"strike":{"pred":[],"emp":[]}}
count_results = {}#indexed by pitcher, batter
for i in range(len(pitcher_test)):
    #if i % 10000 == 0: print(i)
    pitcher = pitcher_test[i]
    pitcher_id = str(ids_test[i][0])
    batter_id = str(ids_test[i][1])
    row_id = ids_test[i][2]
    batter = batter_test[i]
    pitch = pitch_set_test[i]
    sc = sc_test[i]
    bc = bc_test[i]
    count = str(int(bc))+str(int(sc))
    res = outcome_test[i]
    pred_res = pred_test[i]

    row = swing_df.loc[ids_test[i][2]]
     
    zone = row.zones
    ball_zones = [9,10,11,12,13,14,15,16]

    if zone in ball_zones:
        zone_type = "ball"
    else:
        zone_type = "strike"
    zone_results[zone_type]["pred"].append(pred_res)
    zone_results[zone_type]["emp"].append(res)
    
    if count not in count_results.keys():
        count_results[count]={"pred":[],"emp":[]}
    count_results[count]["pred"].append(pred_res)
    count_results[count]["emp"].append(res)


In [None]:
# prints results across keys in a results dictionary

def print_results(results):
    for key in results.keys():
        emp_mean = np.mean(np.array(results[key]["emp"]), axis = 0)
        pred_mean = np.mean(np.array(results[key]["pred"]), axis = 0)
        
        print(key)
        
        print("EMP")
        print(emp_mean)
        print("PRED")
        print(pred_mean)
        print("_______")

In [None]:
# graphs results between two keys in a results dictionary
        
def graph_results(results,key1, key2):

    
    labels = ['Strike', 'Foul', 'Out', 'Hit']
    
    pred1_mean = np.mean(results[key1]["pred"],axis=0)
    emp1_mean = np.mean(results[key1]["emp"],axis=0)
    
    emp1_yerr = []
    pred1_yerr = []
    
    emp1_means = []
    pred1_means = []
    
    pred2_mean = np.mean(results[key2]["pred"],axis=0)
    emp2_mean = np.mean(results[key2]["emp"],axis=0)
    
    emp2_yerr = []
    pred2_yerr = []
    
    emp2_means = []
    pred2_means = []
    
    for label in labels:
        label_ind = labels.index(label)
        
        print(label)
        emp1 = np.array(results[key1]["emp"])[:,label_ind]
        
        emp1_mean = np.mean(emp1)
        emp1_std = np.std(emp1)
        emp1_len = emp1.shape[0]
        emp1_err = (1.96*emp1_std/np.sqrt(emp1_len))
        emp1_yerr.append(emp1_err)
        emp1_means.append(emp1_mean)
        
        
        pred1 = np.array(results[key1]["pred"])[:,label_ind]
        
        pred1_mean = np.mean(pred1)
        pred1_std = np.std(pred1)
        pred1_len = pred1.shape[0]
        pred1_err = (1.96*pred1_std/np.sqrt(pred1_len))
        pred1_yerr.append(pred1_err)
        pred1_means.append(pred1_mean)  
        
                
        

        emp2 = np.array(results[key2]["emp"])[:,label_ind]
        
        emp2_mean = np.mean(emp2)
        emp2_std = np.std(emp2)
        emp2_len = emp1.shape[0]
        emp2_err = (1.96*emp2_std/np.sqrt(emp2_len))
        emp2_yerr.append(emp2_err)
        emp2_means.append(emp2_mean)
        
        
        pred2 = np.array(results[key2]["pred"])[:,label_ind]
        
        pred2_mean = np.mean(pred2)
        pred2_std = np.std(pred2)
        pred2_len = pred2.shape[0]
        pred2_err = (1.96*pred2_std/np.sqrt(pred2_len))
        pred2_yerr.append(pred2_err)
        pred2_means.append(pred2_mean)  
        
        
    x = np.arange(len(labels))  
    width = 0.15 
    dist = .08

    fig, ax = plt.subplots()
    fig.set_size_inches(w=4.2, h=3.2)
    rect1 = ax.bar(x - width-width*1/2-dist-dist*1/2, emp1_means, width, yerr = emp1_yerr, label='%s Emp'%(key1.capitalize()), edgecolor= "dodgerblue",hatch="///", color="white", alpha = .99, linewidth=2)
    rect2 = ax.bar(x-width*1/2 -dist*1/2, pred1_means,  width, yerr = pred1_yerr, label='%s Pred'%(key1.capitalize()), edgecolor= "dodgerblue",color="white" , linewidth=2)
    rect3 = ax.bar(x + width-width*1/2+dist*1/2, emp2_means, width, yerr = emp2_yerr, label='%s Emp'%(key2.capitalize()), edgecolor= "tomato",hatch="///", color="white", alpha = .99, linewidth=2)
    rect4 = ax.bar(x + width*2-width*1/2 +dist+dist*1/2, pred2_means, width, yerr = pred2_yerr, label='%s Pred'%(key2.capitalize()), edgecolor= "tomato", color="white", linewidth=2)

    
    def autolabel(bars):
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., 1.2*height,
                    '%.3f' % float(height), rotation=90,
                    ha='center', va='bottom')
    autolabel(rect1)
    autolabel(rect2)
    autolabel(rect3)
    autolabel(rect4)
    ax.set_ylim([0,1])
    ax.set_ylabel('Outcome Probability') 
    ax.set_xlabel('Outcome')
    ax.set_title('%s vs. %s Transition Probabilities'%(key1.capitalize(),key2.capitalize()))
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()
    
    file_name = '%s_vs_%s_transition_probabilities.pdf'%(key1,key2)


    plt.show()
    plt.savefig(file_name,dpi=120)

In [None]:
graph_results(count_results, "30","02")

In [None]:
# Graphs results between inside (strike) and outside (ball) zones

def graph_results_zone(results,key1, key2):

    
    labels = ['Strike', 'Foul', 'Out', 'Hit']
    
    pred1_mean = np.mean(results[key1]["pred"],axis=0)
    emp1_mean = np.mean(results[key1]["emp"],axis=0)
    
    emp1_yerr = []
    pred1_yerr = []
    
    emp1_means = []
    pred1_means = []
    
    pred2_mean = np.mean(results[key2]["pred"],axis=0)
    emp2_mean = np.mean(results[key2]["emp"],axis=0)
    
    emp2_yerr = []
    pred2_yerr = []
    
    emp2_means = []
    pred2_means = []
    
    for label in labels:
        label_ind = labels.index(label)
        
        print(label)
        emp1 = np.array(results[key1]["emp"])[:,label_ind]
        
        emp1_mean = np.mean(emp1)
        emp1_std = np.std(emp1)
        emp1_len = emp1.shape[0]
        emp1_err = (1.96*emp1_std/np.sqrt(emp1_len))
        emp1_yerr.append(emp1_err)
        emp1_means.append(emp1_mean)
        
        
        pred1 = np.array(results[key1]["pred"])[:,label_ind]
        
        pred1_mean = np.mean(pred1)
        pred1_std = np.std(pred1)
        pred1_len = pred1.shape[0]
        pred1_err = (1.96*pred1_std/np.sqrt(pred1_len))
        pred1_yerr.append(pred1_err)
        pred1_means.append(pred1_mean)  
        
                
        

        emp2 = np.array(results[key2]["emp"])[:,label_ind]
        
        emp2_mean = np.mean(emp2)
        emp2_std = np.std(emp2)
        emp2_len = emp1.shape[0]
        emp2_err = (1.96*emp2_std/np.sqrt(emp2_len))
        emp2_yerr.append(emp2_err)
        emp2_means.append(emp2_mean)
        
        
        pred2 = np.array(results[key2]["pred"])[:,label_ind]
        
        pred2_mean = np.mean(pred2)
        pred2_std = np.std(pred2)
        pred2_len = pred2.shape[0]
        pred2_err = (1.96*pred2_std/np.sqrt(pred2_len))
        pred2_yerr.append(pred2_err)
        pred2_means.append(pred2_mean)  
        
        
    x = np.arange(len(labels))  
    width = 0.15 
    dist = .08

    fig, ax = plt.subplots()
    fig.set_size_inches(w=4.2, h=3.2)
    rect1 = ax.bar(x - width-width*1/2-dist-dist*1/2, emp1_means, width, yerr = emp1_yerr, label='%s Emp'%(key1.capitalize()), edgecolor= "dodgerblue",hatch="///", color="white", alpha = .99, linewidth=2)
    rect2 = ax.bar(x-width*1/2 -dist*1/2, pred1_means,  width, yerr = pred1_yerr, label='%s Pred'%(key1.capitalize()), edgecolor= "dodgerblue",color="white" , linewidth=2)
    rect3 = ax.bar(x + width-width*1/2+dist*1/2, emp2_means, width, yerr = emp2_yerr, label='%s Emp'%(key2.capitalize()), edgecolor= "tomato",hatch="///", color="white", alpha = .99, linewidth=2)
    rect4 = ax.bar(x + width*2-width*1/2 +dist+dist*1/2, pred2_means, width, yerr = pred2_yerr, label='%s Pred'%(key2.capitalize()), edgecolor= "tomato", color="white", linewidth=2)

    
    def autolabel(rects):

        for rect in rects:
            height = rect.get_height()
            print(height)
            ax.text(rect.get_x() + rect.get_width()/2., 1.2*height,
                    '%.3f' % float(height), rotation=90,
                    ha='center', va='bottom')
    autolabel(rect1)
    autolabel(rect2)
    autolabel(rect3)
    autolabel(rect4)
    ax.set_ylim([0,1])
    ax.set_ylabel('Outcome Probability') 
    ax.set_xlabel('Outcome')
    ax.set_title('Ball vs. Strike Zone Transition Probabilities')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()
    
    file_name = 'ball_vs_strike_zone_transition_probabilities.pdf'
    #print(file_name)

    #fig.tight_layout()

    plt.show()
    plt.savefig(file_name,dpi=120)
graph_results_zone(zone_results, "ball","strike")