In [1]:
import numpy as np
import os
import pandas as pd
import pickle
import psycopg2
import xgboost as xgb #v1.6.0
import zipfile

from dotenv import load_dotenv
from sklearn.neighbors import KNeighborsClassifier
from pathlib import Path
from xgboost import XGBClassifier

### PLA

# Load data from database
dotenv_path = Path('pitcherlist_datascience.env')
load_dotenv(dotenv_path=dotenv_path)

conn = psycopg2.connect(f"dbname='{os.environ.get('PL_DB_DATABASE')}' user='{os.environ.get('PL_DB_USER')}' host=plain-banana.db.elephantsql.com password='{os.environ.get('PL_DB_PASSWORD')}'")
cursor = conn.cursor()

# PLA table
cursor.execute("Select * FROM plv_by_player where year_played >= '2020'")
colnames = [desc[0] for desc in cursor.description]
data = cursor.fetchall()

db_data = pd.DataFrame(data).copy()
db_data.columns = colnames

cursor.close()
conn.close()

db_data = (
    db_data
    .drop(columns=['num_plv'])
    .loc[(db_data['year_played']!='ALL') &
         (db_data['opponent_handedness']!='ALL') &
         (db_data['pitchtype']!='UN')]
    .astype({'year_played':'int'})
    .query('num_plv_pitch > 0')
    .reset_index(drop=True)
    .sort_values(['year_played','full_name','opponent_handedness','pitchtype'])
)

db_data['ip'] = db_data['ip'].astype('int').add(db_data['ip'].astype('str').str[-1].astype('int').div(3)).astype('float')

db_data['total_pitches'] = db_data['num_plv_pitch'].groupby([db_data['year_played'],
                                                             db_data['mlb_player_id']]).transform('sum')
db_data['usage'] = db_data['num_plv_pitch'].div(db_data['total_pitches'])
db_data['subset_ip'] = db_data['usage'].mul(db_data['ip'])

(db_data
 [['year_played','full_name','mlb_player_id','pitchtype',
   'pitcher_handedness','opponent_handedness',
   'num_plv_pitch','subset_ip','avg_plv','num_pla_runs']]
 .rename(columns={
     'mlb_player_id':'pitcher_mlb_id',
     'full_name':'pitchername',
     'pitcher_handedness':'p_hand',
     'opponent_handedness':'b_hand',
     'num_plv_pitch':'num_pitches',
     'avg_plv':'plv',
     'num_pla_runs':'pitch_runs'
 })
 .to_csv('pla_data.csv',encoding='latin1',index=False)
)

print('PLA done')

### Hitting/Movement/General PLV

### Load data from database
dotenv_path = Path('pitcherlist_datascience.env')
load_dotenv(dotenv_path=dotenv_path)

conn = psycopg2.connect(f"dbname='{os.environ.get('PL_DB_DATABASE')}' user='{os.environ.get('PL_DB_USER')}' host=plain-banana.db.elephantsql.com password='{os.environ.get('PL_DB_PASSWORD')}'")
cursor = conn.cursor()

cursor.execute("Select * FROM plv_inputs where year_played >= '2023'")
colnames = [desc[0] for desc in cursor.description]
data = cursor.fetchall()

db_data = pd.DataFrame(data).copy()
db_data.columns = colnames

cursor.close()
conn.close()

# Limit to post-2020, since that's what we have the most complete data for
# db_data = db_data.loc[db_data['year_played']>=2020].copy()

# Consolidate descriptions
db_data['cleaned_description'] = db_data['pitch_description'].replace({
    'Ball':'ball', 
    'Foul Ball':'foul_strike', 
    'Strike Swinging':'swinging_strike', 
    'Strike Looking':'called_strike',
    'Homerun':'home_run', 
    'Single':'single', 
    'Ground Out':'out', 
    'Foul Tip':'swinging_strike', 
    'Fielders Choice':'out',
    'Dirt Ball':'ball', 
    'Double':'double', 
    'Pop Out':'out', 
    'Hit By Pitch':'hit_by_pitch', 
    'Fly Out':'out',
    'Line Out':'out', 
    'Reached On Error':'out', 
    'Single - Adv 2nd':'single',
    'Fielders Choice - Adv 2nd':'out', 
    'Sacrifice Fly':'out',
    'Reached On Error - Adv 2nd':'out', 
    'Pitchout':'ball', 
    'Triple - Out at Home':'triple',
    'Triple':'triple', 
    'Strike Swinging - Adv 1st':'swinging_strike', 
    'Single - Out at 2nd':'single',
    'Sacrifice Bunt':'out',
    'Double - Out at 3rd':'double',
    'Single - Adv 3rd':'single', 
    'Double - Adv 3rd':'double',
    'Reached On Error - Out at 2nd':'out', 
    'Sacrifice Bunt - Adv 1st':'out',
    'Reached On Error - Adv 3rd':'out', 
    'Reached On Error - Adv Home':'out',
    'Fielders Choice - Out at 2nd':'out', 
    'Triple - Adv Home':'triple',
    'Sacrifice Fly - Adv 1st':'out', 
    'Strike Swinging - Adv 2nd':'swinging_strike',
    'Double - Adv Home':'double', 
    'Sacrifice Bunt - Adv 2nd':'out',
    'Fielders Choice - Adv 3rd':'out', 
    'Single - Out at 3rd':'single',
    'Single - Adv Home':'single', 
    'Single - Out at Home':'single',
    'Sacrifice Bunt - Adv 3rd':'out', 
    'Fielders Choice - Out at 3rd':'out',
    'Sacrifice Bunt - Out at 2nd':'out', 
    'Sacrifice Fly - Adv 2nd':'out',
    'Reached On Error - Out at 3rd':'out', 
    'Double - Out at Home':'double',
    'Enforced Ball':'ball',
    'Intentional Walk':'ball',
    'Single - Tagged out at 1st':'single',
    'Enforced Strike':'called_strike'
})

### Clean Pitch Extension, per appearance
db_data['pitch_extension'] = db_data['pitch_extension'].astype('float')

# 1st - by pitch type, within appearance
db_data['pitch_extension_cleaned'] = db_data['pitch_extension'].fillna(
    db_data['pitch_extension']
    .groupby([db_data['mlb_game_id'],
              db_data['pitcher_mlb_id'],
              db_data['pitchtype']])
    .transform('median')
).astype('float')

# 2nd - by pitcher, witihin appearance
db_data['pitch_extension_cleaned'] = db_data['pitch_extension_cleaned'].fillna(
    db_data['pitch_extension_cleaned']
    .groupby([db_data['mlb_game_id'],
              db_data['pitcher_mlb_id']])
    .transform('median')
).astype('float')

### Clean Strikezone measurements
for pole in ['top','bottom']:
    db_data['strike_zone_'+pole] = db_data['strike_zone_'+pole].astype('float')
    db_data['strike_zone_'+pole+'_cleaned'] = db_data['strike_zone_'+pole].fillna(
        db_data['strike_zone_'+pole]
        .groupby([db_data['mlb_game_id'],
                  db_data['hitter_mlb_id']])
        .transform('median')
    ).astype('float')
                                                                   
### Standardized Strikezone (z-location, in 'strikezones')
def strikezone_z(dataframe,top_column,bottom_column):
    dataframe[['p_z',top_column,bottom_column]] = dataframe[['p_z',top_column,bottom_column]].astype('float')
    
    # Ratio of 'strikezones' above/below midpoint of strikezone
    dataframe['sz_mid'] = dataframe[[top_column,bottom_column]].mean(axis=1)
    dataframe['sz_height'] = dataframe[top_column].sub(dataframe[bottom_column])
    
    return dataframe['p_z'].sub(dataframe['sz_mid']).div(dataframe['sz_height'])

### VAA (Vertical Approach/Attack Angle)
def adjusted_vaa(dataframe):
    ## Physical characteristics of pitch
    # Pitch velocity (to plate) at plate
    dataframe['v_yf'] = -1 * (dataframe['v_y0']**2 - (2 * dataframe['a_y']*(50-17/12)))**0.5
    # Pitch time in air (50ft to home plate)
    dataframe['pitch_time_50ft'] = (dataframe['v_yf'] - dataframe['v_y0'])/dataframe['a_y']
    # Pitch velocity (vertical) at plate
    dataframe['v_zf'] = dataframe['v_z0'] + dataframe['a_z'] * dataframe['pitch_time_50ft']

    ## raw and height-adjusted VAA
    # Raw VAA 
    dataframe['raw_vaa'] = -1 * np.arctan(dataframe['v_zf']/dataframe['v_yf']) * (180/np.pi)
    # VAA of all pitches at that height
    dataframe['vaa_z_adj'] = dataframe['raw_vaa'].groupby(dataframe['p_z']).transform('mean')
    # Adjusted VAA, based on height
    dataframe['adj_vaa'] = dataframe['raw_vaa'].sub(dataframe['vaa_z_adj'])
    
    return dataframe[['raw_vaa','adj_vaa']]

### Calculate the differences between each pitch and their avg fastball
def fastball_differences(dataframe,stat):
    dataframe[stat] = dataframe[stat].astype('float')
    temp_df = dataframe.loc[dataframe['pitchtype']==dataframe['fastball_type']].groupby(['pitcher_mlb_id','game_played','pitchtype'], as_index=False)[stat].mean().rename(columns={stat:'fb_'+stat})
    dataframe = dataframe.merge(temp_df,
                                left_on=['pitcher_mlb_id','game_played','fastball_type'],
                                right_on=['pitcher_mlb_id','game_played','pitchtype']).drop(columns=['pitchtype_y']).rename(columns={'pitchtype_x':'pitchtype'})
    return dataframe[stat].sub(dataframe['fb_'+stat])

def spin_calcs(data):
    needed_cols = ['velo','p_x','p_z','v_x0','v_y0','v_z0',
                   'a_x','a_y','a_z','pitch_extension_cleaned']
    
    if all([x in data.columns for x in needed_cols]):
        data[needed_cols] = data[needed_cols].astype('float')
        
        ## Formulas
        # Release location
        data['y_R'] = 60.5 - data['pitch_extension_cleaned']
        
        # Time since release
        data['t_R'] = (-data['v_y0']-(data['v_y0']**2 - 2*data['a_y']*(50-data['y_R']))**0.5)/data['a_y']
        
        # Release velo
        data['v_xR'] = data['v_x0']+data['a_x']*data['t_R']
        data['v_yR'] = data['v_y0']+data['a_y']*data['t_R']
        data['v_zR'] = data['v_z0']+data['a_z']*data['t_R']
        
        # Delta release speed
        data['d_v0'] = data['velo'] - (data['v_xR']**2 + data['v_yR']**2 + data['v_zR']**2)**0.5/1.467

        # pitch flight time
        data['t_f'] = (-data['v_yR']-(data['v_yR']**2 - 2*data['a_y']*(data['y_R']-17/12))**0.5)/data['a_y']

        # Average velocity
        data['v_xbar'] = (2*data['v_xR']+data['a_x']*data['t_f'])/2
        data['v_ybar'] = (2*data['v_yR']+data['a_y']*data['t_f'])/2
        data['v_zbar'] = (2*data['v_zR']+data['a_z']*data['t_f'])/2
        data['v_bar'] = (data['v_xbar']**2 + data['v_ybar']**2 + data['v_zbar']**2)**0.5

        # Drag Acceleration
        data['a_drag'] = -(data['a_x']*data['v_xbar'] + data['a_y']*data['v_ybar'] + (data['a_z']+32.174)*data['v_zbar'])/data['v_bar']

        # Magnus Accelerations
        data['a_magx'] = data['a_x'] + data['a_drag']*data['v_xbar']/data['v_bar']
        data['a_magy'] = data['a_y'] + data['a_drag']*data['v_ybar']/data['v_bar']
        data['a_magz'] = data['a_z'] + data['a_drag']*data['v_zbar']/data['v_bar'] + 32.174
        data['a_mag'] = (data['a_magx']**2 + data['a_magy']**2 + data['a_magz']**2)**0.5

        data['IHB'] = 0.5*data['a_magx']*data['t_f']**2*12
        data['IVB'] = 0.5*data['a_magz']*data['t_f']**2*12
        
        return data[['IHB','IVB']]
    
    else:
        print('Dataframe is missing the following columns:')
        for x in needed_cols:
            if x not in data.columns:
                print('-',x)

### Create model features from inputs
def feature_engineer(dataframe):
    # Pythagorean movement
    dataframe['total_move'] = (dataframe['pfx_x'].astype('float')**2+dataframe['pfx_z'].astype('float')**2)**0.5
    
    # Z-location, in # of strikezones from center of strikezone
    dataframe['sz_z'] = strikezone_z(dataframe,'strike_zone_top_cleaned','strike_zone_bottom_cleaned')
    
    # Raw and Z-location adjusted VAA
    dataframe[['raw_vaa','adj_vaa']] = adjusted_vaa(dataframe[['p_z','v_y0','v_z0','a_y','a_z']].astype('float'))
    
    # df of most common fastballs for each pitcher, in each appearance
    fastballs = ['FF','FC','FT','SI']
    fastball_df = (dataframe
                   .loc[dataframe['pitchtype'].isin(fastballs)]
                   .groupby(['pitcher_mlb_id','game_played'], as_index=False)
                   ['pitchtype']
                   .agg(pd.Series.mode)
                   .rename(columns={'pitchtype':'fastball_type'})
                   .copy()
                  )
    
    # Filler values for fastball differences
    fb_diff_dict = {
        'pfx_x':{'CH':0.296,'CU':0.293,'FC':0.304,'FF':0.283,
                 'FS':0.316,'SI':0.299,'SL':0.300},
        'pfx_z':{'CH':-1.185,'CU':-1.194,'FC':-1.212,'FF':-1.159,
                 'FS':-1.157,'SI':-1.177,'SL':-1.180},
        'total_move':{'CH':-0.602,'CU':-0.604,'FC':-0.608,'FF':-0.588,
                      'FS':-0.551,'SI':-0.602,'SL':-0.607},
        'velo':{'CH':-1.438,'CU':-1.470,'FC':-1.475,'FF':-1.388,
                'FS':-1.371,'SI':-1.423,'SL':-1.433}
    }
    
    # Add most common Fastball type
    dataframe = dataframe.merge(fastball_df,on=['pitcher_mlb_id','game_played'], how='left')
    dataframe['fastball_type'] = dataframe['fastball_type'].fillna('NA').apply(lambda x: x if len(x[0])==1 else x[0])

    # Add comparison stats to fastball
    for stat in ['pfx_x','pfx_z','total_move','velo']:
        dataframe[stat+'_diff'] = fastball_differences(dataframe,stat)
        dataframe[stat+'_diff'] = dataframe[stat+'_diff'].fillna(dataframe['pitchtype'].map(fb_diff_dict[stat]))
    
    dataframe[['IHB','IVB']] = spin_calcs(dataframe)
    dataframe['total_IB'] = (dataframe['IHB'].astype('float')**2+dataframe['IVB'].astype('float')**2)**0.5
    
    return dataframe
                
db_data = feature_engineer(db_data)
print('Data loaded')

### Pitch Type Grouping
db_data['pitch_type_bucket'] = 'Other'
db_data.loc[(db_data['pitchtype']==db_data['fastball_type']) |
             db_data['pitchtype'].isin(['FF','FT','SI']),'pitch_type_bucket'] = 'Fastball'
db_data.loc[(db_data['pitchtype']!=db_data['fastball_type']) &
             db_data['pitchtype'].isin(['SL','CU', 'FC']),'pitch_type_bucket'] = 'Breaking Ball'
db_data.loc[db_data['pitchtype'].isin(['CH', 'FS','KN','SC']),'pitch_type_bucket'] = 'Offspeed'

### Arm Angle Estimation
db_data['arm_triangle_z'] = db_data['z0'].astype('float').sub(db_data['pitcher_height'].astype('float').div(12).mul(0.7))
db_data['arm_triangle_x'] = db_data['x0'].astype('float').abs()
db_data['arm_triangle_arm'] = (db_data['arm_triangle_x'].astype('float')**2+db_data['arm_triangle_z'].astype('float')**2)**0.5
db_data['arm_angle'] = np.rad2deg(np.arccos((db_data['arm_triangle_z']**2 + db_data['arm_triangle_arm']**2 - db_data['arm_triangle_x']**2)/(2*db_data['arm_triangle_z']*db_data['arm_triangle_arm'])))
db_data['arm_slot'] = pd.cut(db_data['arm_angle'],bins=[0,30,70,90,180],labels=['Overhand','Three-Quarters','Sidearm','Submarine'])

# # Adjust arm length towards being ~3/8 of height
# # Shoulder width = ~1/4 of height
# # Arm length = (1 - 1/4) / 2 = ~3/8
# db_data['est_arm_length'] = db_data['pitcher_height'].astype('float').div(12).mul(3/8)
# db_data['adj_arm_length'] = db_data[['arm_triangle_arm','est_arm_length']].mean(axis=1)

# # Assuming the difference in arm length 
# # is due to being on a side of the rubber,
# # adjust arm_x for the new arm length
# db_data['adj_arm_x'] = (db_data['adj_arm_length'].astype('float')**2-db_data['arm_triangle_z'].astype('float')**2)**0.5
# db_data['adj_arm_angle'] = np.rad2deg(np.arccos((db_data['arm_triangle_z']**2 + db_data['adj_arm_length']**2 - db_data['adj_arm_x']**2)/(2*db_data['arm_triangle_z']*db_data['adj_arm_length'])))
# db_data['adj_arm_slot'] = pd.cut(db_data['adj_arm_angle'],bins=[0,30,70,90,180],labels=['Overhand','Three-Quarters','Sidearm','Submarine'])

arm_df = (db_data
          .astype({
              'pitcher_height':'float',
              'x0':'float',
              'z0':'float'
          })
          .groupby(['pitcher_mlb_id','pitchername','year_played'])
          [['pitch_id','pitcher_height','x0','z0','arm_angle','pitch_extension_cleaned']]
          .agg({'pitch_id':'count',
                'pitcher_height':'mean',
                'x0':'median',
                'z0':'median',
                'arm_angle':'median',
                'pitch_extension_cleaned':'median'
               })
          .astype('float')
          .rename(columns={'pitch_id':'num_pitches'})
          .reset_index()
          .copy()
         )
arm_df['arm_slot'] = pd.cut(arm_df['arm_angle'],bins=[0,30,60,90,180],labels=['Overhand','Three-Quarters','Sidearm','Submarine'])
arm_df.to_csv('arm_slots.csv',encoding='latin1',index=False)
print('Arm Angle Done')

# Model Features
id_cols = ['year_played', 'game_played', 'pitcher_mlb_id', 'pitchername',
           'plate_appearance_id', 'pitch_id','hitter_mlb_id', 'hittername',
           'pitcherteam', 'hitterteam','hometeam', 'awayteam', 'mlb_game_id',
           'pitchtype','pitch_type_bucket','cleaned_description',
           'launch_angle', 'launch_speed','arm_angle']

stuff_feats = ['velo', 'pfx_x', 'pfx_z', 'total_move', 'z0',
               'pfx_x_diff', 'pfx_z_diff', 'total_move_diff',
               'velo_diff', 'pitch_extension_cleaned', 
               'raw_vaa','adj_vaa','IHB','IVB','total_IB']

location_feats = ['p_x', 'p_z', 'sz_z','strike_zone_top_cleaned',
                  'strike_zone_bottom_cleaned']

category_feats = ['pitcherside','hitterside','balls_before_pitch',
                  'strikes_before_pitch']

model_df = (
    db_data
    [id_cols+stuff_feats+location_feats+category_feats]
    .astype({
        'velo':'float', 
        'pfx_x':'float', 
        'pfx_z':'float', 
        'pitch_extension_cleaned':'float',
        'total_move':'float', 
        'pfx_x_diff':'float', 
        'pfx_z_diff':'float', 
        'total_move_diff':'float',
        'velo_diff':'float',
        'raw_vaa':'float',
        'adj_vaa':'float',
        'z0':'float',
        'p_x':'float',
        'p_z':'float',
        'sz_z':'float',
        'IHB':'float', 
        'IVB':'float',
        'total_IB':'float'
    })
    .rename(columns={'pitch_extension_cleaned':'pitch_extension',
                     'strike_zone_top_cleaned':'strike_zone_top',
                     'strike_zone_bottom_cleaned':'strike_zone_bottom'})
    .copy()
)

del db_data

model_df['balls_before_pitch'] = np.clip(model_df['balls_before_pitch'], a_min=0, a_max=3)
model_df['strikes_before_pitch'] = np.clip(model_df['strikes_before_pitch'], a_min=0, a_max=2)

model_df['balls'] = model_df['balls_before_pitch'].copy()
model_df['strikes'] = model_df['strikes_before_pitch'].copy()

model_df['p_hand'] = model_df['pitcherside'].copy()
model_df['b_hand'] = model_df['hitterside'].copy()

# One-hot encode category columns
model_df = pd.get_dummies(model_df, columns=category_feats)

## downcasting loop
for column in model_df.columns:
    if model_df[column].dtype == 'float64':
        model_df[column]=pd.to_numeric(model_df[column], downcast='float')
    if model_df[column].dtype == 'int64':
        model_df[column]=pd.to_numeric(model_df[column], downcast='integer')
        
# Redefine category list
category_feats = ['pitcherside_L','pitcherside_R',
                  'hitterside_L','hitterside_R',
                  'balls_before_pitch_0','balls_before_pitch_1','balls_before_pitch_2',
                  'balls_before_pitch_3','strikes_before_pitch_0','strikes_before_pitch_1',
                  'strikes_before_pitch_2']

# Add empty columns for model predictions
model_df[['take_input','swing_input','called_strike_raw','ball_raw',
            'hit_by_pitch_raw','swinging_strike_raw','contact_raw',
            'foul_strike_raw','in_play_raw','10deg_raw','10-20deg_raw',
            '20-30deg_raw','30-40deg_raw','40-50deg_raw','50+deg_raw',
            'called_strike_pred','ball_pred','hit_by_pitch_pred','contact_input',
            'swinging_strike_pred','foul_strike_pred','in_play_input','50+deg_pred',
            'out_pred', 'single_pred', 'double_pred', 'triple_pred', 'home_run_pred']] = None

for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
    model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
             launch_angle+': 90-95mph_raw',launch_angle+': 95-100mph_raw',
             launch_angle+': 100-105mph_raw',launch_angle+': 105+mph_raw',
             launch_angle+': <90mph_pred',launch_angle+': 90-95mph_pred',
             launch_angle+': 95-100mph_pred',launch_angle+': 100-105mph_pred',
             launch_angle+': 105+mph_pred']] = None

### Apply models
for pitch_type in ['Fastball','Breaking Ball','Offspeed']:
    # Unzip model files
#     with zipfile.ZipFile('pl_{}_model_files.zip'.format(pitch_type), 'r') as zip_ref:
#         zip_ref.extractall()

    # Swing Decision
    with open('pl_swing_model_{}.pkl'.format(pitch_type), 'rb') as f:
        decision_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['take_input','swing_input']] = decision_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,decision_model.feature_names_in_])
    print(pitch_type+' Swing model done')

    # Take Result
    with open('pl_take_result_model_{}.pkl'.format(pitch_type), 'rb') as f:
        take_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['called_strike_raw','ball_raw','hit_by_pitch_raw']] = take_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,take_model.feature_names_in_])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'called_strike_pred'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'called_strike_raw'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'take_input'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'ball_pred'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'ball_raw'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'take_input'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'hit_by_pitch_pred'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'hit_by_pitch_raw'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'take_input'])
    print(pitch_type+' Take model done')

    # Swing Result
    with open('pl_contact_model_{}.pkl'.format(pitch_type), 'rb') as f:
        swing_result_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['swinging_strike_raw','contact_raw']] = swing_result_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,swing_result_model.feature_names_in_])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'contact_input'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'contact_raw'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'swing_input'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'swinging_strike_pred'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'swinging_strike_raw'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'swing_input'])
    print(pitch_type+' Swing Result model done')

    # Contact Result
    with open('pl_in_play_model_{}.pkl'.format(pitch_type), 'rb') as f:
        contact_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['foul_strike_raw','in_play_raw']] = contact_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,contact_model.feature_names_in_])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'foul_strike_pred'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'foul_strike_raw'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'contact_input'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_input'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_raw'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'contact_input'])
    print(pitch_type+' Contact model done')

    # Launch Angle Result
    with open('pl_launch_angle_result_model_{}.pkl'.format(pitch_type), 'rb') as f:
        launch_angle_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['10deg_raw','10-20deg_raw','20-30deg_raw','30-40deg_raw','40-50deg_raw','50+deg_raw']] = launch_angle_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle_model.feature_names_in_])
    for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
        model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle+'_input'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle+'_raw'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_input'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'50+deg_pred'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'50+deg_raw'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_input'])
    print(pitch_type+' Launch Angle model done')

    # Launch Velo Result
    for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
        with open('pl_{}_model_{}.pkl'.format(launch_angle,pitch_type), 'rb') as f:
            launch_velo_model = pickle.load(f)

        model_df.loc[model_df['pitch_type_bucket']==pitch_type,[launch_angle+': <90mph_raw',launch_angle+': 90-95mph_raw',launch_angle+': 95-100mph_raw',launch_angle+': 100-105mph_raw',launch_angle+': 105+mph_raw']] = launch_velo_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_velo_model.feature_names_in_])
        for bucket in [launch_angle+': '+x for x in ['<90mph','90-95mph','95-100mph','100-105mph','105+mph']]:
            model_df.loc[model_df['pitch_type_bucket']==pitch_type,bucket+'_pred'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,bucket+'_raw'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle+'_input'])
    print(pitch_type+' Launch Velo model done')
    
del decision_model, take_model, swing_result_model, contact_model, launch_angle_model, launch_velo_model

### Apply distribution of outcomes to velo-angle pairs
# Load distributions for each grouping
bip_result_dict = (
    pd.read_csv('bip_result_dict.csv')
    .set_index(['year_played','bb_bucket'])
    .to_dict(orient='index')
)

# Apply averages to each predicted grouping
for year in list(model_df['year_played'].unique()):
    for outcome in ['out', 'single', 'double', 'triple', 'home_run']:
        # Start with 50+ degrees (popups)
        model_df.loc[model_df['year_played']==year,outcome+'_pred'] = model_df.loc[model_df['year_played']==year,'50+deg_pred']*bip_result_dict[(year,'50+deg')][outcome]
    
        for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
            for bucket in [launch_angle+': '+x for x in ['<90mph','90-95mph','95-100mph','100-105mph','105+mph']]:
                model_df.loc[model_df['year_played']==year,outcome+'_pred'] += model_df.loc[model_df['year_played']==year,bucket+'_pred']*bip_result_dict[(year,bucket)][outcome]

model_df['babip_pred'] = model_df[['single_pred', 'double_pred','triple_pred']].sum(axis=1).div(model_df[['out_pred','single_pred', 'double_pred','triple_pred']].sum(axis=1))

### Create wOBA estimates for each count
wOBA_expectancies = pd.read_csv('wOBA_expectancy_matrix.csv').set_index(['game_year','balls','strikes']).to_dict()['pa_wOBA']

model_df['count_wOBA'] = None
for year in model_df['year_played'].unique():
    model_df.loc[model_df['year_played']==year,'count_wOBA'] = model_df.loc[model_df['year_played']==year].apply(lambda x: wOBA_expectancies[(year,x['balls'],x['strikes'])], axis=1)

### Find the estimated change in wOBA/runs for each pitch
# wOBA value of an outcome, based on the count that it came in
outcome_wOBAs = pd.read_csv('outcome_wOBA_values.csv').set_index(['year_played','balls','strikes'])

# Seasonal constants for calculating/scaling PLV
seasonal_constants = pd.read_csv('plv_seasonal_constants.csv').set_index('year')

model_df = model_df.merge(outcome_wOBAs,
                          how='left',
                          on=['year_played','balls','strikes'])

# wOBA_effect is how the pitch is expected to affect wOBA
# (either by moving the count, or by ending the PA)
model_df['wOBA_effect'] = 0
# Pitch Runs (wOBA on a run scale)
model_df['pitch_runs'] = None

for stat in [x[:-5] for x in list(outcome_wOBAs.columns)]:
  model_df['wOBA_effect'] = model_df['wOBA_effect'].add(model_df[stat+'_pred'].fillna(model_df[stat+'_pred'].median()).mul(model_df[stat+'_wOBA'].fillna(model_df[stat+'_wOBA'].median())))

for year in model_df['year_played'].unique():
    # Default run value is average runs per pitch
    model_df.loc[model_df['year_played']==year,'pitch_runs'] = seasonal_constants.loc[year]['er_per_pitch']
    model_df.loc[model_df['year_played']==year,'pitch_runs'] = model_df.loc[model_df['year_played']==year,'pitch_runs'].add(model_df.loc[model_df['year_played']==year,'wOBA_effect'].div(seasonal_constants.loc[year]['run_constant'])).mul(0.915) # 0.915 is the rough translation of runs to earned runs
model_df['pitch_runs'] = model_df['pitch_runs'].astype('float')

# Calculate PLV by scaling pitch_runs of each pitch to (generally) be 0-10
model_df['PLV'] = None
for year in model_df['year_played'].unique():
  model_df.loc[model_df['year_played']==year,'PLV'] = model_df.loc[model_df['year_played']==year,'pitch_runs'].sub(seasonal_constants.loc[year]['run_plv_constant']).div(seasonal_constants.loc[year]['run_plv_coef'])
model_df['PLV'] = model_df['PLV'].astype('float')
print('PLV done')
    
# Hitting
### Hitter Stats

model_df['swing'] = 0
model_df.loc[model_df['cleaned_description'].isin(['single','double','triple','home_run','out','foul_strike','swinging_strike']),'swing'] = 1

# Swing % compared to expected
swing_pred_cols = ['foul_strike_pred','swinging_strike_pred',
                   'out_pred', 'single_pred', 'double_pred', 
                   'triple_pred', 'home_run_pred']

model_df['swing_agg'] = None
model_df['swing_agg'] = model_df['swing'].sub(model_df[swing_pred_cols].sum(axis=1))

# compare probabilities for called strike/hit-by-pitch/ball only
sz_judge_cols = ['called_strike_pred', 'ball_pred', 'hit_by_pitch_pred']
model_df['take_probs'] = model_df[sz_judge_cols].sum(axis=1).astype('float')

# Swings are called strike probability / total
# Takes are Ball + HBP probability / total 
model_df['strike_zone_judgement'] = None
model_df.loc[model_df['swing']==0,'strike_zone_judgement'] = model_df.loc[model_df['swing']==0,['ball_pred', 'hit_by_pitch_pred']].sum(axis=1).fillna(0).div(model_df.loc[model_df['swing']==0,'take_probs'])
model_df.loc[model_df['swing']==1,'strike_zone_judgement'] = model_df.loc[model_df['swing']==1,'called_strike_pred'].fillna(0).div(model_df.loc[model_df['swing']==1,'take_probs'])

# Weighted value columns for swing/take
take_wOBA_cols = ['ball_wOBA_w','hit_by_pitch_wOBA_w','called_strike_wOBA_w']
swing_wOBA_cols = ['foul_strike_wOBA_w','swinging_strike_wOBA_w',
                   'out_wOBA_w', 'single_wOBA_w', 'double_wOBA_w', 
                   'triple_wOBA_w', 'home_run_wOBA_w']

def decision_cols(data,take_cols,swing_cols):
    for stat in take_wOBA_cols+swing_wOBA_cols:
        data[stat] = data[stat[:-2]] * data[stat[:-6]+'pred']

    data['take_value'] = data[take_wOBA_cols].sum(axis=1)
    data['swing_value'] = data[swing_wOBA_cols].sum(axis=1)
    
    return data[['take_value','swing_value']]

model_df[['take_value','swing_value']] = decision_cols(model_df,take_wOBA_cols,swing_wOBA_cols)
model_df['decision_value'] = None
# Decision Value is opportunity cost of decision
# Takes are (take value - swing value)
model_df.loc[model_df['swing']==0,'decision_value'] = model_df.loc[model_df['swing']==0,'take_value'].sub(model_df.loc[model_df['swing']==0,'swing_value'])
# Swings are (swing value - take value)
model_df.loc[model_df['swing']==1,'decision_value'] = model_df.loc[model_df['swing']==1,'swing_value'].sub(model_df.loc[model_df['swing']==1,'take_value'])

# Did they make contact on a swing
model_df['contact'] = None
model_df.loc[model_df['swing']==1,'contact'] = 1
model_df.loc[(model_df['swing']==1) &
             (model_df['cleaned_description']=='swinging_strike'),'contact'] = 0

# Contact ability (Contact outcome - contact probability)
model_df['contact_pred'] = None
model_df.loc[model_df['swing']==1,'contact_pred'] = np.clip(model_df.loc[model_df['swing']==1,['single_pred','double_pred','triple_pred','home_run_pred','out_pred','foul_strike_pred']].sum(axis=1), a_min=0.0001, a_max=1)
model_df.loc[model_df['swing']==1,'contact_pred'] = model_df.loc[model_df['swing']==1,'contact_pred'].div(model_df.loc[model_df['swing']==1,['contact_pred','swinging_strike_pred']].sum(axis=1))

model_df['contact_over_expected'] = None
model_df.loc[model_df['swing']==1,'contact_over_expected'] = model_df.loc[model_df['swing']==1,'contact'] - model_df.loc[model_df['swing']==1,'contact_pred']

model_df['bbe'] = None
model_df.loc[model_df['swing']==1,'bbe'] = 1
model_df.loc[(model_df['swing']==0) |
             (model_df['cleaned_description'].isin(['swinging_strike','foul_strike'])),'bbe'] = 0

# ISO Dict
iso_dict = {
    'out':0,
    'single':0,
    'double':1,
    'triple':2,
    'home_run':3
}

# ISO on Contact
model_df['pitch_ISO'] = 0

for outcome in ['out','single','double','triple','home_run']:
    model_df['pitch_ISO'] += model_df[outcome+'_pred'] * iso_dict[outcome]

model_df['pitch_ISO'] = model_df['pitch_ISO'].div(model_df['in_play_input'].fillna(0.00001))
model_df.loc[model_df['bbe']==0,'pitch_ISO'] = None

model_df['outcome_ISO'] = None
model_df.loc[model_df['bbe']==1,'outcome_ISO'] = model_df.loc[model_df['bbe']==1,'cleaned_description'].map(iso_dict)

iso_df = model_df[['launch_speed', 'launch_angle','cleaned_description','outcome_ISO']].dropna().astype({'outcome_ISO':'int'}).copy()

### KNN xISO Model
# knn_clf = KNeighborsClassifier(n_neighbors=100)
# knn_clf.fit(iso_df[['launch_angle','launch_speed']],iso_df['outcome_ISO'])
# with open('xISO_model.pkl', 'wb') as f:
#     pickle.dump(knn_clf, f)

with open('xISO_model.pkl', 'rb') as f:
    knn_clf = pickle.load(f)

def adj_power(data):
    data[['0_pred','1_pred','2_pred','3_pred']] = None, None, None, None
    data.loc[data['launch_angle'].notna() &
             data['launch_speed'].notna(),
             ['0_pred','1_pred','2_pred','3_pred']] = knn_clf.predict_proba(data.loc[data['launch_angle'].notna() &
                                                                                     data['launch_speed'].notna(),
                                                                                     ['launch_angle','launch_speed']])
    data['pred_ISO'] = None
    data.loc[data['0_pred'].notna(),'pred_ISO'] = data.loc[data['0_pred'].notna(),'1_pred'] + data.loc[data['0_pred'].notna(),'2_pred']*2 + data.loc[data['0_pred'].notna(),'3_pred']*3
    return data['pred_ISO'].astype('float')

model_df['pred_ISO'] = adj_power(model_df)

model_df['adj_power'] = None
model_df['adj_power'] = model_df['pred_ISO'] - model_df['pitch_ISO']

# Calculate batter_wOBA_added, based on a pitch's outcome and quality
model_df['batter_wOBA'] = None
pitch_outcomes = ['ball', 'foul_strike', 'swinging_strike', 'called_strike',
                  'home_run', 'single', 'out', 'double', 'hit_by_pitch', 'triple']

for year in model_df['year_played'].unique():
    model_df.loc[(model_df['year_played']==year) &
                 model_df['cleaned_description'].isin(pitch_outcomes),'batter_wOBA'] = model_df.loc[(model_df['year_played']==year) &
                                                                                                          model_df['cleaned_description'].isin(pitch_outcomes)].apply(lambda x: x[x['cleaned_description']+'_wOBA'], axis=1).sub(model_df.loc[(model_df['year_played']==year) &
                                                                                                          model_df['cleaned_description'].isin(pitch_outcomes),'wOBA_effect'])
    
model_df['batter_wOBA'] = model_df['batter_wOBA'].astype('float')
print('Hitters done')

# Segmenting
model_df['day_played'] = pd.to_datetime(model_df['game_played']).dt.day
model_df['month_played'] = pd.to_datetime(model_df['game_played']).dt.month
model_df['groundball_pred'] = model_df[['10deg: <90mph_pred', '10deg: 90-95mph_pred',
                                        '10deg: 95-100mph_pred', '10deg: 100-105mph_pred',
                                        '10deg: 105+mph_pred']].sum(axis=1)

print('App raw data complete')

PLA done
Data loaded
Arm Angle Done


  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mph_raw',
  model_df[[launch_angle+'_input',launch_angle+': <90mp

Fastball Swing model done
Fastball Take model done
Fastball Swing Result model done
Fastball Contact model done
Fastball Launch Angle model done
Fastball Launch Velo model done
Breaking Ball Swing model done
Breaking Ball Take model done
Breaking Ball Swing Result model done
Breaking Ball Contact model done
Breaking Ball Launch Angle model done
Breaking Ball Launch Velo model done
Offspeed Swing model done
Offspeed Take model done
Offspeed Swing Result model done
Offspeed Contact model done
Offspeed Launch Angle model done
Offspeed Launch Velo model done
PLV done




Hitters done
App raw data complete


In [None]:
### Location Model
# Add empty columns for model predictions
for col in ['take_input','swing_input','called_strike_raw','ball_raw',
            'hit_by_pitch_raw','swinging_strike_raw','contact_raw',
            'foul_strike_raw','in_play_raw','10deg_raw','10-20deg_raw',
            '20-30deg_raw','30-40deg_raw','40-50deg_raw','50+deg_raw',
            'called_strike_pred','ball_pred','hit_by_pitch_pred','contact_input',
            'swinging_strike_pred','foul_strike_pred','in_play_input','50+deg_pred',
            'out_pred', 'single_pred', 'double_pred', 'triple_pred', 'home_run_pred']:
    model_df[col+'_loc'] = None

for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
    model_df[launch_angle+'_input_loc'] = None
    model_df[launch_angle+': <90mph_raw_loc'] = None
    model_df[launch_angle+': 90-95mph_raw_loc'] = None
    model_df[launch_angle+': 95-100mph_raw_loc'] = None
    model_df[launch_angle+': 100-105mph_raw_loc'] = None
    model_df[launch_angle+': 105+mph_raw_loc'] = None
    model_df[launch_angle+': <90mph_pred_loc'] = None
    model_df[launch_angle+': 90-95mph_pred_loc'] = None
    model_df[launch_angle+': 95-100mph_pred_loc'] = None
    model_df[launch_angle+': 100-105mph_pred_loc'] = None
    model_df[launch_angle+': 105+mph_pred_loc'] = None

### Apply models
for pitch_type in ['Fastball','Breaking Ball','Offspeed']:
    # Unzip model files
#     with zipfile.ZipFile('pl_{}_loc_model_files.zip'.format(pitch_type), 'r') as zip_ref:
#         zip_ref.extractall()

    # Swing Decision
    with open('pl_swing_model_{}_loc.pkl'.format(pitch_type), 'rb') as f:
        decision_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['take_input_loc','swing_input_loc']] = decision_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,decision_model.feature_names_in_])
    print(pitch_type+' Swing model done')

    # Take Result
    with open('pl_take_result_model_{}_loc.pkl'.format(pitch_type), 'rb') as f:
        take_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['called_strike_raw_loc','ball_raw_loc','hit_by_pitch_raw_loc']] = take_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,take_model.feature_names_in_])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'called_strike_pred_loc'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'called_strike_raw_loc'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'take_input_loc'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'ball_pred_loc'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'ball_raw_loc'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'take_input_loc'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'hit_by_pitch_pred_loc'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'hit_by_pitch_raw_loc'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'take_input_loc'])
    print(pitch_type+' Take model done')

    # Swing Result
    with open('pl_contact_model_{}_loc.pkl'.format(pitch_type), 'rb') as f:
        swing_result_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['swinging_strike_raw_loc','contact_raw_loc']] = swing_result_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,swing_result_model.feature_names_in_])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'contact_input_loc'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'contact_raw_loc'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'swing_input_loc'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'swinging_strike_pred_loc'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'swinging_strike_raw_loc'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'swing_input_loc'])
    print(pitch_type+' Swing Result model done')

    # Contact Result
    with open('pl_in_play_model_{}_loc.pkl'.format(pitch_type), 'rb') as f:
        contact_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['foul_strike_raw_loc','in_play_raw_loc']] = contact_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,contact_model.feature_names_in_])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'foul_strike_pred_loc'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'foul_strike_raw_loc'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'contact_input_loc'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_input_loc'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_raw_loc'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'contact_input_loc'])
    print(pitch_type+' Contact model done')

    # Launch Angle Result
    with open('pl_launch_angle_result_model_{}_loc.pkl'.format(pitch_type), 'rb') as f:
        launch_angle_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['10deg_raw_loc','10-20deg_raw_loc','20-30deg_raw_loc','30-40deg_raw_loc','40-50deg_raw_loc','50+deg_raw_loc']] = launch_angle_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle_model.feature_names_in_])
    for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
        model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle+'_input_loc'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle+'_raw_loc'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_input_loc'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'50+deg_pred_loc'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'50+deg_raw_loc'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_input_loc'])
    print(pitch_type+' Launch Angle model done')

    # Launch Velo Result
    for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
        with open('pl_{}_model_{}_loc.pkl'.format(launch_angle,pitch_type), 'rb') as f:
            launch_velo_model = pickle.load(f)

        model_df.loc[model_df['pitch_type_bucket']==pitch_type,[launch_angle+': <90mph_raw_loc',launch_angle+': 90-95mph_raw_loc',launch_angle+': 95-100mph_raw_loc',launch_angle+': 100-105mph_raw_loc',launch_angle+': 105+mph_raw_loc']] = launch_velo_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_velo_model.feature_names_in_])
        for bucket in [launch_angle+': '+x for x in ['<90mph','90-95mph','95-100mph','100-105mph','105+mph']]:
            model_df.loc[model_df['pitch_type_bucket']==pitch_type,bucket+'_pred_loc'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,bucket+'_raw_loc'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle+'_input_loc'])
    print(pitch_type+' Launch Velo model done')
    
    
del decision_model, take_model, swing_result_model, contact_model, launch_angle_model, launch_velo_model

# Apply averages to each predicted grouping
for year in list(model_df['year_played'].unique()):
    for outcome in ['out', 'single', 'double', 'triple', 'home_run']:
        # Start with 50+ degrees (popups)
        model_df.loc[model_df['year_played']==year,outcome+'_pred_loc'] = model_df.loc[model_df['year_played']==year,'50+deg_pred_loc']*bip_result_dict[(year,'50+deg')][outcome]
    
        for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
            for bucket in [launch_angle+': '+x for x in ['<90mph','90-95mph','95-100mph','100-105mph','105+mph']]:
                model_df.loc[model_df['year_played']==year,outcome+'_pred_loc'] += model_df.loc[model_df['year_played']==year,bucket+'_pred_loc']*bip_result_dict[(year,bucket)][outcome]
# wOBA_effect is how the pitch is expected to affect wOBA
# (either by moving the count, or by ending the PA)
model_df['wOBA_effect_loc'] = 0
# Pitch Runs (wOBA on a run scale)
model_df['pitch_runs_loc'] = None

for stat in [x[:-5] for x in list(outcome_wOBAs.columns)]:
    model_df['wOBA_effect_loc'] = model_df['wOBA_effect_loc'].add(model_df[stat+'_pred_loc'].fillna(model_df[stat+'_pred_loc'].median()).mul(model_df[stat+'_wOBA'].fillna(model_df[stat+'_wOBA'].median())))

for year in model_df['year_played'].unique():
    # Default run value is average runs per pitch
    model_df.loc[model_df['year_played']==year,'pitch_runs_loc'] = seasonal_constants.loc[year]['er_per_pitch']
    model_df.loc[model_df['year_played']==year,'pitch_runs_loc'] = model_df.loc[model_df['year_played']==year,'pitch_runs_loc'].add(model_df.loc[model_df['year_played']==year,'wOBA_effect_loc'].div(seasonal_constants.loc[year]['run_constant'])).mul(0.915) # 0.915 is the rough translation of runs to earned runs
model_df['pitch_runs_loc'] = model_df['pitch_runs_loc'].astype('float')

# Calculate PLV by scaling pitch_runs of each pitch to (generally) be 0-10
model_df['PLV_loc'] = None
for year in model_df['year_played'].unique():
    model_df.loc[(model_df['year_played']==year),'PLV_loc'] = -(model_df.loc[(model_df['year_played']==year),'pitch_runs_loc'] - model_df.loc[(model_df['year_played']==year),'pitch_runs_loc'].mean()) / (model_df.loc[(model_df['year_played']==year),'pitch_runs_loc'].std()/2) + 5
model_df['PLV_loc'] = model_df['PLV_loc'].astype('float')

Fastball Swing model done
Fastball Take model done


In [None]:
### Stuff Model
# Add empty columns for model predictions
for col in ['foul_strike_raw','in_play_raw','10deg_raw','10-20deg_raw',
            '20-30deg_raw','30-40deg_raw','40-50deg_raw','50+deg_raw','contact_input',
            'swinging_strike_pred','foul_strike_pred','in_play_input','50+deg_pred',
            'out_pred', 'single_pred', 'double_pred', 'triple_pred', 'home_run_pred']:
    model_df[col+'_stuff'] = None

for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
    model_df[launch_angle+'_input_stuff'] = None
    model_df[launch_angle+': <90mph_raw_stuff'] = None
    model_df[launch_angle+': 90-95mph_raw_stuff'] = None
    model_df[launch_angle+': 95-100mph_raw_stuff'] = None
    model_df[launch_angle+': 100-105mph_raw_stuff'] = None
    model_df[launch_angle+': 105+mph_raw_stuff'] = None
    model_df[launch_angle+': <90mph_pred_stuff'] = None
    model_df[launch_angle+': 90-95mph_pred_stuff'] = None
    model_df[launch_angle+': 95-100mph_pred_stuff'] = None
    model_df[launch_angle+': 100-105mph_pred_stuff'] = None
    model_df[launch_angle+': 105+mph_pred_stuff'] = None

### Apply models
for pitch_type in ['Fastball','Breaking Ball','Offspeed']:
    # Swing Result
    with open('pl_contact_model_{}_stuff.pkl'.format(pitch_type), 'rb') as f:
        swing_result_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['swinging_strike_pred_stuff','contact_input_stuff']] = swing_result_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,swing_result_model.feature_names_in_])
    print(pitch_type+' Swing Result model done')

    # Contact Result
    with open('pl_in_play_model_{}_stuff.pkl'.format(pitch_type), 'rb') as f:
        contact_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['foul_strike_raw_stuff','in_play_raw_stuff']] = contact_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,contact_model.feature_names_in_])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'foul_strike_pred_stuff'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'foul_strike_raw_stuff'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'contact_input_stuff'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_input_stuff'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_raw_stuff'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'contact_input_stuff'])
    print(pitch_type+' Contact model done')

    # Launch Angle Result
    with open('pl_launch_angle_result_model_{}_stuff.pkl'.format(pitch_type), 'rb') as f:
        launch_angle_model = pickle.load(f)

    model_df.loc[model_df['pitch_type_bucket']==pitch_type,['10deg_raw_stuff','10-20deg_raw_stuff','20-30deg_raw_stuff','30-40deg_raw_stuff','40-50deg_raw_stuff','50+deg_raw_stuff']] = launch_angle_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle_model.feature_names_in_])
    for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
        model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle+'_input_stuff'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle+'_raw_stuff'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_input_stuff'])
    model_df.loc[model_df['pitch_type_bucket']==pitch_type,'50+deg_pred_stuff'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,'50+deg_raw_stuff'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,'in_play_input_stuff'])
    print(pitch_type+' Launch Angle model done')

    # Launch Velo Result
    for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
        with open('pl_{}_model_{}_stuff.pkl'.format(launch_angle,pitch_type), 'rb') as f:
            launch_velo_model = pickle.load(f)

        model_df.loc[model_df['pitch_type_bucket']==pitch_type,[launch_angle+': <90mph_raw_stuff',launch_angle+': 90-95mph_raw_stuff',launch_angle+': 95-100mph_raw_stuff',launch_angle+': 100-105mph_raw_stuff',launch_angle+': 105+mph_raw_stuff']] = launch_velo_model.predict_proba(model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_velo_model.feature_names_in_])
        for bucket in [launch_angle+': '+x for x in ['<90mph','90-95mph','95-100mph','100-105mph','105+mph']]:
            model_df.loc[model_df['pitch_type_bucket']==pitch_type,bucket+'_pred_stuff'] = model_df.loc[model_df['pitch_type_bucket']==pitch_type,bucket+'_raw_stuff'].mul(model_df.loc[model_df['pitch_type_bucket']==pitch_type,launch_angle+'_input_stuff'])
    print(pitch_type+' Launch Velo model done')
    
del swing_result_model, contact_model, launch_angle_model, launch_velo_model

# Apply averages to each predicted grouping
for year in list(model_df['year_played'].unique()):
    for outcome in ['out', 'single', 'double', 'triple', 'home_run']:
        # Start with 50+ degrees (popups)
        model_df.loc[model_df['year_played']==year,outcome+'_pred_stuff'] = model_df.loc[model_df['year_played']==year,'50+deg_pred_stuff']*bip_result_dict[(year,'50+deg')][outcome]
    
        for launch_angle in ['10deg','10-20deg','20-30deg','30-40deg','40-50deg']:
            for bucket in [launch_angle+': '+x for x in ['<90mph','90-95mph','95-100mph','100-105mph','105+mph']]:
                model_df.loc[model_df['year_played']==year,outcome+'_pred_stuff'] += model_df.loc[model_df['year_played']==year,bucket+'_pred_stuff']*bip_result_dict[(year,bucket)][outcome]
# wOBA_effect is how the pitch is expected to affect wOBA
# (either by moving the count, or by ending the PA)
model_df['wOBA_effect_stuff'] = 0
# Pitch Runs (wOBA on a run scale)
model_df['pitch_runs_stuff'] = None

for stat in [x[:-5] for x in [x for x in outcome_wOBAs.columns.values if x not in ['ball_wOBA', 'called_strike_wOBA','hit_by_pitch_wOBA']]]:
    model_df['wOBA_effect_stuff'] = model_df['wOBA_effect_stuff'].add(model_df[stat+'_pred_stuff'].fillna(model_df[stat+'_pred_stuff'].median()).mul(model_df[stat+'_wOBA'].fillna(model_df[stat+'_wOBA'].median())))

for year in model_df['year_played'].unique():
    # Default run value is average runs per pitch
    model_df.loc[model_df['year_played']==year,'pitch_runs_stuff'] = seasonal_constants.loc[year]['er_per_pitch']
    model_df.loc[model_df['year_played']==year,'pitch_runs_stuff'] = model_df.loc[model_df['year_played']==year,'pitch_runs_stuff'].add(model_df.loc[model_df['year_played']==year,'wOBA_effect_stuff'].div(seasonal_constants.loc[year]['run_constant'])).mul(0.915) # 0.915 is the rough translation of runs to earned runs
model_df['pitch_runs_stuff'] = model_df['pitch_runs_stuff'].astype('float')

# Calculate Stuff PLV using St Dev for each pitch group, centered at 5. Approx. 0-10 scale
model_df['PLV_stuff'] = None
for year in model_df['year_played'].unique():
    for pitch_group in ['Fastball','Breaking Ball','Offspeed']:
        model_df.loc[(model_df['year_played']==year) &
                     (model_df['pitch_type_bucket']==pitch_group),'PLV_stuff'] = -(model_df.loc[(model_df['year_played']==year) &
                                                                                                (model_df['pitch_type_bucket']==pitch_group),'pitch_runs_stuff'] - model_df.loc[(model_df['year_played']==year) &
                                                                                                (model_df['pitch_type_bucket']==pitch_group),'pitch_runs_stuff'].mean()) / (model_df.loc[(model_df['year_played']==year) &
                                                                                                (model_df['pitch_type_bucket']==pitch_group),'pitch_runs_stuff'].std()/2) + 5
model_df['PLV_stuff'] = model_df['PLV_stuff'].astype('float')

In [None]:
model_df['PLV_stuff'] = None
for year in model_df['year_played'].unique():
    for pitch_group in ['Fastball','Breaking Ball','Offspeed']:
        model_df.loc[(model_df['year_played']==year) &
                     (model_df['pitch_type_bucket']==pitch_group),'PLV_stuff'] = -(model_df.loc[(model_df['year_played']==year) &
                                                                                                (model_df['pitch_type_bucket']==pitch_group),'pitch_runs_stuff'] - model_df.loc[(model_df['year_played']==year) &
                                                                                                (model_df['pitch_type_bucket']==pitch_group),'pitch_runs_stuff'].mean()) / (model_df.loc[(model_df['year_played']==year) &
                                                                                                (model_df['pitch_type_bucket']==pitch_group),'pitch_runs_stuff'].std()/2) + 5
model_df['PLV_stuff'] = model_df['PLV_stuff'].astype('float')

In [None]:
model_df.head(5)

In [None]:
model_df.loc[model_df['year_played']==2023].groupby('pitchername')[['pitch_id','PLV_stuff']].agg({
    'pitch_id':'count',
    'PLV_stuff':'mean'
}).query('pitch_id >=300').sort_values('PLV_stuff', ascending=False)

In [None]:
model_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Plot Style
pl_white = '#FEFEFE'
pl_background = '#162B50'
pl_text = '#72a3f7'
pl_line_color = '#293a6b'

sns.set_theme(
    style={
        'axes.edgecolor': pl_background,
        'axes.facecolor': pl_background,
        'axes.labelcolor': pl_white,
        'xtick.color': pl_white,
        'ytick.color': pl_white,
        'figure.facecolor':pl_background,
        'grid.color': pl_background,
        'grid.linestyle': '-',
        'legend.facecolor':pl_background,
        'text.color': pl_white
     }
    )

# Pitch Names
pitch_names = {
    'FF':'Four-Seamer', 
    'SI':'Sinker',
    'FS':'Splitter',  
    'FC':'Cutter', 
    'SL':'Slider', 
    'CU':'Curveball',
    'CH':'Changeup', 
    'KN':'Knuckleball',
    'SC':'Screwball', 
    'UN':'Unknown', 
}

# Marker Style
marker_colors = {
    'FF':'#d22d49', 
    'SI':'#c57a02',
    'FS':'#00a1c5',  
    'FC':'#933f2c', 
    'SL':'#9300c7', 
    'CU':'#3c44cd',
    'CH':'#07b526', 
    'KN':'#999999',
    'SC':'#999999', 
    'UN':'#999999', 
}

sz_bot = 1.5
sz_top = 3.5
y_bot = 1
y_lim = 4

chart_df = model_df.loc[(model_df['p_hand']=='R') &
                        (model_df['b_hand']=='L')].copy()

for pitch_type in ['Fastball','Breaking Ball','Offspeed']:
    fig, ax = plt.subplots(figsize=(8,8))
    # Strike zone outline
    ax.plot([-10/12,10/12], [sz_bot,sz_bot], color='w', linewidth=2)
    ax.plot([-10/12,10/12], [sz_top,sz_top], color='w', linewidth=2)
    ax.plot([-10/12,-10/12], [sz_bot,sz_top], color='w', linewidth=2)
    ax.plot([10/12,10/12], [sz_bot,sz_top], color='w', linewidth=2)
    
    # Inner Strike zone
    ax.plot([-10/12,10/12], [1.5+2/3,1.5+2/3], color='w', linewidth=1)
    ax.plot([-10/12,10/12], [1.5+4/3,1.5+4/3], color='w', linewidth=1)
    ax.axvline(10/36, ymin=(sz_bot-y_bot)/(y_lim-y_bot), ymax=(sz_top-y_bot)/(y_lim-y_bot), color='w', linewidth=1)
    ax.axvline(-10/36, ymin=(sz_bot-y_bot)/(y_lim-y_bot), ymax=(sz_top-y_bot)/(y_lim-y_bot), color='w', linewidth=1)
    
    ax.hexbin(chart_df.loc[chart_df['pitch_type_bucket']==pitch_type,'p_x'],
              chart_df.loc[chart_df['pitch_type_bucket']==pitch_type,'p_z'],
              C=chart_df.loc[chart_df['pitch_type_bucket']==pitch_type,'PLV_loc'],
              extent=(-1.5,1.5,1,4),
              gridsize=(18,12),
#               linewidths=0.25,
#               edgecolors='#dddddd',
              cmap='vlag',
              facecolor=pl_background,
              vmin=3,
              vmax=7,
              mincnt=0,
              alpha=0.5)
    
    ax.set(xlim=(1.5,-1.5),
           ylim=(1,4))
    fig.suptitle(pitch_type+' Location Values',y=0.925)
    sns.despine()

In [None]:
model_df.columns.values

In [None]:
app_cols = ['pitch_id','game_played','mlb_game_id','year_played',
            'pitchername','month_played','pitcher_mlb_id','p_hand','pitch_extension',
            'hittername','hitter_mlb_id','b_hand','pitchtype','pitch_type_bucket',
            'p_x','p_z','strike_zone_top','strike_zone_bottom','IVB','IHB','adj_vaa',
            'balls','strikes','velo','PLV','PLV_loc','PLV_stuff','swinging_strike_pred',
            'called_strike_pred','in_play_input','ball_pred','babip_pred',
            'home_run_pred','cleaned_description','swing_agg','strike_zone_judgement',
            'decision_value','contact_over_expected','adj_power','batter_wOBA',
            'arm_angle']

app_df = model_df[app_cols].dropna(subset=['PLV']).copy()
for col in ['strike_zone_judgement','contact_over_expected','swinging_strike_pred',
            'called_strike_pred','in_play_input','decision_value','adj_power',
            'batter_wOBA','ball_pred','babip_pred','home_run_pred','p_x','p_z',
            'arm_angle']:
    app_df[col] = app_df[col].astype('float')
    
for col in ['pitchername','p_hand','hittername','b_hand',
            'pitchtype','pitch_type_bucket','cleaned_description']:
    app_df[col] = app_df[col].astype('category')
    
app_df['game_played'] = pd.to_datetime(app_df['game_played'])
app_df['month_played'] = app_df['month_played'].astype('int8')

## downcasting loop
for column in app_df.columns:
    if app_df[column].dtype == 'float64':
        app_df[column]=pd.to_numeric(app_df[column], downcast='float')
    if app_df[column].dtype == 'int64':
        app_df[column]=pd.to_numeric(app_df[column], downcast='integer')
for year in [#2020,2021,2022,
             2023]:
    for month in range(3,11):
        (app_df
         .loc[(app_df['year_played']==year) &
              (app_df['month_played']==month)]
         .drop(columns=['month_played'])
         .to_parquet(f'{year}_PLV_App_Data-{month}.parquet',
                     index=False)
        )
print('App data update complete')

In [None]:
app_df['game_played'].max()

In [None]:
app_df['total_IB'] = (app_df['IHB'].astype('float')**2+app_df['IVB'].astype('float')**2)**0.5
app_df['move_angle'] = np.rad2deg(np.arccos((app_df['IVB']**2 + app_df['total_IB']**2 - app_df['IHB']**2)/(2*app_df['IVB']*app_df['total_IB'])))

In [None]:
# app_df['move_angle'] = np.rad2deg(np.arctan(app_df['IVB'].div(app_df['IHB'])))
app_df.loc[(app_df['IHB']<0),'move_angle'] = 360 - app_df.loc[(app_df['IHB']<0),'move_angle']
app_df.loc[(app_df['p_hand']=='L'),'move_angle'] = 360 - app_df.loc[(app_df['p_hand']=='L'),'move_angle']

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sns.kdeplot(app_df['move_angle'],
           cut=0)
sns.despine()

In [None]:
sns.scatterplot(data=app_df.loc[app_df['p_hand']=='R'],
                x='IHB',
                y='IVB',
                hue='move_angle')
sns.despine()

In [None]:
arm_df['arm_slot'] = pd.cut(arm_df['arm_angle'],bins=[0,30,60,90,180],labels=['Overhand','Three-Quarters','Sidearm','Submarine'])
arm_test = arm_df.query('num_pitches >=500').sort_values('arm_angle')
arm_test['angle_percentile'] = 1 - arm_test['arm_angle'].rank(pct=True)
arm_test['height_ext_ratio'] = arm_test['pitcher_height'].div(arm_test['pitch_extension_cleaned'].mul(12))
arm_test['height_ext_diff'] = arm_test['pitcher_height'].sub(arm_test['pitch_extension_cleaned'].mul(12))
arm_test['height_release_ratio'] = arm_test['pitcher_height'].div(arm_test['z0'].mul(12))
arm_test['height_release_diff'] = arm_test['pitcher_height'].sub(arm_test['z0'].mul(12))
arm_test['hrr_percentile'] = 1 - arm_test['height_release_ratio'].rank(pct=True)
arm_test['percentile_diff'] = arm_test['hrr_percentile'].sub(arm_test['angle_percentile'])
arm_test.sort_values('height_release_ratio').round(2)

In [None]:
sns.scatterplot(data=arm_test,
                x='z0',
                y='pitcher_height',
                hue='height_release_diff',
                legend=False
               )
sns.despine()

In [None]:
sns.scatterplot(data=arm_test,
                x='arm_angle',
                y='height_release_diff',
                hue='height_ext_diff',
                legend=False
               )
sns.despine()

In [None]:
arm_test.loc[arm_test['pitchername'].isin(['Justin Verlander','Bryce Miller'])]

In [None]:
arm_test[['pitchername','arm_angle','angle_percentile','height_release_ratio','hrr_percentile','percentile_diff','height_ext_ratio']].sort_values('height_ext_ratio').round(2)#.head(5)

In [None]:
sns.scatterplot(data=arm_test,
               x='angle_percentile',
               y='hrr_percentile')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots(figsize=(8,8))
sns.scatterplot(data=arm_df.query('num_pitches >=100'),
                x='x0',
                y='z0',
                hue='arm_slot')
ax.set(xlim=(-4.5,4.5),
       xlabel='Release X (ft)',
       ylim=(1,7.5),
       ylabel='Release Y (ft)',
       aspect=1
      )
plt.legend(loc='lower center')
fig.suptitle('Estimated Pitcher Arm Slots',x=0.5,y=0.835,ha='center',fontsize=14)
plt.title('(Based on Release Location and Pitcher Height)',x=0.485,ha='center',fontsize=8)
sns.despine()

In [None]:
take_cols = ['called_strike_pred', 'ball_pred', 'hit_by_pitch_pred']
model_df['take_probs'] = model_df[take_cols].sum(axis=1).astype('float')
model_df['zone_prob'] = model_df['called_strike_pred'].fillna(0).div(model_df['take_probs'])

In [None]:
model_df.groupby(['pitchername','pitchtype'])[['pitch_id','zone_prob']].agg({
    'pitch_id':'count',
    'zone_prob':'mean'
}).query('pitch_id >=300').sort_values('zone_prob', ascending=False)

In [None]:
model_df.loc[model_df['pitchername'].isin(['Bryce Miller','Tanner Bibee'])].groupby(['pitchername'])[['pitch_id','zone_prob']].agg({
    'pitch_id':'count',
    'zone_prob':'mean'
})

In [None]:
fig, ax = plt.subplots(figsize=(7,7))
chart_data = model_df.groupby(['pitchername','pitchtype','pitch_type_bucket'])[['pitch_id','zone_prob']].agg({
    'pitch_id':'count',
    'zone_prob':'mean'
}).query('pitch_id >=300').sort_values('zone_prob', ascending=False).reset_index()
sns.kdeplot(data=chart_data,
            x='zone_prob',
            hue='pitch_type_bucket',
            common_norm=False,
            cut=0)
for bucket in ['Fastball','Breaking Ball','Offspeed']:
    ax.axvline(chart_data.loc[chart_data['pitch_type_bucket']==bucket,'zone_prob'].median(), color='w',linestyle='--',alpha=0.4)
# ax.set(xlim=(0.2,2/3))
sns.despine()

In [None]:
chart_df.columns.values

In [None]:
import matplotlib as mpl
fig, ax = plt.subplots(figsize=(8,8))
# Strike zone outline
ax.plot([-10/12,10/12], [sz_bot,sz_bot], color='w', linewidth=2)
ax.plot([-10/12,10/12], [sz_top,sz_top], color='w', linewidth=2)
ax.plot([-10/12,-10/12], [sz_bot,sz_top], color='w', linewidth=2)
ax.plot([10/12,10/12], [sz_bot,sz_top], color='w', linewidth=2)

# Inner Strike zone
ax.plot([-10/12,10/12], [1.5+2/3,1.5+2/3], color='w', linewidth=1)
ax.plot([-10/12,10/12], [1.5+4/3,1.5+4/3], color='w', linewidth=1)
ax.axvline(10/36, ymin=(sz_bot-y_bot)/(y_lim-y_bot), ymax=(sz_top-y_bot)/(y_lim-y_bot), color='w', linewidth=1)
ax.axvline(-10/36, ymin=(sz_bot-y_bot)/(y_lim-y_bot), ymax=(sz_top-y_bot)/(y_lim-y_bot), color='w', linewidth=1)

target='batter_wOBA'
player = 'Aaron Judge'

hb = ax.hexbin(model_df.dropna(subset=target).loc[model_df['hittername']==player,'p_x'],
               model_df.dropna(subset=target).loc[model_df['hittername']==player,'p_z'],
               C=model_df.dropna(subset=target).loc[model_df['hittername']==player,target],
               extent=(-1.5,1.5,1,4),
               gridsize=(12,8),
#               linewidths=0.25,
#               edgecolors='#dddddd',
               cmap='vlag',
               facecolor=pl_background,
               norm=mpl.colors.TwoSlopeNorm(0,-0.02,0.05),
#           vmin=-0.02,
#           vmax=0.05,
               mincnt=0,
               alpha=0.75)

cb = fig.colorbar(hb, ax=ax)

ax.set(xlim=(1.5,-1.5),
       ylim=(1,4))
fig.suptitle('Decision Value',y=0.925)
sns.despine()