In [6]:
import pandas as pd 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
import datetime
import sqlite3 
import pybaseball as pyb #Pybaseball is where we will be pulling data from

# This is to grab data from statcast at the pitch level

In [7]:

# Get today's date as the latest day to pull data ( pybaseball will see tomorrow's date and only pull up to the most recent data)
today = datetime.date.today() + datetime.timedelta(days=1) 
#turn the output of the date to a string to be able to pass through the statcast function
today_str = today.strftime('%Y-%m-%d')  
#This start_dt will be the starting date of spring training for the 2023 season 
start_dt_od = '2023-03-30'


In [8]:

#Get all data leading from Opening Day to the current date from pybaseball's statcast function 
data = pyb.statcast(start_dt=start_dt_od, end_dt=today_str, team=None, verbose=True, parallel=True)


This is a large query, it may take a moment to complete


100%|█████████████████████████████████████████| 137/137 [00:42<00:00,  3.21it/s]


# Function to add barrel to a batted ball event

In [9]:
def code_barrel(df):
    # Fill missing values in 'launch_angle' and 'exit_velo' columns with 0
    df['launch_angle'] = df['launch_angle'].fillna(0)
    df['launch_speed'] = df['launch_speed'].fillna(0)

    # Create the 'barrel' column based on the specified conditions
    df['barrel'] = (
        (df['launch_angle'] <= 50) &
        (df['launch_speed'] >= 98) &
        (df['launch_speed'] * 1.5 - df['launch_angle'] >= 117) &
        (df['launch_speed'] + df['launch_angle'] >= 124)
    )

    # Convert boolean values to integers (0 for False, 1 for True)
    df['barrel'] = df['barrel'].astype(int)

    return df


# Function to map release height and side buckets

In [10]:
def map_height_bucket(value):
    if pd.notna(value):
        if value < 1:
            return '<1'
        elif 1 <= value <= 1.5:
            return '1.0-1.5'
        elif 1.5 < value <= 2:
            return '1.5-2.0'
        elif 2 < value <= 2.5:
            return '2.0-2.5'
        elif 2.5 < value <= 3:
            return '2.5-3.0'
        elif 3 < value <= 3.5:
            return '3.0-3.5'
        elif 3.5 < value <= 4:
            return '3.5-4.0'
        elif 4 < value <= 4.5:
            return '4.0-4.5'
        elif 4.5 < value <= 5:
            return '4.5-5.0'
        elif 5 < value <= 5.5:
            return '5.0-5.5'
        elif 5.5 < value <= 6:
            return '5.5-6.0'
        elif 6 < value <= 6.5:
            return '6.0-6.5'
        elif 6.5 < value <= 7:
            return '6.5-7.0'
        elif 7 <= value <= 7.5:
            return '7.0-7.5'
        elif 7.5 <= value <= 8.0:
            return '7.5-8.0'
        elif 8.0 <= value <= 8.5:
            return '8.0-8.5'
        else:
            return '8.5+'
    else:
        return 'NA'  # or any other label you want to assign to missing values


def map_side_bucket(value):
    if pd.notna(value):
        if value < -5:
            return '<-5.0'
        elif -5 <= value <= -4.5:
            return '-5.0 - -4.5'
        elif -4.5 < value <= -4:
            return '-4.5 - -4.0'
        elif -4 < value <= -3.5:
            return '-3.5 - -3.0'
        elif -3.5 < value <= -3:
            return '-3.0 - -2.5'
        elif -2.5 < value <= -2:
            return '-2.5 - -2.0'
        elif -2 < value <= -1.5:
            return '-2.0 - -1.5'
        elif -1.5 < value <= -1:
            return '-1.5 - -1.0'
        elif -1 < value <= -0.5:
            return '-1.0 - -0.5'
        elif -0.5 < value <= 0:
            return '-0.5 - 0.0'
        elif 0 < value <= 0.5:
            return '0.0 - 0.5'
        elif 0.5 < value <= 1:
            return '0.5 - 1.0'
        elif 1 < value <= 1.5:
            return '1.0 - 1.5'
        elif 1.5 <= value <= 2.0:
            return '1.5 - 2.0'
        elif 2.0 <= value <= 2.5:
            return '2.0 - 2.5'
        elif 2.5 <= value <= 3.0:
            return '2.5 - 3.0'
        elif 3.0 <= value <= 3.5:
            return '3.0 - 3.5'
        elif 3.5 <= value <= 4.0:
            return '3.5-4.0'
        elif 4.0 <= value <= 4.5:
            return '4.0-4.5'
        elif 4.5 <= value <= 5.0:
            return '4.5-5.0'
        else:
            return '5.0+'
    else:
        return 'NA'  # or any other label you want to assign to missing values


# This is a function to add new columns and other created functions before this section to the pitch by pich data

In [11]:
def add_to_savant(df):
    # Modify 'pfx_x' and 'pfx_z' columns to get them in inches 
    df['pfx_x'] = df['pfx_x'].apply(lambda x: x * -12)
    df['pfx_z'] = df['pfx_z'].apply(lambda x: x * 12)
    
    # Create 'hard_hit' column
    df['hard_hit'] = 0  # Initialize all values to 0
    mask = df['launch_speed'].notna()  # Get a boolean mask of non-missing values in 'launch_speed'
    df.loc[mask & (df['launch_speed'] > 95), 'hard_hit'] = 1  # Set values to 1 where condition is met
    
    # Create 'sweet_spot' column
    df['sweet_spot'] = 0  # Initialize all values to 0
    mask = df['launch_angle'].notna()  # Get a boolean mask of non-missing values in 'launch_angle'
    df.loc[mask & ((df['launch_angle'] >= 8) & (df['launch_angle'] <= 32)), 'sweet_spot'] = 1  # Set values to 1 where condition is met

    # Add 'VAA' column
    vy_f = -1 * np.sqrt(df['vx0']**2 + df['vy0']**2) 
    t = (vy_f - df['vy0']) / df['ay']
    vz_f = df['vz0'] + df['az'] * t
    df['VAA'] = np.round(-1 * np.arctan(vz_f / vy_f) * (180 / np.pi), 2)
    
    # Add 'Count' column
    df['count'] = df['balls'].astype(str) + '-' + df['strikes'].astype(str)

    # Add 'count_type' column  
    df['count_type'] = ''
    df.loc[df['count'].isin(['1-0', '2-0', '3-0', '2-1', '3-1', '3-2']), 'count_type'] = 'hitter'
    df.loc[df['count'].isin(['0-1', '0-2', '1-2']), 'count_type'] = 'pitcher'
    df.loc[df['count'].isin(['0-0', '1-1', '2-2']), 'count_type'] = 'even'  

    # Add a column, 'swing' to determine if the pitch was swung a or not (1=swing) 
    #this classification includes all bunt attempts as swings so you will need to filter those out in later analysis
    #define the strings to look for
    target_strings = ['hit_into_play', 'foul', 'swinging_strike', 'swinging_strike_blocked', 'foul_bunt', 'foul_tip', 'foul_pitchout', 'missed_bunt', 'bunt_foul_tip', 'swinging_pitchout']
    # create a new column with 1s and 0s based on whether the target strings are present in column1
    df['swing'] = np.where(df['description'].isin(target_strings), 1, 0)  

    # Create a new column 'swing_type'
    swing_types = df['description'].copy()
    
    # Determine if a pitch resulted in: contact, foul, whiff, take_ball, take_strike, or undef (undefined)
    swing_types.loc[swing_types.isin(['hit_into_play'])] = 'contact'
    swing_types.loc[swing_types.isin(['foul', 'foul_bunt', 'foul_tip', 'bunt_foul_tip', 'foul_pitchout'])] = 'foul'
    swing_types.loc[swing_types.isin(['swinging_strike', 'swinging_strike_blocked', 'missed_bunt', 'swinging_pitchout'])] = 'whiff'
    swing_types.loc[swing_types.isin(['ball', 'blocked_ball', 'hit_by_pitch', 'pitchout'])] = 'take_ball'
    swing_types.loc[swing_types.isin(['called_strike'])] = 'take_strike'
    swing_types.loc[~swing_types.isin(['contact', 'foul', 'whiff', 'take_ball', 'take_strike'])] = 'undef'

    # Add the new column to the dataframe
    df['swing_type'] = swing_types  

    # Create a column called attack_zone (THANKS TO NICK WAN FOR THIS, his KAGGLE competition notebook helped out here)
    df['attack_zone'] = 'waste'
    df.loc[(df['plate_x'].between(-0.558, 0.558)) & (df['plate_z'].between(1.833,3.166)), 'attack_zone'] = 'heart'
    df.loc[(df['plate_x'].between(-1.108, 1.108)) & (df['plate_z'].between(1.166,3.833)) & (~df['attack_zone'].isin(['heart'])), 'attack_zone'] = 'shadow'
    df.loc[(df['plate_x'].between(-1.666, 1.666)) & (df['plate_z'].between(0.5,4.5)) & (~df['attack_zone'].isin(['heart', 'shadow'])), 'attack_zone'] = 'chase'
    
    # Calculate the average delta_run_exp for each player and pitch type
    df['RV'] = df.groupby(['player_name', 'pitch_type'])['delta_run_exp'].transform('mean')
    df['RV/100'] = df['RV'] * 100

    # Add 'Rel_height_bucket' and 'Rel_side_bucket' column
    df['Rel_height_bucket'] = df['release_pos_z'].apply(map_height_bucket)
    df['Rel_side_bucket'] = df['release_pos_x'].apply(map_side_bucket)
    
    #Add 'barrel' designation to a batted ball
    df = code_barrel(df)
    
    
    
    
    return df

# These are the column names I will be changing to make querying quicker for me

In [12]:
# Define a dictionary with the column name changes
column_names_dict = {
    'release_speed': 'velo',
    'release_pos_x': 'release_side',
    'release_pos_z': 'release_height',
    'game_date': 'date',
    'pfx_z': 'ind_vert_break',
    'pfx_x': 'horizontal_break',
    'launch_speed': 'exit_velo',
    
    # Add more column names as needed
}

# Function to apply column name changes to the data

In [13]:
def rename_statcast_columns(data, column_names_dict):
    """
    Rename columns from the Statcast data that was pulled from Savant

    Parameters:
        data (pd.DataFrame): Statcast data from Savant
        column_names_dict (dict): a dictionary containing the current column names as keys
                                  and the new column names as values

    Returns:
        pd.DataFrame: the modified DataFrame with the renamed columns
    """
    data = data.rename(columns=column_names_dict)
    return data

In [14]:
# Run the downloaded data from pybaseball for the day through the 'add_to_savant' function
modified_data = add_to_savant(data)

In [15]:
# Run the new dataframe (modified_data) that contains the new columns from the 'add_to_savant' function through 
# the function that renames the columns
data_cleaned = rename_statcast_columns(modified_data, column_names_dict)  
data_cleaned.head()

Unnamed: 0,pitch_type,date,velo,release_side,release_height,player_name,batter,pitcher,events,description,spin_dir,spin_rate_deprecated,break_angle_deprecated,break_length_deprecated,zone,des,game_type,stand,p_throws,home_team,away_team,type,hit_location,bb_type,balls,strikes,game_year,horizontal_break,ind_vert_break,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,tfs_deprecated,tfs_zulu_deprecated,fielder_2,umpire,sv_id,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,exit_velo,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,pitcher.1,fielder_2.1,fielder_3,fielder_4,fielder_5,fielder_6,fielder_7,fielder_8,fielder_9,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,pitch_name,home_score,away_score,bat_score,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,hard_hit,sweet_spot,VAA,count,count_type,swing,swing_type,attack_zone,RV,RV/100,Rel_height_bucket,Rel_side_bucket,barrel
3117,SI,2023-08-11,96.6,-0.62,6.52,"Holmes, Clay",669394,605280,field_out,hit_into_play,,,,,5,Jake Burger lines out sharply to center fielde...,R,R,R,MIA,NYY,X,8.0,line_drive,3,2,2023,18.36,4.08,-0.04,2.24,,,,2,9,Bot,112.02,47.53,,,543309,,,4.981269,-140.501579,-6.521272,-21.308878,34.584603,-26.357171,3.21,1.51,375.0,107.3,17,95.7,2186,6.0,717047,605280,543309,641343,650402,643396,683011,656185,664056,665828,54.54,0.569,0.68,0.0,1.0,0.0,0.0,6.0,79,6,Sinker,4,9,4,9,9,4,4,9,Standard,Standard,217,0.0,-0.131,1,1,-2.63,3-2,hitter,1,contact,heart,-0.006973,-0.697314,6.5-7.0,-1.0 - -0.5,1
3141,SI,2023-08-11,96.4,-0.63,6.56,"Holmes, Clay",669394,605280,,called_strike,,,,,6,Jake Burger lines out sharply to center fielde...,R,R,R,MIA,NYY,S,,,3,1,2023,17.16,6.0,0.39,2.46,,,,2,9,Bot,,,,,543309,,,5.913953,-140.135953,-6.392567,-20.167295,33.093851,-24.240935,3.29,1.61,,0.0,0,95.7,2147,6.0,717047,605280,543309,641343,650402,643396,683011,656185,664056,665828,54.46,,,,,,,,79,5,Sinker,4,9,4,9,9,4,4,9,Standard,Standard,216,0.0,-0.04,0,0,-2.57,3-1,hitter,0,take_strike,heart,-0.006973,-0.697314,6.5-7.0,-1.0 - -0.5,0
3281,SI,2023-08-11,94.7,-0.6,6.54,"Holmes, Clay",669394,605280,,ball,,,,,14,Jake Burger lines out sharply to center fielde...,R,R,R,MIA,NYY,B,,,2,1,2023,13.08,2.4,1.42,1.68,,,,2,9,Bot,,,,,543309,,,7.656504,-137.579299,-7.409505,-15.514876,30.796001,-28.130044,3.27,1.53,,0.0,0,94.1,2098,6.1,717047,605280,543309,641343,650402,643396,683011,656185,664056,665828,54.45,,,,,,,,79,4,Sinker,4,9,4,9,9,4,4,9,Standard,Standard,230,0.0,0.049,0,0,-3.0,2-1,hitter,0,take_ball,chase,-0.006973,-0.697314,6.5-7.0,-1.0 - -0.5,0
3396,SL,2023-08-11,87.2,-0.64,6.62,"Holmes, Clay",669394,605280,,ball,,,,,1,Jake Burger lines out sharply to center fielde...,R,R,R,MIA,NYY,B,,,1,1,2023,-0.48,-5.28,-0.79,2.94,,,,2,9,Bot,,,,,543309,,,-0.432708,-126.971535,-1.736329,0.554466,28.267388,-36.705424,3.35,1.63,,0.0,0,86.2,2436,5.8,717047,605280,543309,641343,650402,643396,683011,656185,664056,665828,54.68,,,,,,,,79,3,Slider,4,9,4,9,9,4,4,9,Standard,Standard,57,0.0,0.022,0,0,-0.78,1-1,even,0,take_ball,shadow,-0.046362,-4.636158,6.5-7.0,-1.0 - -0.5,0
3563,SL,2023-08-11,87.3,-0.55,6.55,"Holmes, Clay",669394,605280,,ball,,,,,14,Jake Burger lines out sharply to center fielde...,R,R,R,MIA,NYY,B,,,0,1,2023,-1.68,-4.2,0.61,0.23,,,,2,9,Bot,,,,,543309,,,2.505285,-127.007983,-8.306748,0.960499,25.945324,-34.449815,3.33,1.61,,0.0,0,86.9,2458,6.0,717047,605280,543309,641343,650402,643396,683011,656185,664056,665828,54.45,,,,,,,,79,2,Slider,4,9,4,9,9,4,4,9,Standard,Standard,46,0.0,0.014,0,0,-3.73,0-1,pitcher,0,take_ball,waste,-0.046362,-4.636158,6.5-7.0,-1.0 - -0.5,0


## Function to return CSW% (by pitch) for a player

In [30]:
def calculate_csw(data, player_name):
    csw_stats = data[data['player_name'] == player_name].groupby(['player_name', 'pitch_name']).apply(
        lambda x: pd.Series({
            'Whiffs': (x['swing_type'] == 'whiff').sum(),
            'Take Strikes': (x['swing_type'] == 'take_strike').sum(),
            'Pitches Thrown': len(x),
            'CSW%': round(100 * ((x['swing_type'] == 'whiff').sum() + (x['swing_type'] == 'take_strike').sum()) / len(x), 1)
        })
    ).reset_index()
    
    # Sort the DataFrame by 'ratio' column in descending order
    csw_stats = csw_stats.sort_values(by='CSW%', ascending=False)
    
     # Transform the player name format
    csw_stats['player_name'] = csw_stats['player_name'].apply(lambda name: name.split(', ')[1] + ' ' + name.split(', ')[0])
    
    return csw_stats


## Calculate RV/100 for leaderboard

In [33]:
'''
# Calculate the average delta_run_exp for each player and pitch type if not done through the function
data_cleaned['RV'] = data_cleaned.groupby(['player_name', 'pitch_type'])['delta_run_exp'].transform('mean')
data_cleaned['RV/100'] = data_cleaned['RV'] * 100
'''

'''
# Calculate counts and ratio for each player and pitch type
rv100 = data_cleaned.groupby(['player_name', 'pitch_name']).apply(
    lambda x: pd.Series({
        'count': len(x),
        'Velocity': (x['velo']).mean(),
        'IVB': (x['ind_vert_break']).mean(),
        'HB': (x['horizontal_break']).mean(),
        'Extension': (x['release_extension']).mean(),
        'RV/100': 100* x['RV'].mean() 
    })
).reset_index()
rv100 = rv100.loc[:,['player_name','pitch_name','count','Velocity','IVB','HB','Extension','RV/100']]
rv100 = rv100.rename(columns = {'total_pitches_count':'Pitches Thrown',
                         'ratio':'CSW%', 
                        'player_name':'Pitcher', 
                        'pitch_name':'Pitch'})
'''

## Leaderboard RV/100 (pitch level)

In [34]:
'''
from IPython.display import display

#Filter the minimum amount pf pitches thrown
_df = rv100[rv100['count'] > 100]
leaderboardrv100 = _df.sort_values('RV/100', ascending=True).head(20)

# Apply background gradient to the leaderboard DataFrame
leaderboardrv100_styles = leaderboardrv100.style.background_gradient(subset=['RV/100'],cmap='Reds_r')

# Format 'CSW%' column with specific decimal format
leaderboardrv100_styles = leaderboardrv100_styles.format({'count': '{:.0f}',
                                                'RV/100': '{:.1f}',
                                                'Velocity': '{:.1f}',
                                                'IVB': '{:.1f}',
                                                'HB': '{:.1f}',
                                                'Extension': '{:.1f}'})

# Display the styled leaderboard in Jupyter Notebook
display(leaderboardrv100_styles) 
'''


Unnamed: 0,Pitcher,Pitch,count,Velocity,IVB,HB,Extension,RV/100
1506,"Koch, Matt",4-Seam Fastball,113,94.3,13.5,9.1,6.3,-5.9
2115,"Ottavino, Adam",Changeup,101,87.6,1.5,13.2,7.0,-5.6
2193,"Peterson, David",Curveball,102,78.8,-8.8,5.2,6.8,-5.2
1955,"Moore, Matt",Changeup,184,84.2,8.0,-17.2,6.4,-5.2
500,"Cimber, Adam",Sinker,110,85.1,-8.8,12.3,6.6,-5.1
1255,"Holmes, Clay",Slider,177,87.5,-3.9,-2.3,5.9,-4.6
1881,"Mikolas, Miles",Changeup,122,84.8,8.4,14.6,6.6,-4.6
1327,"Jackson, Jay",Slider,185,84.6,3.0,-5.6,6.3,-4.6
2607,"Smith, Will",Slider,302,81.3,0.8,7.2,6.2,-4.5
2409,"Romero, JoJo",Changeup,108,86.0,4.7,-15.1,5.6,-4.4


## Calculate CSW% to create leaderboard

In [27]:
'''
# Calculate counts and ratio for each player and pitch type
CSW = data_cleaned.groupby(['player_name', 'pitch_name']).apply(
    lambda x: pd.Series({
        'whiff_count': (x['swing_type'] == 'whiff').sum(),
        'take_strike_count': (x['swing_type'] == 'take_strike').sum(),
        'total_pitches_count': len(x),
        'ratio': round(100 * ((x['swing_type'] == 'whiff').sum() + (x['swing_type'] == 'take_strike').sum()) / len(x), 1)
    })
).reset_index()
CSW = CSW.loc[:,['player_name','pitch_name','total_pitches_count','ratio']]
CSW = CSW.rename(columns = {'total_pitches_count':'Pitches Thrown',
                         'ratio':'CSW%', 
                        'player_name':'Pitcher', 
                        'pitch_name':'Pitch'}) 
'''

## Leaderboard CSW% (pitch level)

In [None]:
'''
from IPython.display import display

selected_rows = CSW[CSW['Pitches Thrown'] > 150]
leaderboard = selected_rows.sort_values('CSW%', ascending=False).head(20)

# Apply background gradient to the leaderboard DataFrame
leaderboard_styled = leaderboard.style.background_gradient(cmap='Reds')

# Format 'CSW%' column with specific decimal format
leaderboard_styled = leaderboard_styled.format({'Pitches Thrown': '{:.0f}',
                                                'CSW%': '{:.1f}'})

# Display the styled leaderboard in Jupyter Notebook
display(leaderboard_styled) 
'''