In [1]:
import pandas as pd 
import numpy as np 
import datetime
import sqlite3 
import pybaseball as pyb #Pybaseball is where we will be pulling data from

In [2]:
# Get today's date 
today = datetime.date.today()
#turn the output of the date to a string to be able to pass through the statcast function
today_str = today.strftime('%Y-%m-%d')  
#This start_dt will be the starting date of spring training for the 2023 season 
start_dt_od = '2023-03-30'


In [3]:
#Get all data leading from Opening Day to the current date from pybaseball's statcast function 
data = pyb.statcast(start_dt=start_dt_od, end_dt=today_str, team=None, verbose=True, parallel=True)

This is a large query, it may take a moment to complete


100%|██████████| 3/3 [00:12<00:00,  4.27s/it]


#Using an arbitray set of dates to test to make sure the functions work and data can be written to the databse - this line is only included to test the code out

In [4]:
#data = pyb.statcast(start_dt = '2022-05-17', end_dt = '2022-05-18', team = None, verbose = True, parallel = True) 

In [5]:
print(data.tail(1))

     pitch_type  game_date  release_speed  release_pos_x  release_pos_z  \
4393         FF 2023-03-30           95.1          -3.31           5.27   

         player_name  batter  pitcher events description  ...  fld_score  \
4393  Castillo, Luis  680757   622491    NaN        ball  ...          0   

      post_away_score  post_home_score  post_bat_score  post_fld_score  \
4393                0                0               0               0   

     if_fielding_alignment of_fielding_alignment spin_axis delta_home_win_exp  \
4393              Standard              Standard       237                0.0   

     delta_run_exp  
4393         0.069  

[1 rows x 92 columns]


#This is a function to add some aditional things I want in the pitch by pich data

In [28]:
def add_to_savant(df):
    # Modify 'pfx_x' and 'pfx_z' columns to get them in inches 
    df['pfx_x'] = df['pfx_x'].apply(lambda x: x * -12)
    df['pfx_z'] = df['pfx_z'].apply(lambda x: x * 12)
    
    # Create 'hard_hit' column
    df['launch_speed'] = df['launch_speed'].fillna(False)
    df['hard_hit'] = df['launch_speed'].apply(lambda x: 1 if x > 95 else 0)

    # Create 'sweet_spot' column
    df['launch_angle'] = df['launch_angle'].fillna(False)
    df['sweet_spot'] = df['launch_angle'].apply(lambda x: 1 if (x >= 8) and (x <= 32) else 0)

    # Add 'VAA' column
    vy_f = -1 * np.sqrt(df['vx0']**2 + df['vy0']**2) 
    t = (vy_f - df['vy0']) / df['ay']
    vz_f = df['vz0'] + df['az'] * t
    df['VAA'] = np.round(-1 * np.arctan(vz_f / vy_f) * (180 / np.pi), 2)
    
    # Add 'Count' column
    df['count'] = df['balls'].astype(str) + '-' + df['strikes'].astype(str)

    # Add 'count_type' column  
    df['count_type'] = ''
    df.loc[df['count'].isin(['1-0', '2-0', '3-0', '2-1', '3-1', '3-2']), 'count_type'] = 'hitter'
    df.loc[df['count'].isin(['0-1', '0-2', '1-2']), 'count_type'] = 'pitcher'
    df.loc[df['count'].isin(['0-0', '1-1', '2-2']), 'count_type'] = 'even'  

    # Add a column, 'swing' to determine if the pitch was swung a or not (1=swing) 
    #this classification includes all bunt attempts as swings so you will need to filter those out in later analysis
    #define the strings to look for
    target_strings = ['hit_into_play', 'foul', 'swinging_strike', 'swinging_strike_blocked', 'foul_bunt', 'foul_tip', 'foul_pitchout', 'missed_bunt', 'bunt_foul_tip', 'swinging_pitchout']
    # create a new column with 1s and 0s based on whether the target strings are present in column1
    df['swing'] = np.where(df['description'].isin(target_strings), 1, 0)  

    # Add column 'swing_type' to determine if a pitch resulted in: contact, foul, whiff, take_ball, take_strike, or undef (undefined)
    for i in range(len(df)):
        if df.loc[i, 'description'] in ['hit_into_play']:
            df.loc[i, 'swing_type'] = 'contact'
        elif df.loc[i, 'description'] in ['foul', 'foul_bunt', 'foul_tip', 'bunt_foul_tip', 'foul_pitchout']:
            df.loc[i, 'swing_type'] = 'foul'
        elif df.loc[i, 'description'] in ['swinging_strike', 'swinging_strike_blocked', 'missed_bunt', 'swinging_pitchout']:
            df.loc[i, 'swing_type'] = 'whiff'
        elif df.loc[i, 'description'] in ['ball', 'blocked_ball', 'hit_by_pitch', 'pitchout']:
            df.loc[i, 'swing_type'] = 'take_ball'
        elif df.loc[i, 'description'] in ['called_strike']:
            df.loc[i, 'swing_type'] = 'take_strike'
        else:
            df.loc[i, 'swing_type'] = 'undef' 

    return df

 
    

In [6]:
def add_to_savant(df):
    # Modify 'pfx_x' and 'pfx_z' columns to get them in inches 
    df['pfx_x'] = df['pfx_x'].apply(lambda x: x * -12)
    df['pfx_z'] = df['pfx_z'].apply(lambda x: x * 12)
    
    # Create 'hard_hit' column
    df['hard_hit'] = 0  # Initialize all values to 0
    mask = df['launch_speed'].notna()  # Get a boolean mask of non-missing values in 'launch_speed'
    df.loc[mask & (df['launch_speed'] > 95), 'hard_hit'] = 1  # Set values to 1 where condition is met
    
    # Create 'sweet_spot' column
    df['sweet_spot'] = 0  # Initialize all values to 0
    mask = df['launch_angle'].notna()  # Get a boolean mask of non-missing values in 'launch_angle'
    df.loc[mask & ((df['launch_angle'] >= 8) & (df['launch_angle'] <= 32)), 'sweet_spot'] = 1  # Set values to 1 where condition is met

    # Add 'VAA' column
    vy_f = -1 * np.sqrt(df['vx0']**2 + df['vy0']**2) 
    t = (vy_f - df['vy0']) / df['ay']
    vz_f = df['vz0'] + df['az'] * t
    df['VAA'] = np.round(-1 * np.arctan(vz_f / vy_f) * (180 / np.pi), 2)
    
    # Add 'Count' column
    df['count'] = df['balls'].astype(str) + '-' + df['strikes'].astype(str)

    # Add 'count_type' column  
    df['count_type'] = ''
    df.loc[df['count'].isin(['1-0', '2-0', '3-0', '2-1', '3-1', '3-2']), 'count_type'] = 'hitter'
    df.loc[df['count'].isin(['0-1', '0-2', '1-2']), 'count_type'] = 'pitcher'
    df.loc[df['count'].isin(['0-0', '1-1', '2-2']), 'count_type'] = 'even'  

    # Add a column, 'swing' to determine if the pitch was swung a or not (1=swing) 
    #this classification includes all bunt attempts as swings so you will need to filter those out in later analysis
    #define the strings to look for
    target_strings = ['hit_into_play', 'foul', 'swinging_strike', 'swinging_strike_blocked', 'foul_bunt', 'foul_tip', 'foul_pitchout', 'missed_bunt', 'bunt_foul_tip', 'swinging_pitchout']
    # create a new column with 1s and 0s based on whether the target strings are present in column1
    df['swing'] = np.where(df['description'].isin(target_strings), 1, 0)  

    # Create a new column 'swing_type'
    swing_types = df['description'].copy()
    
    # Determine if a pitch resulted in: contact, foul, whiff, take_ball, take_strike, or undef (undefined)
    swing_types.loc[swing_types.isin(['hit_into_play'])] = 'contact'
    swing_types.loc[swing_types.isin(['foul', 'foul_bunt', 'foul_tip', 'bunt_foul_tip', 'foul_pitchout'])] = 'foul'
    swing_types.loc[swing_types.isin(['swinging_strike', 'swinging_strike_blocked', 'missed_bunt', 'swinging_pitchout'])] = 'whiff'
    swing_types.loc[swing_types.isin(['ball', 'blocked_ball', 'hit_by_pitch', 'pitchout'])] = 'take_ball'
    swing_types.loc[swing_types.isin(['called_strike'])] = 'take_strike'
    swing_types.loc[~swing_types.isin(['contact', 'foul', 'whiff', 'take_ball', 'take_strike'])] = 'undef'

    # Add the new column to the dataframe
    df['swing_type'] = swing_types 

    return df

#These are the column names I will be changing to make querying quicker for me

In [7]:
# Define a dictionary with the column name changes
column_names_dict = {
    'release_speed': 'velo',
    'release_pos_x': 'release_side',
    'release_pos_z': 'release_height',
    'game_date': 'date',
    'pf_x_z': 'ind_vert_break',
    'pf_x_x': 'horizontal_break',
    'launch_speed': 'exit_velo',
    
    # Add more column names as needed
}

#Function to apply column name changes to the data

In [8]:
def rename_statcast_columns(data, column_names_dict):
    """
    Rename columns from the Statcast data that was pulled from Savant

    Parameters:
        data (pd.DataFrame): Statcast data from Savant
        column_names_dict (dict): a dictionary containing the current column names as keys
                                  and the new column names as values

    Returns:
        pd.DataFrame: the modified DataFrame with the renamed columns
    """
    data = data.rename(columns=column_names_dict)
    return data

In [9]:
# Run the downloaded data from pybaseball for the day through the 'add_to_savant' function
modified_data = add_to_savant(data)

In [10]:
# Run the new dataframe (modified_data) that contains the new columns from the 'add_to_savant' function through 
# the function that renames the columns
data_cleaned = rename_statcast_columns(modified_data, column_names_dict) 
print(data_cleaned.head())

     pitch_type       date  velo  release_side  release_height player_name  \
1047         SL 2023-03-31  84.5          3.41            5.29   Puk, A.J.   
1068         SL 2023-03-31  84.1           3.4            5.38   Puk, A.J.   
1139         SL 2023-03-31  83.7          3.39            5.32   Puk, A.J.   
1158         FF 2023-03-31  96.3           3.3            5.46   Puk, A.J.   
1185         SL 2023-03-31  84.0           3.4            5.38   Puk, A.J.   

      batter  pitcher     events      description  ...  spin_axis  \
1047  643446   640462  field_out    hit_into_play  ...        286   
1068  643446   640462        NaN             foul  ...        280   
1139  643446   640462        NaN             foul  ...        241   
1158  592192   640462  strikeout  swinging_strike  ...        129   
1185  592192   640462        NaN  swinging_strike  ...        279   

      delta_home_win_exp  delta_run_exp  hard_hit  sweet_spot   VAA count  \
1047               0.035         -0.013

#We want to clear out any data in the database table before we load anything new. This is to ensure that if Statcast makes any changes we will always have the most up to date information - thanks to Jeremy Maschino for this suggestion

In [11]:
# Connect to the database
conn = sqlite3.connect('2023Statcast.db')

# Create a cursor object to execute SQL statements
cur = conn.cursor()

# Execute the DELETE statement to remove all data from the table
cur.execute('DELETE FROM statcast_data_2023')

# Commit the changes to the database
conn.commit()

# Close the cursor and database connections
cur.close()
conn.close()

In [14]:
# Connect to the database
conn = sqlite3.connect('2023Statcast.db')

# Create a cursor object to execute SQL statements
cur = conn.cursor()

# Execute the DROP TABLE statement to remove the entire table
cur.execute('DROP TABLE statcast_data_2023')

# Commit the changes to the database
conn.commit()

# Close the cursor and database connections
cur.close()
conn.close()

OperationalError: no such table: statcast_data_2023

#We will now put all of our updated data into the table 'statcast_data_2023' into the '2023Statcast' databas

In [15]:
#Connect to the SQLLite database 2023Statcast
conn = sqlite3.connect('2023Statcast.db')

#define the table name
table_name = 'statcast_data_2023'
#if the table name exists then append the data on it
data_cleaned.to_sql(table_name, conn, if_exists='append', index=False)

# Close the connection
conn.close()


find events description

In [17]:
x = pd.read_csv('/Users/emiliomartinez/Desktop/code/Data/Savant/2022savant.csv') 
y = pd.read_csv('/Users/emiliomartinez/Desktop/code/Data/Savant/2021savant.csv')