In [2]:
import pandas as pd 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
import datetime
import sqlite3 
import pybaseball as pyb #Pybaseball is where we will be pulling data from
#!pip install nbconvert
#!pip install pyppeteer

In [3]:
# Get today's date as the latest day to pull data ( pybaseball will see tomorrow's date and only pull up to the most recent data)
today = datetime.date.today() + datetime.timedelta(days=1) 
#turn the output of the date to a string to be able to pass through the statcast function
today_str = today.strftime('%Y-%m-%d')  
#This start_dt will be the starting date of spring training for the 2023 season 
start_dt_od = '2023-03-30'


In [26]:
#Get all data leading from Opening Day to the current date from pybaseball's statcast function 
data = pyb.statcast(start_dt=start_dt_od, end_dt=today_str, team=None, verbose=True, parallel=True)

This is a large query, it may take a moment to complete


100%|█████████████████████████████████████████| 133/133 [00:30<00:00,  4.42it/s]


# This to add barrel to a batted ball event

In [27]:
def code_barrel(df):
    # Fill missing values in 'launch_angle' and 'exit_velo' columns with 0
    df['launch_angle'] = df['launch_angle'].fillna(0)
    df['exit_velo'] = df['exit_velo'].fillna(0)

    # Create the 'barrel' column based on the specified conditions
    df['barrel'] = (
        (df['launch_angle'] <= 50) &
        (df['exit_velo'] >= 98) &
        (df['exit_velo'] * 1.5 - df['launch_angle'] >= 117) &
        (df['exit_velo'] + df['launch_angle'] >= 124)
    )

    # Convert boolean values to integers (0 for False, 1 for True)
    df['barrel'] = df['barrel'].astype(int)

    return df


# These are to map release height and side buckets

In [28]:
def map_height_bucket(value):
    if pd.notna(value):
        if value < 1:
            return '<1'
        elif 1 <= value <= 1.5:
            return '1.0-1.5'
        elif 1.5 < value <= 2:
            return '1.5-2.0'
        elif 2 < value <= 2.5:
            return '2.0-2.5'
        elif 2.5 < value <= 3:
            return '2.5-3.0'
        elif 3 < value <= 3.5:
            return '3.0-3.5'
        elif 3.5 < value <= 4:
            return '3.5-4.0'
        elif 4 < value <= 4.5:
            return '4.0-4.5'
        elif 4.5 < value <= 5:
            return '4.5-5.0'
        elif 5 < value <= 5.5:
            return '5.0-5.5'
        elif 5.5 < value <= 6:
            return '5.5-6.0'
        elif 6 < value <= 6.5:
            return '6.0-6.5'
        elif 6.5 < value <= 7:
            return '6.5-7.0'
        elif 7 <= value <= 7.5:
            return '7.0-7.5'
        elif 7.5 <= value <= 8.0:
            return '7.5-8.0'
        elif 8.0 <= value <= 8.5:
            return '8.0-8.5'
        else:
            return '8.5+'
    else:
        return 'NA'  # or any other label you want to assign to missing values


def map_side_bucket(value):
    if pd.notna(value):
        if value < -5:
            return '<-5.0'
        elif -5 <= value <= -4.5:
            return '-5.0 - -4.5'
        elif -4.5 < value <= -4:
            return '-4.5 - -4.0'
        elif -4 < value <= -3.5:
            return '-3.5 - -3.0'
        elif -3.5 < value <= -3:
            return '-3.0 - -2.5'
        elif -2.5 < value <= -2:
            return '-2.5 - -2.0'
        elif -2 < value <= -1.5:
            return '-2.0 - -1.5'
        elif -1.5 < value <= -1:
            return '-1.5 - -1.0'
        elif -1 < value <= -0.5:
            return '-1.0 - -0.5'
        elif -0.5 < value <= 0:
            return '-0.5 - 0.0'
        elif 0 < value <= 0.5:
            return '0.0 - 0.5'
        elif 0.5 < value <= 1:
            return '0.5 - 1.0'
        elif 1 < value <= 1.5:
            return '1.0 - 1.5'
        elif 1.5 <= value <= 2.0:
            return '1.5 - 2.0'
        elif 2.0 <= value <= 2.5:
            return '2.0 - 2.5'
        elif 2.5 <= value <= 3.0:
            return '2.5 - 3.0'
        elif 3.0 <= value <= 3.5:
            return '3.0 - 3.5'
        elif 3.5 <= value <= 4.0:
            return '3.5-4.0'
        elif 4.0 <= value <= 4.5:
            return '4.0-4.5'
        elif 4.5 <= value <= 5.0:
            return '4.5-5.0'
        else:
            return '5.0+'
    else:
        return 'NA'  # or any other label you want to assign to missing values


# This is a function to add all the other functions created prior, to the pitch by pich data

In [29]:
def add_to_savant(df):
    # Modify 'pfx_x' and 'pfx_z' columns to get them in inches 
    df['pfx_x'] = df['pfx_x'].apply(lambda x: x * -12)
    df['pfx_z'] = df['pfx_z'].apply(lambda x: x * 12)
    
    # Create 'hard_hit' column
    df['hard_hit'] = 0  # Initialize all values to 0
    mask = df['launch_speed'].notna()  # Get a boolean mask of non-missing values in 'launch_speed'
    df.loc[mask & (df['launch_speed'] > 95), 'hard_hit'] = 1  # Set values to 1 where condition is met
    
    # Create 'sweet_spot' column
    df['sweet_spot'] = 0  # Initialize all values to 0
    mask = df['launch_angle'].notna()  # Get a boolean mask of non-missing values in 'launch_angle'
    df.loc[mask & ((df['launch_angle'] >= 8) & (df['launch_angle'] <= 32)), 'sweet_spot'] = 1  # Set values to 1 where condition is met

    # Add 'VAA' column
    vy_f = -1 * np.sqrt(df['vx0']**2 + df['vy0']**2) 
    t = (vy_f - df['vy0']) / df['ay']
    vz_f = df['vz0'] + df['az'] * t
    df['VAA'] = np.round(-1 * np.arctan(vz_f / vy_f) * (180 / np.pi), 2)
    
    # Add 'Count' column
    df['count'] = df['balls'].astype(str) + '-' + df['strikes'].astype(str)

    # Add 'count_type' column  
    df['count_type'] = ''
    df.loc[df['count'].isin(['1-0', '2-0', '3-0', '2-1', '3-1', '3-2']), 'count_type'] = 'hitter'
    df.loc[df['count'].isin(['0-1', '0-2', '1-2']), 'count_type'] = 'pitcher'
    df.loc[df['count'].isin(['0-0', '1-1', '2-2']), 'count_type'] = 'even'  

    # Add a column, 'swing' to determine if the pitch was swung a or not (1=swing) 
    #this classification includes all bunt attempts as swings so you will need to filter those out in later analysis
    #define the strings to look for
    target_strings = ['hit_into_play', 'foul', 'swinging_strike', 'swinging_strike_blocked', 'foul_bunt', 'foul_tip', 'foul_pitchout', 'missed_bunt', 'bunt_foul_tip', 'swinging_pitchout']
    # create a new column with 1s and 0s based on whether the target strings are present in column1
    df['swing'] = np.where(df['description'].isin(target_strings), 1, 0)  

    # Create a new column 'swing_type'
    swing_types = df['description'].copy()
    
    # Determine if a pitch resulted in: contact, foul, whiff, take_ball, take_strike, or undef (undefined)
    swing_types.loc[swing_types.isin(['hit_into_play'])] = 'contact'
    swing_types.loc[swing_types.isin(['foul', 'foul_bunt', 'foul_tip', 'bunt_foul_tip', 'foul_pitchout'])] = 'foul'
    swing_types.loc[swing_types.isin(['swinging_strike', 'swinging_strike_blocked', 'missed_bunt', 'swinging_pitchout'])] = 'whiff'
    swing_types.loc[swing_types.isin(['ball', 'blocked_ball', 'hit_by_pitch', 'pitchout'])] = 'take_ball'
    swing_types.loc[swing_types.isin(['called_strike'])] = 'take_strike'
    swing_types.loc[~swing_types.isin(['contact', 'foul', 'whiff', 'take_ball', 'take_strike'])] = 'undef'

    # Add the new column to the dataframe
    df['swing_type'] = swing_types  

    # Create a column called attack_zone (THANKS TO NICK WAN FOR THIS, his KAGGLE competition notebook helped out here)
    df['attack_zone'] = 'waste'
    df.loc[(df['plate_x'].between(-0.558, 0.558)) & (df['plate_z'].between(1.833,3.166)), 'attack_zone'] = 'heart'
    df.loc[(df['plate_x'].between(-1.108, 1.108)) & (df['plate_z'].between(1.166,3.833)) & (~df['attack_zone'].isin(['heart'])), 'attack_zone'] = 'shadow'
    df.loc[(df['plate_x'].between(-1.666, 1.666)) & (df['plate_z'].between(0.5,4.5)) & (~df['attack_zone'].isin(['heart', 'shadow'])), 'attack_zone'] = 'chase'
    
    # Calculate the average delta_run_exp for each player and pitch type
    df['RV'] = df.groupby(['player_name', 'pitch_type'])['delta_run_exp'].transform('mean')
    df['RV/100'] = df['RV'] * 100

    # Add 'Rel_height_bucket' and 'Rel_side_bucket' column
    df['Rel_height_bucket'] = df['release_pos_z'].apply(map_height_bucket)
    df['Rel_side_bucket'] = df['release_pos_x'].apply(map_side_bucket)
    
    #Add 'barrel' designation to a batted ball
    df = code_barrel(df)
    
    
    return df

# These are the column names I will be changing to make querying quicker for me

In [30]:
# Define a dictionary with the column name changes
column_names_dict = {
    'release_speed': 'velo',
    'release_pos_x': 'release_side',
    'release_pos_z': 'release_height',
    'game_date': 'date',
    'pfx_z': 'ind_vert_break',
    'pfx_x': 'horizontal_break',
    'launch_speed': 'exit_velo',
    
    # Add more column names as needed
}

# Function to apply column name changes to the data

In [31]:
def rename_statcast_columns(data, column_names_dict):
    """
    Rename columns from the Statcast data that was pulled from Savant

    Parameters:
        data (pd.DataFrame): Statcast data from Savant
        column_names_dict (dict): a dictionary containing the current column names as keys
                                  and the new column names as values

    Returns:
        pd.DataFrame: the modified DataFrame with the renamed columns
    """
    data = data.rename(columns=column_names_dict)
    return data

In [32]:
# Run the downloaded data from pybaseball for the day through the 'add_to_savant' function
modified_data = add_to_savant(data)

In [33]:
# Run the new dataframe (modified_data) that contains the new columns from the 'add_to_savant' function through 
# the function that renames the columns
data_cleaned = rename_statcast_columns(modified_data, column_names_dict) 
#print(data_cleaned.head())

# We want to clear out any data in the database table before we load anything new. This is to ensure that if Statcast makes any changes we will always have the most up to date information - thanks to Jeremy Maschino for this suggestion

In [18]:
# Connect to the database
conn = sqlite3.connect('2023Statcast.db')

# Create a cursor object to execute SQL statements
cur = conn.cursor()

# Execute the DROP TABLE statement to remove the entire table
cur.execute('DROP TABLE statcast_data_2023')

# Commit the changes to the database
conn.commit()

# Close the cursor and database connections
cur.close()
conn.close()

# We will now put all of our updated data into the table 'statcast_data_2023' into the '2023Statcast' database

In [19]:
#Connect to the SQLLite database 2023Statcast
conn = sqlite3.connect('2023Statcast.db')

#define the table name
table_name = 'statcast_data_2023'
#if the table name exists then append the data on it
data_cleaned.to_sql(table_name, conn, if_exists='append', index=False)

# Close the connection
conn.close()
