Import all needed packages

In [1]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
from matplotlib.gridspec import GridSpec
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import os
import shutil
from tkinter import filedialog
from tkinter import Tk
import tkinter as tk

Methods from original code

In [2]:
def select_file():
    # Create and withdraw root window
    root = tk.Tk()
    root.withdraw()  # we don't want a full GUI, so keep the root window from appearing

    # Print a message before showing the file dialog
    print("Opening file dialog...")

    # Show an "Open" dialog box and return the path to the selected file
    filepath = filedialog.askopenfilename()

    # Ensure the Tkinter instance is destroyed after use
    # This should help to ensure that you can re-run the code in case it doesn't work
    root.destroy()

    # Print the selected file path to confirm
    print(f"Selected file: {filepath}")
    
    return filepath

#edited to return boolean
def two_three_success(group):
    if any(group.iloc[:3]['PitchCall'] == 'HitByPitch'):
        return True
    if len(group) < 4:
        if len(group) >= 3 and group.iloc[2]['Balls'] == 2 and group.iloc[2]['PitchCall'] == 'InPlay':
            return False
        else:
            return True
    return(len(group) >= 4 and group.iloc[3]['Strikes'] == 2)
   
    
#edited to return boolean
def ab_eff_success(group):
    return (len(group) <= 4)
    

def home_plate_drawing(ax4):
    ax4.plot([-0.708, 0.708], [0.15, 0.15], color='black', linewidth=1)
    ax4.plot([-0.708, -0.708], [0.15, 0.3], color='black', linewidth=1)
    ax4.plot([-0.708, 0], [0.3, 0.5], color='black', linewidth=1)
    ax4.plot([0, 0.708], [0.5, 0.3], color='black', linewidth=1)
    ax4.plot([0.708, 0.708], [0.3, 0.15], color='black', linewidth=1)
    
def define_zone(height):
    if height > 2.2:
        return 'upper'
    elif 1.9 <= height <= 2.2:
        return 'middle'
    elif 0.0 <= height < 1.9:
        return 'low'

constants

In [3]:
PITCH_COLORS = {
    'Fastball': '#d22d49',
    'Fastballs': '#d22d49',
    'FourSeamFastBall': '#d22d49',
    'TwoSeamFastBall': '#de6a04',
    'Two-Seam': '#de6a04',
    'Sinker': '#de6a04',
    'Cutter': '#933f2c',
    'Slider': '#eee716',
    'Split-Finger': '#3bacac',
    'Splitter': '#3bacac',
    'ChangeUp': '#1dbe3a',
    'Sweeper': '#ddb33a',
    'Curveball': '#00d1ed',
    'Other': '#888888'
}
ZONE_BOUNDS = {
    'min_plate_x': -0.81,
    'max_plate_x': 0.81,
    'min_plate_z': 1.69,
    'max_plate_z': 3.14
}
PITCH_TYPE_MAPPING = {
    'Splitter': 'ChangeUp', 
    'Fastball': 'Fastball', 
    'FourSeamFastBall': 'Fastball', 
    'TwoSeamFastBall': 'Fastball', 
    'Sinker': 'Fastball', 
    'Cutter': 'Fastball'
}

Method for checking if pitch was in zone

In [4]:
def is_in_zone(df):
    return (
        (ZONE_BOUNDS['min_plate_x'] <= df['PlateLocSide'] <= ZONE_BOUNDS['max_plate_x']) &
        (ZONE_BOUNDS['min_plate_z'] <= df['PlateLocHeight'] <= ZONE_BOUNDS['max_plate_z'])
    )

method for preprocessing data

In [7]:
def load_and_preprocess_data(dtype_dict):
    #filepath = select_file()
    filepath = r'C:\Users\gavin\Au_Baseball\TrackMan_SMML_Master_CSV.csv'
    df = pd.read_csv(filepath, usecols = range(92), dtype=dtype_dict, parse_dates=['Date'])
    
    #sets dataframe to only take from auburn pitchers
    mask = ((df['PitcherTeam'].isin(['AUB_TIG', 'AUB_PRC', 'AUB'])))
    df = df.loc[mask]
    
     #rename columns
    df = df.rename(columns={'Top/Bottom': 'Top.Bottom', 
                            'RelSpeed': 'Velo', 
                            'HorzBreak': 'HB', 
                            'SpinRate': 'Spin',
                           'RelSpeed_mean': 'Velo_mean', 
                              'RelSpeed_max': 'Velo_max',
                              'RelSpeed_min': 'Velo_min',
                              'HorzBreak_mean': 'HB_mean',
                              'HorzBreak_max': 'HB_max',
                              'HorzBreak_min': 'HB_min',
                              'SpinRate_mean': 'Spin_mean',
                              'SpinRate_max': 'Spin_max',
                              'SpinRate_min': 'Spin_min',
                           'InducedVertBreak': 'IVB'}
                  )
    
    df['in_zone'] = df.apply(is_in_zone, axis = 1)
    df['GeneralPitchType'] = df['TaggedPitchType'].map(PITCH_TYPE_MAPPING).fillna(df['TaggedPitchType'])
   
    # Ensure 'Date' column is in string format to avoid datetime reduction errors
    df['Date'] = df['Date'].astype(str)
    
    # Create a unique identifier for each plate appearance
    df['PlateAppearanceID'] = df['Date'] + "_" + df['Pitcher'] + "_" + df['Top.Bottom'] + "_" + df['Inning'].astype(str) + "_" + df['PAofInning'].astype(str)
    
    plate_appearance_grouped = df.groupby('PlateAppearanceID')
    tts = plate_appearance_grouped.apply(two_three_success).reset_index().rename(columns={0: 'two_three_success'})
    aes = plate_appearance_grouped.apply(ab_eff_success).reset_index().rename(columns={0: 'ab_eff_success'})
    df = pd.merge(df, tts, on='PlateAppearanceID', how='left')
    df = pd.merge(df, aes, on='PlateAppearanceID', how='left')
    
    #...then convert 'Date' column back to datetime format for later
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    
    #vertical approach angle recalculated (from original file)
    df['nVAA'] = df['VertApprAngle'] - (-13.73 + (df['Velo'] * 0.06312) + ((df['PlateLocHeight'] * 1.067)))
    
    # Map the 'TaggedPitchType' column using the pitch mapping dictionary
    df['GeneralPitchType'] = df['TaggedPitchType'].map(PITCH_TYPE_MAPPING).fillna(df['TaggedPitchType'])


    # Create a new column 'PitchCount' that represents the pitch count for each pitcher
    df['PitchCount'] = df.groupby(['Pitcher','TaggedPitchType']).cumcount() + 1

    # Define the pitch calls that indicate a swing
    swing_calls = ['StrikeSwinging', 'InPlay', 'FoulBallNotFieldable', 'FoulBall', 'FoulBallFieldable']

    # Create a new 'Swing' column
    df['Swing'] = df['PitchCall'].isin(swing_calls)
  
    return df

method to calculate first set of metrics

In [10]:
def metrics1(df):
    # Initialize an empty DataFrame to store the metrics for each pitcher
    metrics = pd.DataFrame()

    # Initialize an empty list to store the metrics for each pitcher
    metrics_list = []

    # Iterate over each unique pitcher
    for pitcher in df['Pitcher'].unique():
        # Filter the DataFrame for the current pitcher
        df_pitcher = df[df['Pitcher'] == pitcher]
    
        # Calculate the metrics for the current pitcher
        FPS = ((df_pitcher['PitchofPA'] == 2) & (df_pitcher['Strikes'] == 1)).mean() * 100
    
        # Calculate the number of plate appearances that ended in a walk
        walks = df_pitcher.groupby('PlateAppearanceID').apply(lambda group: 1 if group['KorBB'].iloc[-1] == 'Walk' else 0).sum()
    
        # Calculate the total number of plate appearances
        total_plate_appearances = df_pitcher['PlateAppearanceID'].nunique()
    
        # Calculate BB%
        BB_percentage = (walks / total_plate_appearances) * 100 if total_plate_appearances != 0 else 0
    
        two_thirds = (df_pitcher.groupby('PlateAppearanceID')['two_three_success'].first() == "success").mean() * 100
        AB_Efficiency = (df_pitcher.groupby('PlateAppearanceID')['ab_eff_success'].first() == "success").mean() * 100
    
        Zone_percentage = df_pitcher[~((df_pitcher['Strikes'] == 2) & (df_pitcher['Balls'] <= 1))]['in_zone'].mean() * 100
    
        # Calculate Zone% for each pitch type for the current pitcher
        pitch_type_metrics = {}
        for pitch_type in df_pitcher['GeneralPitchType'].unique():
            pitch_type_metrics[f'{pitch_type}_Zone%'] = round(df_pitcher[(df_pitcher['GeneralPitchType'] == pitch_type) & ~((df_pitcher['Strikes'] == 2) & (df_pitcher['Balls'] <= 1))]['in_zone'].mean() * 100, 2)
    
        # Append the metrics for the current pitcher to the list
        metrics_list.append(pd.Series({'FPS': FPS, '2/3': two_thirds, 'AB Efficiency': AB_Efficiency, 'BB%': BB_percentage, 'Zone%': Zone_percentage, **pitch_type_metrics}, name=pitcher))

    # Convert the list of metrics to a DataFrame
    metrics = pd.DataFrame(metrics_list)

    # Replace NaN values with '-'
    metrics.fillna('-', inplace=True)

    metrics = metrics.round(2)
    # Sort the DataFrame by index
    metrics = metrics.sort_index()
    
    return metrics

In [None]:
df = load_and_preprocess_data(dtype_dict = {'Notes': 'string'})
metrics = metrics1(df)
print(metrics)