Importing everything that needs to be imported

In [142]:
import pandas as pd
import os
import numpy as np
from matplotlib.gridspec import GridSpec
import datetime
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle
import plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.graph_objects as go

Constants

In [187]:
PITCH_COLORS = {
    'Fastball': '#d22d49',
    'Fastballs': '#d22d49',
    'FourSeamFastBall': '#d22d49',
    'TwoSeamFastBall': '#de6a04',
    'Two-Seam': '#de6a04',
    'Sinker': '#de6a04',
    'Cutter': '#933f2c',
    'Slider': '#eee716',
    'Split-Finger': '#3bacac',
    'Splitter': '#3bacac',
    'ChangeUp': '#1dbe3a',
    'Sweeper': '#ddb33a',
    'Curveball': '#00d1ed',
    'Other': '#888888',
    'Undefined': '#888888',
    'OneSeamFastBall': '#de6a04'
}
ZONE_BOUNDS = {
    'min_plate_x': -0.81,
    'max_plate_x': 0.81,
    'min_plate_z': 1.69,
    'max_plate_z': 3.14
}
PITCH_TYPE_MAPPING = {
    'Splitter': 'ChangeUp', 
    'Fastball': 'Fastball', 
    'FourSeamFastBall': 'Fastball', 
    'TwoSeamFastBall': 'Fastball', 
    'Sinker': 'Fastball', 
    'Cutter': 'Fastball'
}

methods defined in original code

In [3]:
def two_three_success(group):
    if any(group.iloc[:3]['PitchCall'] == 'HitByPitch'):
        return 0
    if len(group) < 4:
        if len(group) >= 3 and group.iloc[2]['Balls'] == 2 and group.iloc[2]['PitchCall'] == 'InPlay':
            return 0
        else:
            return 1
    elif len(group) >= 4 and group.iloc[3]['Strikes'] == 2:
        return 1
    else:
        return 0
    
    
def ab_eff_success(group):
    if (len(group) <= 4): return 1
    return 0

def home_plate_drawing(ax4):
    ax4.plot([-0.708, 0.708], [0.15, 0.15], color='black', linewidth=1)
    ax4.plot([-0.708, -0.708], [0.15, 0.3], color='black', linewidth=1)
    ax4.plot([-0.708, 0], [0.3, 0.5], color='black', linewidth=1)
    ax4.plot([0, 0.708], [0.5, 0.3], color='black', linewidth=1)
    ax4.plot([0.708, 0.708], [0.3, 0.15], color='black', linewidth=1)
    
def define_zone(height):
    if height > 2.2:
        return 'upper'
    elif 1.9 <= height <= 2.2:
        return 'middle'
    elif 0.0 <= height < 1.9:
        return 'low'

Method to check if pitch was thrown in the strikezone

In [4]:
def is_in_zone(df):
    return (
        (ZONE_BOUNDS['min_plate_x'] <= df['PlateLocSide'] <= ZONE_BOUNDS['max_plate_x']) &
        (ZONE_BOUNDS['min_plate_z'] <= df['PlateLocHeight'] <= ZONE_BOUNDS['max_plate_z'])
    )

Method to load and preprocess data from csv

In [5]:
def load_and_preprocess_data(file_path, dtype_dict):
    df = pd.read_csv(file_path, usecols = range(92), dtype=dtype_dict, parse_dates=['Date'])
    
    #sets dataframe to only take from auburn pitchers
    mask = ((df['PitcherTeam'].isin(['AUB_TIG', 'AUB_PRC', 'AUB'])))
    df = df.loc[mask]
    
     #rename columns
    df = df.rename(columns={'Top/Bottom': 'Top.Bottom', 
                            'RelSpeed': 'Velo', 
                            'HorzBreak': 'HB', 
                            'SpinRate': 'Spin',
                           'RelSpeed_mean': 'Velo_mean', 
                              'RelSpeed_max': 'Velo_max',
                              'RelSpeed_min': 'Velo_min',
                              'HorzBreak_mean': 'HB_mean',
                              'HorzBreak_max': 'HB_max',
                              'HorzBreak_min': 'HB_min',
                              'SpinRate_mean': 'Spin_mean',
                              'SpinRate_max': 'Spin_max',
                              'SpinRate_min': 'Spin_min',
                           'InducedVertBreak': 'IVB'}
                  )
    
    df['in_zone'] = df.apply(is_in_zone, axis = 1)
   
    # Ensure 'Date' column is in string format to avoid datetime reduction errors
    df['Date'] = df['Date'].astype(str)
    
    # Create a unique identifier for each plate appearance
    df['PlateAppearanceID'] = df['Date'] + "_" + df['Pitcher'] + "_" + df['Top.Bottom'] + "_" + df['Inning'].astype(str) + "_" + df['PAofInning'].astype(str)
    
    plate_appearance_grouped = df.groupby('PlateAppearanceID')
    tts = plate_appearance_grouped.apply(two_three_success).reset_index().rename(columns={0: 'two_three_success'})
    aes = plate_appearance_grouped.apply(ab_eff_success).reset_index().rename(columns={0: 'ab_eff_success'})
    df = pd.merge(df, tts, on='PlateAppearanceID', how='left')
    df = pd.merge(df, aes, on='PlateAppearanceID', how='left')
      
    #vertical approach angle recalculated (from original file)
    df['nVAA'] = df['VertApprAngle'] - (-13.73 + (df['Velo'] * 0.06312) + ((df['PlateLocHeight'] * 1.067)))
    
    # Map the 'TaggedPitchType' column using the pitch mapping dictionary
    df['GeneralPitchType'] = df['TaggedPitchType'].map(PITCH_TYPE_MAPPING).fillna(df['TaggedPitchType'])


    # Create a new column 'PitchCount' that represents the pitch count for each pitcher
    df['PitchCount'] = df.groupby(['Pitcher','TaggedPitchType']).cumcount() + 1

    # Group 'Fastball' and 'FourSeamFastball' together
    df['TaggedPitchType'] = df['TaggedPitchType'].replace({'Fastball': 'Fastball', 'FourSeamFastBall': 'Fastball'})
    
    # Define the pitch calls that indicate a swing
    swing_calls = ['StrikeSwinging', 'InPlay', 'FoulBallNotFieldable', 'FoulBall', 'FoulBallFieldable']

    # Create a new 'Swing' column
    df['Swing'] = df['PitchCall'].isin(swing_calls)
  
    # Create a new column 'PitchCount' that represents the pitch count for each pitcher
    df['PitchCount'] = df.groupby(['Pitcher','TaggedPitchType']).cumcount() + 1

    return df

Method to calculate pitcher metrics from a dataframe

In [6]:
def calculate_pitcher_metrics(df):
    metrics = pd.DataFrame()
    metrics_list = []
    for pitcher in df['Pitcher'].unique():
        
        group = df[df['Pitcher'] == pitcher]
        # FPS (First Pitch Strike): Mean where after first pitch (PitchofPA==2), Strikes==1
        fps = ((group['PitchofPA'] == 2) & (group['Strikes'] == 1)).mean() * 100
        
        # BB%: Walks / total plate appearances
        walks_table = group.groupby('PlateAppearanceID').apply(lambda g: 1 if g['KorBB'].iloc[-1] == 'Walk' else 0)
        walks = sum(k for j, k in walks_table.items())

        total_pa = group['PlateAppearanceID'].nunique()
        bb_perc = (walks / total_pa) * 100 if total_pa > 0 else 0
        
        # 2/3 Success Rate: Mean of 'success' per PA
        two_thirds = (group.groupby('PlateAppearanceID')['two_three_success'].first() == 1).mean() * 100
        
        # AB Efficiency: Mean of 'success' per PA
        ab_eff = (group.groupby('PlateAppearanceID')['ab_eff_success'].first() == 1).mean() * 100
        
        # Zone%: In-zone rate excluding counts where Strikes==2 and Balls<=1
        non_excluded = group[~((group['Strikes'] == 2) & (group['Balls'] <= 1))]
        zone_perc = non_excluded['in_zone'].mean() * 100 if not non_excluded.empty else 0
        
        # Per-pitch-type Zone%: Same exclusion logic, rounded to 2 decimals
        pitch_type_zones = {}
        for pitch_type in group['GeneralPitchType'].unique():
            sub = group[(group['GeneralPitchType'] == pitch_type) & ~((group['Strikes'] == 2) & (group['Balls'] <= 1))]
            pitch_type_zones[f'{pitch_type}_Zone%'] = round(sub['in_zone'].mean() * 100, 2) if not sub.empty else '-'
        
        # Combine into a Series
        series = pd.Series({
            'FPS': round(fps, 2),
            'BB%': round(bb_perc, 2),
            'success_rate': round(two_thirds, 2),
            'ab_efficiency': round(ab_eff, 2),
            'Zone%': round(zone_perc, 2),
            **pitch_type_zones
        })
    
        metrics[pitcher] = series
        
        
        
    # Post-processing: Fill NaNs with '-', sort by index
    metrics = metrics.fillna('-').sort_index()
    
    metrics = metrics.round(2)
    # Sort the DataFrame by index
    metrics = metrics.sort_index()

    return metrics.transpose()

Method for creating metrics for table and helper method for grouping dataframe

In [7]:
#helper method for grouping dataframe by pitcher and TaggedPitchType
def TPS(df):
    return df.groupby(['Pitcher', 'TaggedPitchType'])

def table_metrics(df):
   
    #create dataframe group so we don't have to redo it each time
    main_TPS = TPS(df)
    # Calculate averages and round to 3 decimal places
    grouped = main_TPS.mean(numeric_only=True)[['Velo', 'IVB', 'HB', 'Spin']].round(1)
    
    # Calculate in_zone percentage for each pitch type for each pitcher
    #done with different exclusion logic than earlier; is this on purpose or a redundancy?
    grouped_in_zone = (main_TPS['in_zone'].mean() * 100).round(0)
    
     # Calculate averages for 'RelHeight' and 'Extension'
    grouped_avg = main_TPS.mean(numeric_only=True)[['RelHeight', 'Extension']].round(1)
    
    TPS_no_zone = TPS(df.drop(columns='in_zone'))
    
    # Calculate min, max for each metric excluding in_zone
    grouped_min = TPS_no_zone.min(numeric_only=True)[['Velo', 'IVB', 'HB', 'Spin']].round(1)
    grouped_max =TPS_no_zone.max(numeric_only=True)[['Velo', 'IVB', 'HB', 'Spin']].round(1)

    # Filter dataframe for only fastballs
    df_fastballs = df[df['TaggedPitchType'].isin(['Fastball', 'FourSeamFastBall', 'Sinker', 'TwoSeamFastBall'])].copy()
    df_fastballs['zone'] = df_fastballs['PlateLocHeight'].apply(define_zone)
    
    # Calculate average 'VertApprAngle' for each zone without considering the pitch type
    grouped_vaa = df_fastballs.groupby(['Pitcher', 'zone']).mean(numeric_only=True)['VertApprAngle'].round(1)

    # Unstack the multi-index dataframe to get each zone as a separate column
    grouped_vaa = grouped_vaa.unstack(level=-1)

    # Rename the columns
    grouped_vaa.columns = ['VAA' + col for col in grouped_vaa.columns]
    

    # Combine all metrics
    grouped_final = pd.concat([
        grouped,
        grouped_min.add_suffix('_min'),
        grouped_max.add_suffix('_max'),
        grouped_avg,
        grouped_in_zone.rename('in_zone%'),
        grouped_vaa
    ], axis=1)
        
    # Define the order of the columns
    cols_order = [
        'Velo',
        'Velo_max',
        'Velo_min',
        'IVB',
        'IVB_max',
        'IVB_min',
        'HB',
        'HB_max',
        'HB_min',
        'Spin',
        'Spin_max',
        'Spin_min',
        'in_zone%',
        'RelHeight',
        'Extension',
        'VAAupper',
        'VAAmiddle',
        'VAAlow'
    ]
    # Ensure all expected columns are present
    for col in cols_order:
        if col not in grouped_final.columns:
            grouped_final[col] = float('nan')

    # Reorder the columns
    grouped_final = grouped_final[cols_order]

    # Calculate the count of each TaggedPitchType for each Pitcher
    pitch_type_counts = df.groupby(['Pitcher', 'TaggedPitchType']).size().reset_index(name='count')

    # Reset the index of grouped_final to split the combined index into separate columns
    grouped_final = grouped_final.reset_index()
    
    # Split the combined index into separate 'Pitcher' and 'TaggedPitchType' columns
    grouped_final[['Pitcher', 'TaggedPitchType']] = grouped_final['index'].apply(pd.Series)

    # Drop the old combined index column
    grouped_final.drop(columns=['index'], inplace=True)

    # Merge pitch_type_counts with grouped_final on 'Pitcher' and 'TaggedPitchType'
    grouped_final = pd.merge(pitch_type_counts, grouped_final, on=['Pitcher', 'TaggedPitchType'], how='left')

    # Set the index of grouped_final to be 'Pitcher' and 'TaggedPitchType'
    grouped_final.set_index(['Pitcher', 'TaggedPitchType'], inplace=True)

    # Reorder the columns to put 'count' first
    cols_order = ['count'] + [col for col in grouped_final.columns if col != 'count']
    grouped_final = grouped_final[cols_order]

    # Convert 'count' to integer
    grouped_final['count'] = grouped_final['count'].round(0).astype(int)

    return grouped_final,grouped_vaa

Method to take samples for each pitch type to reduce clutter

In [31]:
def sample_pitches(data, sample_frac):
    if data.empty:
        return data
    sampled = []
    for pitch_type in data['TaggedPitchType'].unique():
        pitch_data = data[data['TaggedPitchType'] == pitch_type]
        # Sample a fraction of pitches
        sampled.append(pitch_data.sample(frac=sample_frac, random_state=42))
    return pd.concat(sampled)

Methods for to plotting and creating the pitcher report

In [189]:
def plot_pitch_movement(ax, data, centroid):
    sns.scatterplot(data=data, x='HB', y='IVB', hue='TaggedPitchType',
                    palette=PITCH_COLORS, s=120, ax=ax)
    ax.set_title('Pitch Movement Plot')
    ax.set_xlabel('Horizontal Break (in)')
    ax.set_ylabel('Induced Vertical Break (in)')
    ax.set_xlim(-25, 25)
    ax.set_ylim(-25, 25)
    # Add axes lines at the origin 
    ax.axhline(0, color='black', linewidth=0.5)
    ax.axvline(0, color='black', linewidth=0.5)
    
     # Add centroid points to the second subplot
    for i in range(len(centroid)):
        ax.scatter(centroid.iloc[i, 0], centroid.iloc[i, 1], color=PITCH_COLORS[centroid.index[i]], s=150, marker='x')

    
def create_pitcher_report(df, pitcher, output_path):
    current_pitcher_data = current_pitcher_data = df[df['Pitcher'] == pitcher]
    # Calculate centroid and round to 3 decimal places
    centroid = current_pitcher_data.groupby('TaggedPitchType')[['HB', 'IVB']].mean().round(3)
    
    # Convert 'Date' column to datetime if not already
    current_pitcher_data['Date'] = pd.to_datetime(current_pitcher_data['Date'], errors='coerce')
        
    # Get date
    #pitch_date = current_pitcher_data['Date'].iloc[0].date()
    
    fig = plt.figure(figsize=(24, 18))
    gs = GridSpec(4, 6, figure=fig, height_ratios=[0.05, 0.75, 1, 3.25])
    # Adjust the spacing between subplots
    plt.subplots_adjust(hspace=0.3)
    # Add the pitcher's name and date at the top left of the page
    ax0 = fig.add_subplot(gs[0, 0])
    ax0.text(0.5, 0.5, f"{pitcher}", ha='center', va='center', fontsize=20)
    ax0.axis('off')
    
    metrics = calculate_pitcher_metrics(df)
    
    success_rate = round(metrics.loc[pitcher]['success_rate'], 1)
    ab_efficiency = round(metrics.loc[pitcher]['ab_efficiency'], 1)
    
    pretabledata, grouped_vaa = table_metrics(df)

    # Check if 'VAAupper', 'VAAmiddle', and 'VAAlow' exist before trying to access them
    if pitcher in grouped_vaa.index:
        vaa_upper = grouped_vaa.loc[pitcher]['VAAupper'] if 'VAAupper' in grouped_vaa.columns else 'N/A'
        vaa_middle = grouped_vaa.loc[pitcher]['VAAmiddle'] if 'VAAmiddle' in grouped_vaa.columns else 'N/A'
        vaa_low = grouped_vaa.loc[pitcher]['VAAlow'] if 'VAAlow' in grouped_vaa.columns else 'N/A'
    else:
        vaa_upper = 'N/A'
        vaa_middle = 'N/A'
        vaa_low = 'N/A'
    
    table_data = pretabledata.drop(columns=[ 'VAAupper', 'VAAmiddle', 'VAAlow']).loc[pitcher]
    ax2 = fig.add_subplot(gs[1, :])
    table = ax2.table(cellText=table_data.values, colLabels=table_data.columns, rowLabels=table_data.index, cellLoc='center', loc='center')
        
    # Set the font size of the table
    table.auto_set_font_size(False)
    table.set_fontsize(10)

    # Scale the table to take up the whole width of the page
    table.scale(1, 1.5)

     # Turn off axis for table subplot
    ax2.axis('off')

    sampled_data = sample_pitches(current_pitcher_data, 0.5)
    
    ax1 = fig.add_subplot(gs[3, 0:2])
    plot_pitch_movement(ax1, current_pitcher_data, centroid)
    ax1.axis('off')
    
    # Filter data for left-handed and right-handed batters
    left_batters = current_pitcher_data[current_pitcher_data['BatterSide'] == 'Left']
    right_batters = current_pitcher_data[current_pitcher_data['BatterSide'] == 'Right']
    left_sampled = sample_pitches(left_batters, 0.5)
    right_sampled = sample_pitches(right_batters, 0.5)
    
    
    #create subplots for pitch location, separated by batter
    markers = {"StrikeCalled": "o", "StrikeSwinging": "v", "BallCalled": "^", "HitByPitch": "<", "FoulBallNotFieldable": ">", "FoulBallFieldable": ">", "FoulBall": ">", "InPlay": "s", "BallinDirt": "D", "BallInDirt": "D"}
    
    ax3 = fig.add_subplot(gs[3, 2:4])
    sns.scatterplot(data=right_sampled, x='PlateLocSide', y='PlateLocHeight', s=120, style='PitchCall', markers=markers, hue='TaggedPitchType', palette=PITCH_COLORS, ax=ax3)

    ax3.set_title('Pitch Location (Pitcher View)(Righty batters)')
    ax3.set_xlabel('Plate Side (ft)')
    ax3.set_ylabel('Plate Height (ft)')
    ax3.set_xlim(-3, 3)
    ax3.set_ylim(0, 6)
    home_plate_drawing(ax3)

    # Add a rectangle to the third subplot
    rectangle = Rectangle((ZONE_BOUNDS['min_plate_x'], ZONE_BOUNDS['min_plate_z']), ZONE_BOUNDS['max_plate_x'] - ZONE_BOUNDS['min_plate_x'], ZONE_BOUNDS['max_plate_z'] - ZONE_BOUNDS['min_plate_z'], fill=False)
    ax3.add_patch(rectangle)
    ax3.axis('off')
    
    
    
   
    ax4 = fig.add_subplot(gs[3, 4:6])
    sns.scatterplot(data=left_sampled, x='PlateLocSide', y='PlateLocHeight', s=120, style='PitchCall', markers=markers, hue='TaggedPitchType', palette=PITCH_COLORS, ax=ax4)
    ax4.set_title('Pitch Location (Pitcher View)(Lefty batters)')
    ax4.set_xlabel('Plate Side (ft)')
    ax4.set_ylabel('Plate Height (ft)')
    ax4.set_xlim(-3, 3)
    ax4.set_ylim(0, 6)
    home_plate_drawing(ax4)

    # Add a rectangle to the fourth subplot
    rectangle = Rectangle((ZONE_BOUNDS['min_plate_x'], ZONE_BOUNDS['min_plate_z']), ZONE_BOUNDS['max_plate_x'] - ZONE_BOUNDS['min_plate_x'], ZONE_BOUNDS['max_plate_z'] - ZONE_BOUNDS['min_plate_z'], fill=False)
    ax4.add_patch(rectangle)
    ax4.axis('off')
    
    
    
    # Create the fourth subplot (Velocity by Pitch Type)
    ax5 = fig.add_subplot(gs[2, 0:4])
    sns.lineplot(data=current_pitcher_data, x='PitchCount', y='Velo', hue='TaggedPitchType', palette=PITCH_COLORS, ax=ax5)
    ax5.set_title('Velocity by Pitch Type')
    ax5.set_xlabel('Pitch Number')
    ax5.set_ylabel('Velo')

    

    # Create the sixth subplot (Pitch Type Pie Chart for Right-Handed Batters)
    ax7 = fig.add_subplot(gs[2,4])
    # Get colors for each pitch type
    right_pitch_usage = right_batters['TaggedPitchType'].value_counts()
    right_colors = [PITCH_COLORS[pitch_type] for pitch_type in right_pitch_usage.index]
    ax7.pie(right_pitch_usage, labels=right_pitch_usage.index, colors=right_colors, autopct='%1.1f%%', startangle=140)
    ax7.set_title('Pitch Usage Percentage (Right-Handed Batters)')

    
    # Create the fifth subplot (Pitch Type Pie Chart for Left-Handed Batters)
    ax6 = fig.add_subplot(gs[2, 5])
    # Get colors for each pitch type
    left_pitch_usage = left_batters['TaggedPitchType'].value_counts()
    left_colors = [PITCH_COLORS[pitch_type] for pitch_type in left_pitch_usage.index]
    ax6.pie(left_pitch_usage, labels=left_pitch_usage.index, colors=left_colors, autopct='%1.1f%%', startangle=140)
    ax6.set_title('(Left-Handed Batters)')


    # Calculate the values for the new table
    total_pitches = len(current_pitcher_data)
    swing_pitches = len(current_pitcher_data[current_pitcher_data['Swing'] == 1])
    first_pitches = current_pitcher_data[current_pitcher_data['PitchofPA'] == 1]
    all_first_pitches = len(first_pitches)

    # Define the condition for a successful first pitch
    conditions = (
        # If the PitchCall is not 'BallCalled' or 'InPlay'
        ((first_pitches['PitchCall'] != 'BallCalled') & (first_pitches['PitchCall'] != 'InPlay')) |
        # Or if the PitchCall is 'InPlay' and the PlayResult is 'Out'
        ((first_pitches['PitchCall'] == 'InPlay') & (first_pitches['PlayResult'] == 'Out'))
    )



    # Apply the conditions to the dataframe
    successful_first_pitches = first_pitches[conditions]

    # Get the number of successful first pitches
    first_pitch_strikes = len(successful_first_pitches)


    fps_percent = round((first_pitch_strikes/all_first_pitches)*100,1)


    iz_pitches = len(current_pitcher_data[current_pitcher_data['in_zone'] == 1])
    oz_pitches = len(current_pitcher_data[current_pitcher_data['in_zone'] == 0])
    whiff = len(current_pitcher_data[current_pitcher_data['PitchCall'] == 'StrikeSwinging'])
    called_strikes = len(current_pitcher_data[current_pitcher_data['PitchCall'] == 'StrikeCalled'])
    iz_swings = len(current_pitcher_data[(current_pitcher_data['in_zone'] == 1) & (current_pitcher_data['Swing'] == 1)])
    iz_whiffs = len(current_pitcher_data[(current_pitcher_data['in_zone'] == 1) & (current_pitcher_data['PitchCall'] == 'StrikeSwinging')])
    #whiff_percent = round(len(current_pitcher_data[current_pitcher_data['PitchCall'] == 'StrikeSwinging']) / swing_pitches * 100, 1)

    csw_percent = round((called_strikes + whiff) / total_pitches * 100,1)

    # Correct calculation for In-Zone Whiff%
    if iz_swings > 0:
        iz_whiff_percent = round(iz_whiffs / iz_swings * 100, 1)
    else:
        iz_whiff_percent = 'NA'
    # Correct calculation for In-Zone Whiff%
    if swing_pitches > 0:
        whiff_percent = round(whiff / swing_pitches * 100, 1)
    else:
        whiff_percent = 'NA'

    if oz_pitches > 0:
        chase_percent = round(len(current_pitcher_data[(current_pitcher_data['Swing'] == 1) & (current_pitcher_data['in_zone'] == 0)]) / oz_pitches * 100, 1)
    else:
        chase_percent = 'NA'

    pitch_types = ['Fastball', 'FourSeamFastBall', 'TwoSeamFastBall', 'Sinker']
    avg_nVAA = round(current_pitcher_data[current_pitcher_data['TaggedPitchType'].isin(pitch_types)]['nVAA'].mean(),3)

    # Create a DataFrame for the new table
    new_table_data = pd.DataFrame({'Total Pitches': [total_pitches], 'CSW%':[csw_percent], 'AB Efficiency': [ab_efficiency], 'FPS%': [fps_percent],  '2 of 3 Success': [success_rate] ,
    'Whiff%': [whiff_percent], 'IZ Whiff%': [iz_whiff_percent], 'Chase%': [chase_percent],'FB nVAA': [avg_nVAA],'FB VAA Upper': [vaa_upper], 'FB VAA Mid': [vaa_middle], 'FB VAA Lower': [vaa_low],})

    # Create a new table in the third row spanning both columns
    
    
    ax8 = fig.add_subplot(gs[0, 1:])
    new_table = ax8.table(cellText=new_table_data.values, colLabels=new_table_data.columns, cellLoc='center', loc='center')

    # Set the font size of the new table
    new_table.auto_set_font_size(False)
    new_table.set_fontsize(10)

    # Scale the new table to take up the whole width of the page
    new_table.scale(1, 1.5)

    # Turn off axis for new table subplot
    ax8.axis('off')

   
    
    plt.tight_layout()
    with PdfPages(output_path) as pdf:
        pdf.savefig(fig, bbox_inches='tight')
    plt.close(fig)

Interactive version of report for plots

In [156]:
def create_interactive_report(df, pitcher, output_path_html):
    
    sample_frac = 0.5
    
    plt.rcParams['font.family'] = 'Arial'
    
    current_pitcher_data = df[df['Pitcher'] == pitcher]
    fig_html = make_subplots(rows=2, cols=3, 
        subplot_titles=[
        'Pitch Movement', 
        'pitch percentage (Righty)', 
        'Pitch Percentage (Lefty)',
        'Velocity', 
        'Pitch Location (Righty) ' + str(sample_frac*100) + '% sample', 
        'Pitch Location (Lefty) ' + str(sample_frac*100) + '% sample'     
    ],
        specs=[[{'type': 'scatter'}, {'type': 'pie'}, {'type': 'pie'}],
                   [{'type': 'xy'}, {'type': 'xy'}, {'type': 'xy'}]]
    )
    # Pitch Movement (all pitch types, color-coded)
    scatter_fig = px.scatter(
        current_pitcher_data,
        x='HB',
        y='IVB',
        color='TaggedPitchType',
        color_discrete_map=PITCH_COLORS,  # Use same colors as matplotlib
        labels={'HB': 'Horizontal Break (in)', 'IVB': 'Induced Vertical Break (in)'},
        hover_data=['TaggedPitchType', 'Velo']
    )

    # Add all traces from px.scatter (one per pitch type)
    for trace in scatter_fig.data:
        fig_html.add_trace(trace, row=1, col=1)
    fig_html.add_shape(
        type='line', x0=0, y0=-25, x1=0, y1=25,
        line=dict(color='black', width=1),
        row=1, col=1
    )
    fig_html.add_shape(
        type='line', x0=-25, y0=0, x1=25, y1=0,
        line=dict(color='black', width=1),
        row=1, col=1
    )

    # Update layout for pitch movement plot
    fig_html.update_xaxes(
        title_text='Horizontal Break (in)', range=[-25, 25], row=1, col=1
    )
    fig_html.update_yaxes(
        title_text='Induced Vertical Break (in)', range=[-25, 25], row=1, col=1
    )
    
    left_batters = current_pitcher_data[current_pitcher_data['BatterSide'] == 'Left']
    right_batters = current_pitcher_data[current_pitcher_data['BatterSide'] == 'Right']
    
    sampled_left = sample_pitches(left_batters, sample_frac)
    sampled_right = sample_pitches(right_batters, sample_frac)
    
    left_usage = left_batters['TaggedPitchType'].value_counts(normalize=True) * 100
    right_usage = right_batters['TaggedPitchType'].value_counts(normalize=True) * 100
    
    fig_html.add_trace(
        go.Pie(
            labels=right_usage.index,
            values=right_usage.values,
            marker_colors=[PITCH_COLORS[pitch_type] for pitch_type in right_usage.index],
            textinfo='label',
            hoverinfo='label+percent',
            hole=0.4,  # Donut style for modern look     
        ),
        row=1, col=2
    )
    
    fig_html.add_trace(
        go.Pie(
            labels=left_usage.index,
            values=left_usage.values,
            marker_colors=[PITCH_COLORS[pitch_type] for pitch_type in left_usage.index],
            textinfo='label',
            hoverinfo='label+percent',
            hole=0.4,  # Donut style for modern look     
        ),
        row=1, col=3
    )

    # Pitch Location (Righty)
    scatter_fig_right = px.scatter(
    sampled_right,
        x='PlateLocSide',
        y='PlateLocHeight',
        color='TaggedPitchType',
        color_discrete_map=PITCH_COLORS,
        labels={'PlateLocSide': 'Plate Side (ft)', 'PlateLocHeight': 'Plate Height (ft)'},
        hover_data=['TaggedPitchType', 'PitchCall', 'Velo'],
    )
    for trace in scatter_fig_right.data:
        fig_html.add_trace(trace, row=2, col=2)
    fig_html.add_shape(
        type='rect',
        x0=ZONE_BOUNDS['min_plate_x'], y0=ZONE_BOUNDS['min_plate_z'],
        x1=ZONE_BOUNDS['max_plate_x'], y1=ZONE_BOUNDS['max_plate_z'],
        line=dict(color='gray', width=1),
        fillcolor='gray', opacity=0.4,
        row=2, col=2
    )
    fig_html.update_xaxes(title_text='Plate Side (ft)', range=[-3, 3], row=2, col=2)
    fig_html.update_yaxes(title_text='Plate Height (ft)', range=[0, 6], row=2, col=2)
    
    #Pitch Location (Lefty)
    scatter_fig_left = px.scatter(
    sampled_left,
        x='PlateLocSide',
        y='PlateLocHeight',
        color='TaggedPitchType',
        color_discrete_map=PITCH_COLORS,
        labels={'PlateLocSide': 'Plate Side (ft)', 'PlateLocHeight': 'Plate Height (ft)'},
        hover_data=['TaggedPitchType', 'PitchCall', 'Velo'],
    )
    for trace in scatter_fig_left.data:
        fig_html.add_trace(trace, row=2, col=3)
    fig_html.add_shape(
        type='rect',
        x0=ZONE_BOUNDS['min_plate_x'], y0=ZONE_BOUNDS['min_plate_z'],
        x1=ZONE_BOUNDS['max_plate_x'], y1=ZONE_BOUNDS['max_plate_z'],
        line=dict(color='gray', width=1),
        fillcolor='gray', opacity=0.4,
        row=2, col=3
    )
    fig_html.update_xaxes(title_text='Plate Side (ft)', range=[-3, 3], row=2, col=3)
    fig_html.update_yaxes(title_text='Plate Height (ft)', range=[0, 6], row=2, col=3)
        
    
    # Velocity by Pitch Type (Smoothed)
    velocity_data = current_pitcher_data.copy()
    for pitch_type in velocity_data['TaggedPitchType'].unique():
        pitch_subset = velocity_data[velocity_data['TaggedPitchType'] == pitch_type]
        pitch_subset = pitch_subset.sort_values('PitchCount')  # Ensure chronological order
        pitch_subset['Velo_smooth'] = pitch_subset['Velo'].rolling(window=5, min_periods=1).mean()
        fig_html.add_trace(
            go.Scatter(
                x=pitch_subset['PitchCount'],
                y=pitch_subset['Velo_smooth'],
                mode='lines',
                name=pitch_type,
                line=dict(color=PITCH_COLORS[pitch_type]),
                hovertemplate='Pitch %{x}<br>Velocity: %{y:.1f} mph<br>Type: %{text}',
                text=[pitch_type] * len(pitch_subset)
            ),
            row=2, col=1
        )
    fig_html.update_xaxes(title_text='Pitch Number', row=2, col=1)
    fig_html.update_yaxes(title_text='Velo (mph)', row=2, col=1)
    
    
    fig_html.update_traces(showlegend=False, row=2, col=2)
    fig_html.update_traces(showlegend=False, row=2, col=3)
    
    fig_html.add_annotation(
    text=f'{pitcher}',
    x=0, y=1.1,
    xanchor='left', yanchor='top',
    xref='paper', yref='paper',
    showarrow=False,
    font=dict(size=16)
    )
    
    pio.write_html(fig_html, output_path_html)
 

Creating the plots

In [93]:
input_file = r'C:\Users\gavin\Au_Baseball\TrackMan_SMML_Master_CSV.csv'
df = load_and_preprocess_data(input_file, dtype_dict = {'Notes': 'string'})

In [191]:
pitcher = 'Dutton, Samuel'
output_file = r'C:\Users\gavin\Au_baseball\pitcher_report_' + pitcher + '.pdf'
output_html = r'C:\Users\gavin\Au_baseball\interactive_report_' + pitcher + '.html'
create_pitcher_report(df, pitcher, output_file)
create_interactive_report(df, pitcher, output_html)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

