# Predicting a Synthetic Pitcher-Batter Matchup

This notebook demonstrates how to predict the result of a hypothetical at-bat between Dominic Puccetti (pitcher) and Michael Sandle (batter) using their historical data and a trained RandomForestClassifier.

Steps:
1. Aggregate historical stats for each player.
2. Construct synthetic input data for the matchup.
3. Preprocess and encode features.
4. Predict the at-bat result using the trained model.

In [3]:
# Import libraries and load the dataset
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

df = pd.read_csv('fl_data_25.csv')
df.head()

  df = pd.read_csv('fl_data_25.csv')


Unnamed: 0.1,Unnamed: 0,PitchNo,Date,Time,PAofInning,PitchofPA,Pitcher,PitcherId,PitcherThrows,PitcherTeam,...,ThrowTrajectoryZc1,ThrowTrajectoryZc2,PitchReleaseConfidence,PitchLocationConfidence,PitchMovementConfidence,HitLaunchConfidence,HitLandingConfidence,CatcherThrowCatchConfidence,CatcherThrowReleaseConfidence,CatcherThrowLocationConfidence
0,1,1,2025-05-08,18:28:45.34,1,1,"Cook, Cole",1000161000.0,Left,SCH_BOO,...,,,High,High,High,High,High,,,
1,2,2,2025-05-08,18:29:02.44,1,2,"Cook, Cole",1000161000.0,Left,SCH_BOO,...,,,High,High,High,,,,,
2,3,3,2025-05-08,18:29:16.24,1,3,"Cook, Cole",1000161000.0,Left,SCH_BOO,...,,,High,High,High,High,High,,,
3,4,4,2025-05-08,18:29:35.38,1,4,"Cook, Cole",1000161000.0,Left,SCH_BOO,...,,,High,High,High,,,,,
4,5,5,2025-05-08,18:29:51.19,1,5,"Cook, Cole",1000161000.0,Left,SCH_BOO,...,-67.32693,8.75671,High,High,High,,,High,Medium,High


In [None]:
# Aggregate historical stats for pitcher and batter
pitcher_name = ...
batter_name = ...

pitcher_df = df[df['Pitcher'] == pitcher_name]
batter_df = df[df['Batter'] == batter_name]

# Use only if tagging software is not accurate
def retag_pitch(row):
    if row['InducedVertBreak'] < 0:
        return 'Sweeper'
    elif row['InducedVertBreak'] < 7 and row['SpinRate'] > 1900 and row['HorzBreak'] < -3:
        return 'Slider'
    elif row['RelSpeed'] > 85:
        return 'Fastball'
    else:
        return 'Changeup'

pitcher_df['AutoPitchType'] = pitcher_df.apply(retag_pitch, axis=1)

synthetic_row = {}
for col in df.columns:
    if col == 'Pitcher':
        synthetic_row[col] = pitcher_name
    elif col == 'Batter':
        synthetic_row[col] = batter_name
    elif col in ['PitcherThrows', 'PitcherTeam', 'PitcherSet'] and not pitcher_df.empty:
        synthetic_row[col] = pitcher_df[col].mode().iloc[0]
    elif col in ['BatterSide', 'BatterTeam'] and not batter_df.empty:
        synthetic_row[col] = batter_df[col].mode().iloc[0]
    elif col in ['RelSpeed','SpinRate','SpinAxis','Tilt','RelHeight','RelSide','Extension','VertBreak','InducedVertBreak','HorzBreak','PlateLocHeight','PlateLocSide','ZoneSpeed','VertApprAngle','HorzApprAngle','ZoneTime','EffectiveVelo','MaxHeight','MeasuredDuration','SpeedDrop','PitchLastMeasuredX','PitchLastMeasuredY','PitchLastMeasuredZ'] and not pitcher_df.empty and pd.api.types.is_numeric_dtype(pitcher_df[col]):
        synthetic_row[col] = pitcher_df[col].mean()
    elif col in ['ExitSpeed','Angle','Direction','HitSpinRate','Distance','LastTrackedDistance','Bearing','HangTime'] and not batter_df.empty and pd.api.types.is_numeric_dtype(batter_df[col]):
        synthetic_row[col] = batter_df[col].mean()
    elif col in ['Balls','Strikes','Outs','Inning','Top.Bottom','PAofInning','PitchofPA'] and not batter_df.empty:
        synthetic_row[col] = batter_df[col].mode().iloc[0]
    elif col == 'AutoPitchType' and not pitcher_df.empty:
        synthetic_row[col] = pitcher_df[col].mode().iloc[0]
    elif col == 'TaggedPitchType' and not pitcher_df.empty:
        synthetic_row[col] = pitcher_df[col].mode().iloc[0]
    elif col == 'PitchCall' and not pitcher_df.empty:
        synthetic_row[col] = pitcher_df[col].mode().iloc[0]
    elif col == 'TaggedHitType' and not batter_df.empty:
        synthetic_row[col] = batter_df[col].mode().iloc[0]
    elif col == 'PlayResult':
        continue
    else:
        if pd.api.types.is_numeric_dtype(df[col]):
            synthetic_row[col] = df[col].mean()
        else:
            synthetic_row[col] = df[col].mode().iloc[0]

synthetic_input = pd.DataFrame([synthetic_row])
synthetic_input.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pitcher_df['AutoPitchType'] = pitcher_df.apply(retag_pitch, axis=1)


Unnamed: 0.1,Unnamed: 0,PitchNo,Date,Time,PAofInning,PitchofPA,Pitcher,PitcherId,PitcherThrows,PitcherTeam,...,ThrowTrajectoryZc1,ThrowTrajectoryZc2,PitchReleaseConfidence,PitchLocationConfidence,PitchMovementConfidence,HitLaunchConfidence,HitLandingConfidence,CatcherThrowCatchConfidence,CatcherThrowReleaseConfidence,CatcherThrowLocationConfidence
0,78010.0,153.153398,2025-05-15,11:24:35.50,1,1,"Webster, Evan",387989300.0,Left,FLO_Y'A,...,-10.451792,2.353149,High,High,High,High,High,High,High,High


In [43]:
# Preprocess, encode, and impute the synthetic input row
features = [col for col in df.columns if col != 'PlayResult']
categorical_cols = synthetic_input.select_dtypes(include=['object', 'category']).columns

for col in categorical_cols:
    combined = pd.concat([df[col], synthetic_input[col]], axis=0).astype(str).fillna('missing')
    le = LabelEncoder()
    le.fit(combined)
    synthetic_input[col] = le.transform(synthetic_input[col].astype(str).fillna('missing'))

# Drop columns with all missing values before imputation
numeric_cols = synthetic_input.select_dtypes(include=['float64', 'int64']).columns
cat_cols = synthetic_input.select_dtypes(exclude=['float64', 'int64']).columns
for col in numeric_cols:
    if synthetic_input[col].isnull().all() and df[col].isnull().all():
        synthetic_input = synthetic_input.drop(columns=[col])
for col in cat_cols:
    if synthetic_input[col].isnull().all() and df[col].isnull().all():
        synthetic_input = synthetic_input.drop(columns=[col])

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

if len(numeric_cols) > 0:
    valid_numeric = [col for col in numeric_cols if col in synthetic_input.columns]
    synthetic_input[valid_numeric] = num_imputer.fit_transform(pd.concat([df[valid_numeric], synthetic_input[valid_numeric]]))[-1:]
if len(cat_cols) > 0:
    valid_cat = [col for col in cat_cols if col in synthetic_input.columns]
    synthetic_input[valid_cat] = cat_imputer.fit_transform(pd.concat([df[valid_cat], synthetic_input[valid_cat]]))[-1:]

synthetic_input = synthetic_input.dropna(axis=1).dropna(axis=0)
synthetic_input = synthetic_input[[col for col in df[features].columns if col in synthetic_input.columns]]

In [44]:
# Load trained model and predict the result for the synthetic matchup
from joblib import load
try:
    # Drop columns with all missing values in both training and synthetic input
    valid_features = [col for col in features if not (df[col].isnull().all() or synthetic_input[col].isnull().all())]
    X_encoded = df[valid_features].copy()
    synth_input = synthetic_input[valid_features].copy()
    # Encode categorical features
    for col in synth_input.select_dtypes(include=['object', 'category']).columns:
        combined = pd.concat([df[col], synth_input[col]], axis=0).astype(str).fillna('missing')
        le = LabelEncoder()
        le.fit(combined)
        X_encoded[col] = le.transform(X_encoded[col].astype(str).fillna('missing'))
        synth_input[col] = le.transform(synth_input[col].astype(str).fillna('missing'))
    # Impute missing values
    num_cols = synth_input.select_dtypes(include=[np.number]).columns
    cat_cols = synth_input.select_dtypes(exclude=[np.number]).columns
    num_imputer = SimpleImputer(strategy='median')
    cat_imputer = SimpleImputer(strategy='most_frequent')
    if len(num_cols) > 0:
        X_encoded[num_cols] = num_imputer.fit_transform(X_encoded[num_cols])
        synth_input[num_cols] = num_imputer.transform(synth_input[num_cols])
    if len(cat_cols) > 0:
        X_encoded[cat_cols] = cat_imputer.fit_transform(X_encoded[cat_cols])
        synth_input[cat_cols] = cat_imputer.transform(synth_input[cat_cols])
    # Drop columns and rows that still contain NaNs
    X_encoded = X_encoded.dropna(axis=1).dropna(axis=0)
    synth_input = synth_input[X_encoded.columns].dropna(axis=1).dropna(axis=0)
    y = df['PlayResult'].loc[X_encoded.index]
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_encoded, y)
    # Predict for synthetic input
    y_pred_synth = clf.predict(synth_input)
    #print('Predicted result for Dominic Puccetti vs Michael Sandle:')
    #print(y_pred_synth)
except Exception as e:
    print('Error during prediction:', e)

In [45]:
# Improved pitch-by-pitch simulation: realistic location, swing zone, and in-play outs
import random

# Strike zone boundaries
xmin, xmax = -0.9, 0.9
ymin, ymax = 1.6, 3.5

# Stadium dimensions (feet)
LF_RF = 330
GAPS = 375
CF = 390
FENCE_HEIGHT = 8

def get_fence_distance(angle):
    # Angle: 0 = center, -45 = left field line, +45 = right field line
    # Use simple linear interpolation between lines/gaps/center
    angle = float(angle)
    if abs(angle) <= 15:
        return CF
    elif abs(angle) <= 30:
        return GAPS
    else:
        return LF_RF

def classify_hit(distance, angle):
    fence = get_fence_distance(angle)
    # Home Run if over fence
    if distance >= fence:
        return 'HomeRun'
    # Double/Triple if deep outfield but not over fence
    elif distance >= 250:
        # 70% Double, 30% Triple
        return 'Triple' if random.random() < 0.3 else 'Double'
    # Single if past the diamond (roughly > 150 ft)
    elif distance >= 150:
        # Probabilistic assignment for outfield singles vs fly outs and outs
        fly_prob = min(0.5, max(0.1, (distance-150)/100))
        if random.random() < fly_prob:
            # 40% Fly Out, 30% GroundOut, 30% Single
            r = random.random()
            if r < 0.4:
                return 'FlyOut'
            elif r < 0.7:
                return 'GroundOut'
            else:
                return 'Single'
        else:
            # 60% Single, 40% GroundOut
            return 'Single' if random.random() < 0.6 else 'GroundOut'
    else:
        # Infield: ground out, pop out, or infield hit
        # Use angle to help: shallow angle = grounder, steep = pop
        if abs(angle) < 20:
            # 70% GroundOut, 20% InfieldHit, 10% PopOut
            r = random.random()
            if r < 0.7:
                return 'GroundOut'
            elif r < 0.9:
                return 'InfieldHit'
            else:
                return 'PopOut'
        else:
            # 60% PopOut, 30% InfieldHit, 10% GroundOut
            r = random.random()
            if r < 0.6:
                return 'PopOut'
            elif r < 0.9:
                return 'InfieldHit'
            else:
                return 'GroundOut'

max_pitches = random.randint(4, 7)
pitch_features = ['AutoPitchType', 'PlateLocHeight', 'PlateLocSide', 'Balls', 'Strikes', 'PitchCall']
sequence = []
balls = 0
strikes = 0
atbat_end = False
final_result = None

# Get pitcher's historical pitch types and location stats
pitch_types = pitcher_df['AutoPitchType'].dropna().unique() if not pitcher_df.empty else df['AutoPitchType'].dropna().unique()
loc_height_mean = pitcher_df['PlateLocHeight'].mean() if not pitcher_df.empty else df['PlateLocHeight'].mean()
loc_height_std = pitcher_df['PlateLocHeight'].std() if not pitcher_df.empty else df['PlateLocHeight'].std()
loc_side_mean = pitcher_df['PlateLocSide'].mean() if not pitcher_df.empty else df['PlateLocSide'].mean()
loc_side_std = pitcher_df['PlateLocSide'].std() if not pitcher_df.empty else df['PlateLocSide'].std()

# Get batter's swing zone stats
batter_height_mean = batter_df['PlateLocHeight'].mean() if not batter_df.empty else df['PlateLocHeight'].mean()
batter_height_std = batter_df['PlateLocHeight'].std() if not batter_df.empty else df['PlateLocHeight'].std()
batter_side_mean = batter_df['PlateLocSide'].mean() if not batter_df.empty else df['PlateLocSide'].mean()
batter_side_std = batter_df['PlateLocSide'].std() if not batter_df.empty else df['PlateLocSide'].std()

for i in range(max_pitches):
    if atbat_end:
        break
    pitch = synthetic_row.copy()
    # Sample pitch type from pitcher's historical types
    pitch['AutoPitchType'] = random.choice(pitch_types) if len(pitch_types) > 0 else df['AutoPitchType'].mode().iloc[0]
    # Sample location from pitcher's mean ± stddev
    pitch['PlateLocHeight'] = float(random.gauss(loc_height_mean, loc_height_std))
    pitch['PlateLocSide'] = float(random.gauss(loc_side_mean, loc_side_std))
    # Clamp location to reasonable field values
    pitch['PlateLocHeight'] = max(min(pitch['PlateLocHeight'], 5.0), 0.0)
    pitch['PlateLocSide'] = max(min(pitch['PlateLocSide'], 2.0), -2.0)

    pitch['Balls'] = balls
    pitch['Strikes'] = strikes

    # Determine if pitch is in the strike zone
    in_zone = (xmin <= pitch['PlateLocSide'] <= xmax) and (ymin <= pitch['PlateLocHeight'] <= ymax)

    # Simulate batter's swing decision using batter's swing zone
    swing_zone = (abs(pitch['PlateLocHeight'] - batter_height_mean) <= batter_height_std) and (abs(pitch['PlateLocSide'] - batter_side_mean) <= batter_side_std)
    if in_zone and swing_zone:
        swing_prob = 0.85
    elif in_zone:
        swing_prob = 0.7
    elif swing_zone:
        swing_prob = 0.4
    else:
        swing_prob = 0.15
    swing = random.random() < swing_prob

    # Determine pitch call
    if in_zone and not swing:
        pitch_call = 'StrikeCalled'
    elif in_zone and swing:
        r = random.random()
        if r < 0.1:
            pitch_call = 'InPlay'
        elif r < 0.7:
            pitch_call = 'StrikeSwinging'
        else:
            pitch_call = 'FoulBall'
    elif not in_zone and not swing:
        pitch_call = 'BallCalled'
    elif not in_zone and swing:
        r = random.random()
        if r < 0.05:
            pitch_call = 'InPlay'
        elif r < 0.65:
            pitch_call = 'StrikeSwinging'
        else:
            pitch_call = 'FoulBall'

    pitch['PitchCall'] = pitch_call

    # Update balls/strikes based on pitch call and set final_result if at-bat ends
    if pitch_call == 'BallCalled':
        balls += 1
        if balls == 4:
            atbat_end = True
            final_result = 'Walk'
    elif pitch_call in ['StrikeCalled', 'StrikeSwinging']:
        if strikes < 2:
            strikes += 1
        else:
            strikes = 3
            atbat_end = True
            final_result = 'Strikeout'
    elif pitch_call == 'FoulBall':
        if strikes < 2:
            strikes += 1
        # Foul with 2 strikes does not add strike
    elif pitch_call == 'InPlay':
        atbat_end = True
        # final_result will be set after prediction

    sequence.append(pitch)

# Convert sequence to DataFrame
sequence_df = pd.DataFrame(sequence)

# Preprocess sequence for prediction
for col in sequence_df.select_dtypes(include=['object', 'category']).columns:
    combined = pd.concat([df[col].astype(str), sequence_df[col].astype(str)], axis=0).fillna('missing')
    le = LabelEncoder()
    le.fit(combined)
    sequence_df[col] = le.transform(sequence_df[col].astype(str).fillna('missing'))

# Drop columns with all missing values
valid_features = [col for col in sequence_df.columns if not (df[col].isnull().all() or sequence_df[col].isnull().all())]
sequence_df = sequence_df[valid_features]

# Impute missing values
numeric_cols = sequence_df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = sequence_df.select_dtypes(exclude=['float64', 'int64']).columns
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')
if len(numeric_cols) > 0:
    valid_numeric = [col for col in numeric_cols if col in sequence_df.columns]
    df_numeric = df[valid_numeric].astype(float)
    seq_numeric = sequence_df[valid_numeric].astype(float)
    sequence_df[valid_numeric] = num_imputer.fit_transform(pd.concat([df_numeric, seq_numeric]))[-len(sequence_df):]
if len(cat_cols) > 0:
    valid_cat = [col for col in cat_cols if col in sequence_df.columns]
    df_cat = df[valid_cat].astype(str)
    seq_cat = sequence_df[valid_cat].astype(str)
    sequence_df[valid_cat] = cat_imputer.fit_transform(pd.concat([df_cat, seq_cat]))[-len(sequence_df):]

# Predict pitch-by-pitch outcomes
try:
    y_pred_sequence = clf.predict(sequence_df[X_encoded.columns.intersection(sequence_df.columns)])
    # Dynamic matchup title
    matchup_title = f"Pitch-by-pitch predictions for {pitcher_name} vs {batter_name}:"
    print(matchup_title)
    for i, pitch in enumerate(sequence):
        pitch_call = pitch['PitchCall']
        balls = pitch['Balls']
        strikes = pitch['Strikes']
        print(f"Pitch {i+1}: Type={pitch['AutoPitchType']}, Location=({pitch['PlateLocSide']:.2f}, {pitch['PlateLocHeight']:.2f}), Call={pitch_call}, Balls={balls}, Strikes={strikes}")
        if pitch_call == 'InPlay':
            final_result = y_pred_sequence[i]
            # Stadium-based hit classification if available
            try:
                hit_idx = i
                hit_distance = sequence_df.iloc[hit_idx].get('Distance', None)
                hit_angle = sequence_df.iloc[hit_idx].get('Angle', None)
                if hit_distance is not None and hit_angle is not None:
                    hit_type = classify_hit(hit_distance, hit_angle)
                    print(f"Stadium-based hit classification: {hit_type} (Distance={hit_distance:.1f} ft, Angle={hit_angle:.1f} deg)")
                    final_result = hit_type
            except Exception as e:
                print(f"Error in stadium hit classification: {e}")
            break
    # If final_result is None or InfieldHit, assign a realistic out
    if final_result in [None, 'None', 'InfieldHit']:
        # Use last pitch context if available
        if 'hit_angle' in locals() and 'hit_distance' in locals():
            # Use hit classification to assign out type
            angle = hit_angle if hit_angle is not None else 0
            distance = hit_distance if hit_distance is not None else 0
            # If infield, assign ground out or pop out
            if distance < 150:
                final_result = 'GroundOut' if abs(angle) < 20 else 'PopOut'
            else:
                final_result = 'FlyOut'
        else:
            # Fallback: assign GroundOut
            final_result = 'GroundOut'
    print(f'Final predicted at-bat result: {final_result}')
except Exception as e:
    print('Error during pitch-by-pitch prediction:', e)

Pitch-by-pitch predictions for Webster, Evan vs Thoroman, Grant:
Pitch 1: Type=Changeup, Location=(1.25, 2.55), Call=FoulBall, Balls=0, Strikes=0
Pitch 2: Type=Changeup, Location=(-1.13, 1.34), Call=FoulBall, Balls=0, Strikes=1
Pitch 3: Type=Fastball, Location=(-0.77, 2.11), Call=InPlay, Balls=0, Strikes=2
Stadium-based hit classification: InfieldHit (Distance=112.3 ft, Angle=14.2 deg)
Final predicted at-bat result: GroundOut


In [46]:
# --- Helper functions for zone and wOBA/xwOBA ---
def get_zone(plate_x, plate_y):
    # 3x3 grid: 1=top left, 2=top mid, 3=top right, 4=mid left, ..., 9=bottom right
    x_bins = np.linspace(xmin, xmax, 4)
    y_bins = np.linspace(ymin, ymax, 4)
    col = np.digitize([plate_x], x_bins)[0]
    row = np.digitize([plate_y], y_bins)[0]
    col = min(max(col, 1), 3)
    row = min(max(row, 1), 3)
    zone = (row - 1) * 3 + col
    return zone

def estimate_xwoba(launch_angle, exit_velo):
    # Simple xwOBA estimator based on Statcast research
    # Barrel: LA 25-35, EV >=98; solid: LA 10-25, EV >=95; weak: low EV or extreme LA
    if exit_velo is None or launch_angle is None:
        return 0.200
    if exit_velo >= 98 and 25 <= launch_angle <= 35:
        return 0.900  # Barrel
    elif exit_velo >= 95 and 10 <= launch_angle <= 25:
        return 0.650  # Solid
    elif exit_velo >= 90 and 0 <= launch_angle <= 40:
        return 0.400  # Decent
    elif exit_velo >= 80 and -10 <= launch_angle <= 50:
        return 0.250  # Weak
    else:
        return 0.150  # Poor

# --- Simulation loop ---
sim_results = []
pitch_zone_summary = []

num_sims = 100000  # More sims for better probability
for sim in range(num_sims):
    balls = 0
    strikes = 0
    atbat_end = False
    sequence = []
    for i in range(random.randint(4, 7)):
        if atbat_end:
            break
        pitch = synthetic_row.copy()
        pitch['AutoPitchType'] = random.choice(pitch_types) if len(pitch_types) > 0 else df['AutoPitchType'].mode().iloc[0]
        pitch['PlateLocHeight'] = float(random.gauss(loc_height_mean, loc_height_std))
        pitch['PlateLocSide'] = float(random.gauss(loc_side_mean, loc_side_std))
        pitch['PlateLocHeight'] = max(min(pitch['PlateLocHeight'], 5.0), 0.0)
        pitch['PlateLocSide'] = max(min(pitch['PlateLocSide'], 2.0), -2.0)
        pitch['Balls'] = balls
        pitch['Strikes'] = strikes
        in_zone = (xmin <= pitch['PlateLocSide'] <= xmax) and (ymin <= pitch['PlateLocHeight'] <= ymax)
        swing_zone = (abs(pitch['PlateLocHeight'] - batter_height_mean) <= batter_height_std) and (abs(pitch['PlateLocSide'] - batter_side_mean) <= batter_side_std)
        if in_zone and swing_zone:
            swing_prob = 0.85
        elif in_zone:
            swing_prob = 0.7
        elif swing_zone:
            swing_prob = 0.4
        else:
            swing_prob = 0.15
        swing = random.random() < swing_prob
        if in_zone and not swing:
            pitch_call = 'StrikeCalled'
        elif in_zone and swing:
            r = random.random()
            if r < 0.1:
                pitch_call = 'InPlay'
            elif r < 0.7:
                pitch_call = 'StrikeSwinging'
            else:
                pitch_call = 'FoulBall'
        elif not in_zone and not swing:
            pitch_call = 'BallCalled'
        elif not in_zone and swing:
            r = random.random()
            if r < 0.05:
                pitch_call = 'InPlay'
            elif r < 0.65:
                pitch_call = 'StrikeSwinging'
            else:
                pitch_call = 'FoulBall'
        pitch['PitchCall'] = pitch_call
        pitch['PitchCount'] = f"{balls}-{strikes}"
        pitch['Zone'] = get_zone(pitch['PlateLocSide'], pitch['PlateLocHeight']) if in_zone else None

        # Simulate batted ball for InPlay
        if pitch_call == 'InPlay':
            # Simulate launch angle and exit velo based on batter's historical mean ± std
            la_mean = batter_df['Angle'].mean() if not batter_df.empty else 15
            la_std = batter_df['Angle'].std() if not batter_df.empty else 10
            ev_mean = batter_df['ExitSpeed'].mean() if not batter_df.empty else 85
            ev_std = batter_df['ExitSpeed'].std() if not batter_df.empty else 7
            launch_angle = float(np.clip(np.random.normal(la_mean, la_std), -20, 50))
            exit_velo = float(np.clip(np.random.normal(ev_mean, ev_std), 60, 110))
            pitch['LaunchAngle'] = launch_angle
            pitch['ExitVelo'] = exit_velo
            pitch['xwOBA'] = estimate_xwoba(launch_angle, exit_velo)
        else:
            pitch['LaunchAngle'] = None
            pitch['ExitVelo'] = None
            pitch['xwOBA'] = None

        pitch_zone_summary.append({
            'PitchType': pitch['AutoPitchType'],
            'Zone': pitch['Zone'],
            'PitchCount': pitch['PitchCount'],
            'PitchCall': pitch_call,
            'LaunchAngle': pitch.get('LaunchAngle', None),
            'ExitVelo': pitch.get('ExitVelo', None),
            'xwOBA': pitch.get('xwOBA', None)
        })

        # Update count
        if pitch_call == 'BallCalled':
            balls += 1
            if balls == 4:
                atbat_end = True
        elif pitch_call in ['StrikeCalled', 'StrikeSwinging']:
            if strikes < 2:
                strikes += 1
            else:
                atbat_end = True
        elif pitch_call == 'FoulBall':
            if strikes < 2:
                strikes += 1
        elif pitch_call == 'InPlay':
            atbat_end = True

# --- Summary Report with Pitch Type Grouping ---

# Map pitch types to simplified categories for this pitcher
def map_pitch_type(ptype):
    if ptype in ['Sinker', 'Two-Seam', 'Four-Seam', 'Other', 'Fastball']:
        return 'Four-Seam'
    elif ptype in ['Splitter', 'Changeup', 'Cutter']:
        return 'Changeup'
    else:
        return str(ptype)

summary_df = pd.DataFrame(pitch_zone_summary)
if 'PitchType' in summary_df.columns:
    summary_df['PitchTypeGroup'] = summary_df['PitchType'].apply(map_pitch_type)
else:
    summary_df['PitchTypeGroup'] = 'Unknown'

inplay = summary_df[summary_df['PitchCall'] == 'InPlay'].copy()

if not inplay.empty:
    # Group by zone, pitch count, and grouped pitch type, aggregate xwOBA and count
    zone_summary = (
        inplay.groupby(['Zone', 'PitchCount', 'PitchTypeGroup'])
        .agg(
            n=('xwOBA', 'count'),
            mean_xwOBA=('xwOBA', 'mean'),
            std_xwOBA=('xwOBA', 'std')
        )
        .reset_index()
        .sort_values(['mean_xwOBA', 'n'], ascending=[False, False])
    )

    print("\n--- Matchup Attack Summary ---")
    print("Zone legend (3x3 grid):")
    print("""
    1 | 2 | 3   <-- Top of zone (high)
    4 | 5 | 6   <-- Middle of zone
    7 | 8 | 9   <-- Bottom of zone (low)

    1 = top left, 2 = top middle, 3 = top right
    4 = middle left, 5 = middle, 6 = middle right
    7 = bottom left, 8 = bottom middle, 9 = bottom right

    Extended Zone Legend:
    - Zone 1: Top left (high and inside to RHH)
    - Zone 2: Top middle (high, middle)
    - Zone 3: Top right (high and away to RHH)
    - Zone 4: Middle left (belt high, inside)
    - Zone 5: Middle middle (belt high, middle)
    - Zone 6: Middle right (belt high, away)
    - Zone 7: Bottom left (low and inside)
    - Zone 8: Bottom middle (low, middle)
    - Zone 9: Bottom right (low and away)
    """)

    print(zone_summary.head(10))
    print("\nTop 3 Recommendations:")
    for idx, row in zone_summary.head(3).iterrows():
        print(
            f"Attack on pitch count {row['PitchCount']} in zone {int(row['Zone'])} "
            f"against pitch type {row['PitchTypeGroup']} "
            f"(mean xwOBA={row['mean_xwOBA']:.3f}, sample size={int(row['n'])})"
        )
else:
    print("No balls in play in these simulations. Try increasing the number of simulations or check data quality.")


--- Matchup Attack Summary ---
Zone legend (3x3 grid):

    1 | 2 | 3   <-- Top of zone (high)
    4 | 5 | 6   <-- Middle of zone
    7 | 8 | 9   <-- Bottom of zone (low)

    1 = top left, 2 = top middle, 3 = top right
    4 = middle left, 5 = middle, 6 = middle right
    7 = bottom left, 8 = bottom middle, 9 = bottom right

    Extended Zone Legend:
    - Zone 1: Top left (high and inside to RHH)
    - Zone 2: Top middle (high, middle)
    - Zone 3: Top right (high and away to RHH)
    - Zone 4: Middle left (belt high, inside)
    - Zone 5: Middle middle (belt high, middle)
    - Zone 6: Middle right (belt high, away)
    - Zone 7: Bottom left (low and inside)
    - Zone 8: Bottom middle (low, middle)
    - Zone 9: Bottom right (low and away)
    
     Zone PitchCount PitchTypeGroup   n  mean_xwOBA  std_xwOBA
420   9.0        3-0       Changeup   3    0.266667   0.125831
121   3.0        2-0      Four-Seam  10    0.250000   0.174801
133   3.0        3-0      Four-Seam   1    0.25000