In [168]:
# Player Similiarity Program
import pandas as pd
import numpy as np
import polars as pl
from api_scraper import MLB_Scrape

# Set display options to print all columns without truncation
pd.set_option("display.max_columns", None)  # Ensure all columns are displayed
pd.set_option("display.max_rows", None)  # Ensure all rows are displayed
pd.set_option("display.width", None)  # Remove column width limit

In [None]:
ab_flag_dict = {'Walk':0, 'Field Error': 1, 'Caught Stealing Home': 0, 'Strikeout Double Play': 1,
                'Sac Fly Double Play': 0,   'Hit By Pitch': 0, 'Runner Out': 0, 'Triple Play': 1, 
                'Field Out': 1, 'Flyout': 1, 'Bunt Lineout': 1, 'Catcher Interference': 0, 'Fielders Choice': 1,
                'Sac Bunt': 0, 'Pop Out': 1, 'Caught Stealing 2B': 0, 'Fielders Choice Out': 1, 'Stolen Base 2B': 0,
                'Sac Fly': 0, 'Groundout': 1, 'Home Run': 1, 'Caught Stealing 3B': 0, 'Intent Walk': 0, 'Double Play': 1,
                'Grounded Into DP': 1, 'Forceout': 1, 'Lineout': 1, 'Single': 1, 'Triple': 1, 'Wild Pitch': 0, 'Double': 1,
                'Strikeout': 1, 'Bunt Groundout': 0, 'Bunt Pop Out': 0}

ab_events = [event for event, flag in ab_flag_dict.items() if flag == 1]

pa_flag_dict = {'Walk':1, 'Field Error': 1, 'Caught Stealing Home': 0, 'Strikeout Double Play': 1,
                'Sac Fly Double Play': 1,   'Hit By Pitch': 1, 'Runner Out': 0, 'Triple Play': 1, 
                'Field Out': 1, 'Flyout': 1, 'Bunt Lineout': 1, 'Catcher Interference': 1, 'Fielders Choice': 1,
                'Sac Bunt': 1, 'Pop Out': 1, 'Caught Stealing 2B': 0, 'Fielders Choice Out': 1, 'Stolen Base 2B': 0,
                'Sac Fly': 1, 'Groundout': 1, 'Home Run': 1, 'Caught Stealing 3B': 0, 'Intent Walk': 1, 'Double Play': 1,
                'Grounded Into DP': 1, 'Forceout': 1, 'Lineout': 1, 'Single': 1, 'Triple': 1, 'Wild Pitch': 0, 'Double': 1,
                'Strikeout': 1, 'Bunt Groundout': 1, 'Bunt Pop Out': 1}

pa_events = [event for event, flag in pa_flag_dict.items() if flag == 1]

hit_flag_dict = {'Walk':0, 'Field Error': 0, 'Caught Stealing Home': 0, 'Strikeout Double Play': 0,
                'Sac Fly Double Play': 0,   'Hit By Pitch': 0, 'Runner Out': 0, 'Triple Play': 0, 
                'Field Out': 0, 'Flyout': 0, 'Bunt Lineout': 0, 'Catcher Interference': 0, 'Fielders Choice': 0,
                'Sac Bunt': 0, 'Pop Out': 0, 'Caught Stealing 2B': 0, 'Fielders Choice Out': 0, 'Stolen Base 2B': 0,
                'Sac Fly': 0, 'Groundout': 0, 'Home Run': 1, 'Caught Stealing 3B': 0, 'Intent Walk': 0, 'Double Play': 0,
                'Grounded Into DP': 0, 'Forceout': 0, 'Lineout': 0, 'Single': 1, 'Triple': 1, 'Wild Pitch': 0, 'Double': 1,
                'Strikeout': 0, 'Bunt Groundout': 0, 'Bunt Pop Out': 0}

hit_events = [event for event, flag in hit_flag_dict.items() if flag == 1]

# Function to calculate on-base percentage
obp_denom_dict = {
    event: 1 for event in ab_flag_dict if ab_flag_dict[event] == 1  # ABs
}

# Include BB, HBP, SF explicitly
for event in ["Walk", "Intent Walk", "Hit By Pitch", "Sac Fly"]:
    obp_denom_dict[event] = 1

# Exclude Sac Bunt if it was included
if "Sac Bunt" in obp_denom_dict:
    obp_denom_dict["Sac Bunt"] = 0

# Create a list of events to use in counting
obp_denom_events = [event for event, flag in obp_denom_dict.items() if flag == 1]

In [170]:
data = pl.read_csv("2025_data.csv")

batters = pl.read_csv("batter_id.csv")

In [171]:
player_name = "Wyatt Langford"

player_data = data.filter(pl.col("batter_name") == player_name)

player_ab_data = player_data.filter(pl.col("event").is_in(ab_events))

player_pa_data = player_data.filter(pl.col("event").is_in(pa_events))

player_hit_data = player_data.filter(pl.col("event").is_in(hit_events))

"""bughit = player_data.filter((pl.col("pitch_description") == "Knuckle Curve") & (pl.col("event").is_in(hit_events)))
bugab = player_data.filter((pl.col("pitch_description") == "Knuckle Curve") & (pl.col("event").is_in(ab_events)))

hits = bughit.height
abs = bugab.height

print(hits, abs)

bugab"""

'bughit = player_data.filter((pl.col("pitch_description") == "Knuckle Curve") & (pl.col("event").is_in(hit_events)))\nbugab = player_data.filter((pl.col("pitch_description") == "Knuckle Curve") & (pl.col("event").is_in(ab_events)))\n\nhits = bughit.height\nabs = bugab.height\n\nprint(hits, abs)\n\nbugab'

In [172]:
# Filter data for the player
player_data = data.filter(pl.col("batter_name") == player_name)

# Helper function for counting events
def count_player_events(events):
    return player_data.filter(pl.col("event").is_in(events)).height

# Count ABs, Hits, OBP denominator (exclude Sac Bunt)
num_ab = count_player_events(ab_events)
num_hits = count_player_events(hit_events)
obp_denom = count_player_events(obp_denom_events)

# Batting Average
batting_average = round(num_hits / num_ab if num_ab > 0 else 0, 3)

# On-Base Percentage
on_base_percentage = round(
    (num_hits + count_player_events(["Walk","Intent Walk" ,"Hit By Pitch"])) / obp_denom if obp_denom > 0 else 0,
    3
)

# Slugging Percentage
slugging_percentage = round(
    (1*count_player_events(["Single"]) + 2*count_player_events(["Double"]) +
     3*count_player_events(["Triple"]) + 4*count_player_events(["Home Run"])) / num_ab if num_ab > 0 else 0,
    3
)

# OPS (On-base Plus Slugging)
ops = round(on_base_percentage + slugging_percentage, 3)

# Output
print(f"{player_name} - AVG: {batting_average}, OBP: {on_base_percentage}, SLG: {slugging_percentage}, OPS: {ops}")


Wyatt Langford - AVG: 0.241, OBP: 0.344, SLG: 0.431, OPS: 0.775


In [173]:
fastball_dict = {'Four-Seam Fastball': 1, 'Sinker': 1, 'Cutter': 1}

offspeed_dict = {'Splitter': 1, 'Changeup': 1, 'Forkball': 1, 'Screwball': 1}

breaking_dict = {'Curveball': 1, 'Knuckle Ball': 1, 'Knuckle Curve': 1, 'Slider': 1, 
                 'Sweeper': 1, 'Slurve': 1, 'Slow Curve': 1}

In [177]:
# Helper function to calculate pitch group stats
def calculate_pitch_group_stats(pitch_dict):
    def count_events(events):
        return player_data.filter(
            pl.col("pitch_description").is_in(pitch_dict) &
            pl.col("event").is_in(events)
        ).height
    
    # ABs, Hits, and OBP denominator (exclude sac bunts)
    num_ab = count_events(ab_events)
    num_hits = count_events(hit_events)
    obp_denom = count_events(obp_denom_events)
    
    # Batting Average
    avg = round(num_hits / num_ab if num_ab > 0 else 0, 3)
    
    # On-Base Percentage
    obp = round(
        (num_hits + count_events(["Walk", "Intent Walk", "Hit By Pitch"])) / obp_denom if obp_denom > 0 else 0,
        3
    )
    
    # Slugging Percentage
    slg = round(
        (1*count_events(["Single"]) + 2*count_events(["Double"]) +
         3*count_events(["Triple"]) + 4*count_events(["Home Run"])) / num_ab if num_ab > 0 else 0,
        3
    )

    ops = round(obp + slg, 3)

    return {"AVG": avg, "OBP": obp, "SLG": slg, "OPS": ops}

# Calculate stats for each pitch group
fastball_stats = calculate_pitch_group_stats(fastball_dict)
breaking_stats = calculate_pitch_group_stats(breaking_dict)
offspeed_stats = calculate_pitch_group_stats(offspeed_dict)

print(f"{player_name} Overall Stats: AVG: {batting_average}, OBP: {on_base_percentage}, SLG: {slugging_percentage}, OPS: {ops}")
print(f"Fastball Stats: AVG: {fastball_stats['AVG']}, OBP: {fastball_stats['OBP']}, SLG: {fastball_stats['SLG']}, OPS: {fastball_stats['OPS']}")
print(f"Breaking Ball Stats: AVG: {breaking_stats['AVG']}, OBP: {breaking_stats['OBP']}, SLG: {breaking_stats['SLG']}, OPS: {breaking_stats['OPS']}")
print(f"Offspeed Stats: AVG: {offspeed_stats['AVG']}, OBP: {offspeed_stats['OBP']}, SLG: {offspeed_stats['SLG']}, OPS: {offspeed_stats['OPS']}")



Wyatt Langford Overall Stats: AVG: 0.241, OBP: 0.344, SLG: 0.431, OPS: 0.775
Fastball Stats: AVG: 0.248, OBP: 0.359, SLG: 0.483, OPS: 0.842
Breaking Ball Stats: AVG: 0.252, OBP: 0.321, SLG: 0.366, OPS: 0.687
Offspeed Stats: AVG: 0.178, OBP: 0.286, SLG: 0.288, OPS: 0.574
