In [3]:
%pip install pybaseball




In [None]:
import os
import time
import pandas as pd
from pybaseball import statcast
import pybaseball

pybaseball.cache.enable()

# Folders
MONTHLY_DIR = "statcast_monthly"
YEARLY_DIR = "statcast_yearly"
os.makedirs(MONTHLY_DIR, exist_ok=True)
os.makedirs(YEARLY_DIR, exist_ok=True)

# Columns you actually need
USE_COLS = [
    "pitch_type", "pitch_name",
    "batter", "pitcher",
    "stand", "p_throws",
    "balls", "strikes",
    "outs_when_up", "inning", "inning_topbot",
    "game_pk", "game_date",
    "at_bat_number", "pitch_number",
    "release_speed",
    "pfx_x", "pfx_z","zone",
    "plate_x", "plate_z",
    "home_score", "away_score",
    "on_1b", "on_2b", "on_3b"
]

def safe_statcast(start_dt, end_dt, retries=5, wait=5):
    """Statcast with retry logic."""
    for attempt in range(1, retries + 1):
        try:
            df = statcast(start_dt=start_dt, end_dt=end_dt)
            if df is not None and not df.empty:
                return df
        except Exception as e:
            print(f"Attempt {attempt} failed for {start_dt} → {end_dt}: {e}")

        print(f"Retrying in {wait} seconds...")
        time.sleep(wait)

    print(f"FAILED permanently: {start_dt} → {end_dt}")
    return None


# Download monthly files
log = []
for year in range(2015, 2025):
    for month in range(4, 12):  # April–November
        start = f"{year}-{month:02d}-01"
        end = f"{year}-{month:02d}-28"
        filename = f"{MONTHLY_DIR}/statcast_{year}_{month:02d}.csv"

        if os.path.exists(filename):
            continue

        print(f"Pulling {start} → {end}...")
        df_month = safe_statcast(start, end)

        if df_month is not None and not df_month.empty:
            df_month = df_month[USE_COLS]  # keep only needed columns
            df_month.to_csv(filename, index=False)
            log.append((year, month, "SUCCESS", df_month.shape[0]))
        else:
            log.append((year, month, "FAILED", 0))


# Combine into yearly files
import glob

monthly_files = glob.glob(f"{MONTHLY_DIR}/*.csv")
files_by_year = {}

for f in monthly_files:
    base = os.path.basename(f)
    year = int(base.split("_")[1])
    files_by_year.setdefault(year, []).append(f)

for year, files in sorted(files_by_year.items()):
    print(f"Combining year {year} ({len(files)} months)...")
    df_year = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)
    out_path = f"{YEARLY_DIR}/statcast_{year}.csv"
    df_year.to_csv(out_path, index=False)


# Save log
log_df = pd.DataFrame(log, columns=["year", "month", "status", "rows"])
log_df.to_csv("statcast_download_log.csv", index=False)


In [8]:
import glob
df = []
files = glob.glob("statcast_yearly/*.csv")
df = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)

print(df.shape)

(6233268, 26)


In [9]:
#['SL' 'FF' 'SI' 'CH' 'FC' 'CU' 'FS' 'KC' 'EP' 'ST' 'SV' 'KN' 'FO' 'FA' 'SC' 'CS']
#['Slider' '4-Seam Fastball' 'Sinker' 'Changeup' 'Cutter' 'Curveball''Split-Finger'
# 'Knuckle Curve' 'Eephus''Sweeper' 'Slurve' 'Knuckleball' 'Forkball' 'Other' 'Screwball' 'Slow Curve']

df = df.dropna(subset=["pitch_type", "zone"])
valid_pitches = ['SL','FF','SI','CH','FC','CU','FS','KC','EP','ST','SV','KN','FO','FA','SC','CS']
df = df[df["pitch_type"].isin(valid_pitches)]


In [25]:
from pybaseball import batting_stats
batter_stats = []
for year in range(2015, 2025):
    print(f"Pulling {year} data...")
    df_year = batting_stats(year)
    batter_stats.append(df_year)
bs = pd.concat(batter_stats,ignore_index= True)


Pulling 2015 data...
Pulling 2016 data...
Pulling 2017 data...
Pulling 2018 data...
Pulling 2019 data...
Pulling 2020 data...
Pulling 2021 data...
Pulling 2022 data...
Pulling 2023 data...
Pulling 2024 data...


In [27]:
bs

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,11579,2015,Bryce Harper,WSN,22,153,521,654,172,91,...,116.0,188,0.477,394,0.118,0.226,0.293,0.543,0.418,9.3
1,10155,2015,Mike Trout,LAA,23,159,575,682,172,93,...,117.7,205,0.486,422,0.207,0.282,0.297,0.588,0.422,9.3
2,5038,2015,Josh Donaldson,TOR,29,158,620,711,184,100,...,113.6,233,0.467,499,0.145,0.255,0.279,0.542,0.388,8.7
3,4314,2015,Joey Votto,CIN,31,158,545,695,171,107,...,109.3,175,0.425,412,0.177,0.254,0.289,0.542,0.423,7.3
4,9218,2015,Paul Goldschmidt,ARI,27,159,567,695,182,109,...,114.0,197,0.465,424,0.181,0.266,0.279,0.558,0.403,7.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1370,17901,2024,Andrew Benintendi,CHW,29,135,477,522,109,69,...,109.5,132,0.344,384,0.159,0.245,0.243,0.389,0.303,0.0
1371,19287,2024,Adolis Garcia,TEX,31,154,580,637,130,78,...,116.1,196,0.479,409,0.127,0.294,0.227,0.413,0.306,0.1
1372,17982,2024,Ty France,- - -,29,140,479,535,112,75,...,111.1,148,0.400,370,0.128,0.237,0.228,0.387,0.303,-0.9
1373,21897,2024,Christopher Morel,- - -,25,152,535,611,105,70,...,111.4,154,0.404,381,0.149,0.298,0.225,0.399,0.316,-1.1


In [34]:
from pybaseball import playerid_reverse_lookup  


batter_ids = df['batter'].dropna().unique()
id_map = playerid_reverse_lookup(batter_ids, key_type='mlbam')
bstats = bs.merge(
    id_map[['key_mlbam', 'key_fangraphs']],
    left_on='IDfg',
    right_on='key_fangraphs',
    how='inner'
)
bstats = bstats[['key_mlbam', 'AVG', 'OBP', 'SLG', 'OPS', 'ISO', 'BB%', 'K%']]
bstats = bstats.rename(columns = {'key_mlbam' : 'batter'})
bstats['batter'] = bstats['batter'].astype('int32', errors='ignore')
df['batter'] = df['batter'].astype('int32', errors='ignore')

# 3. Remove duplicate MLBAM IDs
bstats = bstats.drop_duplicates(subset='batter')

# 4. Set index to MLBAM ID
bstats = bstats.set_index('batter')

# 5. Now mapping works
df['batter_avg'] = df['batter'].map(bstats['AVG'])
df['batter_obp'] = df['batter'].map(bstats['OBP'])
df['batter_slg'] = df['batter'].map(bstats['SLG'])
df['batter_ops'] = df['batter'].map(bstats['OPS'])
df['batter_iso'] = df['batter'].map(bstats['ISO'])
df['batter_bb_rate'] = df['batter'].map(bstats['BB%'])
df['batter_k_rate'] = df['batter'].map(bstats['K%'])



In [43]:
for col in ['batter_avg','batter_obp','batter_slg','batter_ops','batter_iso','batter_bb_rate','batter_k_rate']:
    df[col] = df[col].fillna(df[col].mean())


In [None]:
# sort by game ID, at bat #, and pitch #. 
# Then find the ordering of pitch type and location and create next pitch/zone for all vlaues.
# Then deal with rows where next pitch is not defined, walks or strikeouts 
stats = df.sort_values(
    ["game_pk", "at_bat_number", "pitch_number"]
).reset_index(drop=True)

stats["prev_pitch_type"] = stats.groupby(
    ["game_pk", "at_bat_number"]
)["pitch_type"].shift(0)

stats["prev_zone"] = stats.groupby(
    ["game_pk", "at_bat_number"]
)["zone"].shift(0)

stats["prev2_pitch_type"] = stats.groupby(
    ["game_pk", "at_bat_number"]
)["pitch_type"].shift(1)

stats["prev2_zone"] = stats.groupby(
    ["game_pk", "at_bat_number"]
)["zone"].shift(1)

stats = stats.dropna(
    subset=["prev_pitch_type", "prev2_pitch_type"]
).copy()


In [33]:
df = df.sort_values(["game_pk", "at_bat_number", "pitch_number"])
pitcher_repertoire = df.groupby("pitcher")["pitch_type"].unique().to_dict()

In [35]:
test = df.sort_values(["AVG"])

In [5]:
stats["next_pitch_type"] = stats.groupby(
    ["game_pk", "at_bat_number"]
)["pitch_type"].shift(-1)

stats["next_zone"] = stats.groupby(
    ["game_pk", "at_bat_number"]
)["zone"].shift(-1)

stats = stats.dropna(
    subset=["next_pitch_type", "next_zone"]
).copy()

stats["target"] = (
    stats["next_pitch_type"] + "_" +
    stats["next_zone"].astype(int).astype(str)
)


In [6]:
FEATURES = [
    "balls",
    "strikes",
    "outs_when_up",
    "inning",
    "inning_topbot",
    "p_throws",
    "stand",
    "prev_pitch_type",
    "prev_zone",
    "prev2_pitch_type",
    "prev2_zone"
]


CATEGORICAL_FEATURES = [
    "p_throws",
    "stand",
    "inning_topbot",
    "prev_pitch_type",
    "prev_zone",
    "prev2_pitch_type",
    "prev2_zone"
]

for col in CATEGORICAL_FEATURES:
    stats[col] = stats[col].astype("category")
    
stats[FEATURES].head()


Unnamed: 0,balls,strikes,outs_when_up,inning,inning_topbot,p_throws,stand,prev_pitch_type,prev_zone,prev2_pitch_type,prev2_zone
1,0,1,0,1,Top,R,L,CU,8,CH,8
4,0,1,1,1,Top,R,R,FF,1,SI,13
5,0,2,1,1,Top,R,R,SI,11,FF,1
6,1,2,1,1,Top,R,R,SI,6,SI,11
7,1,2,1,1,Top,R,R,SI,4,SI,6


In [7]:
min_count = 500

valid_targets = (
    stats["target"]
    .value_counts()
    .loc[lambda x: x >= min_count]
    .index
)

stats = stats[stats["target"].isin(valid_targets)].copy()

In [12]:
train = stats[stats["game_year"] <= 2022].copy()
valid = stats[stats["game_year"] > 2022].copy()




In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

train["y"] = le.fit_transform(train["target"])
valid["y"] = le.transform(valid["target"])

num_classes = len(le.classes_)


In [15]:

%pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 21.3 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [16]:
import lightgbm as lgb

lgb_train = lgb.Dataset(
    train[FEATURES],
    train["y"],
    categorical_feature=CATEGORICAL_FEATURES
)

lgb_valid = lgb.Dataset(
    valid[FEATURES],
    valid["y"],
    categorical_feature=CATEGORICAL_FEATURES,
    reference=lgb_train
)


In [18]:
params = {
    "objective": "multiclass",
    "num_class": num_classes,
    "metric": "multi_logloss",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "min_data_in_leaf": 100,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbosity": -1
}

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1200,
    valid_sets=[lgb_valid],
)
