In [3]:
%pip install pybaseball




In [None]:
import os
import time
import pandas as pd
from pybaseball import statcast
import pybaseball

pybaseball.cache.enable()

# Folders
MONTHLY_DIR = "statcast_monthly"
YEARLY_DIR = "statcast_yearly"
os.makedirs(MONTHLY_DIR, exist_ok=True)
os.makedirs(YEARLY_DIR, exist_ok=True)

# Columns you actually need
USE_COLS = [
    "pitch_type", "pitch_name",
    "batter", "pitcher",
    "stand", "p_throws",
    "balls", "strikes",
    "outs_when_up", "inning", "inning_topbot",
    "game_pk", "game_date",
    "at_bat_number", "pitch_number",
    "release_speed",
    "pfx_x", "pfx_z","zone",
    "plate_x", "plate_z",
    "home_score", "away_score",
    "on_1b", "on_2b", "on_3b"
]

def safe_statcast(start_dt, end_dt, retries=5, wait=5):
    """Statcast with retry logic."""
    for attempt in range(1, retries + 1):
        try:
            df = statcast(start_dt=start_dt, end_dt=end_dt)
            if df is not None and not df.empty:
                return df
        except Exception as e:
            print(f"Attempt {attempt} failed for {start_dt} → {end_dt}: {e}")

        print(f"Retrying in {wait} seconds...")
        time.sleep(wait)

    print(f"FAILED permanently: {start_dt} → {end_dt}")
    return None


# Download monthly files
log = []
for year in range(2015, 2025):
    for month in range(4, 12):  # April–November
        start = f"{year}-{month:02d}-01"
        end = f"{year}-{month:02d}-28"
        filename = f"{MONTHLY_DIR}/statcast_{year}_{month:02d}.csv"

        if os.path.exists(filename):
            continue

        print(f"Pulling {start} → {end}...")
        df_month = safe_statcast(start, end)

        if df_month is not None and not df_month.empty:
            df_month = df_month[USE_COLS]  # keep only needed columns
            df_month.to_csv(filename, index=False)
            log.append((year, month, "SUCCESS", df_month.shape[0]))
        else:
            log.append((year, month, "FAILED", 0))


# Combine into yearly files
import glob

monthly_files = glob.glob(f"{MONTHLY_DIR}/*.csv")
files_by_year = {}

for f in monthly_files:
    base = os.path.basename(f)
    year = int(base.split("_")[1])
    files_by_year.setdefault(year, []).append(f)

for year, files in sorted(files_by_year.items()):
    print(f"Combining year {year} ({len(files)} months)...")
    df_year = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)
    out_path = f"{YEARLY_DIR}/statcast_{year}.csv"
    df_year.to_csv(out_path, index=False)


# Save log
log_df = pd.DataFrame(log, columns=["year", "month", "status", "rows"])
log_df.to_csv("statcast_download_log.csv", index=False)


In [2]:
import glob
import os
import time
import pandas as pd
from pybaseball import statcast
import pybaseball
df = []
files = glob.glob("statcast_yearly/*.csv")
df = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)

print(df.shape)

(6233268, 26)


In [3]:
#['SL' 'FF' 'SI' 'CH' 'FC' 'CU' 'FS' 'KC' 'EP' 'ST' 'SV' 'KN' 'FO' 'FA' 'SC' 'CS']
#['Slider' '4-Seam Fastball' 'Sinker' 'Changeup' 'Cutter' 'Curveball''Split-Finger'
# 'Knuckle Curve' 'Eephus''Sweeper' 'Slurve' 'Knuckleball' 'Forkball' 'Other' 'Screwball' 'Slow Curve']

df = df.dropna(subset=["pitch_type", "zone"])
valid_pitches = ['SL','FF','SI','CH','FC','CU','FS','KC','EP','ST','SV','KN','FO','FA','SC','CS']
df = df[df["pitch_type"].isin(valid_pitches)]


In [4]:
from pybaseball import batting_stats
batter_stats = []
for year in range(2015, 2025):
    print(f"Pulling {year} data...")
    df_year = batting_stats(year)
    batter_stats.append(df_year)
bs = pd.concat(batter_stats,ignore_index= True)


Pulling 2015 data...
Pulling 2016 data...
Pulling 2017 data...
Pulling 2018 data...
Pulling 2019 data...
Pulling 2020 data...
Pulling 2021 data...
Pulling 2022 data...
Pulling 2023 data...
Pulling 2024 data...


In [27]:
bs

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,11579,2015,Bryce Harper,WSN,22,153,521,654,172,91,...,116.0,188,0.477,394,0.118,0.226,0.293,0.543,0.418,9.3
1,10155,2015,Mike Trout,LAA,23,159,575,682,172,93,...,117.7,205,0.486,422,0.207,0.282,0.297,0.588,0.422,9.3
2,5038,2015,Josh Donaldson,TOR,29,158,620,711,184,100,...,113.6,233,0.467,499,0.145,0.255,0.279,0.542,0.388,8.7
3,4314,2015,Joey Votto,CIN,31,158,545,695,171,107,...,109.3,175,0.425,412,0.177,0.254,0.289,0.542,0.423,7.3
4,9218,2015,Paul Goldschmidt,ARI,27,159,567,695,182,109,...,114.0,197,0.465,424,0.181,0.266,0.279,0.558,0.403,7.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1370,17901,2024,Andrew Benintendi,CHW,29,135,477,522,109,69,...,109.5,132,0.344,384,0.159,0.245,0.243,0.389,0.303,0.0
1371,19287,2024,Adolis Garcia,TEX,31,154,580,637,130,78,...,116.1,196,0.479,409,0.127,0.294,0.227,0.413,0.306,0.1
1372,17982,2024,Ty France,- - -,29,140,479,535,112,75,...,111.1,148,0.400,370,0.128,0.237,0.228,0.387,0.303,-0.9
1373,21897,2024,Christopher Morel,- - -,25,152,535,611,105,70,...,111.4,154,0.404,381,0.149,0.298,0.225,0.399,0.316,-1.1


In [5]:
from pybaseball import playerid_reverse_lookup  


batter_ids = df['batter'].dropna().unique()
id_map = playerid_reverse_lookup(batter_ids, key_type='mlbam')
bstats = bs.merge(
    id_map[['key_mlbam', 'key_fangraphs']],
    left_on='IDfg',
    right_on='key_fangraphs',
    how='inner'
)
bstats = bstats[['key_mlbam', 'AVG', 'OBP', 'SLG', 'OPS', 'ISO', 'BB%', 'K%']]
bstats = bstats.rename(columns = {'key_mlbam' : 'batter'})
bstats['batter'] = bstats['batter'].astype('int32', errors='ignore')
df['batter'] = df['batter'].astype('int32', errors='ignore')

# 3. Remove duplicate MLBAM IDs
bstats = bstats.drop_duplicates(subset='batter')

# 4. Set index to MLBAM ID
bstats = bstats.set_index('batter')

# 5. Now mapping works
df['batter_avg'] = df['batter'].map(bstats['AVG'])
df['batter_obp'] = df['batter'].map(bstats['OBP'])
df['batter_slg'] = df['batter'].map(bstats['SLG'])
df['batter_ops'] = df['batter'].map(bstats['OPS'])
df['batter_iso'] = df['batter'].map(bstats['ISO'])
df['batter_bb_rate'] = df['batter'].map(bstats['BB%'])
df['batter_k_rate'] = df['batter'].map(bstats['K%'])



In [None]:
for col in ['batter_avg','batter_obp','batter_slg','batter_ops','batter_iso','batter_bb_rate','batter_k_rate']:
    df[col] = df[col].fillna(df[col].mean())



In [8]:
df = df.sort_values(
    ["pitcher", "game_pk", "at_bat_number", "pitch_number"]
).reset_index(drop=True)
window = 5

for i in range(1, window + 1):
    df[f"prev_pitch_type_{i}"] = df.groupby("pitcher")["pitch_type"].shift(i)
    df[f"prev_speed_{i}"] = df.groupby("pitcher")["release_speed"].shift(i)
    df[f"prev_px_{i}"] = df.groupby("pitcher")["plate_x"].shift(i)
    df[f"prev_pz_{i}"] = df.groupby("pitcher")["plate_z"].shift(i)
    
df = df.dropna(subset=[f"prev_pitch_type_{window}"])


In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


for col in ["pitch_type", "stand", "p_throws", "inning_topbot"]:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

num_cols = [
    "release_speed", "pfx_x", "pfx_z", "plate_x", "plate_z",
    "batter_avg", "batter_obp", "batter_slg", "batter_ops",
    "batter_iso", "batter_bb_rate", "batter_k_rate"
]

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [10]:
import numpy as np

sequence_cols = [
    f"prev_pitch_type_{i}" for i in range(1, window+1)
] + [
    f"prev_speed_{i}" for i in range(1, window+1)
] + [
    f"prev_px_{i}" for i in range(1, window+1)
] + [
    f"prev_pz_{i}" for i in range(1, window+1)
]

X = df[sequence_cols].values.reshape(-1, window, len(sequence_cols) // window)
y = df["pitch_type"].values

In [11]:
df = df.sort_values(
    ["pitcher", "game_pk", "at_bat_number", "pitch_number"]
).reset_index(drop=True)

# Score differential (home - away)
df["score_diff"] = df["home_score"] - df["away_score"]


In [12]:
from sklearn.preprocessing import LabelEncoder

# Make a copy to avoid modifying encoders later if you want
df = df.copy()

# Encode pitch_type (target + sequence)
pitch_le = LabelEncoder()
df["pitch_type"] = pitch_le.fit_transform(df["pitch_type"].astype(str))

# Encode stand, p_throws, inning_topbot for context
stand_le = LabelEncoder()
throws_le = LabelEncoder()
topbot_le = LabelEncoder()

df["stand"] = stand_le.fit_transform(df["stand"].astype(str))
df["p_throws"] = throws_le.fit_transform(df["p_throws"].astype(str))
df["inning_topbot"] = topbot_le.fit_transform(df["inning_topbot"].astype(str))

In [13]:
N = 10  # sequence length

seq_cols = [
    "pitch_type",
    "release_speed",
    "pfx_x",
    "pfx_z",
    "plate_x",
    "plate_z",
    "zone",
]

# Create shifted features for each of the previous N pitches, grouped by pitcher
for i in range(1, N + 1):
    g = df.groupby("pitcher", group_keys=False)
    for col in seq_cols:
        df[f"{col}_prev_{i}"] = g[col].shift(i)
        
df = df.dropna(subset=[f"pitch_type_prev_{N}"]).reset_index(drop=True)

In [58]:
df["on_1b"] = df["on_1b"].notnull().astype(int)
df["on_2b"] = df["on_2b"].notnull().astype(int)
df["on_3b"] = df["on_3b"].notnull().astype(int)

In [59]:
context_cols = [
    "balls", "strikes", "outs_when_up",
    "inning", "inning_topbot",
    "on_1b", "on_2b", "on_3b",
    "score_diff",
    "stand", "p_throws",
    "batter_avg", "batter_obp", "batter_slg",
    "batter_ops", "batter_iso",
    "batter_bb_rate", "batter_k_rate",
]
df[context_cols] = df[context_cols].fillna(0)

In [60]:
# Collect the names of the shifted sequence columns in order
seq_feature_names = []
for i in range(1, N + 1):
    for col in seq_cols:
        seq_feature_names.append(f"{col}_prev_{i}")

# Convert to numpy and reshape: [num_samples, N, num_seq_features]
X_seq = df[seq_feature_names].values
num_seq_features = len(seq_cols)
X_seq = X_seq.reshape(-1, N, num_seq_features)

print("X_seq shape:", X_seq.shape)  # (num_samples, N, 7)

X_seq shape: (6153750, 10, 7)


In [63]:
X_ctx = df[context_cols].values
print("X_ctx shape:", X_ctx.shape)  # (num_samples, num_context_features)

y = df["pitch_type"].values  # already label-encoded above
print("y shape:", y.shape)   # (num_samples,)

X_ctx shape: (6153750, 18)
y shape: (6153750,)


In [64]:
from sklearn.model_selection import train_test_split

X_seq_train, X_seq_val, X_ctx_train, X_ctx_val, y_train, y_val = train_test_split(
    X_seq, X_ctx, y, test_size=0.2, random_state=42, shuffle=True
)

print("Train:", X_seq_train.shape, X_ctx_train.shape, y_train.shape)
print("Val:  ", X_seq_val.shape, X_ctx_val.shape, y_val.shape)

Train: (4923000, 10, 7) (4923000, 18) (4923000,)
Val:   (1230750, 10, 7) (1230750, 18) (1230750,)


In [18]:
%pip install torch

Collecting torch
  Downloading torch-2.9.1-cp313-cp313-win_amd64.whl.metadata (30 kB)
Downloading torch-2.9.1-cp313-cp313-win_amd64.whl (110.9 MB)
   ---------------------------------------- 0.0/110.9 MB ? eta -:--:--
   - -------------------------------------- 5.0/110.9 MB 26.9 MB/s eta 0:00:04
   ---- ----------------------------------- 11.5/110.9 MB 29.7 MB/s eta 0:00:04
   ------ --------------------------------- 18.4/110.9 MB 30.4 MB/s eta 0:00:04
   --------- ------------------------------ 26.5/110.9 MB 32.7 MB/s eta 0:00:03
   ------------ --------------------------- 35.7/110.9 MB 34.6 MB/s eta 0:00:03
   --------------- ------------------------ 43.5/110.9 MB 35.1 MB/s eta 0:00:02
   ------------------ --------------------- 51.4/110.9 MB 35.4 MB/s eta 0:00:02
   --------------------- ------------------ 59.0/110.9 MB 35.6 MB/s eta 0:00:02
   ------------------------ --------------- 67.1/110.9 MB 36.0 MB/s eta 0:00:02
   --------------------------- ------------ 75.0/110.9 MB 36.3 

In [65]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PitchTransformer(nn.Module):
    def __init__(
        self,
        seq_len=10,
        seq_feat_dim=7,
        context_dim=18,   # depends on your context feature count
        num_pitch_types=10,  # adjust based on your label encoder
        d_model=128,
        n_heads=4,
        num_layers=3,
        ff_dim=256,
        dropout=0.1
    ):
        super().__init__()

        self.seq_len = seq_len
        self.seq_feat_dim = seq_feat_dim

        # Project raw numeric sequence features into d_model
        self.seq_proj = nn.Linear(seq_feat_dim, d_model)

        # Positional encoding (learned)
        self.pos_emb = nn.Parameter(torch.randn(1, seq_len, d_model))

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Context MLP
        self.context_mlp = nn.Sequential(
            nn.Linear(context_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.ReLU()
        )

        # Final classifier
        self.classifier = nn.Sequential(
            nn.Linear(d_model + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_pitch_types)
        )

    def forward(self, X_seq, X_ctx):
        """
        X_seq: [batch, seq_len, seq_feat_dim]
        X_ctx: [batch, context_dim]
        """

        # Project sequence features
        seq_emb = self.seq_proj(X_seq)  # [batch, seq_len, d_model]

        # Add positional encoding
        seq_emb = seq_emb + self.pos_emb

        # Transformer encoder
        enc_out = self.encoder(seq_emb)  # [batch, seq_len, d_model]

        # Mean pool over sequence
        seq_repr = enc_out.mean(dim=1)  # [batch, d_model]

        # Context branch
        ctx_repr = self.context_mlp(X_ctx)  # [batch, 64]

        # Fuse
        fused = torch.cat([seq_repr, ctx_repr], dim=1)

        # Classify
        logits = self.classifier(fused)

        return logits

In [66]:
from torch.utils.data import DataLoader, TensorDataset

# Convert numpy arrays to torch tensors
X_seq_train_t = torch.tensor(X_seq_train, dtype=torch.float32)
X_ctx_train_t = torch.tensor(X_ctx_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)

X_seq_val_t = torch.tensor(X_seq_val, dtype=torch.float32)
X_ctx_val_t = torch.tensor(X_ctx_val, dtype=torch.float32)
y_val_t = torch.tensor(y_val, dtype=torch.long)

train_ds = TensorDataset(X_seq_train_t, X_ctx_train_t, y_train_t)
val_ds = TensorDataset(X_seq_val_t, X_ctx_val_t, y_val_t)

train_loader = DataLoader(train_ds, batch_size=512, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=512, shuffle=False)

In [72]:
model = PitchTransformer(
    seq_len=10,
    seq_feat_dim=7,
    context_dim=X_ctx_train.shape[1],
    num_pitch_types=len(pitch_le.classes_)
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [69]:
X_seq = df[seq_feature_names].values.reshape(-1, N, len(seq_cols))
X_ctx = df[context_cols].values
y = df["pitch_type"].values

In [70]:
for Xs, Xc, yb in train_loader:
    print("Batch Xs max:", Xs.max().item())
    print("Batch Xs min:", Xs.min().item())
    print("Batch Xc max:", Xc.max().item())
    print("Batch Xc min:", Xc.min().item())
    print("Batch y max:", yb.max().item())
    print("Batch y min:", yb.min().item())
    break

Batch Xs max: 15.0
Batch Xs min: -5.76625394821167
Batch Xc max: 17.0
Batch Xc min: -10.0
Batch y max: 15
Batch y min: 0


In [46]:
for epoch in range(10):
    model.train()
    total_loss = 0

    for Xs, Xc, yb in train_loader:
        optimizer.zero_grad()
        logits = model(Xs, Xc)
        loss = criterion(logits, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for Xs, Xc, yb in val_loader:
            logits = model(Xs, Xc)
            loss = criterion(logits, yb)
            val_loss += loss.item()

            preds = logits.argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)

    print(f"Epoch {epoch+1} | Train Loss: {total_loss:.3f} | "
          f"Val Loss: {val_loss:.3f} | Val Acc: {correct/total:.3f}")

Epoch 1 | Train Loss: nan | Val Loss: nan | Val Acc: 0.038


KeyboardInterrupt: 

In [None]:
def predict_next_pitch(model, seq_window, ctx_vector):
    """
    seq_window: numpy array shape [N, 7]
    ctx_vector: numpy array shape [context_dim]
    """

    model.eval()

    Xs = torch.tensor(seq_window, dtype=torch.float32).unsqueeze(0)
    Xc = torch.tensor(ctx_vector, dtype=torch.float32).unsqueeze(0)

    with torch.no_grad():
        logits = model(Xs, Xc)
        probs = F.softmax(logits, dim=1).cpu().numpy()[0]

    pred_idx = probs.argmax()
    pred_pitch = pitch_le.inverse_transform([pred_idx])[0]

    return pred_pitch, probs