# Constrained PA-Level dataset curation

We will add constraints for pulling the pitcher too early, such as minimum number of pitches thrown, minimum innings played, and/or minimum time through order.

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path

DATA_DIR = Path("../data")
PROC_DIR = DATA_DIR / "processed"

YEAR_TAG = "2022_2023"  # or whatever you used

# 1. Load PA table and RL tensors
pa = pd.read_parquet(PROC_DIR / f"pa_decisions_{YEAR_TAG}.parquet")
rl = np.load(PROC_DIR / f"rl_tensors_{YEAR_TAG}.npz")

pa_index = rl["pa_index"]  # shape [B]

# 2. Define your constraints on PA-level features
# Example: only consider PAs with inning >= 5, pitch_count >= 75, tto >= 2
mask_pa = (
    (pa["inning"] >= 5) &
    (pa["pitch_count"] >= 75) &
    (pa["tto"] >= 2)
)

# 3. Map that mask to RL samples via pa_index
mask_samples = mask_pa.iloc[pa_index].to_numpy()

# 4. Build a new dict with filtered tensors
constrained = {}
B = pa_index.shape[0]

for key in rl.files:
    arr = rl[key]
    # Only subset along axis 0 if this array is per-PA (length B)
    if isinstance(arr, np.ndarray) and arr.shape[0] == B:
        constrained[key] = arr[mask_samples]
    else:
        constrained[key] = arr  # keep as is (e.g., constants, metadata)

out_path = PROC_DIR / f"rl_tensors_{YEAR_TAG}_constrained.npz"
np.savez_compressed(out_path, **constrained)
print("Saved constrained tensors to:", out_path)
print("Constrained B:", constrained["state_vec"].shape[0])

Saved constrained tensors to: ../data/processed/rl_tensors_2022_2023_constrained.npz
Constrained B: 29673
