In [5]:
import pandas as pd
import numpy as np

# --- 1) load ---
csv_path = "mlb_game_data_2025v2_with_rolling40_filtered.csv"   # adjust if needed
df = pd.read_csv(csv_path)

# --- 2) columns to drop (identifiers / high-cardinality categoricals) ---
drop_exact = {"Date", "Home Team", "Away Team", "Home SP", "Away SP"}
drop_cols = [c for c in df.columns if c in drop_exact]
df_wo_meta = df.drop(columns=drop_cols, errors="ignore")

# --- 3) target y ---
# Expect a boolean column named exactly like this; change if your file uses a different name.
y_col = "Home Team Won"
if y_col not in df_wo_meta.columns:
    raise KeyError(f"Expected target column '{y_col}' not found.")
# store y as integers {0,1} for now; we can map to {-1,+1} later if needed
y = df_wo_meta[y_col].astype(int)
df_wo_meta = df_wo_meta.drop(columns=[y_col])

# --- 4) detect Home/Away pairs and build (home - away) ---
home_prefix = "Home "
away_prefix = "Away "

home_cols = {c[len(home_prefix):]: c for c in df_wo_meta.columns if c.startswith(home_prefix)}
away_cols = {c[len(away_prefix):]: c for c in df_wo_meta.columns if c.startswith(away_prefix)}

paired_keys = sorted(set(home_cols.keys()) & set(away_cols.keys()))

# helper to ensure numeric
def _to_numeric(series):
    return pd.to_numeric(series, errors="coerce")

diff_data = {}
for key in paired_keys:
    h_col = home_cols[key]
    a_col = away_cols[key]
    # compute (Home - Away) for numeric columns
    h = _to_numeric(df_wo_meta[h_col])
    a = _to_numeric(df_wo_meta[a_col])
    diff = h - a
    diff_data[f"{key} (Home-Away)"] = diff

X_diff = pd.DataFrame(diff_data, index=df_wo_meta.index)

# --- 5) keep neutral numeric features that are not Home/Away and not dropped ---
neutral_cols = [
    c for c in df_wo_meta.columns
    if not c.startswith(home_prefix) and not c.startswith(away_prefix)
]
neutral_numeric = df_wo_meta[neutral_cols].apply(pd.to_numeric, errors="coerce")
# keep only columns that have at least some non-NaN numeric values
neutral_numeric = neutral_numeric.loc[:, neutral_numeric.notna().any(axis=0)]

# --- 6) combine features and reorder so SP ERA (Home-Away) is last ---
X = pd.concat([X_diff, neutral_numeric], axis=1)

# Move "SP ERA (Home-Away)" to the end if it exists
sp_era_col = "SP ERA (Home-Away)"
if sp_era_col in X.columns:
    cols = [c for c in X.columns if c != sp_era_col]
    cols.append(sp_era_col)
    X = X[cols]

# --- 7) clean up NaNs (optional: simple impute with median) ---
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median(numeric_only=True))

# --- 8) final dataframe for VQC step 0 ---
df_vqc = X.copy()
df_vqc["y"] = y.values  # {0,1}; later we can add y_pm1 = 2*y - 1 for Z-readout training

# (optional) Save for reproducibility
out_path = "mlb_vqc_features.csv"
df_vqc.to_csv(out_path, index=False)

print(f"Built features: {X.shape[1]} columns (diff + neutral).")
print(f"Rows: {len(df_vqc)}. Saved to: {out_path}")
df_vqc


Built features: 9 columns (diff + neutral).
Rows: 2084. Saved to: mlb_vqc_features.csv


Unnamed: 0,hits (Home-Away),homeruns (Home-Away),leftonbase (Home-Away),obp (Home-Away),slg (Home-Away),strikeouts (Home-Away),strikepercentage (Home-Away),whip (Home-Away),SP ERA (Home-Away),y
0,1.033333,0.200000,0.066667,0.032683,0.052300,1.600000,0.006667,-0.209833,-1.522333,1
1,-1.500000,-1.100000,-3.200000,0.023800,0.054200,3.100000,-0.007000,-0.446000,-1.853000,1
2,1.748252,0.412587,0.706294,0.031210,0.052874,1.846154,0.014965,-0.215804,-2.147972,0
3,-1.272727,-0.818182,-2.545455,0.022273,0.048818,3.090909,-0.006364,-0.435455,-1.702727,0
4,1.700000,1.400000,-0.400000,0.044400,0.152200,5.200000,0.012000,-0.205000,-1.978000,1
...,...,...,...,...,...,...,...,...,...,...
2079,-0.125000,-0.300000,-0.750000,-0.009750,-0.027350,-0.175000,0.013750,0.008750,-0.396500,1
2080,1.025000,0.400000,1.200000,0.010750,0.002850,1.800000,-0.005750,0.005500,-1.444750,1
2081,-1.975000,-0.075000,-2.150000,-0.017175,0.002075,0.600000,0.015000,0.232250,1.386500,0
2082,2.000000,0.575000,0.500000,0.016975,0.021550,2.125000,0.005750,-0.038000,-1.851000,1
