In [4]:
import pandas as pd
import numpy as np

# --- 1) load ---
csv_path = "mlb_game_data_2025v2.csv"   # adjust if needed
df = pd.read_csv(csv_path)

# --- 2) columns to drop (identifiers / high-cardinality categoricals) ---
drop_exact = {"Date", "Home Team", "Away Team", "Home SP", "Away SP"}
drop_cols = [c for c in df.columns if c in drop_exact]
df_wo_meta = df.drop(columns=drop_cols, errors="ignore")

# --- 3) target y ---
# Expect a boolean column named exactly like this; change if your file uses a different name.
y_col = "Home Team Won"
if y_col not in df_wo_meta.columns:
    raise KeyError(f"Expected target column '{y_col}' not found.")
# store y as integers {0,1} for now; we can map to {-1,+1} later if needed
y = df_wo_meta[y_col].astype(int)
df_wo_meta = df_wo_meta.drop(columns=[y_col])

# --- 4) detect Home/Away pairs and build (home - away) ---
home_prefix = "Home "
away_prefix = "Away "

home_cols = {c[len(home_prefix):]: c for c in df_wo_meta.columns if c.startswith(home_prefix)}
away_cols = {c[len(away_prefix):]: c for c in df_wo_meta.columns if c.startswith(away_prefix)}

paired_keys = sorted(set(home_cols.keys()) & set(away_cols.keys()))

# helper to ensure numeric
def _to_numeric(series):
    return pd.to_numeric(series, errors="coerce")

diff_data = {}
for key in paired_keys:
    h_col = home_cols[key]
    a_col = away_cols[key]
    # compute (Home - Away) for numeric columns
    h = _to_numeric(df_wo_meta[h_col])
    a = _to_numeric(df_wo_meta[a_col])
    diff = h - a
    diff_data[f"{key} (Home-Away)"] = diff

X_diff = pd.DataFrame(diff_data, index=df_wo_meta.index)

# --- 5) keep neutral numeric features that are not Home/Away and not dropped ---
neutral_cols = [
    c for c in df_wo_meta.columns
    if not c.startswith(home_prefix) and not c.startswith(away_prefix)
]
neutral_numeric = df_wo_meta[neutral_cols].apply(pd.to_numeric, errors="coerce")
# keep only columns that have at least some non-NaN numeric values
neutral_numeric = neutral_numeric.loc[:, neutral_numeric.notna().any(axis=0)]

# --- 6) combine features and reorder so SP ERA (Home-Away) is last ---
X = pd.concat([X_diff, neutral_numeric], axis=1)

# Move "SP ERA (Home-Away)" to the end if it exists
sp_era_col = "SP ERA (Home-Away)"
if sp_era_col in X.columns:
    cols = [c for c in X.columns if c != sp_era_col]
    cols.append(sp_era_col)
    X = X[cols]

# --- 7) clean up NaNs (optional: simple impute with median) ---
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median(numeric_only=True))

# --- 8) final dataframe for VQC step 0 ---
df_vqc = X.copy()
df_vqc["y"] = y.values  # {0,1}; later we can add y_pm1 = 2*y - 1 for Z-readout training

# (optional) Save for reproducibility
out_path = "mlb_vqc_features.csv"
df_vqc.to_csv(out_path, index=False)

print(f"Built features: {X.shape[1]} columns (diff + neutral).")
print(f"Rows: {len(df_vqc)}. Saved to: {out_path}")
df_vqc.head()


Built features: 9 columns (diff + neutral).
Rows: 2430. Saved to: mlb_vqc_features.csv


Unnamed: 0,hits (Home-Away),homeruns (Home-Away),leftonbase (Home-Away),obp (Home-Away),slg (Home-Away),strikeouts (Home-Away),strikepercentage (Home-Away),whip (Home-Away),SP ERA (Home-Away),y
0,-4.0,0.0,-13.0,-0.201,-0.132,0.0,-0.03,1.23,-1.8,0
1,1.0,-3.0,7.0,-0.089,-0.219,-3.0,-0.02,0.55,8.25,0
2,0.0,1.0,-3.0,0.035,0.15,3.0,0.05,-0.16,-1.91,1
3,-10.0,-5.0,-8.0,-0.171,-0.596,5.0,0.04,1.11,7.8,0
4,1.0,-2.0,5.0,0.007,-0.122,6.0,-0.01,0.0,-0.6,0


In [5]:
df_vqc

Unnamed: 0,hits (Home-Away),homeruns (Home-Away),leftonbase (Home-Away),obp (Home-Away),slg (Home-Away),strikeouts (Home-Away),strikepercentage (Home-Away),whip (Home-Away),SP ERA (Home-Away),y
0,-4.0,0.0,-13.0,-0.201,-0.132,0.0,-0.03,1.23,-1.80,0
1,1.0,-3.0,7.0,-0.089,-0.219,-3.0,-0.02,0.55,8.25,0
2,0.0,1.0,-3.0,0.035,0.150,3.0,0.05,-0.16,-1.91,1
3,-10.0,-5.0,-8.0,-0.171,-0.596,5.0,0.04,1.11,7.80,0
4,1.0,-2.0,5.0,0.007,-0.122,6.0,-0.01,0.00,-0.60,0
...,...,...,...,...,...,...,...,...,...,...
2425,4.0,1.0,0.0,-0.006,-0.008,-1.0,0.02,0.08,1.56,1
2426,4.0,0.0,-5.0,-0.012,-0.034,5.0,-0.12,-0.02,0.00,1
2427,2.0,-1.0,15.0,0.017,0.012,-2.0,0.03,0.01,-8.10,1
2428,0.0,1.0,-2.0,0.015,0.049,4.0,0.00,0.09,-2.01,1
