In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load
df_vqc = pd.read_csv("mlb_vqc_features.csv")

# Separate X and y
X_df = df_vqc.drop(columns=["y"])

n = 9  # <-- choose how many features/qubits you want

X_df= X_df.iloc[:, :n]   # keep first n columns
print("New shape:", X_df.shape)


y01 = df_vqc["y"].astype(int).to_numpy()
y_pm1 = 2*y01 - 1   # convert to {-1, +1} for quantum expectation output

# Standardize ALL training data (no splitting now)
scaler = StandardScaler().fit(X_df.values)
Z = scaler.transform(X_df.values)

# Map to angles: φ = π * tanh(z)  → ensures stable range (-π, π)
phi = np.pi * np.tanh(Z).astype(np.float32)

# Store for Qiskit
X_angles = phi
y_labels = y_pm1

n_qubits = X_angles.shape[1]
print(f"✅ Data prepared for VQC: {len(X_angles)} samples, {n_qubits} features/qubits.")


New shape: (2430, 9)
✅ Data prepared for VQC: 2430 samples, 9 features/qubits.


In [7]:
X_df

Unnamed: 0,hits (Home-Away),homeruns (Home-Away),leftonbase (Home-Away),obp (Home-Away),slg (Home-Away),strikeouts (Home-Away),strikepercentage (Home-Away),whip (Home-Away),SP ERA (Home-Away)
0,-4.0,0.0,-13.0,-0.201,-0.132,0.0,-0.03,1.23,-1.80
1,1.0,-3.0,7.0,-0.089,-0.219,-3.0,-0.02,0.55,8.25
2,0.0,1.0,-3.0,0.035,0.150,3.0,0.05,-0.16,-1.91
3,-10.0,-5.0,-8.0,-0.171,-0.596,5.0,0.04,1.11,7.80
4,1.0,-2.0,5.0,0.007,-0.122,6.0,-0.01,0.00,-0.60
...,...,...,...,...,...,...,...,...,...
2425,4.0,1.0,0.0,-0.006,-0.008,-1.0,0.02,0.08,1.56
2426,4.0,0.0,-5.0,-0.012,-0.034,5.0,-0.12,-0.02,0.00
2427,2.0,-1.0,15.0,0.017,0.012,-2.0,0.03,0.01,-8.10
2428,0.0,1.0,-2.0,0.015,0.049,4.0,0.00,0.09,-2.01


In [9]:
from qiskit.circuit.library import ZZFeatureMap
from qiskit.circuit import ParameterVector

feature_map = ZZFeatureMap(feature_dimension=n_qubits, reps=2, entanglement='linear')

def bind_feature_map(sample_angles):
    # the ZZFeatureMap internally creates its own ParameterVector named 'x';
    # get the parameters from the circuit
    params_in_circ = list(feature_map.parameters)

    # create binding dictionary by position
    binding_by_position = {params_in_circ[i]: float(sample_angles[i]) for i in range(n_qubits)}

    # use assign_parameters to bind the values
    return feature_map.assign_parameters(binding_by_position, inplace=False)

_ = bind_feature_map(X_angles[0])


In [10]:
from qiskit.circuit.library import RealAmplitudes

In [11]:
# choose a small number of repetitions first; you can increase later if underfitting
ansatz_reps = 2
ansatz = RealAmplitudes(
    num_qubits=n_qubits,
    reps=ansatz_reps,
    entanglement="linear",   # 'linear' is a safe default; 'full' is more expressive but heavier
    insert_barriers=False
)

# peek at how many trainable parameters we have
theta_params = list(ansatz.parameters)   # ordered list of Parameter objects
n_thetas = len(theta_params)
print(f"Ansatz reps={ansatz_reps} → trainable parameters = {n_thetas}")

Ansatz reps=2 → trainable parameters = 27


In [12]:
from qiskit import QuantumCircuit
from qiskit.quantum_info import SparsePauliOp

# combine encoding + trainable layers
vqc = QuantumCircuit(n_qubits)
vqc.compose(feature_map, inplace=True)
vqc.compose(ansatz, inplace=True)

# define observable: Z on last qubit
# this gives an output in [-1, +1]
z_string = "I" * (n_qubits - 1) + "Z"
observable = SparsePauliOp.from_list([(z_string, 1.0)])

print("✅ VQC circuit and observable ready")
print(f"Observable: {z_string}")


✅ VQC circuit and observable ready
Observable: IIIIIIIIZ


In [11]:
import numpy as np
from qiskit.primitives import Estimator

# estimator primitive
estimator = Estimator()

# the list of trainable parameter objects in the ansatz
trainable_params = list(ansatz.parameters)
n_params = len(trainable_params)

# --- loss function: mean-squared error between ⟨Z⟩ and labels in {-1,+1} ---
def vqc_loss(theta_values, X_batch, y_batch):
    # parameter_values list must match order: [feature params] + [trainable params]
    # but feature_map’s x’s come first, ansatz params second
    n_samples = len(X_batch)
    # replicate theta_values for each sample (each row of X_batch)
    param_values = [list(X_batch[i]) + list(theta_values) for i in range(n_samples)]
    circuits = [vqc] * n_samples
    observables = [observable] * n_samples

    # run Estimator
    results = estimator.run(
        circuits=circuits,
        parameter_values=param_values,
        observables=observables
    ).result().values

    # compute MSE
    preds = np.array(results, dtype=float)
    loss_val = np.mean((preds - y_batch) ** 2)
    return loss_val, preds


  estimator = Estimator()


In [14]:
import numpy as np
from qiskit.primitives import Estimator
from qiskit_algorithms.optimizers import SPSA

# (re)instantiate estimator to avoid stale sessions
estimator = Estimator()

# ordered parameter lists (assumes we composed: feature_map → ansatz)
feat_params = getattr(feature_map, "ordered_parameters", list(feature_map.parameters))
theta_params = getattr(ansatz, "ordered_parameters", list(ansatz.parameters))
vqc_param_count = len(feat_params) + len(theta_params)
assert len(feat_params) == X_angles.shape[1], "Feature-map param count must match n_features."

# objective for optimizer: full-batch MSE on training set
def objective(theta_vec):
    # build parameter values per sample in the order [feature params ... ansatz params]
    param_values = [list(X_angles[i]) + list(theta_vec) for i in range(len(X_angles))]
    circuits = [vqc] * len(X_angles)
    observables = [observable] * len(X_angles)

    vals = estimator.run(
        circuits=circuits,
        parameter_values=param_values,
        observables=observables
    ).result().values
    preds = np.array(vals, dtype=float)             # ⟨Z⟩ in [-1,1]
    return np.mean((preds - y_labels) ** 2)         # MSE w.r.t. {-1,+1}

# initialize weights
n_params = len(theta_params)
theta0 = np.random.uniform(-0.1, 0.1, size=n_params)

# choose optimizer (SPSA is robust for VQCs)
opt = SPSA(maxiter=200)

# train
result = opt.minimize(fun=objective, x0=theta0)
theta_opt = result.x
print("✅ training done. final loss:", result.fun)

# simple train-set predictions and accuracy
def predict_pm1(theta_vec, X_phi):
    vals = estimator.run(
        circuits=[vqc]*len(X_phi),
        parameter_values=[list(row) + list(theta_vec) for row in X_phi],
        observables=[observable]*len(X_phi)
    ).result().values
    return np.array(vals, float)

preds_pm1 = predict_pm1(theta_opt, X_angles)
y_hat01 = (preds_pm1 >= 0.0).astype(int)   # threshold at 0 on ⟨Z⟩
train_acc = (y_hat01 == ((y_labels+1)//2)).mean()
print(f"Train accuracy: {train_acc:.3f}")


  estimator = Estimator()


KeyboardInterrupt: 

In [15]:
from qiskit_machine_learning.algorithms.classifiers.vqc import VQC

In [14]:
# --- VQC: quick, high-level classifier ---

from qiskit.circuit.library import ZZFeatureMap, RealAmplitudes
from qiskit_machine_learning.algorithms.classifiers.vqc import VQC
from qiskit_algorithms.optimizers import COBYLA

# backend (statevector is fine for training on angles)
try:
    from qiskit.utils import QuantumInstance
    from qiskit_aer import Aer
    qi = QuantumInstance(
        backend=Aer.get_backend("statevector_simulator"),
        shots=None,                 # analytic expectation (noise-free)
        seed_simulator=42,
        seed_transpiler=42,
    )
    quantum_instance_kw = dict(quantum_instance=qi)
except Exception:
    # Some versions accept 'quantum_instance' directly or run with default if None
    quantum_instance_kw = {}

# feature map & ansatz
feature_map = ZZFeatureMap(feature_dimension=n_qubits, reps=2, entanglement='linear')
ansatz = RealAmplitudes(num_qubits=n_qubits, reps=2, entanglement='linear')

# optimizer
optimizer = COBYLA(maxiter=300, tol=1e-4, rhobeg=0.2)

# VQC classifier (expects labels in {0,1})
vqc = VQC(
    feature_map=feature_map,
    ansatz=ansatz,
    optimizer=optimizer,
    **quantum_instance_kw
)

# train
vqc.fit(X_angles, y01)

# predict on training set (since we only have train data)
y_pred = vqc.predict(X_angles)
train_acc = (y_pred == y01).mean()
print(f"Train accuracy (VQC): {train_acc:.3f}")


Train accuracy (VQC): 0.545


In [20]:
X_angles

array([[-2.0728345 ,  0.0719734 , -2.9517357 , ..., -1.7442071 ,
         3.1415863 , -0.64478374],
       [ 0.77886736, -2.9849193 ,  2.4326193 , ..., -1.3091928 ,
         3.1284027 ,  2.7863467 ],
       [ 0.13834204,  1.7780777 , -1.0613425 , ...,  2.139866  ,
        -2.2279317 , -0.69779605],
       ...,
       [ 1.3571073 , -1.6776913 ,  3.0553286 , ...,  1.368924  ,
         0.20222254, -2.6386895 ],
       [ 0.13834204,  1.7780777 , -0.660356  , ..., -0.2490792 ,
         1.4800605 , -0.74562997],
       [ 0.13834204,  1.7780777 , -1.4267552 , ...,  1.368924  ,
        -2.039233  ,  0.25437236]], dtype=float32)

In [17]:
# Process postseason test data
import pandas as pd
import numpy as np

# Load postseason test data
test_df = pd.read_csv("postseason_test_data.csv")

# Drop non-numeric identifier columns
drop_cols = ["Series_Game", "Home Team", "Away Team", "Home SP", "Away SP"]
test_df_clean = test_df.drop(columns=drop_cols, errors="ignore")

# Detect Home/Away pairs and compute differences
home_prefix = "Home "
away_prefix = "Away "

home_cols = {c[len(home_prefix):]: c for c in test_df_clean.columns if c.startswith(home_prefix)}
away_cols = {c[len(away_prefix):]: c for c in test_df_clean.columns if c.startswith(away_prefix)}

paired_keys = sorted(set(home_cols.keys()) & set(away_cols.keys()))

# Helper to ensure numeric
def _to_numeric(series):
    return pd.to_numeric(series, errors="coerce")

# Compute (Home - Away) for each paired feature
diff_data = {}
for key in paired_keys:
    h_col = home_cols[key]
    a_col = away_cols[key]
    h = _to_numeric(test_df_clean[h_col])
    a = _to_numeric(test_df_clean[a_col])
    diff = h - a
    diff_data[f"{key} (Home-Away)"] = diff

X_test_diff = pd.DataFrame(diff_data, index=test_df_clean.index)

# Keep neutral numeric features (not Home/Away prefixed)
neutral_cols = [
    c for c in test_df_clean.columns
    if not c.startswith(home_prefix) and not c.startswith(away_prefix)
]
neutral_numeric = test_df_clean[neutral_cols].apply(pd.to_numeric, errors="coerce")
neutral_numeric = neutral_numeric.loc[:, neutral_numeric.notna().any(axis=0)]

# Combine features
X_test = pd.concat([X_test_diff, neutral_numeric], axis=1)

# Move "SP ERA (Home-Away)" to the end if it exists (to match training data)
sp_era_col = "SP ERA (Home-Away)"
if sp_era_col in X_test.columns:
    cols = [c for c in X_test.columns if c != sp_era_col]
    cols.append(sp_era_col)
    X_test = X_test[cols]

# Clean up NaNs
X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(X_test.median(numeric_only=True))

# Save processed test data
X_test.to_csv("postseason_test_processed.csv", index=False)

print(f"✅ Processed postseason test data: {X_test.shape[0]} games, {X_test.shape[1]} features")
print(f"Saved to: postseason_test_processed.csv")
X_test.head()


✅ Processed postseason test data: 47 games, 9 features
Saved to: postseason_test_processed.csv


Unnamed: 0,hits (Home-Away),homeruns (Home-Away),leftonbase (Home-Away),obp (Home-Away),slg (Home-Away),strikeouts (Home-Away),strikepercentage (Home-Away),whip (Home-Away),SP ERA (Home-Away)
0,0.6,0.05,-2.3,0.004225,-0.02165,-0.25,0.004,-0.03525,-4.3
1,0.6,0.05,-2.3,0.004225,-0.02165,-0.25,0.004,-0.03525,1.73
2,-1.8,0.3,-1.25,-0.0038,0.02325,1.8,0.0035,0.0395,-3.14
3,-1.8,0.3,-1.25,-0.0038,0.02325,1.8,0.0035,0.0395,-2.22
4,-1.8,0.3,-1.25,-0.0038,0.02325,1.8,0.0035,0.0395,-3.83


In [18]:
y01_playoffs = [1,0,0,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,0,0,0,1,0,1,0,0,1,1,1,1,1,0,1]

In [19]:
len(y01_playoffs)

47

In [31]:
# Load and normalize postseason test data (using training scaler)
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load processed postseason test data
test_df_vqc = pd.read_csv("postseason_test_processed.csv")

# Separate X and y
X_test_df = test_df_vqc.drop(columns=["y"], errors="ignore")

# If "y" column exists in the CSV, extract it
if "y" in test_df_vqc.columns:
    y_test_01 = test_df_vqc["y"].astype(int).to_numpy()
    y_test_pm1 = 2 * y_test_01 - 1
    print(f"Loaded {len(y_test_01)} test labels")
else:
    # Use the manually provided labels
    y_test_01 = np.array([1,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,1,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,0,0,0,1,0,1,0,0,1,1,1,1,1,0,1])
    y_test_pm1 = 2 * y_test_01 - 1
    print(f"Using manually provided {len(y_test_01)} test labels")

# Use the same n features as training
n = 9
X_test_df = X_test_df.iloc[:, :n]  # keep first n columns to match training
print(f"Test data shape: {X_test_df.shape}")

# IMPORTANT: Transform test data using the TRAINING scaler (already fitted on X_df)
# DO NOT fit a new scaler on test data!
Z_test = scaler.transform(X_test_df.values)

# Map to angles: φ = π * tanh(z) → ensures stable range (-π, π)
phi_test = np.pi * np.tanh(Z_test).astype(np.float32)

# Store for Qiskit predictions
X_test_angles = phi_test
y_test_labels = y_test_pm1

print(f"✅ Test data prepared for VQC: {len(X_test_angles)} samples, {X_test_angles.shape[1]} features/qubits.")
print(f"Test labels range: [{y_test_labels.min()}, {y_test_labels.max()}]")


Using manually provided 47 test labels
Test data shape: (47, 9)
✅ Test data prepared for VQC: 47 samples, 9 features/qubits.
Test labels range: [-1, 1]


In [32]:
y_test_labels

array([ 1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1,
        1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
       -1,  1, -1,  1, -1, -1,  1,  1,  1,  1,  1, -1,  1])

In [33]:
y_pred_test = vqc.predict(X_test_angles)
train_acc_test = (y_pred_test == y_test_01).mean()
print(f"Train accuracy (VQC): {train_acc_test:.3f}")

Train accuracy (VQC): 0.511


In [34]:
y_pred_test = vqc.predict(X_test_angles)
y_pred_test

array([0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0])

In [37]:
# Create predictions comparison table
import pandas as pd
import numpy as np

# Load original postseason data to get game identifiers
postseason_original = pd.read_csv("postseason_test_data.csv")

# Determine actual and predicted winners as team abbreviations
actual_winner = []
predicted_winner = []

for i in range(len(postseason_original)):
    home_team = postseason_original.iloc[i]['Home Team']
    away_team = postseason_original.iloc[i]['Away Team']
    
    # Actual winner
    if y_test_01[i] == 1:
        actual_winner.append(home_team)
    else:
        actual_winner.append(away_team)
    
    # Predicted winner
    if y_pred_test[i] == 1:
        predicted_winner.append(home_team)
    else:
        predicted_winner.append(away_team)

# Create results dataframe
results_df = pd.DataFrame({
    'Series_Game': postseason_original['Series_Game'],
    'Actual Winner': actual_winner,
    'Predicted Winner': predicted_winner,
    'Correct': (y_pred_test == y_test_01)
})



# Calculate accuracy
accuracy = results_df['Correct'].mean()
correct_count = results_df['Correct'].sum()
total_count = len(results_df)

print(f"VQC Postseason Prediction Results")
print(f"=" * 60)
print(f"Total Games: {total_count}")
print(f"Correct Predictions: {correct_count}")
print(f"Wrong Predictions: {total_count - correct_count}")
print(f"Accuracy: {accuracy:.1%}")
print(f"\n" + "=" * 60)

# Display the full results
results_df


VQC Postseason Prediction Results
Total Games: 47
Correct Predictions: 24
Wrong Predictions: 23
Accuracy: 51.1%



Unnamed: 0,Series_Game,Actual Winner,Predicted Winner,Correct
0,WS Game 1,TOR,LAD,False
1,WS Game 2,LAD,LAD,True
2,WS Game 3,LAD,LAD,True
3,WS Game 4,TOR,LAD,False
4,WS Game 5,TOR,LAD,False
5,WS Game 6,LAD,LAD,True
6,WS Game 7,LAD,LAD,True
7,ALCS Game 1,SEA,TOR,False
8,ALCS Game 2,SEA,TOR,False
9,ALCS Game 3,TOR,SEA,False
