In [1]:
import pandas as pd
import time
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler

from qiskit.circuit.library import ZZFeatureMap
from qiskit.circuit.library import PauliFeatureMap
from qiskit.primitives import Sampler

from qiskit_algorithms.state_fidelities import ComputeUncompute
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit_machine_learning.algorithms import QSVC
from sklearn.metrics import accuracy_score

os.makedirs("QSVM_Results", exist_ok=True)

In [2]:
# Load Excel file
train_df = pd.read_csv("mlb_vqc_features.csv")

# Explore the structure
print(train_df.shape)
print(train_df.head(5))

(2084, 10)
   hits (Home-Away)  homeruns (Home-Away)  leftonbase (Home-Away)  \
0          1.033333              0.200000                0.066667   
1         -1.500000             -1.100000               -3.200000   
2          1.748252              0.412587                0.706294   
3         -1.272727             -0.818182               -2.545455   
4          1.700000              1.400000               -0.400000   

   obp (Home-Away)  slg (Home-Away)  strikeouts (Home-Away)  \
0         0.032683         0.052300                1.600000   
1         0.023800         0.054200                3.100000   
2         0.031210         0.052874                1.846154   
3         0.022273         0.048818                3.090909   
4         0.044400         0.152200                5.200000   

   strikepercentage (Home-Away)  whip (Home-Away)  SP ERA (Home-Away)  y  
0                      0.006667         -0.209833           -1.522333  1  
1                     -0.007000         -0.44

In [3]:
train_df

Unnamed: 0,hits (Home-Away),homeruns (Home-Away),leftonbase (Home-Away),obp (Home-Away),slg (Home-Away),strikeouts (Home-Away),strikepercentage (Home-Away),whip (Home-Away),SP ERA (Home-Away),y
0,1.033333,0.200000,0.066667,0.032683,0.052300,1.600000,0.006667,-0.209833,-1.522333,1
1,-1.500000,-1.100000,-3.200000,0.023800,0.054200,3.100000,-0.007000,-0.446000,-1.853000,1
2,1.748252,0.412587,0.706294,0.031210,0.052874,1.846154,0.014965,-0.215804,-2.147972,0
3,-1.272727,-0.818182,-2.545455,0.022273,0.048818,3.090909,-0.006364,-0.435455,-1.702727,0
4,1.700000,1.400000,-0.400000,0.044400,0.152200,5.200000,0.012000,-0.205000,-1.978000,1
...,...,...,...,...,...,...,...,...,...,...
2079,-0.125000,-0.300000,-0.750000,-0.009750,-0.027350,-0.175000,0.013750,0.008750,-0.396500,1
2080,1.025000,0.400000,1.200000,0.010750,0.002850,1.800000,-0.005750,0.005500,-1.444750,1
2081,-1.975000,-0.075000,-2.150000,-0.017175,0.002075,0.600000,0.015000,0.232250,1.386500,0
2082,2.000000,0.575000,0.500000,0.016975,0.021550,2.125000,0.005750,-0.038000,-1.851000,1


In [4]:
# creates x for features and y for labels
scaler = StandardScaler()
X_train = train_df.drop(columns=['y']).to_numpy()
X_train = scaler.fit_transform(X_train)
X_train

array([[ 1.13473596,  0.58918596,  0.27868057, ...,  0.16172011,
        -1.24030429, -0.70032277],
       [-1.26037875, -2.57781591, -1.73719914, ..., -0.78777983,
        -2.63662356, -0.93775077],
       [ 1.81064843,  1.10708191,  0.67339827, ...,  0.73825427,
        -1.27560661, -1.14954902],
       ...,
       [-1.70946276, -0.08075674, -1.08923781, ...,  0.74068349,
         1.37348319,  1.3883017 ],
       [ 2.0486613 ,  1.5027442 ,  0.54609318, ...,  0.09803414,
        -0.22435146, -0.93631472],
       [-0.24403073,  1.38093643, -0.68811888, ...,  1.90439988,
        -0.47415169, -1.45706469]], shape=(2084, 9))

In [5]:
y_train = train_df['y'].to_numpy()
y_train

array([1, 1, 0, ..., 0, 1, 1], shape=(2084,))

In [6]:
# Smaller subset for testing and faster execution

# select the bottom x rows 
x = 1000
df_sample = train_df.tail(x).reset_index(drop=True)


X_train = scaler.fit_transform(df_sample.drop(columns=['y']).to_numpy())
y_train = df_sample['y'].to_numpy()

In [7]:
df_sample

Unnamed: 0,hits (Home-Away),homeruns (Home-Away),leftonbase (Home-Away),obp (Home-Away),slg (Home-Away),strikeouts (Home-Away),strikepercentage (Home-Away),whip (Home-Away),SP ERA (Home-Away),y
0,-0.050,0.350,0.275,0.009850,0.029150,0.325,-0.00725,0.14300,-0.53100,1
1,-1.550,-0.225,-2.075,-0.000750,0.008500,1.100,0.00625,-0.13425,-1.08900,0
2,-0.025,0.325,0.400,0.009725,0.029075,0.350,-0.00650,0.13800,-0.55375,1
3,-1.225,-0.100,0.400,0.008500,0.017900,2.475,-0.02475,-0.08675,-1.96350,0
4,1.225,-0.025,1.650,0.032525,0.030575,2.100,0.01325,-0.40425,-1.94325,1
...,...,...,...,...,...,...,...,...,...,...
995,-0.125,-0.300,-0.750,-0.009750,-0.027350,-0.175,0.01375,0.00875,-0.39650,1
996,1.025,0.400,1.200,0.010750,0.002850,1.800,-0.00575,0.00550,-1.44475,1
997,-1.975,-0.075,-2.150,-0.017175,0.002075,0.600,0.01500,0.23225,1.38650,0
998,2.000,0.575,0.500,0.016975,0.021550,2.125,0.00575,-0.03800,-1.85100,1


In [8]:
df_test = pd.read_csv("postseason_test_processed.csv")
x_test = df_test.drop(columns=['y']).to_numpy()
x_test = scaler.fit_transform(x_test)
y_test = df_test['y'].to_numpy() 

feature_map = PauliFeatureMap(feature_dimension=len(X_train[0]), reps=2, paulis = ['Z','Y','ZZ'])

sampler = Sampler()

fidelity = ComputeUncompute(sampler=sampler)

kernel = FidelityQuantumKernel(fidelity=fidelity, feature_map=feature_map)

start_time=time.time()

qsvc = QSVC(quantum_kernel=kernel)

qsvc.fit(X_train, y_train)

end_time=time.time()

end_time1=time.time()

qsvc_score = qsvc.score(x_test, y_test)

end_time2=time.time()

print(f"Training time: {end_time - start_time} seconds")
print(f"Scoring time: {end_time2 - end_time1} seconds")
print(f"QSVC accuracy: {qsvc_score}")

In [None]:
feature_dim = len(X_train[0])
postseason_original = pd.read_csv("postseason_test_data.csv")

feature_maps = {
    "ZZFeatureMap (reps=1)": ZZFeatureMap(
        feature_dimension=feature_dim,
        reps=1,
        entanglement='linear'
    ),

    "ZZFeatureMap (reps=2)": ZZFeatureMap(
        feature_dimension=feature_dim,
        reps=2,
        entanglement='linear'
    ),

    "Pauli Z,Y,ZZ (reps=1)": PauliFeatureMap(
        feature_dimension=feature_dim,
        reps=1,
        paulis=['Z', 'Y', 'ZZ'],
        entanglement='linear'
    ),

    "Pauli Z,Y,ZZ (reps=2)": PauliFeatureMap(
        feature_dimension=feature_dim,
        reps=2,
        paulis=['Z', 'Y', 'ZZ'],
        entanglement='linear'
    ),
}

# Prepare sampler + fidelity
sampler = Sampler()
fidelity = ComputeUncompute(sampler=sampler)

# -------------------------------------------------
# Loop through maps
# -------------------------------------------------
for name, fmap in feature_maps.items():
    print(f"\n====== Testing {name} ======")

    kernel = FidelityQuantumKernel(fidelity=fidelity, feature_map=fmap)

    # Train
    t0 = time.time()
    qsvc = QSVC(quantum_kernel=kernel)
    qsvc.fit(X_train, y_train)
    t1 = time.time()

    # Predict
    y_pred = qsvc.predict(x_test)

    # Score
    accuracy = accuracy_score(y_test, y_pred)
    t2 = time.time()

    print(f"Training time: {t1 - t0:.4f} sec")
    print(f"Scoring time:  {t2 - t1:.4f} sec")
    print(f"Accuracy:      {accuracy:.4f}")

    # Determine actual and predicted winners as team abbreviations
    actual_winner = []
    predicted_winner = []

    for i in range(len(postseason_original)):
        home_team = postseason_original.iloc[i]['Home Team']
        away_team = postseason_original.iloc[i]['Away Team']
        
        # Actual winner
        if y_test[i] == 1:
            actual_winner.append(home_team)
        else:
            actual_winner.append(away_team)
        
        # Predicted winner
        if y_pred[i] == 1:
            predicted_winner.append(home_team)
        else:
            predicted_winner.append(away_team)

    # Create results dataframe
    results_df = pd.DataFrame({
        'Series_Game': postseason_original['Series_Game'],
        'Actual Winner': actual_winner,
        'Predicted Winner': predicted_winner,
        'Correct': (y_pred == y_test)
    })
    results_df.to_csv(f"QSVM_Results/Results_{name}_1000Samples.csv", index=False)




  sampler = Sampler()
