In [None]:
import pandas as pd
import time
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder, StandardScaler

from qiskit.circuit.library import ZZFeatureMap
from qiskit.circuit.library import PauliFeatureMap
from qiskit.primitives import Sampler

from qiskit_algorithms.state_fidelities import ComputeUncompute
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit_machine_learning.algorithms import QSVC

from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA

In [2]:
# Load Excel file
train_df = pd.read_csv("mlb_vqc_features.csv")

# Explore the structure
print(train_df.shape)
print(train_df.head(5))

(2084, 10)
   hits (Home-Away)  homeruns (Home-Away)  leftonbase (Home-Away)  \
0          1.033333              0.200000                0.066667   
1         -1.500000             -1.100000               -3.200000   
2          1.748252              0.412587                0.706294   
3         -1.272727             -0.818182               -2.545455   
4          1.700000              1.400000               -0.400000   

   obp (Home-Away)  slg (Home-Away)  strikeouts (Home-Away)  \
0         0.032683         0.052300                1.600000   
1         0.023800         0.054200                3.100000   
2         0.031210         0.052874                1.846154   
3         0.022273         0.048818                3.090909   
4         0.044400         0.152200                5.200000   

   strikepercentage (Home-Away)  whip (Home-Away)  SP ERA (Home-Away)  y  
0                      0.006667         -0.209833           -1.522333  1  
1                     -0.007000         -0.44

In [3]:
train_df

Unnamed: 0,hits (Home-Away),homeruns (Home-Away),leftonbase (Home-Away),obp (Home-Away),slg (Home-Away),strikeouts (Home-Away),strikepercentage (Home-Away),whip (Home-Away),SP ERA (Home-Away),y
0,1.033333,0.200000,0.066667,0.032683,0.052300,1.600000,0.006667,-0.209833,-1.522333,1
1,-1.500000,-1.100000,-3.200000,0.023800,0.054200,3.100000,-0.007000,-0.446000,-1.853000,1
2,1.748252,0.412587,0.706294,0.031210,0.052874,1.846154,0.014965,-0.215804,-2.147972,0
3,-1.272727,-0.818182,-2.545455,0.022273,0.048818,3.090909,-0.006364,-0.435455,-1.702727,0
4,1.700000,1.400000,-0.400000,0.044400,0.152200,5.200000,0.012000,-0.205000,-1.978000,1
...,...,...,...,...,...,...,...,...,...,...
2079,-0.125000,-0.300000,-0.750000,-0.009750,-0.027350,-0.175000,0.013750,0.008750,-0.396500,1
2080,1.025000,0.400000,1.200000,0.010750,0.002850,1.800000,-0.005750,0.005500,-1.444750,1
2081,-1.975000,-0.075000,-2.150000,-0.017175,0.002075,0.600000,0.015000,0.232250,1.386500,0
2082,2.000000,0.575000,0.500000,0.016975,0.021550,2.125000,0.005750,-0.038000,-1.851000,1


In [4]:
# creates x for features and y for labels
X_train = train_df.drop(columns=['y']).to_numpy()
X_train = StandardScaler().fit_transform(X_train)
X_train

array([[ 1.13473596,  0.58918596,  0.27868057, ...,  0.16172011,
        -1.24030429, -0.70032277],
       [-1.26037875, -2.57781591, -1.73719914, ..., -0.78777983,
        -2.63662356, -0.93775077],
       [ 1.81064843,  1.10708191,  0.67339827, ...,  0.73825427,
        -1.27560661, -1.14954902],
       ...,
       [-1.70946276, -0.08075674, -1.08923781, ...,  0.74068349,
         1.37348319,  1.3883017 ],
       [ 2.0486613 ,  1.5027442 ,  0.54609318, ...,  0.09803414,
        -0.22435146, -0.93631472],
       [-0.24403073,  1.38093643, -0.68811888, ...,  1.90439988,
        -0.47415169, -1.45706469]], shape=(2084, 9))

In [5]:
y_train = train_df['y'].to_numpy()
y_train

array([1, 1, 0, ..., 0, 1, 1], shape=(2084,))

In [None]:
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)

In [None]:
feature_map = PauliFeatureMap(feature_dimension=len(X_train_pca[0]), reps=2, paulis = ['Z','Y','ZZ'])

sampler = Sampler()

fidelity = ComputeUncompute(sampler=sampler)

kernel = FidelityQuantumKernel(fidelity=fidelity, feature_map=feature_map)


  sampler = Sampler()


In [None]:
start_time=time.time()

qsvc = QSVC(quantum_kernel=kernel)

qsvc.fit(X_train_pca, y_train)

end_time=time.time()

In [None]:
df_test = pd.read_csv("postseason_test_processed.csv")
x_test = df_test.drop(columns=['y']).to_numpy()
x_test = StandardScaler().fit_transform(x_test)
y_test = df_test['y'].to_numpy()

x_test_pca = pca.transform(x_test)

In [None]:
end_time1=time.time()

qsvc_score = qsvc.score(x_test_pca, y_test)

end_time2=time.time()

print(f"Training time: {end_time - start_time} seconds")
print(f"Scoring time: {end_time2 - end_time1} seconds")
print(f"QSVC accuracy: {qsvc_score}")

Training time: 3855.278528213501 seconds
Scoring time: 711.265638589859 seconds
QSVC accuracy: 0.5531914893617021
