# Quantum benchmark

## I - Introduction

## II - Packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score, KFold
#Import classical libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

plt.style.use('ggplot')

import functools

from qiskit import BasicAer
from qiskit.circuit.library import ZZFeatureMap
from qiskit.utils import QuantumInstance, algorithm_globals
from qiskit_machine_learning.algorithms import QSVC
from qiskit_machine_learning.kernels import QuantumKernel
from qiskit_machine_learning.datasets import ad_hoc_data
import logging

import pennylane as qml
from pennylane.templates.embeddings import AngleEmbedding, AmplitudeEmbedding
from pennylane.optimize import AdamOptimizer

from qiskit.algorithms.optimizers import COBYLA
from qiskit.circuit.library import TwoLocal, ZZFeatureMap
import qiskit

In [2]:
import warnings
warnings.filterwarnings('ignore')

## III - Data 

In [3]:
# Read out CSV

df = pd.read_csv('fraud_detection_bank_dataset.csv', sep=',')
df = df.sample(1400)
df = df.drop(['Unnamed: 0'], axis = 1)
df_labels = df['targets']
df.drop(['targets'],axis = 1,inplace = True)
X_train, X_test, y_train, y_test = train_test_split(df, df_labels, test_size=0.2, random_state=42)

cols = ['col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_18', 'col_19','col_20', 'col_21', 'col_35', 
        'col_51', 'col_52', 'col_53', 'col_70','col_71','col_7', 'col_22', 'col_54', 'col_56']

X_train = X_train.drop(cols, axis=1)
X_test = X_test.drop(cols, axis=1)

In [4]:
df.shape

(1400, 112)

In [5]:
%%script false --no-raise-error

import sweetviz as sv

#EDA using Autoviz
sweet_report = sv.analyze(df)

#Saving results to HTML file
sweet_report.show_html('sweet_report.html')

## IV - Modelisation

### Classical

## Quantum Approaches

In [8]:
np.unique(df_labels)

array([0., 1.])

In [5]:
lda_1 = LDA(n_components=1)
lda_2 = LDA(n_components=1)

In [6]:
lda_1.fit(X_train.iloc[:, :46], y_train)
feature_1 = lda_1.transform(X_train.iloc[:, :46])
lda_2.fit(X_train.iloc[:, 46:], y_train)
feature_2 = lda_2.transform(X_train.iloc[:, 46:])

In [7]:
test_1 = lda_1.transform(X_test.iloc[:, :46])
test_2 = lda_2.transform(X_test.iloc[:, 46:])

In [8]:

features_lda_1 = pd.DataFrame(feature_1)
features_lda_2 = pd.DataFrame(feature_2)
features_lda = features_lda_1.join(features_lda_2, lsuffix="_left", rsuffix="_right")

In [9]:
test_lda_1 =pd.DataFrame(test_1)
test_lda_2 =pd.DataFrame(test_2)
test_lda = test_lda_1.join(test_lda_2, lsuffix="_left", rsuffix="_right")

In [10]:
n_dim = len(features_lda.columns)

## Split train test 

In [11]:
# Split dataset into train and test

#sample_train, sample_test, label_train, label_test = train_test_split(
#     features_lda, y_train, test_size=0.2, random_state=22)

sample_train = features_lda.to_numpy()
sample_test = test_lda.to_numpy()
label_train = y_train
label_test = y_test
# Normalize

std_scale = StandardScaler().fit(sample_train)
sample_train = std_scale.transform(sample_train)
sample_test = std_scale.transform(sample_test)

# Scale for better fit within the feature map

#samples = np.append(sample_train, sample_test, axis=0)
minmax_scale = MinMaxScaler((-1, 1)).fit(sample_train)
sample_train = minmax_scale.transform(sample_train)
sample_test = minmax_scale.transform(sample_test)

# Select a sample for a better control of the research and wall time


#test_size = 200 #40
#sample_test = sample_test[:test_size]
#label_test = label_test[:test_size]

In [12]:
# Basic parameters for hybrid model

seed = 8500
feature_dim = n_dim
num_reps = 2
num_shots =256 


## Hybrid

In [13]:
# Define feature_map

feature_map = ZZFeatureMap(feature_dimension=feature_dim, reps=num_reps)

# Define the backend
backend = QuantumInstance(
    BasicAer.get_backend("qasm_simulator"), shots=num_shots, seed_simulator=seed, seed_transpiler=seed
)

# Define the kernel

kernel = QuantumKernel(feature_map=feature_map, quantum_instance=backend)

# Model run
svc = SVC(kernel=kernel.evaluate)
#svc.fit(sample_train, label_train)
#score = svc.score(sample_test, label_test)

#print(f"Callable kernel classification test score: {score}")

In [19]:
#result_predict = svc.predict(sample_test)

In [20]:
#print(metrics.classification_report(label_test,result_predict))

In [14]:
from sklearn.model_selection import cross_validate
from tqdm import tqdm
def evaluate_ml_model(_models, X, y, n_fold=10, metric='precision'):
    ''' Function to evaluate a ML and QML model with a list of metrics
    
    
    '''
    results = pd.DataFrame()
    kfold = KFold(n_splits=n_fold)
    columns = []
    for name, model in tqdm(_models):
        # -------------------
        # Variables initialization 
        _df = pd.DataFrame()
        names = []
        means = []
        stds = []
        
        # -------------------
        # k-fold Cross validation
        cv_results = cross_validate(model, X, y, cv=kfold, scoring=metric)
        
        # -------------------
        # Compute the mean and standard deviation 
        for _name, _array in cv_results.items():
            names.append(_name)
            means.append(round(100*_array.mean(), 2))
            stds.append(round(100*_array.std(), 2))
        # -------------------
        # Save the results in a dataframe 
        _df =  pd.DataFrame([means, stds], columns=names)
        columns.extend([name+' mean (%)', name+' std (%)'])
        #results = results.join(_df, on=_df.index)
        results = results.append(_df)
    results.index = columns
    print(results)
    return results

In [34]:
models = []
#models.append(('LR', LogisticRegression(max_iter=1000)))
#models.append(('KNN', KNeighborsClassifier()))
#models.append(('CART', DecisionTreeClassifier()))
#models.append(('NB', GaussianNB()))
#models.append(('SVM', SVC()))
models.append(('qsvc', svc))
_metrics = ['precision', 'recall', 'f1', 'accuracy',  'matthews_corrcoef','balanced_accuracy']

In [35]:
df_results = pd.DataFrame()
df_results = evaluate_ml_model(models, sample_train, label_train, n_fold=10, metric=_metrics)
df_results 

100%|█████████████████████████████████████████| 1/1 [1:44:00<00:00, 6240.61s/it]

               fit_time  score_time  test_precision  test_recall  test_f1  \
qsvc mean (%)  51786.78    10619.15           82.35        65.92    72.93   
qsvc std (%)    2383.02      412.76           10.29         8.79     8.14   

               test_accuracy  test_matthews_corrcoef  test_balanced_accuracy  
qsvc mean (%)          88.12                   66.35                   80.67  
qsvc std (%)            3.02                    9.90                    4.94  





Unnamed: 0,fit_time,score_time,test_precision,test_recall,test_f1,test_accuracy,test_matthews_corrcoef,test_balanced_accuracy
qsvc mean (%),51786.78,10619.15,82.35,65.92,72.93,88.12,66.35,80.67
qsvc std (%),2383.02,412.76,10.29,8.79,8.14,3.02,9.9,4.94


In [38]:
j = 0
for i in range(int(len(df_results.index)/2)):

    print(f'{df_results.iloc[j].name.split()[0]} & {df_results.iloc[j][2]} ({df_results.iloc[j+1][2]}) & {df_results.iloc[j][3]} ({df_results.iloc[j+1][3]}) &  {df_results.iloc[j][4]} ({df_results.iloc[j+1][4]}) & {df_results.iloc[j][6]} ({df_results.iloc[j+1][6]}) & {df_results.iloc[j][7]} ({df_results.iloc[j+1][7]}) \\')
    
    j+=2

qsvc & 82.35 (10.29) & 65.92 (8.79) &  72.93 (8.14) & 66.35 (9.9) & 80.67 (4.94) \


In [23]:
df_results.to_csv('LDA_fraud_dataset.csv')

## Pennylane

In [18]:
from pennylane import numpy as np

In [19]:
# Angle Encoding

num_qubits = n_dim

dev = qml.device('default.qubit', wires = num_qubits)

@qml.qnode(dev)
def circuit(parameters, data):
    for i in range(num_qubits):
        qml.Hadamard(wires = i)
    
    AngleEmbedding(features = data, wires = range(num_qubits), rotation = 'Y')
    
    qml.StronglyEntanglingLayers(weights = parameters, wires = range(num_qubits))
    
    return qml.expval(qml.PauliZ(0))

In [20]:
num_layers = 5
weights_init = 0.01 * np.random.randn(num_layers, num_qubits, 3, requires_grad=True)
bias_init = np.array(0.0, requires_grad=True)

#print(weights_init, bias_init)

In [21]:
circuit(weights_init, sample_train[0])

tensor(0.08669908, requires_grad=True)

In [22]:
def variational_classifier(weights, bias, x):
    return circuit(weights, x) + bias

In [23]:
def square_loss(labels, predictions):
    loss = 0
    for l, p in zip(labels, predictions):
        loss = loss + (l - p) ** 2

    loss = loss / len(labels)
    return loss

In [24]:
def accuracy(labels, predictions):

    loss = 0
    for l, p in zip(labels, predictions):
        if abs(l - p) < 1e-5:
            loss = loss + 1
    loss = loss / len(labels)

    return loss

In [25]:
def cost(weights, bias, X, Y):
    predictions = [variational_classifier(weights, bias, x) for x in X]
    return square_loss(Y, predictions)

In [26]:
Y = np.array(label_train * 2 - np.ones(len(label_train)),requires_grad=True)  # shift label from {0, 1} to {-1, 1}
X = np.array(sample_train, requires_grad=True)

for i in range(5):
    print("X = {}, Y = {: d}".format(list(X[i]), int(Y[i])))

X = [tensor(0.25351439, requires_grad=True), tensor(0.30771751, requires_grad=True)], Y =  1
X = [tensor(-0.14112155, requires_grad=True), tensor(-0.59942875, requires_grad=True)], Y = -1
X = [tensor(0.01882312, requires_grad=True), tensor(-0.26478299, requires_grad=True)], Y = -1
X = [tensor(0.20482344, requires_grad=True), tensor(-0.36680751, requires_grad=True)], Y =  1
X = [tensor(-0.11257209, requires_grad=True), tensor(-0.59058478, requires_grad=True)], Y = -1


In [27]:
opt = AdamOptimizer(stepsize=0.1, beta1=0.9, beta2=0.99, eps=1e-08)
batch_size = 10

In [41]:
weights = weights_init
bias = bias_init

wbest = 0
bbest = 0
abest = 0
ccost = 0 
for it in range(150):

    # weights update by one optimizer step

    batch_index = np.random.randint(0, len(X), (batch_size,))
    X_batch = X[batch_index]
    Y_batch = Y[batch_index]
    weights, bias, _, _ = opt.step(cost, weights, bias, X_batch, Y_batch)

    # Compute the accuracy
    predictions = [np.sign(variational_classifier(weights, bias, x)) for x in X]
    
    '''if accuracy(Y, predictions) > abest:
        wbest = weights
        bbest = bias
        abest = accuracy(Y, predictions)
        print('New best')

    acc = accuracy(Y, predictions)

    print(
        "Iter: {:5d} | Cost: {:0.7f} | Accuracy: {:0.7f} ".format(
            it + 1, cost(weights, bias, X, Y), acc
        )
    )'''
    prec = metrics.accuracy_score(Y, predictions)
    if  prec > abest or ((prec == abest) and (cost(weights, bias, X, Y) < ccost)):
        wbest = weights
        bbest = bias
        abest = prec
        ccost = cost(weights, bias, X, Y)
        print('New best')
    #prec = metrics.precision_score(Y, predictions, average='binary')
    print(
        "Iter: {:5d} | Cost: {:0.7f} | f1: {:0.7f} ".format(
            it + 1, cost(weights, bias, X, Y), prec
        )
    )
    
    

New best
Iter:     1 | Cost: 0.7344644 | f1: 0.7973214 
Iter:     2 | Cost: 0.6157621 | f1: 0.7651786 
Iter:     3 | Cost: 0.5747764 | f1: 0.7660714 
Iter:     4 | Cost: 0.5470359 | f1: 0.7750000 
New best
Iter:     5 | Cost: 0.4855538 | f1: 0.8410714 
New best
Iter:     6 | Cost: 0.4756260 | f1: 0.8687500 
New best
Iter:     7 | Cost: 0.5005733 | f1: 0.8776786 
Iter:     8 | Cost: 0.4820430 | f1: 0.8741071 
Iter:     9 | Cost: 0.4734429 | f1: 0.8714286 
Iter:    10 | Cost: 0.4764574 | f1: 0.8723214 
Iter:    11 | Cost: 0.4812747 | f1: 0.8741071 
Iter:    12 | Cost: 0.4950626 | f1: 0.8767857 
New best
Iter:    13 | Cost: 0.5177490 | f1: 0.8830357 
Iter:    14 | Cost: 0.5121669 | f1: 0.8785714 
Iter:    15 | Cost: 0.5206429 | f1: 0.8812500 
Iter:    16 | Cost: 0.5131106 | f1: 0.8776786 
Iter:    17 | Cost: 0.4936372 | f1: 0.8723214 
Iter:    18 | Cost: 0.4612667 | f1: 0.8616071 
Iter:    19 | Cost: 0.4721951 | f1: 0.8392857 
Iter:    20 | Cost: 0.4947040 | f1: 0.8178571 
Iter:    21 | C

In [42]:
Yte = np.array(label_test * 2 - np.ones(len(label_test)))
Xte = np.array(normalize(sample_test))

In [43]:
pd.Series(Yte).value_counts()

-1.0    203
 1.0     77
dtype: int64

In [44]:
predictions = [np.sign(variational_classifier(wbest, bbest, x)) for x in Xte]
pred = [np.sign(variational_classifier(wbest, bbest, x)) for x in X]
acc = accuracy(Yte, predictions)

print(f'Cost: {cost(wbest, bbest, Xte, Yte)}, Accuracy: {np.round(acc, 2) * 100}%')

Cost: 0.5158124094433032, Accuracy: 84.0%


In [45]:
print(metrics.classification_report(Yte, predictions))

              precision    recall  f1-score   support

        -1.0       0.83      0.99      0.90       203
         1.0       0.92      0.45      0.61        77

    accuracy                           0.84       280
   macro avg       0.87      0.72      0.75       280
weighted avg       0.85      0.84      0.82       280



In [33]:
print(f'''

    Precision: {round(100*metrics.precision_score(predictions,Yte),2)}%
    Recall: {round(100*metrics.recall_score(predictions,Yte),2)}%
    f1: {round(100*metrics.f1_score(predictions,Yte),2)}%
    Accuracy: {round(100*metrics.accuracy_score(predictions,Yte),2)}%
    Balanced accuracy: {round(100*metrics.balanced_accuracy_score(predictions,Yte),2)}%
    Matthew corcorref: {round(100*metrics.matthews_corrcoef(predictions,Yte),2)}%
    ''')



    Precision: 49.35%
    Recall: 84.44%
    f1: 62.3%
    Accuracy: 83.57%
    Balanced accuracy: 83.92%
    Matthew corcorref: 55.81%
    


In [37]:
print(f'''

    Precision: {round(100*metrics.precision_score(Yte, predictions),2)}%
    Recall: {round(100*metrics.recall_score(Yte, predictions),2)}%
    f1: {round(100*metrics.f1_score(Yte, predictions),2)}%
    Accuracy: {round(100*metrics.accuracy_score(Yte, predictions),2)}%
    Balanced accuracy: {round(100*metrics.balanced_accuracy_score(Yte, predictions),2)}%
    Matthew corcorref: {round(100*metrics.matthews_corrcoef(Yte, predictions),2)}%
    ''')



    Precision: 84.44%
    Recall: 49.35%
    f1: 62.3%
    Accuracy: 83.57%
    Balanced accuracy: 72.95%
    Matthew corcorref: 55.81%
    


In [46]:
weights = weights_init
bias = bias_init

wbest = 0
bbest = 0
abest = 0
ccost = 1 
for it in range(150):

    # weights update by one optimizer step

    batch_index = np.random.randint(0, len(X), (batch_size,))
    X_batch = X[batch_index]
    Y_batch = Y[batch_index]
    weights, bias, _, _ = opt.step(cost, weights, bias, X_batch, Y_batch)

    # Compute the accuracy
    predictions = [np.sign(variational_classifier(weights, bias, x)) for x in X]
    
    '''if accuracy(Y, predictions) > abest:
        wbest = weights
        bbest = bias
        abest = accuracy(Y, predictions)
        print('New best')

    acc = accuracy(Y, predictions)

    print(
        "Iter: {:5d} | Cost: {:0.7f} | Accuracy: {:0.7f} ".format(
            it + 1, cost(weights, bias, X, Y), acc
        )
    )'''
    prec = metrics.f1_score(Y, predictions, average='binary', pos_label=1)
    if  prec > abest or ((prec == abest) and (cost(weights, bias, X, Y) < ccost)):
        wbest = weights
        bbest = bias
        abest = prec
        ccost = cost(weights, bias, X, Y)
        print('New best')
    #prec = metrics.precision_score(Y, predictions, average='binary')
    print(
        "Iter: {:5d} | Cost: {:0.7f} | f1: {:0.7f} ".format(
            it + 1, cost(weights, bias, X, Y), prec
        )
    )
    
    

New best
Iter:     1 | Cost: 0.7837847 | f1: 0.3398329 
Iter:     2 | Cost: 0.6468604 | f1: 0.0282686 
Iter:     3 | Cost: 0.6890824 | f1: 0.0071685 
Iter:     4 | Cost: 0.7594924 | f1: 0.0000000 
Iter:     5 | Cost: 0.6646804 | f1: 0.0071685 
Iter:     6 | Cost: 0.5017198 | f1: 0.3243243 
New best
Iter:     7 | Cost: 0.4952593 | f1: 0.6603774 
New best
Iter:     8 | Cost: 0.5725122 | f1: 0.7429806 
Iter:     9 | Cost: 0.5681912 | f1: 0.7363834 
Iter:    10 | Cost: 0.5485545 | f1: 0.7187500 
Iter:    11 | Cost: 0.4900122 | f1: 0.6603774 
Iter:    12 | Cost: 0.4652018 | f1: 0.5909091 
Iter:    13 | Cost: 0.4688597 | f1: 0.5506494 
Iter:    14 | Cost: 0.4787903 | f1: 0.5000000 
Iter:    15 | Cost: 0.4678385 | f1: 0.5506494 
Iter:    16 | Cost: 0.4589827 | f1: 0.6000000 
Iter:    17 | Cost: 0.4677693 | f1: 0.6394231 
Iter:    18 | Cost: 0.5428852 | f1: 0.7174888 
New best
Iter:    19 | Cost: 0.5978691 | f1: 0.7432150 
Iter:    20 | Cost: 0.5118448 | f1: 0.6972477 
Iter:    21 | Cost: 0.46

In [47]:
predictions = [np.sign(variational_classifier(wbest, bbest, x)) for x in Xte]
pred = [np.sign(variational_classifier(wbest, bbest, x)) for x in X]
acc = accuracy(Yte, predictions)

print(f'Cost: {cost(wbest, bbest, Xte, Yte)}, Accuracy: {np.round(acc, 2) * 100}%')

Cost: 0.5009974584580961, Accuracy: 84.0%


In [48]:
print(metrics.classification_report(Yte, predictions))

              precision    recall  f1-score   support

        -1.0       0.83      0.99      0.90       203
         1.0       0.92      0.45      0.61        77

    accuracy                           0.84       280
   macro avg       0.87      0.72      0.75       280
weighted avg       0.85      0.84      0.82       280



In [49]:
print(f'''

    Precision: {round(100*metrics.precision_score(predictions,Yte),2)}%
    Recall: {round(100*metrics.recall_score(predictions,Yte),2)}%
    f1: {round(100*metrics.f1_score(predictions,Yte),2)}%
    Accuracy: {round(100*metrics.accuracy_score(predictions,Yte),2)}%
    Balanced accuracy: {round(100*metrics.balanced_accuracy_score(predictions,Yte),2)}%
    Matthew corcorref: {round(100*metrics.matthews_corrcoef(predictions,Yte),2)}%
    ''')



    Precision: 45.45%
    Recall: 92.11%
    f1: 60.87%
    Accuracy: 83.93%
    Balanced accuracy: 87.37%
    Matthew corcorref: 57.33%
    
