# Quantum benchmark

## I - Introduction

## II - Packages

In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score, KFold
#Import classical libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

plt.style.use('ggplot')

import functools

from qiskit import BasicAer
from qiskit.circuit.library import ZZFeatureMap
from qiskit.utils import QuantumInstance, algorithm_globals
from qiskit_machine_learning.algorithms import QSVC
from qiskit_machine_learning.kernels import QuantumKernel
from qiskit_machine_learning.datasets import ad_hoc_data
import logging

import pennylane as qml
from pennylane.templates.embeddings import AngleEmbedding, AmplitudeEmbedding
from pennylane.optimize import AdamOptimizer

from qiskit.algorithms.optimizers import COBYLA
from qiskit.circuit.library import TwoLocal, ZZFeatureMap
import qiskit

## III - Data 

In [48]:
# Read out CSV

df = pd.read_csv('fraud_detection_bank_dataset.csv', sep=',')

In [49]:
%%script false --no-raise-error

import sweetviz as sv

#EDA using Autoviz
sweet_report = sv.analyze(df)

#Saving results to HTML file
sweet_report.show_html('sweet_report.html')

## IV - Modelisation

### Classical

In [50]:
df = df.drop(['Unnamed: 0'], axis = 1)
df_labels = df['targets']
df.drop(['targets'],axis = 1,inplace = True)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(df, df_labels, test_size=0.2, random_state=42)

---

## Dimensionality reduction

In [14]:
np.unique(df_labels)

array([0, 1])

In [52]:
from sklearn.preprocessing import StandardScaler

In [53]:
# Standardizing the features


In [None]:
cols = ['col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_18', 'col_19','col_20', 'col_21', 'col_35', 
        'col_51', 'col_52', 'col_53', 'col_70','col_71','col_7', 'col_22', 'col_54', 'col_56']

X_train = X_train.drop(cols, axis=1)
X_test = X_test.drop(cols, axis=1)

In [None]:
X_train = StandardScaler().fit_transform(X_train)

In [54]:
#pca = PCA(n_components=2)
pca = PCA(n_components=2)
pca = pca.fit(X_train)

In [56]:
df_pca_train = pca.transform(X_train)
df_pca_test = pca.transform(X_test)

In [57]:
df_pca_train = pd.DataFrame(data = df_pca_train
             , columns = ['pc_1', 'pc_2'])

df_pca_test = pd.DataFrame(data = df_pca_test
             , columns = ['pc_1', 'pc_2'])

In [58]:
df_pca_train.head()

Unnamed: 0,pc_1,pc_2
0,-37252.963487,-134.646498
1,-36614.757077,-134.458136
2,-34974.747159,-133.918582
3,23534.37226,-129.967247
4,-37028.56418,-134.657561


In [None]:
plt.figure(figsize=(10,8))
plt.plot(df_pca_train.iloc[:,0], df_pca_train.iloc[:,1], 'b+')
plt.plot(df_pca_test.iloc[:,0], df_pca_test.iloc[:,1], 'g+')
#plt.plot(df.index, feature_2, 'g+')

In [29]:
features_pca_1 = pd.DataFrame(feature_1)
features_pca_2 = pd.DataFrame(feature_2)
features_pca = features_pca_1.join(features_pca_2, lsuffix="_left", rsuffix="_right")

In [59]:
#n_dim = len(df_pca_train.columns)
n_dim = len(df_pca_train.columns)

## Split train test 

In [60]:
# Split dataset into train and test

sample_train, sample_test, label_train, label_test = train_test_split(
     df_pca_train, y_train, test_size=0.2, random_state=22)

# Normalize

#std_scale = StandardScaler().fit(sample_train)
#sample_train = std_scale.transform(sample_train)
#sample_test = std_scale.transform(sample_test)

# Scale for better fit within the feature map

#samples = np.append(sample_train, sample_test, axis=0)
#minmax_scale = MinMaxScaler((-1, 1)).fit(samples)
#sample_train = minmax_scale.transform(sample_train)
#sample_test = minmax_scale.transform(sample_test)

# Select a sample for a better control of the research and wall time

train_size = 800#160
sample_train = sample_train[:train_size]
label_train = label_train[:train_size]

test_size = 200 #40
sample_test = sample_test[:test_size]
label_test = label_test[:test_size]

In [61]:
# Basic parameters for hybrid model

seed = 8500
feature_dim = n_dim
num_reps = 2
num_shots =256 


## Hybrid

In [68]:
# Define feature_map

feature_map = ZZFeatureMap(feature_dimension=feature_dim, reps=num_reps)

# Define the backend
backend = QuantumInstance(
    BasicAer.get_backend("qasm_simulator"), shots=num_shots, seed_simulator=seed, seed_transpiler=seed
)

# Define the kernel

kernel = QuantumKernel(feature_map=feature_map, quantum_instance=backend)

# Model run
svc = SVC(kernel=kernel.evaluate)
#svc.fit(sample_train, label_train)
#score = svc.score(sample_test, label_test)

#print(f"Callable kernel classification test score: {score}")

Callable kernel classification test score: 0.77


In [71]:
_models = []
_models.append(('qsvc',svc))
_metrics = ['precision', 'recall', 'f1', 'accuracy',  'matthews_corrcoef','balanced_accuracy']
for metric in _metrics:
    df_results= pd.concat([df_results, evaluate_ml_model(_models, sample_train, label_train, n_fold=10, metric=metric)])

In [72]:
df_results

Unnamed: 0,precision mean (%),precision std (%),recall mean (%),recall std (%),f1 mean (%),f1 std (%),accuracy mean (%),accuracy std (%),balanced_accuracy mean (%),balanced_accuracy std (%),...,recall mean (%).1,recall std (%).1,f1 mean (%).1,f1 std (%).1,accuracy mean (%).1,accuracy std (%).1,matthews_corrcoef mean (%),matthews_corrcoef std (%),balanced_accuracy mean (%).1,balanced_accuracy std (%).1
LR,0.0,0.0,0.0,0.0,0.0,0.0,74.63,3.26,50.0,0.0,...,,,,,,,,,,
KNN,37.56,10.92,24.59,8.44,29.42,9.12,70.75,2.86,55.42,3.92,...,,,,,,,,,,
CART,20.17,19.01,7.01,6.95,10.23,10.09,70.5,2.45,49.27,3.88,...,,,,,,,,,,
NB,27.37,4.21,94.51,6.48,42.35,5.66,35.0,3.58,54.52,2.66,...,,,,,,,,,,
SVM,0.0,0.0,0.0,0.0,0.0,0.0,74.63,3.26,50.0,0.0,...,,,,,,,,,,
qsvc,,,,,,,,,,,...,0.0,0.0,0.0,0.0,74.63,3.26,0.0,0.0,50.0,0.0


## Classical Approaches

In [62]:
models = []
models.append(('LR', LogisticRegression(max_iter=1000)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

In [63]:
df_results = pd.DataFrame()

In [64]:
def evaluate_ml_model(models, X, y, n_fold=10, metric='precision'):
    
    _df = pd.DataFrame()
    #results = []
    names = []
    #scoring = 'accuracy'
    for name, model in models:
        kfold = KFold(n_splits=n_fold)
        cv_results = cross_val_score(model, X, y, cv=kfold, scoring=metric)
        #results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        #print(msg)
        _df =  pd.concat([_df, pd.DataFrame([round(100*cv_results.mean(), 2) , round(100*cv_results.std(), 2) ]).T])
    _df.index = names
    _df.columns=[metric+' mean (%)', metric+' std (%)']
    return _df 
             
        

In [65]:
_metrics = ['precision', 'recall', 'f1', 'accuracy', 'balanced_accuracy', 'matthews_corrcoef']
for metric in _metrics:
    df_results= pd.concat([df_results, evaluate_ml_model(models, sample_train, label_train, n_fold=10, metric=metric)], axis=1)

In [66]:
df_results= pd.concat([df_results, evaluate_ml_model(models, sample_train, label_train, n_fold=10, metric='accuracy')], axis=1)

In [67]:
df_results

Unnamed: 0,precision mean (%),precision std (%),recall mean (%),recall std (%),f1 mean (%),f1 std (%),accuracy mean (%),accuracy std (%),balanced_accuracy mean (%),balanced_accuracy std (%),matthews_corrcoef mean (%),matthews_corrcoef std (%),accuracy mean (%).1,accuracy std (%).1
LR,0.0,0.0,0.0,0.0,0.0,0.0,74.63,3.26,50.0,0.0,0.0,0.0,74.63,3.26
KNN,37.56,10.92,24.59,8.44,29.42,9.12,70.75,2.86,55.42,3.92,12.62,9.11,70.75,2.86
CART,20.17,19.01,7.01,6.95,10.23,10.09,70.5,2.45,49.27,3.88,-1.55,12.46,70.62,2.32
NB,27.37,4.21,94.51,6.48,42.35,5.66,35.0,3.58,54.52,2.66,12.92,7.13,35.0,3.58
SVM,0.0,0.0,0.0,0.0,0.0,0.0,74.63,3.26,50.0,0.0,0.0,0.0,74.63,3.26


## Pennylane

In [28]:
from pennylane import numpy as np

In [29]:
# Angle Encoding

num_qubits = n_dim

dev = qml.device('default.qubit', wires = num_qubits)

@qml.qnode(dev)
def circuit(parameters, data):
    for i in range(num_qubits):
        qml.Hadamard(wires = i)
    
    AngleEmbedding(features = data, wires = range(num_qubits), rotation = 'Y')
    
    qml.StronglyEntanglingLayers(weights = parameters, wires = range(num_qubits))
    
    return qml.expval(qml.PauliZ(0))

In [30]:
num_layers = 5
weights_init = 0.01 * np.random.randn(num_layers, num_qubits, 3, requires_grad=True)
bias_init = np.array(0.0, requires_grad=True)

#print(weights_init, bias_init)

In [31]:
circuit(weights_init, sample_train[0])

tensor(-0.09425205, requires_grad=True)

In [32]:
def variational_classifier(weights, bias, x):
    return circuit(weights, x) + bias

In [33]:
def square_loss(labels, predictions):
    loss = 0
    for l, p in zip(labels, predictions):
        loss = loss + (l - p) ** 2

    loss = loss / len(labels)
    return loss

In [34]:
def accuracy(labels, predictions):

    loss = 0
    for l, p in zip(labels, predictions):
        if abs(l - p) < 1e-5:
            loss = loss + 1
    loss = loss / len(labels)

    return loss

In [35]:
def cost(weights, bias, X, Y):
    predictions = [variational_classifier(weights, bias, x) for x in X]
    return square_loss(Y, predictions)

In [36]:
Y = np.array(label_train * 2 - np.ones(len(label_train)),requires_grad=True)  # shift label from {0, 1} to {-1, 1}
X = np.array(sample_train, requires_grad=True)

for i in range(5):
    print("X = {}, Y = {: d}".format(list(X[i]), int(Y[i])))

X = [tensor(-0.99999959, requires_grad=True), tensor(0.10477704, requires_grad=True)], Y = -1
X = [tensor(-0.99999897, requires_grad=True), tensor(-0.61489866, requires_grad=True)], Y = -1
X = [tensor(-0.99627189, requires_grad=True), tensor(-0.88545579, requires_grad=True)], Y = -1
X = [tensor(-0.99994599, requires_grad=True), tensor(-0.10680072, requires_grad=True)], Y = -1
X = [tensor(-0.99230716, requires_grad=True), tensor(-0.14662807, requires_grad=True)], Y = -1


In [37]:
opt = AdamOptimizer(stepsize=0.1, beta1=0.9, beta2=0.99, eps=1e-08)
batch_size = 10

In [38]:
import warnings
warnings.filterwarnings('ignore')

In [78]:
weights = weights_init
bias = bias_init

wbest = 0
bbest = 0
abest = 0
ccost = 0 
for it in range(250):

    # weights update by one optimizer step

    batch_index = np.random.randint(0, len(X), (batch_size,))
    X_batch = X[batch_index]
    Y_batch = Y[batch_index]
    weights, bias, _, _ = opt.step(cost, weights, bias, X_batch, Y_batch)

    # Compute the accuracy
    predictions = [np.sign(variational_classifier(weights, bias, x)) for x in X]
    
    '''if accuracy(Y, predictions) > abest:
        wbest = weights
        bbest = bias
        abest = accuracy(Y, predictions)
        print('New best')

    acc = accuracy(Y, predictions)

    print(
        "Iter: {:5d} | Cost: {:0.7f} | Accuracy: {:0.7f} ".format(
            it + 1, cost(weights, bias, X, Y), acc
        )
    )'''
    prec = metrics.f1_score(Y, predictions, average='binary', pos_label=1)
    if  prec > abest or ((prec == abest) and (cost(weights, bias, X, Y) < ccost)):
        wbest = weights
        bbest = bias
        abest = prec
        ccost = cost(weights, bias, X, Y)
        print("New Best:")
    print(
        "Iter: {:5d} | Cost: {:0.7f} | f1: {:0.7f} ".format(
            it + 1, cost(weights, bias, X, Y), prec
        )
    )
    
    

New Best:
Iter:     1 | Cost: 1.9191211 | f1: 0.1154299 
New Best:
Iter:     2 | Cost: 1.8256290 | f1: 0.2608696 
Iter:     3 | Cost: 1.2961935 | f1: 0.0000000 
Iter:     4 | Cost: 0.8427779 | f1: 0.0000000 
Iter:     5 | Cost: 0.7063916 | f1: 0.0000000 
Iter:     6 | Cost: 0.6819693 | f1: 0.0000000 
Iter:     7 | Cost: 0.5885872 | f1: 0.0000000 
Iter:     8 | Cost: 0.4792729 | f1: 0.0000000 
New Best:
Iter:     9 | Cost: 0.5767016 | f1: 0.8202020 
Iter:    10 | Cost: 0.7209637 | f1: 0.6558966 
Iter:    11 | Cost: 0.6950332 | f1: 0.6710744 
New Best:
Iter:    12 | Cost: 0.5077988 | f1: 0.8750000 
Iter:    13 | Cost: 0.4284217 | f1: 0.6753247 
Iter:    14 | Cost: 0.4392933 | f1: 0.0000000 
Iter:    15 | Cost: 0.4780805 | f1: 0.0000000 
Iter:    16 | Cost: 0.5272714 | f1: 0.0000000 
Iter:    17 | Cost: 0.5518219 | f1: 0.0000000 
Iter:    18 | Cost: 0.4958795 | f1: 0.0000000 
Iter:    19 | Cost: 0.5037093 | f1: 0.0000000 
Iter:    20 | Cost: 0.5706935 | f1: 0.5902778 
Iter:    21 | Cost: 

In [79]:
Yte = np.array(label_test * 2 - np.ones(len(label_test)))
Xte = np.array(normalize(sample_test))

In [80]:
predictions = [np.sign(variational_classifier(wbest, bbest, x)) for x in Xte]
pred = [np.sign(variational_classifier(wbest, bbest, x)) for x in X]
acc = accuracy(Yte, predictions)

print(f'Cost: {cost(wbest, bbest, Xte, Yte)}, Accuracy: {np.round(acc, 2) * 100}%')

Cost: 0.787210742813839, Accuracy: 77.0%


In [81]:
print(metrics.classification_report(predictions,Yte))

              precision    recall  f1-score   support

        -1.0       1.00      0.77      0.87       200
         1.0       0.00      0.00      0.00         0

    accuracy                           0.77       200
   macro avg       0.50      0.39      0.44       200
weighted avg       1.00      0.77      0.87       200



In [65]:
print(metrics.classification_report(predictions,Yte))

              precision    recall  f1-score   support

        -1.0       0.72      0.79      0.76       141
         1.0       0.36      0.27      0.31        59

    accuracy                           0.64       200
   macro avg       0.54      0.53      0.53       200
weighted avg       0.61      0.64      0.62       200



In [82]:
print(f'''

    Precision: {round(100*metrics.precision_score(predictions,Yte),2)}%
    Recall: {round(100*metrics.recall_score(predictions,Yte),2)}%
    f1: {round(100*metrics.f1_score(predictions,Yte),2)}%
    Accuracy: {round(100*metrics.accuracy_score(predictions,Yte),2)}%
    Balanced accuracy: {round(100*metrics.balanced_accuracy_score(predictions,Yte),2)}%
    Matthew corcorref: {round(100*metrics.matthews_corrcoef(predictions,Yte),2)}%
    ''')



    Precision: 0.0%
    Recall: 0.0%
    f1: 0.0%
    Accuracy: 77.0%
    Balanced accuracy: 77.0%
    Matthew corcorref: 0.0%
    
