In [53]:
import pandas as pd
import numpy as np
from pennylane import numpy as np
import pennylane as qml
from pennylane_qiskit import IBMQDevice
from pennylane_qiskit import BasicAerDevice
from pennylane.templates.embeddings import AngleEmbedding, AmplitudeEmbedding
from pennylane.optimize import AdamOptimizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import normalize


import time
start = time.time()

import warnings
warnings.filterwarnings('ignore')

In [54]:
# Dataset CSV load
df = pd.read_csv('fraud_detection_bank_dataset.csv', sep=',')

# Data type definition as float
df = df.astype(float)

# Drop of columns if necessary
df = df.drop(['Unnamed: 0'], axis = 1)

# Sample selection
df_sample = df.sample(2000)

In [55]:
# Review the information related to the dataframe

df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 2635 to 3461
Columns: 113 entries, col_0 to targets
dtypes: float64(113)
memory usage: 1.7 MB


In [56]:
# Table of the description of the dataframe related to fixed parameters

df_sample.describe()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_103,col_104,col_105,col_106,col_107,col_108,col_109,col_110,col_111,targets
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,2.764,271.674,0.389,2.236,0.099,0.857,2.4415,2.764,0.0,0.0,...,0.0055,0.3385,0.0035,0.311,0.203,0.0,0.049,0.0295,43.2675,0.264
std,9.647167,561.9988,3.169969,9.564311,1.013762,3.773549,2.994839,9.647167,0.0,0.0,...,0.080455,0.473318,0.059072,0.463019,0.402333,0.0,0.215922,0.315721,65.151349,0.44091
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
50%,0.0,97.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0
75%,2.0,267.0,0.0,2.0,0.0,1.0,6.0,2.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,62.0,1.0
max,242.0,9076.0,84.0,259.0,40.0,144.0,8.0,242.0,0.0,0.0,...,2.0,1.0,1.0,1.0,1.0,0.0,1.0,6.0,747.0,1.0


In [57]:
# Train, test and validation split
train = df_sample.sample(frac = 0.7)
test = df_sample.drop(train.index).sample(frac = 0.5)
validate = df_sample.drop(train.index).drop(test.index)

In [58]:
train.shape

(1400, 113)

In [59]:
test.shape

(300, 113)

In [60]:
validate.shape

(300, 113)

In [61]:
# Separation of labels
x_train = train
y_train = train[['targets']]
x_test = test
y_test = test[['targets']]
x_validate = validate
y_validate = validate[['targets']]

In [62]:
# Hard split on the dataframe for the LDA

# Train split
features_a = x_train.iloc[:,:55]
features_b = x_train.iloc[:,55:]

# Test split
features_a_test = x_test.iloc[:,:55]
features_b_test = x_test.iloc[:,55:]

# Validate split
features_a_validate = x_validate.iloc[:,:55]
features_b_validate = x_validate.iloc[:,55:]

In [63]:
# LDA fit with the separated groups
lda1 = LDA(n_components=1, solver='svd').fit(features_a, y_train)
lda2 = LDA(n_components=1, solver='svd').fit(features_b, y_train)

In [64]:
# LDA train transformation
features_lda_1 = lda1.transform(features_a)
features_lda_2 = lda2.transform(features_b)

In [65]:
# LDA test transformation (using train fit)
features_lda_1_test = lda1.transform(features_a_test)
features_lda_2_test = lda2.transform(features_b_test)

In [66]:
# LDA validate transformation (using train fit)
features_lda_1_validate = lda1.transform(features_a_validate)
features_lda_2_validate = lda2.transform(features_b_validate)

In [67]:
# Arrays to dataframe for join in a single df
features_lda_1 = pd.DataFrame(features_lda_1)
features_lda_2 = pd.DataFrame(features_lda_2)
features_lda_1_test = pd.DataFrame(features_lda_1_test)
features_lda_2_test = pd.DataFrame(features_lda_2_test)
features_lda_1_validate = pd.DataFrame(features_lda_1_validate)
features_lda_2_validate = pd.DataFrame(features_lda_2_validate)

In [68]:
# Join of dataframes
x_train_lda = pd.concat([features_lda_1, features_lda_2], axis=1)
x_test_lda = pd.concat([features_lda_1_test, features_lda_2_test], axis=1)
x_validate_lda = pd.concat([features_lda_1_validate, features_lda_2_validate], axis=1)

In [69]:
# Second standard scaler normalization (using train fit)
std_scale = StandardScaler().fit(x_train_lda)
data = std_scale.transform(x_train_lda)
x_test_lda_n = std_scale.transform(x_test_lda)
x_validate_lda_n = std_scale.transform(x_test_lda)

In [70]:
# Dimensions definition for QML
n_dim = len(x_train_lda.columns)
n_dim

2

In [71]:
# Review the balance of the target variable in train

y_train.value_counts(normalize=True)*100

targets
0.0        74.714286
1.0        25.285714
dtype: float64

In [72]:
# Review the balance of the target variable in test

y_test.value_counts(normalize=True)*100

targets
0.0        73.333333
1.0        26.666667
dtype: float64

In [73]:
# Begin of Pennylane variational classifier

In [74]:
# Angle Encoding
num_qubits = n_dim

# Device seletion
dev = qml.device('default.qubit', wires = num_qubits, shots=1024)
#dev = qml.device('default.qubit.tf', wires = num_qubits, shots=1024)
#dev = qml.device('qiskit.ibmq', wires = num_qubits, backend='ibmq_manila', ibmqx_token="6cc75c58fc80fea56cb8dd391f8fbcfdb676a3dc7005493728bc9da7ea753e31a2110a01e3a0cc83f1a98f5ca79e32956fc66c11b5eea4cae163b3fa996be356", shots=256)
#dev = qml.device('qiskit.basicaer', wires = num_qubits, shots = 256)

@qml.qnode(dev)
def circuit(parameters, data):
    for i in range(num_qubits):
        qml.Hadamard(wires = i)
    
    AngleEmbedding(features = data, wires = range(num_qubits), rotation = 'Y')
    
    qml.StronglyEntanglingLayers(weights = parameters, wires = range(num_qubits))
    
    return qml.expval(qml.PauliZ(0))

In [75]:
num_layers = 5
weights_init = 0.01 * np.random.randn(num_layers, num_qubits, 3, requires_grad=True)
bias_init = np.array(0.0, requires_grad=True)

print(weights_init, bias_init)

[[[ 0.00607978 -0.00408812 -0.00016642]
  [ 0.00249037  0.00173503  0.0012574 ]]

 [[ 0.00223442  0.01169267 -0.0034114 ]
  [-0.01732298 -0.01619    -0.00695151]]

 [[ 0.00088427  0.00552891 -0.00915112]
  [-0.00354648  0.00651411  0.00980091]]

 [[ 0.00591232 -0.01868129 -0.00521649]
  [-0.00268737  0.00131551 -0.00090814]]

 [[-0.01546776 -0.00934685 -0.01062303]
  [-0.00938344  0.00729765  0.00525423]]] 0.0


In [76]:
circuit(weights_init, data[0])

tensor(0.05859375, requires_grad=True)

In [77]:
def variational_classifier(weights, bias, x):
    return circuit(weights, x) + bias

In [78]:
def square_loss(labels, predictions):
    loss = 0
    for l, p in zip(labels, predictions):
        loss = loss + (l - p) ** 2

    loss = loss / len(labels)
    return loss

In [79]:
def accuracy(labels, predictions):

    loss = 0
    for l, p in zip(labels, predictions):
        if abs(l - p) < 1e-5:
            loss = loss + 1
    loss = loss / len(labels)

    return loss

In [80]:
def cost(weights, bias, X, Y):
    predictions = [variational_classifier(weights, bias, x) for x in X]
    return square_loss(Y, predictions)

In [81]:
Y = np.array(y_train.values[:,0] * 2 - np.ones(len(y_train.values[:,0])), requires_grad = False)  # shift label from {0, 1} to {-1, 1}
X = np.array(data, requires_grad=False)

for i in range(5):
    print("X = {}, Y = {: d}".format(list(X[i]), int(Y[i])))

X = [tensor(0.57763142, requires_grad=False), tensor(0.08617211, requires_grad=False)], Y = -1
X = [tensor(0.56883442, requires_grad=False), tensor(-0.14983753, requires_grad=False)], Y = -1
X = [tensor(0.48586538, requires_grad=False), tensor(-0.71935856, requires_grad=False)], Y = -1
X = [tensor(-0.5921235, requires_grad=False), tensor(-0.12414815, requires_grad=False)], Y = -1
X = [tensor(0.86166586, requires_grad=False), tensor(1.67787148, requires_grad=False)], Y =  1


In [82]:
opt = AdamOptimizer(stepsize=0.1, beta1=0.9, beta2=0.99, eps=1e-08)
batch_size = 10

In [83]:
weights = weights_init
bias = bias_init

wbest = 0
bbest = 0
abest = 0

for it in range(20):

    # weights update by one optimizer step

    batch_index = np.random.randint(0, len(X), (batch_size,))
    X_batch = X[batch_index]
    Y_batch = Y[batch_index]
    weights, bias, _, _ = opt.step(cost, weights, bias, X_batch, Y_batch)

    # Compute the accuracy
    predictions = [np.sign(variational_classifier(weights, bias, x)) for x in X]
    
    if accuracy(Y, predictions) > abest:
        wbest = weights
        bbest = bias
        abest = accuracy(Y, predictions)
        print('New best')

    acc = accuracy(Y, predictions)

    print(
        "Iter: {:5d} | Cost: {:0.7f} | Accuracy: {:0.7f} ".format(
            it + 1, cost(weights, bias, X, Y), acc
        )
    )

New best
Iter:     1 | Cost: 0.7271737 | Accuracy: 0.7635714 
New best
Iter:     2 | Cost: 0.5904514 | Accuracy: 0.8085714 
New best
Iter:     3 | Cost: 0.5353814 | Accuracy: 0.8300000 
New best
Iter:     4 | Cost: 0.5059354 | Accuracy: 0.8357143 
New best
Iter:     5 | Cost: 0.4892570 | Accuracy: 0.8421429 
New best
Iter:     6 | Cost: 0.4609409 | Accuracy: 0.8514286 
New best
Iter:     7 | Cost: 0.4394666 | Accuracy: 0.8578571 
New best
Iter:     8 | Cost: 0.4177787 | Accuracy: 0.8657143 
New best
Iter:     9 | Cost: 0.4073585 | Accuracy: 0.8735714 
Iter:    10 | Cost: 0.4073844 | Accuracy: 0.8735714 
Iter:    11 | Cost: 0.4130709 | Accuracy: 0.8728571 
Iter:    12 | Cost: 0.4069243 | Accuracy: 0.8728571 
New best
Iter:    13 | Cost: 0.4076166 | Accuracy: 0.8771429 
New best
Iter:    14 | Cost: 0.4434968 | Accuracy: 0.8821429 
New best
Iter:    15 | Cost: 0.4199666 | Accuracy: 0.8835714 
Iter:    16 | Cost: 0.3899834 | Accuracy: 0.8792857 
Iter:    17 | Cost: 0.3847866 | Accuracy: 0.

In [84]:
# Testing set preparation
Yte = np.array(y_test.values[:,0] * 2 - np.ones(len(y_test.values[:,0])), requires_grad = False)
Xte = np.array(normalize(x_test_lda_n), requires_grad=False)

In [85]:
# Outcome on test set
predictions = [np.sign(variational_classifier(wbest, bbest, x)) for x in Xte]
pred = [np.sign(variational_classifier(wbest, bbest, x)) for x in X]
acc = accuracy(Yte, predictions)

print(f'Cost: {cost(wbest, bbest, Xte, Yte)}, Accuracy: {np.round(acc, 2) * 100}%')

Cost: 0.5593412262119379, Accuracy: 83.0%


In [86]:
# Test and predictions comparison
pd.DataFrame((Yte, predictions), ('Test', 'Predictions')).T

Unnamed: 0,Test,Predictions
0,-1.0,-1.0
1,-1.0,-1.0
2,-1.0,-1.0
3,-1.0,-1.0
4,-1.0,-1.0
...,...,...
295,-1.0,-1.0
296,-1.0,-1.0
297,1.0,1.0
298,1.0,-1.0


In [87]:
# Print the classification report and important metrics
print(metrics.classification_report(Yte, predictions))
print(metrics.precision_score(Yte, predictions))
print(metrics.recall_score(Yte, predictions))
print(metrics.f1_score(Yte, predictions))
print(metrics.balanced_accuracy_score(Yte, predictions))

              precision    recall  f1-score   support

        -1.0       0.82      0.98      0.89       220
         1.0       0.87      0.42      0.57        80

    accuracy                           0.83       300
   macro avg       0.85      0.70      0.73       300
weighted avg       0.84      0.83      0.81       300

0.8717948717948718
0.425
0.5714285714285714
0.7011363636363637


In [88]:
# Validation set preparation
Vte = np.array(y_validate.values[:,0] * 2 - np.ones(len(y_validate.values[:,0])), requires_grad = False)
Xte = np.array(normalize(x_validate_lda_n), requires_grad=False)

In [89]:
# Outcome on validation set
acc = accuracy(Vte, predictions)

print(f'Cost: {cost(wbest, bbest, Xte, Vte)}, Accuracy: {np.round(acc, 2) * 100}%')

Cost: 0.922790157644559, Accuracy: 66.0%


In [90]:
# Test and predictions comparison
pd.DataFrame((Vte, predictions), ('Test', 'Predictions')).T

Unnamed: 0,Test,Predictions
0,-1.0,-1.0
1,-1.0,-1.0
2,1.0,-1.0
3,1.0,-1.0
4,1.0,-1.0
...,...,...
295,1.0,-1.0
296,-1.0,-1.0
297,1.0,1.0
298,-1.0,-1.0


In [91]:
# Print the classification report and important metrics
print(metrics.classification_report(Vte, predictions))
print(metrics.precision_score(Vte, predictions))
print(metrics.recall_score(Vte, predictions))
print(metrics.f1_score(Vte, predictions))
print(metrics.balanced_accuracy_score(Vte, predictions))

              precision    recall  f1-score   support

        -1.0       0.70      0.89      0.78       206
         1.0       0.41      0.17      0.24        94

    accuracy                           0.66       300
   macro avg       0.56      0.53      0.51       300
weighted avg       0.61      0.66      0.61       300

0.41025641025641024
0.1702127659574468
0.24060150375939848
0.5292811402602768
