In [52]:
import pandas as pd
import numpy as np
#from pennylane import numpy as np
import pennylane as qml
from pennylane_qiskit import IBMQDevice
from pennylane_qiskit import BasicAerDevice
from pennylane.templates.embeddings import AngleEmbedding, AmplitudeEmbedding
from pennylane.optimize import AdamOptimizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import normalize


import time
start = time.time()

import warnings
warnings.filterwarnings('ignore')

In [53]:
# Dataset CSV load
df = pd.read_csv('fraud_detection_bank_dataset.csv', sep=',')

# Data type definition as float
df = df.astype(float)

# Drop of columns if necessary
df = df.drop(['Unnamed: 0'], axis = 1)

# Sample selection
df_sample = df.sample(1000)

In [54]:
# Review the information related to the dataframe

df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 11973 to 18291
Columns: 113 entries, col_0 to targets
dtypes: float64(113)
memory usage: 890.6 KB


In [55]:
# Table of the description of the dataframe related to fixed parameters

df_sample.describe()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_103,col_104,col_105,col_106,col_107,col_108,col_109,col_110,col_111,targets
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,3.121,291.176,0.342,2.56,0.091,0.962,2.44,3.121,0.0,0.0,...,0.003,0.353,0.004,0.315,0.202,0.0,0.05,0.03,45.021,0.286
std,9.66829,550.865203,2.343614,9.316394,0.634919,4.456952,3.027799,9.66829,0.0,0.0,...,0.054717,0.478142,0.063151,0.464748,0.401693,0.0,0.218054,0.383728,63.333606,0.452115
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
50%,0.0,99.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0
75%,2.0,298.75,0.0,2.0,0.0,1.0,6.0,2.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,63.0,1.0
max,143.0,6036.0,47.0,179.0,15.0,125.0,8.0,143.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,11.0,728.0,1.0


In [56]:
# Train, test and validation split
train = df_sample.sample(frac = 0.7)
test = df_sample.drop(train.index).sample(frac = 0.5)
validate = df_sample.drop(train.index).drop(test.index)

In [57]:
train.shape

(700, 113)

In [58]:
test.shape

(150, 113)

In [59]:
validate.shape

(150, 113)

In [60]:
# Separation of labels
x_train = train
y_train = train[['targets']]
x_test = test
y_test = test[['targets']]
x_validate = validate
y_validate = validate[['targets']]

In [61]:
x_train = x_train.drop(['targets'], axis = 1)
x_test = x_test.drop(['targets'], axis = 1)
x_validate = x_validate.drop(['targets'], axis = 1)

In [62]:
x_train == 0

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_102,col_103,col_104,col_105,col_106,col_107,col_108,col_109,col_110,col_111
20053,False,False,True,False,True,True,False,False,True,True,...,False,True,True,True,True,True,True,True,True,False
916,True,False,True,False,True,False,True,True,True,True,...,True,True,False,True,True,True,True,True,True,False
16541,False,False,True,True,True,False,False,False,True,True,...,True,True,True,True,False,False,True,True,False,False
19831,True,False,True,False,True,True,False,True,True,True,...,True,True,False,True,False,False,True,False,True,False
12311,True,False,True,False,True,True,False,True,True,True,...,True,True,True,True,True,True,True,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16488,True,False,True,False,True,False,False,True,True,True,...,False,True,False,True,True,True,True,True,True,False
14627,True,False,False,True,True,True,False,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3879,True,False,True,True,True,False,False,True,True,True,...,True,True,True,True,True,True,True,True,True,True
552,True,False,True,False,True,False,False,True,True,True,...,False,True,True,True,False,True,True,True,True,False


In [63]:
(x_train == 0).all()

col_0      False
col_1      False
col_2      False
col_3      False
col_4      False
           ...  
col_107    False
col_108     True
col_109    False
col_110    False
col_111    False
Length: 112, dtype: bool

In [64]:
drop_0 = x_train.columns[(x_train == 0).all()]

In [65]:
drop_0

Index(['col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_18', 'col_19',
       'col_20', 'col_21', 'col_35', 'col_40', 'col_51', 'col_52', 'col_53',
       'col_58', 'col_70', 'col_71', 'col_108'],
      dtype='object')

In [66]:
corr_matrix = x_train.corr().abs()

In [67]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

In [68]:
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [69]:
to_drop

['col_7', 'col_22', 'col_54', 'col_60', 'col_65']

In [70]:
x_train.drop(['col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_18', 'col_19','col_20', 'col_21', 'col_35', 'col_51', 'col_52', 'col_53', 'col_70','col_71','col_7', 'col_22', 'col_54', 'col_56'], axis = 1)
x_test.drop(['col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_18', 'col_19','col_20', 'col_21', 'col_35', 'col_51', 'col_52', 'col_53', 'col_70','col_71','col_7', 'col_22', 'col_54', 'col_56'], axis = 1)
x_validate.drop(['col_8', 'col_9', 'col_10', 'col_11', 'col_12', 'col_18', 'col_19','col_20', 'col_21', 'col_35', 'col_51', 'col_52', 'col_53', 'col_70','col_71','col_7', 'col_22', 'col_54', 'col_56'], axis = 1)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_13,col_14,col_15,...,col_102,col_103,col_104,col_105,col_106,col_107,col_108,col_109,col_110,col_111
4327,0.0,41.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0
2227,0.0,35.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12925,2.0,595.0,0.0,0.0,0.0,15.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,169.0
17338,2.0,3161.0,0.0,81.0,0.0,1.0,6.0,0.0,1.0,0.0,...,5.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,156.0
6799,0.0,149.0,0.0,2.0,0.0,0.0,6.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7985,0.0,96.0,0.0,4.0,0.0,0.0,8.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0
5440,4.0,1694.0,0.0,1.0,0.0,0.0,7.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
13990,0.0,306.0,0.0,3.0,0.0,0.0,7.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,52.0
3495,8.0,618.0,0.0,1.0,0.0,1.0,7.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,157.0


In [71]:
# Hard split on the dataframe for the LDA

# Train split
features_a = x_train.iloc[:,:50]
features_b = x_train.iloc[:,50:]

# Test split
features_a_test = x_test.iloc[:,:50]
features_b_test = x_test.iloc[:,50:]

# Validate split
features_a_validate = x_validate.iloc[:,:50]
features_b_validate = x_validate.iloc[:,50:]

In [72]:
# LDA fit with the separated groups
lda1 = LDA(n_components=1, solver='svd').fit(features_a, y_train)
lda2 = LDA(n_components=1, solver='svd').fit(features_b, y_train)

In [73]:
# LDA train transformation
features_lda_1 = lda1.transform(features_a)
features_lda_2 = lda2.transform(features_b)

In [74]:
# LDA test transformation (using train fit)
features_lda_1_test = lda1.transform(features_a_test)
features_lda_2_test = lda2.transform(features_b_test)

In [75]:
# LDA validate transformation (using train fit)
features_lda_1_validate = lda1.transform(features_a_validate)
features_lda_2_validate = lda2.transform(features_b_validate)

In [76]:
# Arrays to dataframe for join in a single df
features_lda_1 = pd.DataFrame(features_lda_1)
features_lda_2 = pd.DataFrame(features_lda_2)
features_lda_1_test = pd.DataFrame(features_lda_1_test)
features_lda_2_test = pd.DataFrame(features_lda_2_test)
features_lda_1_validate = pd.DataFrame(features_lda_1_validate)
features_lda_2_validate = pd.DataFrame(features_lda_2_validate)

In [77]:
# Join of dataframes
x_train_lda = pd.concat([features_lda_1, features_lda_2], axis=1)
x_test_lda = pd.concat([features_lda_1_test, features_lda_2_test], axis=1)
x_validate_lda = pd.concat([features_lda_1_validate, features_lda_2_validate], axis=1)

In [78]:
# Second standard scaler normalization (using train fit)
std_scale = StandardScaler().fit(x_train_lda)
data = std_scale.transform(x_train_lda)
x_test_lda_n = std_scale.transform(x_test_lda)
x_validate_lda_n = std_scale.transform(x_test_lda)

In [79]:
# Dimensions definition for QML
n_dim = len(x_train_lda.columns)
n_dim

2

In [80]:
# Review the balance of the target variable in train

y_train.value_counts(normalize=True)*100

targets
0.0        72.0
1.0        28.0
dtype: float64

In [81]:
# Review the balance of the target variable in test

y_test.value_counts(normalize=True)*100

targets
0.0        71.333333
1.0        28.666667
dtype: float64

In [82]:
# Begin of Pennylane variational classifier

In [83]:
from pennylane import numpy as np

In [84]:
# Angle Encoding
num_qubits = n_dim

# Device seletion
dev = qml.device('default.qubit', wires = num_qubits, shots=1024)
#dev = qml.device('lightning.qubit',wires=1)
#dev = qml.device('default.qubit.tf', wires = num_qubits, shots=1024)
#dev = qml.device('qiskit.ibmq', wires = num_qubits, backend='ibmq_manila', ibmqx_token="6cc75c58fc80fea56cb8dd391f8fbcfdb676a3dc7005493728bc9da7ea753e31a2110a01e3a0cc83f1a98f5ca79e32956fc66c11b5eea4cae163b3fa996be356", shots=256)
#dev = qml.device('qiskit.basicaer', wires = num_qubits, shots = 256)

@qml.qnode(dev)
def circuit(parameters, data):
    for i in range(num_qubits):
        qml.Hadamard(wires = i)
    
    AngleEmbedding(features = data, wires = range(num_qubits), rotation = 'Y')
    
    qml.StronglyEntanglingLayers(weights = parameters, wires = range(num_qubits))
    
    return qml.expval(qml.PauliZ(0))

In [85]:
num_layers = 5
weights_init = 0.01 * np.random.randn(num_layers, num_qubits, 3, requires_grad=True)
bias_init = np.array(0.0, requires_grad=True)

print(weights_init, bias_init)

[[[ 0.01513306 -0.00864158 -0.00682896]
  [ 0.00098522 -0.00348892  0.00446006]]

 [[ 0.01209854 -0.01041733  0.00381764]
  [-0.00805414  0.00474816  0.01656622]]

 [[ 0.00368248  0.01745879 -0.0079473 ]
  [ 0.00223455  0.0140396  -0.00360985]]

 [[ 0.00561606 -0.00135279  0.00335324]
  [-0.02749777 -0.0054078   0.00415383]]

 [[-0.02212797 -0.0160376   0.00729891]
  [ 0.010323   -0.00530105 -0.00171464]]] 0.0


In [86]:
circuit(weights_init, data[0])

tensor(0.25195312, requires_grad=True)

In [87]:
def variational_classifier(weights, bias, x):
    return circuit(weights, x) + bias

In [88]:
def square_loss(labels, predictions):
    loss = 0
    for l, p in zip(labels, predictions):
        loss = loss + (l - p) ** 2

    loss = loss / len(labels)
    return loss

In [89]:
def accuracy(labels, predictions):

    loss = 0
    for l, p in zip(labels, predictions):
        if abs(l - p) < 1e-5:
            loss = loss + 1
    loss = loss / len(labels)

    return loss

In [90]:
def cost(weights, bias, X, Y):
    predictions = [variational_classifier(weights, bias, x) for x in X]
    return square_loss(Y, predictions)

In [91]:
Y = np.array(y_train.values[:,0] * 2 - np.ones(len(y_train.values[:,0])), requires_grad = False)  # shift label from {0, 1} to {-1, 1}
X = np.array(data, requires_grad=False)

for i in range(5):
    print("X = {}, Y = {: d}".format(list(X[i]), int(Y[i])))

X = [tensor(-0.42000757, requires_grad=False), tensor(-0.61994393, requires_grad=False)], Y = -1
X = [tensor(0.37409183, requires_grad=False), tensor(0.47564962, requires_grad=False)], Y =  1
X = [tensor(0.0005144, requires_grad=False), tensor(-0.49755209, requires_grad=False)], Y = -1
X = [tensor(1.38170242, requires_grad=False), tensor(1.35099813, requires_grad=False)], Y = -1
X = [tensor(-0.65868933, requires_grad=False), tensor(-0.06605325, requires_grad=False)], Y = -1


In [92]:
opt = AdamOptimizer(stepsize=0.1, beta1=0.9, beta2=0.99, eps=1e-08)
batch_size = 10

In [93]:
weights = weights_init
bias = bias_init

wbest = 0
bbest = 0
abest = 0

for it in range(20):

    # weights update by one optimizer step

    batch_index = np.random.randint(0, len(X), (batch_size,))
    X_batch = X[batch_index]
    Y_batch = Y[batch_index]
    weights, bias, _, _ = opt.step(cost, weights, bias, X_batch, Y_batch)

    # Compute the accuracy
    predictions = [np.sign(variational_classifier(weights, bias, x)) for x in X]
    
    if accuracy(Y, predictions) > abest:
        wbest = weights
        bbest = bias
        abest = accuracy(Y, predictions)
        print('New best')

    acc = accuracy(Y, predictions)

    print(
        "Iter: {:5d} | Cost: {:0.7f} | Accuracy: {:0.7f} ".format(
            it + 1, cost(weights, bias, X, Y), acc
        )
    )

New best
Iter:     1 | Cost: 0.7288561 | Accuracy: 0.7600000 
New best
Iter:     2 | Cost: 0.5752185 | Accuracy: 0.8157143 
New best
Iter:     3 | Cost: 0.5404956 | Accuracy: 0.8357143 
Iter:     4 | Cost: 0.5242447 | Accuracy: 0.8271429 
Iter:     5 | Cost: 0.5118443 | Accuracy: 0.8342857 
New best
Iter:     6 | Cost: 0.4832604 | Accuracy: 0.8428571 
New best
Iter:     7 | Cost: 0.4808237 | Accuracy: 0.8528571 
Iter:     8 | Cost: 0.5247058 | Accuracy: 0.8485714 
Iter:     9 | Cost: 0.6036741 | Accuracy: 0.8414286 
Iter:    10 | Cost: 0.6528697 | Accuracy: 0.8385714 
Iter:    11 | Cost: 0.6612754 | Accuracy: 0.8371429 
Iter:    12 | Cost: 0.7084526 | Accuracy: 0.8257143 
Iter:    13 | Cost: 0.6810647 | Accuracy: 0.8300000 
Iter:    14 | Cost: 0.6105049 | Accuracy: 0.8428571 
Iter:    15 | Cost: 0.5372625 | Accuracy: 0.8514286 
New best
Iter:    16 | Cost: 0.5369332 | Accuracy: 0.8542857 
Iter:    17 | Cost: 0.6440908 | Accuracy: 0.8385714 
Iter:    18 | Cost: 0.7708714 | Accuracy: 0.7

In [94]:
# Testing set preparation
Yte = np.array(y_test.values[:,0] * 2 - np.ones(len(y_test.values[:,0])), requires_grad = False)
Xte = np.array(normalize(x_test_lda_n), requires_grad=False)

In [95]:
# Outcome on test set
predictions = [np.sign(variational_classifier(wbest, bbest, x)) for x in Xte]
pred = [np.sign(variational_classifier(wbest, bbest, x)) for x in X]
acc = accuracy(Yte, predictions)

print(f'Cost: {cost(wbest, bbest, Xte, Yte)}, Accuracy: {np.round(acc, 2) * 100}%')

Cost: 0.6625894539740345, Accuracy: 83.0%


In [96]:
# Test and predictions comparison
pd.DataFrame((Yte, predictions), ('Test', 'Predictions')).T

Unnamed: 0,Test,Predictions
0,1.0,-1.0
1,-1.0,-1.0
2,-1.0,-1.0
3,1.0,1.0
4,-1.0,-1.0
...,...,...
145,-1.0,-1.0
146,-1.0,-1.0
147,-1.0,-1.0
148,-1.0,-1.0


In [97]:
# Print the classification report and important metrics
print(metrics.classification_report(Yte, predictions))
print(metrics.precision_score(Yte, predictions))
print(metrics.recall_score(Yte, predictions))
print(metrics.f1_score(Yte, predictions))
print(metrics.balanced_accuracy_score(Yte, predictions))

              precision    recall  f1-score   support

        -1.0       0.82      0.98      0.89       107
         1.0       0.91      0.47      0.62        43

    accuracy                           0.83       150
   macro avg       0.86      0.72      0.75       150
weighted avg       0.85      0.83      0.81       150

0.9090909090909091
0.46511627906976744
0.6153846153846153
0.7232123451423603


In [98]:
# Get the predictions based on the optimized weights and bias
final_predictions = [variational_classifier(weights, bias, x) for x in Xte]
# Get the predictions within the range 0-1 so that they represent a probability
probability_class0 = (final_predictions-bias + np.ones(len(final_predictions)))/2
#probability_class0 = (predictions + np.ones(len(predictions)))/2

# Print the probability for each sample
print(probability_class0)

[0.32128906 0.390625   0.82128906 0.97949219 0.53710938 0.40722656
 0.97265625 0.35839844 0.42480469 0.98339844 0.31835938 0.97851562
 0.453125   0.46777344 0.98242188 0.27734375 0.41308594 0.6796875
 0.36914062 0.28417969 0.4609375  0.71972656 0.74902344 0.50878906
 0.4609375  0.9453125  0.79101562 0.36132812 0.75683594 0.37402344
 0.3125     0.40722656 0.52539062 0.31640625 0.94433594 0.26269531
 0.51953125 0.24902344 0.33886719 0.37207031 0.53320312 0.42675781
 0.69824219 0.54980469 0.97363281 0.69824219 0.67480469 0.30273438
 0.38085938 0.38378906 0.41992188 0.57617188 0.41894531 0.84472656
 0.93554688 0.4765625  0.31152344 0.98046875 0.7421875  0.69238281
 0.6015625  0.49414062 0.68164062 0.71386719 0.44726562 0.94042969
 0.48046875 0.70996094 0.26269531 0.3125     0.8828125  0.5625
 0.71386719 0.39160156 0.98632812 0.70507812 0.46679688 0.42675781
 0.49902344 0.38867188 0.70996094 0.94238281 0.52246094 0.97949219
 0.52148438 0.55273438 0.69433594 0.53710938 0.50488281 0.52539062


In [99]:
probability_class_0 = pd.DataFrame(probability_class0)

In [100]:
print(probability_class_0)

            0
0    0.321289
1    0.390625
2    0.821289
3    0.979492
4    0.537109
..        ...
145  0.330078
146  0.421875
147  0.818359
148  0.676758
149  0.354492

[150 rows x 1 columns]
