In [138]:
import pandas as pd
import numpy as np
from pennylane import numpy as np
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler

import pennylane as qml
from pennylane_qiskit import IBMQDevice
from pennylane_qiskit import BasicAerDevice
from pennylane.templates.embeddings import AngleEmbedding, AmplitudeEmbedding
from pennylane.optimize import AdamOptimizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


import time
start = time.time()

In [139]:
# Read out CSV and sets/samples creation

df = pd.read_csv('fraud_detection_bank_dataset.csv', sep=',')
df = df.astype(float)
df = df.drop(['Unnamed: 0'], axis = 1)
df_sample = df.sample(2000)
train,test = train_test_split(df_sample, test_size=0.30, random_state=10)
train_set = train
test_set = test
np.random.seed(42)

In [140]:
# Review the information related to the dataframe

df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 10214 to 13822
Columns: 113 entries, col_0 to targets
dtypes: float64(113)
memory usage: 1.7 MB


In [141]:
# Table of the description of the dataframe related to fixed parameters

df_sample.describe()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_103,col_104,col_105,col_106,col_107,col_108,col_109,col_110,col_111,targets
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,3.288,292.928,0.3,2.379,0.0855,0.9745,2.29,3.288,0.0,0.0,...,0.004,0.366,0.0015,0.307,0.197,0.0,0.0465,0.033,45.0925,0.2775
std,14.770584,722.829477,2.342166,8.138641,0.741933,3.368757,2.997564,14.770584,0.0,0.0,...,0.063135,0.48183,0.03871,0.461365,0.397832,0.0,0.210618,0.382079,59.864654,0.447877
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,37.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
50%,0.0,93.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0
75%,1.0,270.5,0.0,2.0,0.0,1.0,6.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,62.0,1.0
max,368.0,19687.0,86.0,183.0,26.0,78.0,11.0,368.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,10.0,502.0,1.0


In [142]:
# Separation of labels

x_train = train_set
y_train = train_set[['targets']]

x_test = test_set
y_test = test_set[['targets']]

In [143]:
features_a = x_train.iloc[:,:55]
features_b = x_train.iloc[:,55:]

In [144]:
features_a_test = x_test.iloc[:,:55]
features_b_test = x_test.iloc[:,55:]

In [145]:
lda = LDA(n_components=1)
features_lda_1 = lda.fit_transform(features_a, y_train)
features_lda_2 = lda.fit_transform(features_b, y_train)

In [146]:
features_lda_1_test = lda.fit_transform(features_a_test, y_test)
features_lda_2_test = lda.fit_transform(features_b_test, y_test)

In [147]:
features_lda_1 = pd.DataFrame(features_lda_1)
features_lda_2 = pd.DataFrame(features_lda_2)
features_lda_1_test = pd.DataFrame(features_lda_1_test)
features_lda_2_test = pd.DataFrame(features_lda_2_test)

In [148]:
x_train_lda = features_lda_1.join(features_lda_2, lsuffix="_left", rsuffix="_right")
x_test_lda = features_lda_1_test.join(features_lda_2_test, lsuffix="_left", rsuffix="_right")

In [149]:
x_train_lda

Unnamed: 0,0_left,0_right
0,0.477842,-0.286121
1,-0.876101,-0.603912
2,-1.039679,-0.840358
3,1.789524,3.625557
4,-0.827172,-1.663958
...,...,...
1395,2.047377,-0.173438
1396,-0.684520,-1.561943
1397,-0.367997,0.290284
1398,-0.945484,-0.603912


In [150]:
x_test_lda

Unnamed: 0,0_left,0_right
0,-0.549145,-1.277229
1,-1.386657,0.107849
2,0.833744,0.011079
3,-0.091864,-1.277316
4,-1.353161,0.107849
...,...,...
595,0.895299,0.102784
596,-1.027923,-0.286958
597,-1.057668,-0.806670
598,-1.679386,-0.714452


In [151]:
# Normalize

std_scale = StandardScaler().fit(x_train_lda)
data = std_scale.transform(x_train_lda)

std_scale = StandardScaler().fit(x_test_lda)
x_test_lda_n = std_scale.transform(x_test_lda)

In [152]:
x_test_lda_n

array([[-0.49520369, -0.94852857],
       [-1.25044871,  0.08009347],
       [ 0.75184743,  0.00822801],
       ...,
       [-0.9537753 , -0.59907011],
       [-1.51442363, -0.53058467],
       [ 0.38198046,  0.08009347]])

In [153]:
n_dim = 2

In [154]:
# Review the balance of the target variable in train

y_train.value_counts(normalize=True)*100

targets
0.0        72.714286
1.0        27.285714
dtype: float64

In [155]:
# Review the balance of the target variable in test

y_test.value_counts(normalize=True)*100

targets
0.0        71.166667
1.0        28.833333
dtype: float64

In [156]:
# Angle Encoding

num_qubits = n_dim

dev = qml.device('default.qubit', wires = num_qubits, shots=1024)
#dev = qml.device('default.qubit.tf', wires = num_qubits, shots=1024)
#dev = qml.device('qiskit.ibmq', wires = num_qubits, backend='ibmq_manila', ibmqx_token="6cc75c58fc80fea56cb8dd391f8fbcfdb676a3dc7005493728bc9da7ea753e31a2110a01e3a0cc83f1a98f5ca79e32956fc66c11b5eea4cae163b3fa996be356", shots=256)
#dev = qml.device('qiskit.basicaer', wires = num_qubits, shots = 256)

@qml.qnode(dev)
def circuit(parameters, data):
    for i in range(num_qubits):
        qml.Hadamard(wires = i)
    
    AngleEmbedding(features = data, wires = range(num_qubits), rotation = 'Y')
    
    qml.StronglyEntanglingLayers(weights = parameters, wires = range(num_qubits))
    
    return qml.expval(qml.PauliZ(0))

In [157]:
num_layers = 5
weights_init = 0.01 * np.random.randn(num_layers, num_qubits, 3, requires_grad=True)
bias_init = np.array(0.0, requires_grad=True)

print(weights_init, bias_init)

[[[ 0.00496714 -0.00138264  0.00647689]
  [ 0.0152303  -0.00234153 -0.00234137]]

 [[ 0.01579213  0.00767435 -0.00469474]
  [ 0.0054256  -0.00463418 -0.0046573 ]]

 [[ 0.00241962 -0.0191328  -0.01724918]
  [-0.00562288 -0.01012831  0.00314247]]

 [[-0.00908024 -0.01412304  0.01465649]
  [-0.00225776  0.00067528 -0.01424748]]

 [[-0.00544383  0.00110923 -0.01150994]
  [ 0.00375698 -0.00600639 -0.00291694]]] 0.0


In [158]:
circuit(weights_init, data[0])

tensor(-0.06445312, requires_grad=True)

In [159]:
def variational_classifier(weights, bias, x):
    return circuit(weights, x) + bias

In [160]:
def square_loss(labels, predictions):
    loss = 0
    for l, p in zip(labels, predictions):
        loss = loss + (l - p) ** 2

    loss = loss / len(labels)
    return loss

In [161]:
def accuracy(labels, predictions):

    loss = 0
    for l, p in zip(labels, predictions):
        if abs(l - p) < 1e-5:
            loss = loss + 1
    loss = loss / len(labels)

    return loss

In [162]:
def cost(weights, bias, X, Y):
    predictions = [variational_classifier(weights, bias, x) for x in X]
    return square_loss(Y, predictions)

In [163]:
Y = np.array(y_train.values[:,0] * 2 - np.ones(len(y_train.values[:,0])), requires_grad = False)  # shift label from {0, 1} to {-1, 1}
X = np.array(data, requires_grad=False)

for i in range(5):
    print("X = {}, Y = {: d}".format(list(X[i]), int(Y[i])))

X = [tensor(0.43425169, requires_grad=False), tensor(-0.21131303, requires_grad=False)], Y = -1
X = [tensor(-0.79617925, requires_grad=False), tensor(-0.44601562, requires_grad=False)], Y = -1
X = [tensor(-0.94483513, requires_grad=False), tensor(-0.62064063, requires_grad=False)], Y = -1
X = [tensor(1.62627636, requires_grad=False), tensor(2.67763153, requires_grad=False)], Y =  1
X = [tensor(-0.75171383, requires_grad=False), tensor(-1.22890557, requires_grad=False)], Y = -1


In [164]:
opt = AdamOptimizer(stepsize=0.1, beta1=0.9, beta2=0.99, eps=1e-08)
batch_size = 10

In [165]:
weights = weights_init
bias = bias_init

wbest = 0
bbest = 0
abest = 0

for it in range(50):

    # weights update by one optimizer step

    batch_index = np.random.randint(0, len(X), (batch_size,))
    X_batch = X[batch_index]
    Y_batch = Y[batch_index]
    weights, bias, _, _ = opt.step(cost, weights, bias, X_batch, Y_batch)

    # Compute the accuracy
    predictions = [np.sign(variational_classifier(weights, bias, x)) for x in X]
    
    if accuracy(Y, predictions) > abest:
        wbest = weights
        bbest = bias
        abest = accuracy(Y, predictions)
        print('New best')

    acc = accuracy(Y, predictions)

    print(
        "Iter: {:5d} | Cost: {:0.7f} | Accuracy: {:0.7f} ".format(
            it + 1, cost(weights, bias, X, Y), acc
        )
    )

New best
Iter:     1 | Cost: 0.7766051 | Accuracy: 0.7250000 
New best
Iter:     2 | Cost: 0.6020277 | Accuracy: 0.8000000 
New best
Iter:     3 | Cost: 0.5430088 | Accuracy: 0.8200000 
New best
Iter:     4 | Cost: 0.6133099 | Accuracy: 0.8242857 
New best
Iter:     5 | Cost: 0.5682896 | Accuracy: 0.8328571 
New best
Iter:     6 | Cost: 0.5139809 | Accuracy: 0.8385714 
New best
Iter:     7 | Cost: 0.4862393 | Accuracy: 0.8457143 
New best
Iter:     8 | Cost: 0.4849819 | Accuracy: 0.8485714 
New best
Iter:     9 | Cost: 0.4730128 | Accuracy: 0.8492857 
New best
Iter:    10 | Cost: 0.4595886 | Accuracy: 0.8535714 
Iter:    11 | Cost: 0.4523207 | Accuracy: 0.8478571 
Iter:    12 | Cost: 0.4455806 | Accuracy: 0.8507143 
Iter:    13 | Cost: 0.4476530 | Accuracy: 0.8528571 
New best
Iter:    14 | Cost: 0.4501227 | Accuracy: 0.8550000 
New best
Iter:    15 | Cost: 0.4498906 | Accuracy: 0.8571429 
Iter:    16 | Cost: 0.4526023 | Accuracy: 0.8571429 
Iter:    17 | Cost: 0.4800448 | Accuracy: 0.

In [166]:
Yte = np.array(y_test.values[:,0] * 2 - np.ones(len(y_test.values[:,0])), requires_grad = False)
Xte = np.array(normalize(x_test_lda_n), requires_grad=False)

In [167]:
predictions = [np.sign(variational_classifier(wbest, bbest, x)) for x in Xte]
pred = [np.sign(variational_classifier(wbest, bbest, x)) for x in X]
acc = accuracy(Yte, predictions)

print(f'Cost: {cost(wbest, bbest, Xte, Yte)}, Accuracy: {np.round(acc, 2) * 100}%')

Cost: 0.5347601293965113, Accuracy: 83.0%


In [168]:
pd.DataFrame((predictions, Yte), ('Predictions', 'Test')).T

Unnamed: 0,Predictions,Test
0,-1.0,-1.0
1,-1.0,-1.0
2,1.0,-1.0
3,-1.0,-1.0
4,-1.0,-1.0
...,...,...
595,1.0,-1.0
596,-1.0,-1.0
597,-1.0,-1.0
598,-1.0,-1.0


In [169]:
# Print the classification report and important metrics

print(metrics.classification_report(predictions,Yte))
print(metrics.precision_score(predictions,Yte))
print(metrics.recall_score(predictions,Yte))
print(metrics.f1_score(predictions,Yte))
print(metrics.balanced_accuracy_score(predictions,Yte))
print(metrics.confusion_matrix(predictions,Yte))

              precision    recall  f1-score   support

        -1.0       0.79      0.96      0.87       352
         1.0       0.92      0.65      0.76       248

    accuracy                           0.83       600
   macro avg       0.86      0.80      0.82       600
weighted avg       0.85      0.83      0.82       600

0.9248554913294798
0.6451612903225806
0.7600950118764846
0.8041147360703812
[[339  13]
 [ 88 160]]


In [170]:
#Classical ML approach

import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')

In [171]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

In [172]:
# Classic ML results

results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.785000 (0.036485)
KNN: 0.817857 (0.017276)
CART: 1.000000 (0.000000)
NB: 0.406429 (0.032016)
SVM: 0.727143 (0.021381)


In [173]:
# Classic ML results

results = []
names = []
scoring = 'balanced_accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.725856 (0.054155)
KNN: 0.748267 (0.027442)
CART: 1.000000 (0.000000)
NB: 0.574042 (0.019213)
SVM: 0.500000 (0.000000)


In [174]:
# Classic ML results

results = []
names = []
scoring = 'recall'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.596823 (0.120502)
KNN: 0.595380 (0.056029)
CART: 1.000000 (0.000000)
NB: 0.942471 (0.036170)
SVM: 0.000000 (0.000000)


In [175]:
# Classic ML results

results = []
names = []
scoring = 'precision'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10)
    cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.613604 (0.094372)
KNN: 0.695107 (0.076148)
CART: 1.000000 (0.000000)
NB: 0.308200 (0.027662)
SVM: 0.000000 (0.000000)
