In [2]:
import pandas as pd
import numpy as np
from pennylane import numpy as np
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler

import pennylane as qml
#from pennylane_qiskit import IBMQDevice
#from pennylane_qiskit import BasicAerDevice
from pennylane.templates.embeddings import AngleEmbedding, AmplitudeEmbedding
from pennylane.optimize import AdamOptimizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

import time

In [74]:
class PrepareData:
    def __init__(self, data, target, sample_size = 0, test_split = 0.3, seed = 10):
        self.data = data
        self.preprocess_done = None
        
        if sample_size == 0:
            self.data_sample = data
        else:
            self.data_sample = data.sample(sample_size)
            
        self.train_set, self.test_set = train_test_split(self.data_sample, test_size=test_split, random_state=seed)
        
        self.y_train = self.train_set[[target]]
        self.y_test = self.test_set[[target]]
        
        self.x_train = self.train_set.drop(target, axis=1)
        self.x_test = self.test_set.drop(target, axis=1)
        
    def view_info(self):
        print(self.data_sample.info())
        if self.preprocess_done == None:
            print("No preprocessing done yet.")
        else:
            print("Preprocessing done via: ", self.preprocess_done)
        return self.data_sample.describe()
    
    def get_preprocessed(self, to_show = False):
        if self.preprocess_done == None:
            print("Please do some preprocessing first.")
        else:
            
            if to_show:
                print("Training Set and Labels: ")
                print(self.train_X_preprocessed)
                print(self.train_Y_preprocessed)

                print("Test Set and Labels: ")
                print(self.test_X_preprocessed)
                print(self.test_Y_preprocessed)
            
            return self.train_X_preprocessed, self.train_Y_preprocessed, self.test_X_preprocessed, self.test_Y_preprocessed
    
    def perform_LDA(self, n_dim = 2):
        
        self.preprocess_done = "LDA"
        print("Performing LDA...")
        
        length = len(self.x_train.columns)
        split_feature = int(length/n_dim)
        features_train = []
        features_test = []
        
        # Split Features (for Yaqi to change)
        for i in range(n_dim):
            new_set_train = self.x_train.iloc[:,(i*split_feature):((i+1)*split_feature)]
            features_train.append(new_set_train)
            
            new_set_test = self.x_test.iloc[:,(i*split_feature):((i+1)*split_feature)]
            features_test.append(new_set_test)
        
        # Run the LDA
        lda = LDA(n_components= (n_dim - 1))
        features_lda_train = []
        features_lda_test = []
        
        for i in range(n_dim):
            features_lda_train_new = lda.fit_transform(features_train[i], self.y_train)
            features_lda_train.append(pd.DataFrame(features_lda_train_new))
            
            features_lda_test_new = lda.fit_transform(features_test[i], self.y_test)
            features_lda_test.append(pd.DataFrame(features_lda_test_new))
        
        x_train_data = features_lda_train[0]
        x_test_data = features_lda_test[0]
        
        # Join the results together
        for i in range(1, n_dim):
            l_suffix = "_" + str(i)
            r_suffix = "_" + str(i+1)
            x_train_data = x_train_data.join(features_lda_train[i], lsuffix=l_suffix, rsuffix=r_suffix)
            x_test_data = x_test_data.join(features_lda_test[i], lsuffix=l_suffix, rsuffix=r_suffix)
        
        # Normalize
        std_scale_train = StandardScaler().fit(x_train_data)
        x_train_data = std_scale_train.transform(x_train_data)
        
        std_scale_test = StandardScaler().fit(x_test_data)
        x_test_data = std_scale_test.transform(x_test_data)
            
        # shift label from {0, 1} to {-1, 1}
        self.train_X_preprocessed = np.array(x_train_data, requires_grad=False)
        self.train_Y_preprocessed = np.array(self.y_train.values[:,0] * 2 - np.ones(len(self.y_train.values[:,0])), requires_grad = False)
        
        self.test_X_preprocessed = np.array(x_test_data, requires_grad=False)
        self.test_Y_preprocessed = np.array(self.y_test.values[:,0] * 2 - np.ones(len(self.y_test.values[:,0])), requires_grad = False)
        
    def perform_PCA(self, n_dim = 2):
        
        self.preprocess_done = "PCA"
        print("Performing PCA...")
        
        self.y_train.value_counts(normalize=True)*100
        self.y_test.value_counts(normalize=True)*100
        
        pca = PCA(n_components=n_dim, svd_solver='full')
        pca.fit(self.x_train)
        x_train_pca = pca.transform(self.x_train)
        pca.fit(self.x_test)
        x_test_pca = pca.transform(self.x_test)
        
        train_X_preprocessed = normalize(x_train_pca)
        test_X_preprocessed = normalize(x_test_pca)
        
        self.train_Y_preprocessed = np.array(self.y_train.values[:,0] * 2 - np.ones(len(self.y_train.values[:,0])), requires_grad = False)  # shift label from {0, 1} to {-1, 1}
        self.train_X_preprocessed = np.array(train_X_preprocessed, requires_grad=False)
        
        self.test_Y_preprocessed = np.array(self.y_test.values[:,0] * 2 - np.ones(len(self.y_test.values[:,0])), requires_grad = False)  # shift label from {0, 1} to {-1, 1}
        self.test_X_preprocessed = np.array(test_X_preprocessed, requires_grad=False)
        
    def perform_normalize(self, n_dim = 2):
        
        self.preprocess_done = "Normalize"
        print("Performing Normalize...")
        
        self.y_train.value_counts(normalize=True)*100
        self.y_test.value_counts(normalize=True)*100
        
        self.x_train.value_counts(normalize=True)*100
        self.x_test.value_counts(normalize=True)*100
        
        

In [75]:
df = pd.read_csv('fraud_detection_bank_dataset.csv', sep=',')
df = df.astype(float)
df = df.drop(['Unnamed: 0'], axis = 1)

In [76]:
data = PrepareData(data = df,target = "targets", sample_size = 2000, test_split = 0.3, seed = 10)

In [77]:
data.view_info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 10272 to 19363
Columns: 113 entries, col_0 to targets
dtypes: float64(113)
memory usage: 1.7 MB
None
No preprocessing done yet.


Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_103,col_104,col_105,col_106,col_107,col_108,col_109,col_110,col_111,targets
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,3.013,309.9045,0.8865,2.378,0.0955,1.066,2.44,3.013,0.0,0.0,...,0.011,0.3875,0.001,0.318,0.1975,0.0,0.0505,0.025,47.0115,0.264
std,14.092556,893.336008,20.937306,8.773574,1.135793,6.593283,3.051894,14.092556,0.0,0.0,...,0.113514,0.487301,0.031615,0.465816,0.398212,0.0,0.219029,0.3007,64.113467,0.44091
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0
50%,0.0,97.5,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,0.0
75%,2.0,286.25,0.0,2.0,0.0,1.0,6.0,2.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,62.0,1.0
max,490.0,25927.0,904.0,235.0,40.0,230.0,8.0,490.0,0.0,0.0,...,2.0,1.0,1.0,1.0,1.0,0.0,1.0,11.0,728.0,1.0


In [78]:
data.perform_LDA()

Performing PCA...


In [79]:
data.get_preprocessed(to_show = True)

Training Set and Labels: 
[[-0.99360793  0.11288612]
 [-0.99992919 -0.01190007]
 [-0.99991751 -0.01284414]
 ...
 [-0.99991742 -0.01285143]
 [-0.49123299  0.87102821]
 [-0.99991403 -0.01311251]]
[-1. -1. -1. ...  1. -1. -1.]
Test Set and Labels: 
[[-0.99992372 -0.01235157]
 [-0.99988422 -0.01521681]
 [-0.99992729 -0.01205916]
 ...
 [ 0.99999839 -0.00179606]
 [-0.9999273  -0.01205789]
 [-0.99992564 -0.01219491]]
[-1. -1.  1. -1.  1. -1.  1.  1. -1.  1.  1. -1. -1. -1. -1.  1.  1. -1.
 -1. -1. -1.  1. -1. -1. -1.  1. -1.  1.  1. -1. -1. -1. -1.  1. -1. -1.
 -1. -1. -1. -1. -1. -1.  1.  1. -1.  1. -1. -1.  1. -1. -1. -1. -1.  1.
 -1. -1. -1. -1.  1.  1. -1. -1. -1. -1. -1. -1.  1. -1. -1.  1. -1. -1.
  1.  1.  1.  1.  1. -1. -1. -1.  1. -1.  1. -1.  1. -1. -1. -1.  1. -1.
  1. -1.  1.  1. -1. -1. -1. -1. -1.  1. -1. -1.  1. -1. -1. -1. -1.  1.
 -1. -1. -1.  1. -1. -1. -1. -1.  1.  1. -1. -1.  1.  1.  1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1.  1. -1. -1. -1.
 -

(tensor([[-0.99360793,  0.11288612],
         [-0.99992919, -0.01190007],
         [-0.99991751, -0.01284414],
         ...,
         [-0.99991742, -0.01285143],
         [-0.49123299,  0.87102821],
         [-0.99991403, -0.01311251]], requires_grad=False),
 tensor([-1., -1., -1., ...,  1., -1., -1.], requires_grad=False),
 tensor([[-0.99992372, -0.01235157],
         [-0.99988422, -0.01521681],
         [-0.99992729, -0.01205916],
         ...,
         [ 0.99999839, -0.00179606],
         [-0.9999273 , -0.01205789],
         [-0.99992564, -0.01219491]], requires_grad=False),
 tensor([-1., -1.,  1., -1.,  1., -1.,  1.,  1., -1.,  1.,  1., -1., -1.,
         -1., -1.,  1.,  1., -1., -1., -1., -1.,  1., -1., -1., -1.,  1.,
         -1.,  1.,  1., -1., -1., -1., -1.,  1., -1., -1., -1., -1., -1.,
         -1., -1., -1.,  1.,  1., -1.,  1., -1., -1.,  1., -1., -1., -1.,
         -1.,  1., -1., -1., -1., -1.,  1.,  1., -1., -1., -1., -1., -1.,
         -1.,  1., -1., -1.,  1., -1., -1.,  

In [111]:
class QBC:
    def __init__(self, data, n_dim, n_layers, 
                 optimizer = AdamOptimizer(stepsize=0.1, beta1=0.9, beta2=0.99, eps=1e-08),
                 interface_type = "autograd",
                 loss_function = None,  backend = "default.qubit", shots = None):
        
        if loss_function == None:
            def square_loss(labels, predictions):
                loss = 0
                for l, p in zip(labels, predictions):
                    loss = loss + (l - p) ** 2

                loss = loss / len(labels)
                return loss
            self.loss_function = square_loss
        else: 
            self.loss_function = loss_function
        self.opt = optimizer
        self.data = data
        
        self.n_dim = n_dim
        
        dev = qml.device(backend, wires = self.n_dim, shots=shots)
        #dev = qml.device('default.qubit.tf', wires = num_qubits, shots=1024)
        #dev = qml.device('qiskit.ibmq', wires = num_qubits, backend='ibmq_manila', ibmqx_token="6cc75c58fc80fea56cb8dd391f8fbcfdb676a3dc7005493728bc9da7ea753e31a2110a01e3a0cc83f1a98f5ca79e32956fc66c11b5eea4cae163b3fa996be356", shots=256)
        #dev = qml.device('qiskit.basicaer', wires = num_qubits, shots = 256)

        @qml.qnode(dev)
        def circuit(parameters, data):
            for i in range(n_dim):
                qml.Hadamard(wires = i)

            AngleEmbedding(features = data, wires = range(self.n_dim), rotation = 'Y')

            qml.StronglyEntanglingLayers(weights = parameters, wires = range(self.n_dim))

            return qml.expval(qml.PauliZ(0))
        
        self.qlayer = qml.QNode(circuit, dev, interface=interface_type, diff_method='best')
        
        self.n_layers = n_layers
        self.weights = 0.01 * np.random.randn(self.n_layers, self.n_dim, 3, requires_grad=True)
        self.bias = np.array(0.0, requires_grad=True)

    def variational_classifier(self, weights, bias, x):
        return self.qlayer(weights, x) + bias
        
    def train(self, batch_size = 10, n_epochs = 50):
        wbest = 0
        bbest = 0
        abest = 0
        X, Y, _, _ = self.data.get_preprocessed()
        
        def cost(weights, bias, X, Y):
            predictions = [self.variational_classifier(weights, bias, x) for x in X]
            return self.loss_function(Y, predictions)

        def accuracy(labels, predictions):

            loss = 0
            for l, p in zip(labels, predictions):
                if abs(l - p) < 1e-5:
                    loss = loss + 1
            loss = loss / len(labels)

            return loss

        for it in range(n_epochs):

            # weights update by one optimizer step

            batch_index = np.random.randint(0, len(X), (batch_size,))
            X_batch = X[batch_index]
            Y_batch = Y[batch_index]
            self.weights, self.bias, _, _ = self.opt.step(cost, self.weights, self.bias, X_batch, Y_batch)

            # Compute the accuracy
            predictions = [np.sign(self.variational_classifier(self.weights, self.bias, x)) for x in X]

            if accuracy(Y, predictions) > abest:
                wbest = self.weights
                bbest = self.bias
                abest = accuracy(Y, predictions)
                print('New best')

            acc = accuracy(Y, predictions)

            print(
                "Iter: {:5d} | Cost: {:0.7f} | Accuracy: {:0.7f} ".format(
                    it + 1, cost(self.weights, self.bias, X, Y), acc
                )
            )
            
        self.weights = wbest
        self.bias = bbest
        
    def predict(self, test_data):
        predictions = [np.sign(self.variational_classifier(self.weights, self.bias, x)) for x in test_data]
        return predictions

In [112]:
model = QBC(data, n_dim = 2, n_layers = 5, backend = "lightning.qubit")

In [113]:
model.train()

New best
Iter:     1 | Cost: 0.8362042 | Accuracy: 0.7278571 


KeyboardInterrupt: 