In [2]:
import pandas as pd
import numpy as np
from pennylane import numpy as np
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler

import pennylane as qml
#from pennylane_qiskit import IBMQDevice
#from pennylane_qiskit import BasicAerDevice
from pennylane.templates.embeddings import AngleEmbedding, AmplitudeEmbedding
from pennylane.optimize import AdamOptimizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


import time

In [54]:
class PrepareData:
    def __init__(self, data, target, sample_size = 0, test_split = 0.3, seed = 10):
        self.data = data
        self.preprocess_done = None
        
        if sample_size == 0:
            self.data_sample = data
        else:
            self.data_sample = data.sample(sample_size)
            
        self.train_set, self.test_set = train_test_split(self.data_sample, test_size=test_split, random_state=seed)
        
        self.y_train = self.train_set[[target]]
        self.y_test = self.test_set[[target]]
        
        self.x_train = self.train_set.drop(target, axis=1)
        self.x_test = self.test_set.drop(target, axis=1)
        
    def view_info(self):
        print(self.data_sample.info())
        if self.preprocess_done == None:
            print("No preprocessing done yet.")
        else:
            print("Preprocessing done via: ", self.preprocess_done)
        return self.data_sample.describe()
    
    def view_preprocessed(self):
        if self.preprocess_done == None:
            print("Please do some preprocessing first.")
        else:
            print("Training Set and Labels: ")
            print(self.train_X_preprocessed)
            print(self.train_Y_preprocessed)

            print("Test Set and Labels: ")
            print(self.test_X_preprocessed)
            print(self.test_Y_preprocessed)
    
    def perform_LDA(self, n_dim = 2):
        
        self.preprocess_done = "LDA"
        print("Performing LDA...")
        
        length = len(self.x_train.columns)
        split_feature = int(length/n_dim)
        features_train = []
        features_test = []
        
        # Split Features (for Yaqi to change)
        for i in range(n_dim):
            new_set_train = self.x_train.iloc[:,(i*split_feature):((i+1)*split_feature)]
            features_train.append(new_set_train)
            
            new_set_test = self.x_test.iloc[:,(i*split_feature):((i+1)*split_feature)]
            features_test.append(new_set_test)
        
        # Run the LDA
        lda = LDA(n_components= (n_dim - 1))
        features_lda_train = []
        features_lda_test = []
        
        for i in range(n_dim):
            features_lda_train_new = lda.fit_transform(features_train[i], self.y_train)
            features_lda_train.append(pd.DataFrame(features_lda_train_new))
            
            features_lda_test_new = lda.fit_transform(features_test[i], self.y_test)
            features_lda_test.append(pd.DataFrame(features_lda_test_new))
        
        x_train_data = features_lda_train[0]
        x_test_data = features_lda_test[0]
        
        # Join the results together
        for i in range(1, n_dim):
            l_suffix = "_" + str(i)
            r_suffix = "_" + str(i+1)
            x_train_data = x_train_data.join(features_lda_train[i], lsuffix=l_suffix, rsuffix=r_suffix)
            x_test_data = x_test_data.join(features_lda_test[i], lsuffix=l_suffix, rsuffix=r_suffix)
        
        # Normalize
        std_scale_train = StandardScaler().fit(x_train_data)
        x_train_data = std_scale_train.transform(x_train_data)
        
        std_scale_test = StandardScaler().fit(x_test_data)
        x_test_data = std_scale_test.transform(x_test_data)
            
        # shift label from {0, 1} to {-1, 1}
        self.train_X_preprocessed = np.array(x_train_data, requires_grad=False)
        self.train_Y_preprocessed = np.array(self.y_train.values[:,0] * 2 - np.ones(len(self.y_train.values[:,0])), requires_grad = False)
        
        self.test_X_preprocessed = np.array(x_test_data, requires_grad=False)
        self.test_Y_preprocessed = np.array(self.y_test.values[:,0] * 2 - np.ones(len(self.y_test.values[:,0])), requires_grad = False)
        
    def perform_PCA(self, n_dim = 2):
        
        self.preprocess_done = "PCA"
        print("Performing PCA...")
        
        self.y_train.value_counts(normalize=True)*100
        self.y_test.value_counts(normalize=True)*100
        
        pca = PCA(n_components=n_dim, svd_solver='full')
        pca.fit(self.x_train)
        x_train_pca = pca.transform(self.x_train)
        pca.fit(self.x_test)
        x_test = pca.transform(self.x_test)
        
        train_X_preprocessed = normalize(self.x_train_pca)
        test_X_preprocessed = normalize(self.x_test)
        
        self.train_Y_preprocessed = np.array(self.y_train.values[:,0] * 2 - np.ones(len(self.y_train.values[:,0])), requires_grad = False)  # shift label from {0, 1} to {-1, 1}
        self.train_X_preprocessed = np.array(train_X_preprocessed, requires_grad=False)
        
        self.test_Y_preprocessed = np.array(self.y_test.values[:,0] * 2 - np.ones(len(self.y_test.values[:,0])), requires_grad = False)  # shift label from {0, 1} to {-1, 1}
        self.test_X_preprocessed = np.array(test_X_preprocessed, requires_grad=False)
        
    def perform_normalize(self, n_dim = 2):
        
        self.preprocess_done = "Normalize"
        print("Performing Normalize...")
        
        self.y_train.value_counts(normalize=True)*100
        self.y_test.value_counts(normalize=True)*100
        
        self.x_train.value_counts(normalize=True)*100
        self.x_test.value_counts(normalize=True)*100
        
        

In [55]:
df = pd.read_csv('fraud_detection_bank_dataset.csv', sep=',')
df = df.astype(float)
df = df.drop(['Unnamed: 0'], axis = 1)

In [56]:
data = PrepareData(data = df,target = "targets", sample_size = 2000, test_split = 0.3, seed = 10)

In [57]:
data.view_info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 9997 to 10438
Columns: 113 entries, col_0 to targets
dtypes: float64(113)
memory usage: 1.7 MB
None
No preprocessing done yet.


Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_103,col_104,col_105,col_106,col_107,col_108,col_109,col_110,col_111,targets
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,3.089,284.0315,0.2775,2.346,0.059,0.8855,2.543,3.089,0.0,0.0,...,0.0055,0.346,0.0025,0.306,0.1815,0.0,0.0455,0.0205,42.1915,0.2535
std,10.594584,525.37318,1.345882,8.883453,0.406942,3.193977,3.036406,10.594584,0.0,0.0,...,0.073976,0.475812,0.04995,0.460945,0.385528,0.0,0.20845,0.219326,60.283662,0.435123
min,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
50%,0.0,98.5,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0
75%,2.0,307.25,0.0,2.0,0.0,1.0,6.0,2.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,62.0,1.0
max,182.0,7457.0,25.0,179.0,11.0,79.0,8.0,182.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,5.0,567.0,1.0


In [58]:
data.perform_LDA()

Performing LDA...


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [59]:
data.view_preprocessed()

NameError: name 'preprocessed' is not defined

In [None]:
class QBC:
    def __init__(self, Data, n_dim, n_layers, 
                 optimizer = AdamOptimizer(stepsize=0.1, beta1=0.9, beta2=0.99, eps=1e-08),
                 loss_function = None, batch_size = 10, backend = "default.qubit", shots = 0):
        
        if loss_function == None:
            self.loss_function = square_loss
        else: 
            self.loss_function = loss_function
        self.opt = optimizer
        self.batch_size = batch_size
        
        self.n_dim = n_dim
        
        dev = qml.device(backend, wires = self.n_dim, shots=shots)
        #dev = qml.device('default.qubit.tf', wires = num_qubits, shots=1024)
        #dev = qml.device('qiskit.ibmq', wires = num_qubits, backend='ibmq_manila', ibmqx_token="6cc75c58fc80fea56cb8dd391f8fbcfdb676a3dc7005493728bc9da7ea753e31a2110a01e3a0cc83f1a98f5ca79e32956fc66c11b5eea4cae163b3fa996be356", shots=256)
        #dev = qml.device('qiskit.basicaer', wires = num_qubits, shots = 256)

        @qml.qnode(dev)
        def circuit(parameters, data):
            for i in range(num_qubits):
                qml.Hadamard(wires = i)

            AngleEmbedding(features = data, wires = range(num_qubits), rotation = 'Y')

            qml.StronglyEntanglingLayers(weights = parameters, wires = range(num_qubits))

            return qml.expval(qml.PauliZ(0))
        
        self.n_layers = n_layers
        self.weights = 0.01 * np.random.randn(self.n_layers, self.n_dim, 3, requires_grad=True)
        self.bias = np.array(0.0, requires_grad=True)
        
    def variational_classifier(weights, bias, x):
        return circuit(weights, x) + bias
    
    def square_loss(labels, predictions):
        loss = 0
        for l, p in zip(labels, predictions):
            loss = loss + (l - p) ** 2

        loss = loss / len(labels)
        return loss
    
    def accuracy(labels, predictions):

        loss = 0
        for l, p in zip(labels, predictions):
            if abs(l - p) < 1e-5:
                loss = loss + 1
        loss = loss / len(labels)

        return loss
    
    def cost(weights, bias, X, Y):
        predictions = [variational_classifier(weights, bias, x) for x in X]
        return self.loss_function(Y, predictions)

    def train(self, n_epochs):
        wbest = 0
        bbest = 0
        abest = 0

        for it in range(n_epochs):

            # weights update by one optimizer step

            batch_index = np.random.randint(0, len(X), (batch_size,))
            X_batch = X[batch_index]
            Y_batch = Y[batch_index]
            weights, bias, _, _ = opt.step(cost, weights, bias, X_batch, Y_batch)

            # Compute the accuracy
            predictions = [np.sign(variational_classifier(weights, bias, x)) for x in X]

            if accuracy(Y, predictions) > abest:
                wbest = weights
                bbest = bias
                abest = accuracy(Y, predictions)
                print('New best')

            acc = accuracy(Y, predictions)

            print(
                "Iter: {:5d} | Cost: {:0.7f} | Accuracy: {:0.7f} ".format(
                    it + 1, cost(weights, bias, X, Y), acc
                )
            )