In [15]:
# Imports, as always...
from os import listdir, makedirs, path
import pandas as pd
from tqdm.notebook import tqdm

from sklearn import preprocessing
import numpy as np
import os
import copy
import pickle

In [None]:
# Random seed.
np.random.seed(42)

# Fingerprint Classification

This notebook aims to replicate and advance the approaches to classification in [Martina et al. (2021)](https://arxiv.org/pdf/2109.11405), in which an SVM is used to binarily classify whether a given classical measurement was or was not produced by a given quantum circuit.

This is a simple task which, when taken together with their *very* small circuit, yields for near perfect accuracy. Here, we will complicate things to push the capability of the models to investigate more thoroughly what may or may not be possible with regard to classifying the membership of a quantum state to a quantum device by its "noise fingerprint". 

The ideas fitting into the work of this notebook are as follows:
- *Multi-class classification*. Using the data produced by Martina et al. (2021), can we present a multi-class prediction model that is not given any bias towards any particular model -- given a measurement of a quantum state, which device produced it?
- *Larger/deeper circuits*. How does the performance degrade as the number of qubits increases, or as the circuit depth increases?
- *Noise severity analysis*. Under which severities/forms of noise is performance best? Ideally, we can produce a visualisation of performance (e.g. accuracy) vs. noise intensity/severity. We might expect poor performance with little/no noise (not enough distinguishing information between membership classes), good performance with moderate noise, then poor performance again with large amounts of noise (too much randomness).

For clarity, 'membership to a quantum device' in this context refers to 'being produced by that device'.

## Martina et al. (2021)'s Dataset

Only the raw data is given, and the `createDataset.py` and `extractExecuction.py` scripts are abhorrent messes and crimes against humanity, so I'll do my best to re-create the "extracting" and "creating" process for a dataset from the data on their [GitHub](https://github.com/trianam/learningQuantumNoiseFingerprint/tree/main).

In [80]:
# List all files in the "walker" directory.
file_list = listdir('./martina/data/walker')

# List of machines.
# Note: IBM Bogota's files cannot be read -- ALL lead to pickle underflow.
machines = ['ibmq_athens', 'ibmq_casablanca', 'ibmq_lima', 'ibmq_quito', 'ibmq_santiago', 'ibmq_5_yorktown']

# How many files does each machine have.
counts = {machine : 0 for machine in machines}
for file in file_list:
    for machine in machines:
        if machine in file:
            counts[machine] += 1

# Any count above 250 includes custom splits -- we're not too interested in those.
display(counts)

{'ibmq_athens': 750,
 'ibmq_casablanca': 500,
 'ibmq_lima': 250,
 'ibmq_quito': 250,
 'ibmq_santiago': 250,
 'ibmq_5_yorktown': 250}

### Extracting

This is the process of reading the stored data and translating it into probability distributions.

In [111]:
# File paths.
base_path = './martina/data/walker'
extracted_path = './martina/data/walkerExtracted'
    
# Generate the output path.
makedirs(extracted_path, exist_ok=True)

# "Window sizes".
ks = [1000]

In [95]:
# Helper function to filter the file list into the (non-split) files of only a given machine.
def filter_to_machine(file_list, machine):
    # List of words that specify different types of data (e.g. split).
    no_words = ['split', 'bis']
    
    return filter(
        lambda file : machine in file and not (any([word in file for word in no_words])), file_list
    )

In [96]:
# Extracting executions for each machine.
for machine in tqdm(machines, desc='Extracting'):
    executions = []
    
    # For each file belonging to the current machine.
    for file in tqdm(filter_to_machine(file_list, machine), desc='Reading files', total=250):
        # Read the contents of the file.
        contents = pickle.load(open(path.join(base_path, file), 'rb'))
        
        # For each run of the circuit (of which there are 8000 in the Martina paper and data).
        for n in range(len(contents['results'][0]['data']['memory'])):
            current_execution = []
            
            # Note: we will not be doing repeated measures, nor will we "read all bits".
            
            # For each measurement step t (of which there are 9 in the Martina paper and data).
            for t in range(len(contents['results'])):
                execution = int(contents['results'][t]['data']['memory'][n], 0)
                current_execution.append(execution)

            executions.append(current_execution)
            
    # Cast to numpy array.
    executions = np.array(executions)
    
    # Save the full executions of this machine. 
    np.savetxt(path.join(extracted_path, f'{machine}-executions.csv'), executions)
    
    # Break the executions into windows to be saved.
    for k in ks:
        # The window size must cleanly divide the number of executions.
        if executions.shape[0] % k != 0: raise(Exception('Indivisible by window size.'))
        
        # Initialise probabilities array.
        probs = np.zeros(shape=(
            executions.shape[0] // k, executions.shape[1], np.unique(executions).shape[0]
        ), dtype=np.float32)
        
        # Calculate probabilities with the given window size.
        for n in tqdm(range(executions.shape[0]), desc='Calculating probabilities'):
            i = n // k
            
            for t in range(executions.shape[1]):
                probs[i, t, executions[n, t]] += 1
                
        probs = probs / k
        
        # Save the window.
        np.save(path.join(extracted_path, f'{machine}-probabilities-{k}.npy'), probs)

Extracting:   0%|          | 0/6 [00:00<?, ?it/s]

Reading files:   0%|          | 0/250 [00:00<?, ?it/s]

Calculating probabilities:   0%|          | 0/2000000 [00:00<?, ?it/s]

Reading files:   0%|          | 0/250 [00:00<?, ?it/s]

Calculating probabilities:   0%|          | 0/2000000 [00:00<?, ?it/s]

Reading files:   0%|          | 0/250 [00:00<?, ?it/s]

Calculating probabilities:   0%|          | 0/2000000 [00:00<?, ?it/s]

Reading files:   0%|          | 0/250 [00:00<?, ?it/s]

Calculating probabilities:   0%|          | 0/2000000 [00:00<?, ?it/s]

Reading files:   0%|          | 0/250 [00:00<?, ?it/s]

Calculating probabilities:   0%|          | 0/2000000 [00:00<?, ?it/s]

Reading files:   0%|          | 0/250 [00:00<?, ?it/s]

Calculating probabilities:   0%|          | 0/2000000 [00:00<?, ?it/s]

### Creating

Now we arrange the extracted run statistics into datasets. Unlike the original work, we will do this to work nicely with PyTorch frameworks.

In [112]:
# File paths.
dataset_path = './martina/data/walkerDataset'
makedirs(dataset_path, exist_ok=True)

# Specify the window size.
k = ks[0]

In [147]:
# Pack all the probability distributions (from all machines) into an array.
probs = [np.load(path.join(extracted_path, f'{machine}-probabilities-{k}.npy')) for machine in machines]
order = [np.arange(prob.shape[0]) for prob in probs]

# Features (x), labels (y) format.
xs, ys = [], []
for i in range(min(map(len, order))):
    for p in range(len(probs)):
        xs.append(probs[p][order[p][i]])
        ys.append(p)
        
# Numpify those arrays.
xs = np.array(xs, dtype=np.float32)
ys = np.array(ys, dtype=np.float32)

# Save in this format.
np.savez_compressed(path.join(dataset_path, f'all-dataset-{k}'))

## Multi-class Classification

Given the measurement of a quantum state, what is the probability distribution over the set of devices (for the likelihood of membership), and subsequently which device is most likely to have produced the state?