<a href="https://colab.research.google.com/github/abirharrasse/EMINES---VSA-Project---Grp3/blob/master/Bundling_Projet_VSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [None]:
!git clone  https://github.com/abirharrasse/Hyperdimensional-Computing

Cloning into 'Hyperdimensional-Computing'...
remote: Enumerating objects: 96, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 96 (delta 14), reused 7 (delta 7), pack-reused 75 (from 1)[K
Receiving objects: 100% (96/96), 48.20 MiB | 14.60 MiB/s, done.
Resolving deltas: 100% (51/51), done.
Updating files: 100% (13/13), done.


In [None]:
%cd /content/Hyperdimensional-Computing

/content/Hyperdimensional-Computing


In [None]:
import torch
import numpy as np
import time
import os
from utils import prepare_data, encode_and_save
from model import BModel, GModel
import argparse

In [None]:
from google.colab import drive
drive.mount('/content/drive')
encoded_data_folder = '/content/drive/MyDrive/encoded_data'


Mounted at /content/drive


In [None]:
class ArgumentParser:
    def __init__(self):
        self.args = {
            'lr': 0.01,            # Learning rate
            'gamma': 0.3,          # Kernel parameter
            'epoch': 1,            # Number of epochs
            'gorder': 8,           # Group order
            'dim': 10000,          # Dimension of hypervectors
            'seed': 43,            # Random seed
            'r': 2,
            'resume': False,       # Resume flag
            'data_dir': '/content/drive/MyDrive/encoded_data', # Data directory
            'dataset': 'fmnist',   # Dataset name
            'raw_data_dir': './dataset',  # Raw data directory
            'model': 'rff-gvsa'    # Model type
        }
        # Dynamically set attributes on the instance
        for key, value in self.args.items():
            setattr(self, key, value)

    def get_args(self):
        return self.args

# Instantiate the parser
args = ArgumentParser()

# Verify that attributes are correctly set
print(f"Seed: {args.seed}")  # Should print "Seed: 43"

# Use the seed with torch
torch.manual_seed(args.seed)
print("Torch manual seed set successfully.")


Seed: 43
Torch manual seed set successfully.


## Encoding with RFE

In [None]:
# List of datasets to iterate over
dataset_choices = ['fmnist', 'mnist', 'isolet', 'ucihar']


if 'hdc' in args.model:
    args.gorder = 2
    print("Use binary HDC with random fourier features, ignoring gorder, set to 2.")

# Loop over each dataset
for dataset in dataset_choices:
    # Set the current dataset in the args
    args.dataset = dataset
    args.data_dir = f'{args.data_dir}/{args.dataset}_{args.model}_order{args.gorder}_gamma{args.gamma}_dim{args.dim}'

    # Create the directory if it doesn't exist
    try:
        os.makedirs(args.data_dir)
    except FileExistsError:
        print(f'Encoded data folder for {dataset} already exists')

    # Perform encoding and saving if not resuming from existing data
    if not args.resume:
        print(f'Encoding the dataset: {dataset}')
        encode_and_save(args)
        print(f'Finished encoding and saving for {dataset}')


## Evaluating accuracy of RFF with bundling and angular similarity

In [None]:
from encoder import RandomFourierEncoder
import torch
from tqdm import tqdm
import os
from argparse import ArgumentParser

dataset_choices = ['fmnist', 'mnist', 'isolet', 'ucihar']
results = {}

start = '/content/drive/MyDrive/encoded_data'
# Loop over each dataset
for dataset in dataset_choices:
    args.dataset = dataset
    args.data_dir = f'{start}/{args.dataset}_{args.model}_order{args.gorder}_gamma{args.gamma}_dim{args.dim}'

    # Load the encoded training data
    X_train = torch.load(f'{args.data_dir}/train_hd.pt')
    y_train = torch.load(f'{args.data_dir}/y_train.pt')

    # Load the encoded test data
    X_test = torch.load(f'{args.data_dir}/test_hd.pt')
    y_test = torch.load(f'{args.data_dir}/y_test.pt')
    if len(X_train.shape) == 3:
        X_train = X_train.squeeze(1)
    if len(X_test.shape) == 3:
        X_test = X_test.squeeze(1)

    rfe = RandomFourierEncoder(input_dim=784, gamma=args.gamma, gorder=args.gorder, output_dim=args.dim)

    # Compute the bundled vectors for each class
    num_classes = torch.unique(y_train).size(0)
    bundled_vectors = []
    for c in range(num_classes):
        class_data = torch.stack([X_train[i] for i in range(len(X_train)) if y_train[i] == c])
        if len(class_data.shape) == 3:
            class_data = class_data.squeeze(1)

        bundled_vector = rfe.group_bundle(class_data)
        bundled_vectors.append(bundled_vector)

    # Evaluate on the test set
    correct = 0
    total = 0
    for i in tqdm(range(len(X_test)), desc=f"Evaluating {dataset}"):
        x = X_test[i]

        similarities = [rfe.similarity(x, bundled_vector) for bundled_vector in bundled_vectors]
        predicted_class = similarities.index(max(similarities))
        if predicted_class == y_test[i]:
            correct += 1
        total += 1

    accuracy = correct / total * 100
    print(f'{dataset} Test Accuracy: {accuracy:.2f}%')
    results[dataset] = accuracy

# Print summary of accuracies
print(f"Summary of accuracies for {dataset} dataset: ")
for dataset, accuracy in results.items():
    print(f'{dataset}: {accuracy:.2f}%')

  X_train = torch.load(f'{args.data_dir}/train_hd.pt')
  y_train = torch.load(f'{args.data_dir}/y_train.pt')
  X_test = torch.load(f'{args.data_dir}/test_hd.pt')
  y_test = torch.load(f'{args.data_dir}/y_test.pt')
Evaluating fmnist: 100%|██████████| 10000/10000 [01:12<00:00, 137.64it/s]


fmnist Test Accuracy: 73.17%


Evaluating mnist: 100%|██████████| 10000/10000 [01:12<00:00, 137.69it/s]


mnist Test Accuracy: 84.46%


Evaluating isolet: 100%|██████████| 1559/1559 [00:32<00:00, 48.60it/s]


isolet Test Accuracy: 87.56%


Evaluating ucihar: 100%|██████████| 2947/2947 [00:13<00:00, 222.44it/s]

ucihar Test Accuracy: 80.42%
Summary of accuracies for ucihar dataset: 
fmnist: 73.17%
mnist: 84.46%
isolet: 87.56%
ucihar: 80.42%





## Encoding with ManhattanEncoder (see the github repo for the implementation)

In [None]:
from utils_manhattan import encode_and_save
dataset_choices = ['isolet', 'ucihar']

args.r = 12
args.model = 'manhattan-hdc'
print("Use binary HDC with manhattan distance")
start = '/content/drive/MyDrive/encoded_data'
# Loop over each dataset
for dataset in dataset_choices:
    # Set the current dataset in the args
    args.dataset = dataset
    args.data_dir = f'{start}/{args.dataset}_{args.model}_order{args.r}_gamma{args.gamma}_dim{args.dim}'

    # Create the directory if it doesn't exist
    try:
        os.makedirs(args.data_dir)
    except FileExistsError:
        print(f'Encoded data folder for {dataset} already exists')

    # Perform encoding and saving if not resuming from existing data
    if not args.resume:
        print(f'Encoding the dataset: {dataset}')
        encode_and_save(args)
        print(f'Finished encoding and saving for {dataset}')


Use binary HDC with manhattan distance
Encoded data folder for isolet already exists
Encoding the dataset: isolet
Loading dataset...
# of channels of data 1
# of training samples and test samples 6238 1559
Encoding to HDC with Manhattan distance.
Building item memory...
generating linear item memory...
Encoded pixels to hypervectors with size: torch.Size([256, 10000])




Encoding training data...


Training Data Encoding: 100%|██████████| 6238/6238 [14:11<00:00,  7.33it/s]


Encoding test data...


Test Data Encoding: 100%|██████████| 1559/1559 [03:28<00:00,  7.49it/s]


Finished encoding and saving for isolet
Encoding the dataset: ucihar
Loading dataset...
# of channels of data 1
# of training samples and test samples 7352 2947
Encoding to HDC with Manhattan distance.
Building item memory...
generating linear item memory...
Encoded pixels to hypervectors with size: torch.Size([256, 10000])
Encoding training data...


Training Data Encoding: 100%|██████████| 7352/7352 [14:36<00:00,  8.39it/s]


Encoding test data...


Test Data Encoding: 100%|██████████| 2947/2947 [05:56<00:00,  8.26it/s]


Finished encoding and saving for ucihar


## Evaluationg the Accuracy with ManhattanEncoder, bundling and Manhattan Similarity: r = 2

In [None]:
from encoder_manhattan import ManhattanEncoder
import torch
from tqdm import tqdm

dataset_choices = ['isolet', 'ucihar']
results = {}
args.r = 2
start = '/content/drive/MyDrive/encoded_data'
# Loop over each dataset
for dataset in dataset_choices:
    args.dataset = dataset
    args.data_dir = f'{start}/{args.dataset}_{args.model}_order{args.r}_gamma{args.gamma}_dim{args.dim}'

    # Load the encoded training data
    X_train = torch.load(f'{args.data_dir}/train_hd.pt')
    y_train = torch.load(f'{args.data_dir}/y_train.pt')

    # Load the encoded test data
    X_test = torch.load(f'{args.data_dir}/test_hd.pt')
    y_test = torch.load(f'{args.data_dir}/y_test.pt')
    if len(X_train.shape) == 3:
        X_train = X_train.squeeze(1)
    if len(X_test.shape) == 3:
        X_test = X_test.squeeze(1)


    mht = ManhattanEncoder(num=256, r=2)

    # Compute the bundled vectors for each class
    num_classes = torch.unique(y_train).size(0)
    bundled_vectors = []
    for c in range(num_classes):
        class_data = torch.stack([X_train[i] for i in range(len(X_train)) if y_train[i] == c])
        bundled_vector = mht.group_bundle(class_data)  # Use the instance method
        bundled_vectors.append(bundled_vector)

    # Evaluate on the test set
    correct = 0
    total = 0
    for i in tqdm(range(len(X_test))):
        x = X_test[i]
        similarities = [mht.similarity(x, bundled_vector) for bundled_vector in bundled_vectors]
        predicted_class = similarities.index(max(similarities))
        if predicted_class == y_test[i]:
            correct += 1
        total += 1
        accuracy = correct / total * 100
    print(f'Test Accuracy: {accuracy:.2f}%')
    results[dataset] = accuracy

# Print summary of accuracies
print(f"Summary of accuracies for {dataset} dataset: ")
for dataset, accuracy in results.items():
    print(f'{dataset}: {accuracy:.2f}%')

  X_train = torch.load(f'{args.data_dir}/train_hd.pt')
  y_train = torch.load(f'{args.data_dir}/y_train.pt')
  X_test = torch.load(f'{args.data_dir}/test_hd.pt')
  y_test = torch.load(f'{args.data_dir}/y_test.pt')
100%|██████████| 1559/1559 [00:06<00:00, 226.96it/s]


Test Accuracy: 3.85%


100%|██████████| 2947/2947 [00:03<00:00, 942.10it/s]

Test Accuracy: 15.98%
Summary of accuracies for ucihar dataset: 
isolet: 3.85%
ucihar: 15.98%





## With r = 12

In [None]:
from encoder_manhattan import ManhattanEncoder
import torch
from tqdm import tqdm

dataset_choices = ['isolet', 'ucihar']
results = {}

start = '/content/drive/MyDrive/encoded_data'
# Loop over each dataset
for dataset in dataset_choices:
    args.dataset = dataset
    args.data_dir = f'{start}/{args.dataset}_{args.model}_order{args.r}_gamma{args.gamma}_dim{args.dim}'

    # Load the encoded training data
    X_train = torch.load(f'{args.data_dir}/train_hd.pt')
    y_train = torch.load(f'{args.data_dir}/y_train.pt')

    # Load the encoded test data
    X_test = torch.load(f'{args.data_dir}/test_hd.pt')
    y_test = torch.load(f'{args.data_dir}/y_test.pt')
    if len(X_train.shape) == 3:
        X_train = X_train.squeeze(1)
    if len(X_test.shape) == 3:
        X_test = X_test.squeeze(1)


    mht = ManhattanEncoder(num=256, r=12)

    # Compute the bundled vectors for each class
    num_classes = torch.unique(y_train).size(0)
    bundled_vectors = []
    for c in range(num_classes):
        class_data = torch.stack([X_train[i] for i in range(len(X_train)) if y_train[i] == c])
        bundled_vector = mht.group_bundle(class_data)  # Use the instance method
        bundled_vectors.append(bundled_vector)

    # Evaluate on the test set
    correct = 0
    total = 0
    for i in tqdm(range(len(X_test))):
        x = X_test[i]
        similarities = [mht.similarity(x, bundled_vector) for bundled_vector in bundled_vectors]
        predicted_class = similarities.index(max(similarities))
        if predicted_class == y_test[i]:
            correct += 1
        total += 1
        accuracy = correct / total * 100
    print(f'Test Accuracy: {accuracy:.2f}%')
    results[dataset] = accuracy

# Print summary of accuracies
print(f"Summary of accuracies for {dataset} dataset: ")
for dataset, accuracy in results.items():
    print(f'{dataset}: {accuracy:.2f}%')

  X_train = torch.load(f'{args.data_dir}/train_hd.pt')
  y_train = torch.load(f'{args.data_dir}/y_train.pt')
  X_test = torch.load(f'{args.data_dir}/test_hd.pt')
  y_test = torch.load(f'{args.data_dir}/y_test.pt')
100%|██████████| 1559/1559 [00:08<00:00, 178.59it/s]


Test Accuracy: 3.91%


100%|██████████| 2947/2947 [00:03<00:00, 947.14it/s]

Test Accuracy: 18.32%
Summary of accuracies for ucihar dataset: 
isolet: 3.91%
ucihar: 18.32%





In [None]:
from encoder_manhattan import ManhattanEncoder
import torch
from tqdm import tqdm

args = ArgumentParser()
torch.manual_seed(args.seed)
np.random.seed(args.seed)

# Load the encoded training data
X_train = torch.load(f'{args.data_dir}/{args.dataset}_{args.model}_order{args.gorder}_gamma{args.gamma}_dim{args.dim}/train_hd.pt')
y_train = torch.load(f'{args.data_dir}/{args.dataset}_{args.model}_order{args.gorder}_gamma{args.gamma}_dim{args.dim}/y_train.pt')

# Load the encoded test data
X_test = torch.load(f'{args.data_dir}/{args.dataset}_{args.model}_order{args.gorder}_gamma{args.gamma}_dim{args.dim}/test_hd.pt')
y_test = torch.load(f'{args.data_dir}/{args.dataset}_{args.model}_order{args.gorder}_gamma{args.gamma}_dim{args.dim}/y_test.pt')


mht = ManhattanEncoder(num=10, r=8)

# Compute the bundled vectors for each class
num_classes = torch.unique(y_train).size(0)
bundled_vectors = []
for c in range(num_classes):
    class_data = torch.stack([X_train[i] for i in range(len(X_train)) if y_train[i] == c])
    bundled_vector = mht.group_bundle(class_data)  # Use the instance method
    bundled_vectors.append(bundled_vector)

# Evaluate on the test set
correct = 0
total = 0
for i in tqdm(range(len(X_test))):
    x = X_test[i]
    similarities = [mht.similarity(x, bundled_vector) for bundled_vector in bundled_vectors]
    predicted_class = similarities.index(max(similarities))
    if predicted_class == y_test[i]:
        correct += 1
    total += 1

print(f'Test Accuracy: {correct / total * 100:.2f}%')

  X_train = torch.load(f'{args.data_dir}/{args.dataset}_{args.model}_order{args.gorder}_gamma{args.gamma}_dim{args.dim}/train_hd.pt')
  y_train = torch.load(f'{args.data_dir}/{args.dataset}_{args.model}_order{args.gorder}_gamma{args.gamma}_dim{args.dim}/y_train.pt')
  X_test = torch.load(f'{args.data_dir}/{args.dataset}_{args.model}_order{args.gorder}_gamma{args.gamma}_dim{args.dim}/test_hd.pt')
  y_test = torch.load(f'{args.data_dir}/{args.dataset}_{args.model}_order{args.gorder}_gamma{args.gamma}_dim{args.dim}/y_test.pt')
100%|██████████| 10000/10000 [00:27<00:00, 359.96it/s]

Test Accuracy: 11.15%





## Evaluating accuracy after RFF, bundling and Manhattan Similarity

In [None]:
from encoder_manhattan import ManhattanEncoder, RandomFourierEncoder
import torch
from tqdm import tqdm

dataset_choices = ['fmnist', 'mnist', 'isolet', 'ucihar']
results = {}
args.r = 2
args.gorder = 8
args.model = 'rff-gvsa'
print("Use binary HDC with manhattan distance")
start = '/content/drive/MyDrive/encoded_data'
# Loop over each dataset
for dataset in dataset_choices:
    args.dataset = dataset
    args.data_dir = f'{start}/{args.dataset}_{args.model}_order{args.gorder}_gamma{args.gamma}_dim{args.dim}'
    print(args.data_dir)
    # Load the encoded training data
    X_train = torch.load(f'{args.data_dir}/train_hd.pt')
    y_train = torch.load(f'{args.data_dir}/y_train.pt')

    # Load the encoded test data
    X_test = torch.load(f'{args.data_dir}/test_hd.pt')
    y_test = torch.load(f'{args.data_dir}/y_test.pt')
    if len(X_train.shape) == 3:
        X_train = X_train.squeeze(1)
    if len(X_test.shape) == 3:
        X_test = X_test.squeeze(1)


    mht = ManhattanEncoder(num=256, r=8)

    rfe = RandomFourierEncoder(input_dim=784, gamma=args.gamma, gorder=args.gorder, output_dim=args.dim)

    # Compute the bundled vectors for each class
    num_classes = torch.unique(y_train).size(0)
    bundled_vectors = []
    for c in range(num_classes):
        class_data = torch.stack([X_train[i] for i in range(len(X_train)) if y_train[i] == c])
        if len(class_data.shape) == 3:
            class_data = class_data.squeeze(1)

        bundled_vector = rfe.group_bundle(class_data)
        bundled_vectors.append(bundled_vector)

    # Evaluate on the test set
    correct = 0
    total = 0
    for i in tqdm(range(len(X_test))):
        x = X_test[i]
        similarities = [mht.similarity(x, bundled_vector) for bundled_vector in bundled_vectors]
        predicted_class = similarities.index(max(similarities))
        if predicted_class == y_test[i]:
            correct += 1
        total += 1
        accuracy = correct / total * 100
    print(f'Test Accuracy: {accuracy:.2f}%')
    results[dataset] = accuracy

# Print summary of accuracies
print(f"Summary of accuracies for {dataset} dataset: ")
for dataset, accuracy in results.items():
    print(f'{dataset}: {accuracy:.2f}%')

Use binary HDC with manhattan distance
/content/drive/MyDrive/encoded_data/fmnist_rff-gvsa_order8_gamma0.3_dim10000


  X_train = torch.load(f'{args.data_dir}/train_hd.pt')
  y_train = torch.load(f'{args.data_dir}/y_train.pt')
  X_test = torch.load(f'{args.data_dir}/test_hd.pt')
  y_test = torch.load(f'{args.data_dir}/y_test.pt')
100%|██████████| 10000/10000 [00:24<00:00, 402.60it/s]


Test Accuracy: 73.13%
/content/drive/MyDrive/encoded_data/mnist_rff-gvsa_order8_gamma0.3_dim10000


100%|██████████| 10000/10000 [00:24<00:00, 408.46it/s]


Test Accuracy: 84.44%
/content/drive/MyDrive/encoded_data/isolet_rff-gvsa_order8_gamma0.3_dim10000


100%|██████████| 1559/1559 [00:10<00:00, 154.30it/s]


Test Accuracy: 87.49%
/content/drive/MyDrive/encoded_data/ucihar_rff-gvsa_order8_gamma0.3_dim10000


100%|██████████| 2947/2947 [00:03<00:00, 746.52it/s]

Test Accuracy: 80.66%
Summary of accuracies for ucihar dataset: 
fmnist: 73.13%
mnist: 84.44%
isolet: 87.49%
ucihar: 80.66%



