In [195]:
import numpy as np
from mnist import MNIST
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from PIL import Image
from matplotlib import pyplot as plt

In [196]:
def shuffle(X, y):
    permutation = np.arange(X.shape[0])
    np.random.shuffle(permutation)
    return X[permutation], y[permutation]

def load_dataset():
    mndata = MNIST('./data/')
    X_train, labels_train = map(np.array, mndata.load_training())
    X_test, labels_test = map(np.array, mndata.load_testing())
    return X_train, labels_train, X_test, labels_test

In [197]:
X_train, labels_train, _, _ = load_dataset()
# X_train, labels_train = shuffle(X_train, labels_train)
X_train, X_test, y_train, y_test = train_test_split(X_train, labels_train, test_size=0.33, random_state=42)

X_train[0].reshape(28, 28)

plt.imshow(X_train[2].reshape(28, 28), interpolation='nearest')

img = Image.fromarray(X_train[2].reshape(28, 28))
img.show()

In [124]:
D = 10000 # dimensions in random space
IMG_LEN = 28
NUM_SAMPLES = X_train.shape[0]

In [125]:
# print("Generating random projection...")
# proj = np.random.rand(D, IMG_LEN * IMG_LEN)
print("Generating random projection...")
seed = 50
np.random.seed(seed)
proj = np.random.rand(D, IMG_LEN * IMG_LEN)
# proj[proj==0] = -1
print(proj.shape)
def get_scene(img, proj):
    return np.dot(proj, img)

# Transform the image vectors into the hypervectors
def get_scenes(images, proj):
    print(images.shape)
    print(proj.shape)
    return np.dot(images[:NUM_SAMPLES, :], proj.T)

print("Projecting images to higher dim space...")
X_train = get_scenes(X_train, proj)

Generating random projection...
(10000, 784)
Projecting images to higher dim space...
(40200, 784)
(10000, 784)


In [126]:
digit_vectors = np.zeros((10, D))

In [127]:
print(digit_vectors.shape)

(10, 10000)


In [128]:
digit_vectors = np.zeros((10, D))
# num_count = {}
for i in range(NUM_SAMPLES):
#     num_count[y_train[i]] =  num_count.get(y_train[i], 0) + 1
    digit_vectors[y_train[i]] += X_train[i]
digit_vectors = np.array(digit_vectors)

In [129]:
# digit_vectors[digit_vectors > 0] = 1
# digit_vectors[digit_vectors <= 0] = -1

In [130]:
digit_vectors.shape

(10, 10000)

for i in num_count:
    digit_vectors[i] /= num_count[i] 

In [131]:
def classify(images, digit_vectors):
    similarities = cosine_similarity(images, digit_vectors)
    classifications = np.argmax(similarities, axis=1)
    return classifications

In [132]:
print("Train accuracy:")
predictions = classify(X_train, digit_vectors)
acc = accuracy_score(y_train[:X_train.shape[0]], predictions)
print(acc)

Train accuracy:
0.8131094527363184


In [133]:
# print("Test accuracy:")
# X_test = get_scenes(X_test, proj)
# predictions = classify(X_test, digit_vectors)
# acc = accuracy_score(y_test[:X_test.shape[0]], predictions)
# print(acc)

In [134]:
import pandas as pd
df = pd.DataFrame({f'model_{seed}': predictions,
                   'y': y_train})

In [135]:
df_discrepencies = df[df[f'model_{seed}'] != df["y"]]

In [136]:
df_discrepencies.reset_index(inplace=True)

In [137]:
df_discrepencies.head()

Unnamed: 0,index,model_50,y
0,0,7,8
1,1,8,6
2,2,2,9
3,16,9,5
4,20,3,2


In [138]:
np.random.seed(seed)
for row in df_discrepencies.iterrows():
    idx = row[1]["index"]
    y_false = row[1][f"model_{seed}"]
    y_true = row[1]["y"]
    digit_vectors[y_false] -= X_train[idx]
    digit_vectors[y_true] += X_train[idx]

In [139]:
print("Train accuracy:")
predictions = classify(X_train, digit_vectors)
acc = accuracy_score(y_train[:X_train.shape[0]], predictions)
print(acc)

Train accuracy:
0.8506965174129353


In [141]:
print("Test accuracy:")
X_test = get_scenes(X_test, proj)
predictions = classify(X_test, digit_vectors)
acc = accuracy_score(y_test[:X_test.shape[0]], predictions)
print(acc)

Test accuracy:
(19800, 784)
(10000, 784)
0.8517676767676767


In [80]:
import pandas as pd
df_test = pd.DataFrame({f'model_{seed}': predictions,
                   'y': y_test})

In [142]:
np.save(f'./temp/digit_vectors_seed_{seed}_version_2.npy', digit_vectors)

#### Make sure you sun from top until this point with multiple seeds to run code below this point

In [198]:
model_30 = np.load("./temp/digit_vectors_seed_30_version_2.npy")
model_40 = np.load("./temp/digit_vectors_seed_40_version_2.npy")
model_50 = np.load("./temp/digit_vectors_seed_50_version_2.npy")

In [199]:
X_train, labels_train, _, _ = load_dataset()
# X_train, labels_train = shuffle(X_train, labels_train)
X_train, X_test, y_train, y_test = train_test_split(X_train, labels_train, test_size=0.33, random_state=42)

In [146]:
import copy
seeds = [30, 40, 50]
models = [model_30, model_40, model_50]
results = []

for seed, model in zip(seeds, models):
    np.random.seed(seed)
    proj = np.random.rand(D, IMG_LEN * IMG_LEN)
    X_test_copy = copy.deepcopy(X_test)
    X_test_copy = get_scenes(X_test_copy, proj)
    predictions = classify(X_test_copy, model)
    results.append(predictions)
    print("here")

(19800, 784)
(10000, 784)
here
(19800, 784)
(10000, 784)
here
(19800, 784)
(10000, 784)
here


In [147]:
np.save("./temp/results_30_40_50_version_2.npy", results)

In [200]:
results = np.load("./temp/results_30_40_50_version_2.npy")

In [201]:
import pandas as pd
df = pd.DataFrame({'model_30': list(results[0]),
                   'model_40': list(results[1]),
                   'model_50': list(results[2]),
                   'y': y_test})

In [202]:
df.head()

Unnamed: 0,model_30,model_40,model_50,y
0,7,7,7,7
1,3,3,3,3
2,8,8,8,8
3,9,9,9,9
4,3,3,3,3


In [203]:
df_discrepencies = df[(df["y"] == df["model_30"]) | (df["y"] == df["model_40"]) | (df["y"] == df["model_50"])]

In [204]:
df_discrepencies = df_discrepencies[((df_discrepencies["y"] + df_discrepencies["model_30"]
                                     + df_discrepencies["model_40"] + df_discrepencies["model_50"]) 
                                     != df_discrepencies["y"] * 4)]

In [205]:
df_discrepencies.head()

Unnamed: 0,model_30,model_40,model_50,y
34,5,0,0,5
111,7,4,4,4
166,2,6,6,6
246,9,7,9,9
284,5,9,5,5


In [210]:
sum(df_discrepencies["model_50"] != df_discrepencies["y"])

184

In [161]:
df_discrepencies.to_excel("./temp/test_discrepencies_version_2.xlsx")

In [211]:
test_discrepencies = pd.read_excel("./temp/test_discrepencies_version_2.xlsx")

In [212]:
test_discrepencies.rename(columns={"Unnamed: 0": "idx"}, inplace=True)

In [213]:
test_discrepencies.head()

Unnamed: 0,idx,model_30,model_40,model_50,y
0,34,5,0,0,5
1,111,7,4,4,4
2,166,2,6,6,6
3,246,9,7,9,9
4,284,5,9,5,5


In [214]:
def get_scene(img, proj):
    return np.dot(img, proj.T)

In [174]:
for row in test_discrepencies.iterrows():
    idx = row[1]["idx"]
    y_false = row[1]["model_40"]
    y_true = row[1]["y"]
    hv = get_scene(X_train[idx].reshape((1, -1)), proj)
    model_30[y_false] -= hv[0]
    model_30[y_true] += hv[0]

In [185]:
# print("Generating random projection...")
# proj = np.random.rand(D, IMG_LEN * IMG_LEN)
print("Generating random projection...")
seed = 30
np.random.seed(seed)
proj = np.random.rand(D, IMG_LEN * IMG_LEN)
# proj[proj==0] = -1
print(proj.shape)
def get_scene(img, proj):
    return np.dot(img, proj.T)

# Transform the image vectors into the hypervectors
def get_scenes(images, proj):
    print(images.shape)
    print(proj.shape)
    return np.dot(images[:NUM_SAMPLES, :], proj.T)

print("Projecting images to higher dim space...")
X_train = get_scenes(X_train, proj)

Generating random projection...
(10000, 784)
Projecting images to higher dim space...
(40200, 784)
(10000, 784)


In [186]:
print("Train accuracy:")
predictions = classify(X_train, model_30)
acc = accuracy_score(y_train[:X_train.shape[0]], predictions)
print(acc)

Train accuracy:
0.8514427860696517


In [187]:
print("Test accuracy:")
X_test = get_scenes(X_test, proj)
predictions = classify(X_test, model_30)
acc = accuracy_score(y_test[:X_test.shape[0]], predictions)
print(acc)

Test accuracy:
(19800, 784)
(10000, 784)
0.8522222222222222


In [190]:
_, _, X_test, y_test = load_dataset()

In [191]:
print("Test accuracy:")
X_test = get_scenes(X_test, proj)
predictions = classify(X_test, model_30)
acc = accuracy_score(y_test[:X_test.shape[0]], predictions)
print(acc)

Test accuracy:
(10000, 784)
(10000, 784)
0.8603
