In [14]:
import numpy as np
from mnist import MNIST
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from PIL import Image
from matplotlib import pyplot as plt

In [15]:
def shuffle(X, y):
    permutation = np.arange(X.shape[0])
    np.random.shuffle(permutation)
    return X[permutation], y[permutation]

def load_dataset():
    mndata = MNIST('./data/')
    X_train, labels_train = map(np.array, mndata.load_training())
    X_test, labels_test = map(np.array, mndata.load_testing())
    return X_train, labels_train, X_test, labels_test

In [16]:
X_train, labels_train, _, _ = load_dataset()
# X_train, labels_train = shuffle(X_train, labels_train)
X_train, X_test, y_train, y_test = train_test_split(X_train, labels_train, test_size=0.33, random_state=42)

X_train[0].reshape(28, 28)

plt.imshow(X_train[2].reshape(28, 28), interpolation='nearest')

img = Image.fromarray(X_train[2].reshape(28, 28))
img.show()

In [17]:
D = 10000 # dimensions in random space
IMG_LEN = 28
NUM_SAMPLES = X_train.shape[0]

In [18]:
# print("Generating random projection...")
# proj = np.random.rand(D, IMG_LEN * IMG_LEN)
print("Generating random projection...")
seed = 50
np.random.seed(seed)
proj = np.random.rand(D, IMG_LEN * IMG_LEN)
# proj[proj==0] = -1
print(proj.shape)
def get_scene(img, proj):
    return np.dot(proj, img)

# Transform the image vectors into the hypervectors
def get_scenes(images, proj):
    print(images.shape)
    print(proj.shape)
    return np.dot(images[:NUM_SAMPLES, :], proj.T)

print("Projecting images to higher dim space...")
X_train = get_scenes(X_train, proj)

Generating random projection...
(10000, 784)
Projecting images to higher dim space...
(40200, 784)
(10000, 784)


In [52]:
digit_vectors = np.zeros((10, D))

In [53]:
print(digit_vectors.shape)

(10, 10000)


In [54]:
digit_vectors = np.zeros((10, D))
# num_count = {}
for i in range(NUM_SAMPLES):
#     num_count[y_train[i]] =  num_count.get(y_train[i], 0) + 1
    digit_vectors[y_train[i]] += X_train[i]
digit_vectors = np.array(digit_vectors)

In [55]:
# digit_vectors[digit_vectors > 0] = 1
# digit_vectors[digit_vectors <= 0] = -1

In [56]:
digit_vectors.shape

(10, 10000)

for i in num_count:
    digit_vectors[i] /= num_count[i] 

In [22]:
def classify(images, digit_vectors):
    similarities = cosine_similarity(images, digit_vectors)
    classifications = np.argmax(similarities, axis=1)
    return classifications

In [58]:
print("Train accuracy:")
predictions = classify(X_train, digit_vectors)
acc = accuracy_score(y_train[:X_train.shape[0]], predictions)
print(acc)

Train accuracy:
0.8131094527363184


In [59]:
print("Test accuracy:")
X_test = get_scenes(X_test, proj)
predictions = classify(X_test, digit_vectors)
acc = accuracy_score(y_test[:X_test.shape[0]], predictions)
print(acc)

Test accuracy:
(19800, 784)
(10000, 784)
0.8085858585858586


import pandas as pd
df = pd.DataFrame({f'model_{seed}': predictions,
                   'y': y_train})

In [60]:
np.save(f'./temp/digit_vectors_seed_{seed}_version_2_floating.npy', digit_vectors)

#### Make sure you sun from top until this point with multiple seeds to run code below this point

In [23]:
model_30 = np.load("./temp/digit_vectors_seed_30_version_2_floating.npy")
model_40 = np.load("./temp/digit_vectors_seed_40_version_2_floating.npy")
model_50 = np.load("./temp/digit_vectors_seed_50_version_2_floating.npy")

### First, retraining on Adversarial inputs from Training set

In [24]:
X_train, labels_train, _, _ = load_dataset()
# X_train, labels_train = shuffle(X_train, labels_train)
X_train, X_test, y_train, y_test = train_test_split(X_train, labels_train, test_size=0.33, random_state=42)

In [25]:
import copy
seeds = [30, 40, 50]
models = [model_30, model_40, model_50]
results = []

for seed, model in zip(seeds, models):
    np.random.seed(seed)
    proj = np.random.rand(D, IMG_LEN * IMG_LEN)
    X_train_copy = copy.deepcopy(X_train)
    X_train_copy = get_scenes(X_train_copy, proj)
    predictions = classify(X_train_copy, model)
    results.append(predictions)
    print("here")

(40200, 784)
(10000, 784)
here
(40200, 784)
(10000, 784)
here
(40200, 784)
(10000, 784)
here


In [26]:
len(results[0])

40200

In [27]:
import pandas as pd
df = pd.DataFrame({'model_30': list(results[0]),
                   'model_40': list(results[1]),
                   'model_50': list(results[2]),
                   'y': y_train})

In [28]:
df.head()

Unnamed: 0,model_30,model_40,model_50,y
0,7,9,7,8
1,8,8,8,6
2,4,4,2,9
3,6,6,6,6
4,3,3,3,3


In [32]:
x = df["model_30"] == df["model_40"]

In [33]:
y = df["model_30"] == df["model_50"]

In [62]:
model_names = ["model_30", "model_40", "model_50"]

In [63]:
"Hello".lower()

'hello'

In [65]:
for i in range(len(model_names)):
    x = (df[model_names[i]] != df[model_names[i+1]])
    for j in range(i+2, len(model_names)):
        x = x | (df[model_names[i]] != df[model_names[j]])
    break

In [67]:
df[x]

Unnamed: 0,model_30,model_40,model_50,y
0,7,9,7,8
2,4,4,2,9
51,9,4,9,4
151,0,0,4,7
175,9,7,9,7
200,8,5,5,8
280,3,3,5,5
305,5,0,5,0
306,2,2,3,3
369,3,3,9,9


In [68]:
df_discrepencies = df[(df["model_30"] != df["model_40"]) | (df["model_30"] != df["model_50"]) | (df["model_40"] != df["model_50"])]

In [69]:
df_discrepencies.head()

Unnamed: 0,model_30,model_40,model_50,y
0,7,9,7,8
2,4,4,2,9
51,9,4,9,4
151,0,0,4,7
175,9,7,9,7


In [70]:
len(df_discrepencies)

1201

In [190]:
df_discrepencies.reset_index(inplace=True)

In [191]:
df_discrepencies.head()

Unnamed: 0,index,model_30,model_40,model_50,y
0,0,7,9,7,8
1,2,4,4,2,9
2,51,9,4,9,4
3,151,0,0,4,7
4,175,9,7,9,7


In [192]:
def get_scene(img, proj):
    return np.dot(img, proj.T)

In [193]:
epochs = 10

for epoch in range(epochs):
    for row in df_discrepencies.iterrows():
        idx = row[1]["index"]
        y_false = row[1]["model_50"]
        y_true = row[1]["y"]
        hv = get_scene(X_train[idx].reshape((1, -1)), proj)
        model_50[y_false] -= hv[0]
        model_50[y_true] += hv[0]

In [71]:
for i, j, k in zip([1, 2, 3], ['a', 'b', 'c'], [4, 5, 6]):
    print(i, j, k)

1 a 4
2 b 5
3 c 6


In [194]:
# print("Generating random projection...")
# proj = np.random.rand(D, IMG_LEN * IMG_LEN)
print("Generating random projection...")
seed = 50
np.random.seed(seed)
proj = np.random.rand(D, IMG_LEN * IMG_LEN)
# proj[proj==0] = -1
print(proj.shape)
def get_scene(img, proj):
    return np.dot(img, proj.T)

# Transform the image vectors into the hypervectors
def get_scenes(images, proj):
    print(images.shape)
    print(proj.shape)
    return np.dot(images[:NUM_SAMPLES, :], proj.T)

print("Projecting images to higher dim space...")
X_train = get_scenes(X_train, proj)

Generating random projection...
(10000, 784)
Projecting images to higher dim space...
(40200, 784)
(10000, 784)


In [195]:
print("Train accuracy:")
predictions = classify(X_train, model_50)
acc = accuracy_score(y_train[:X_train.shape[0]], predictions)
print(acc)

Train accuracy:
0.8420398009950248


In [196]:
print("Test accuracy:")
X_test = get_scenes(X_test, proj)
predictions = classify(X_test, model_50)
acc = accuracy_score(y_test[:X_test.shape[0]], predictions)
print(acc)

Test accuracy:
(19800, 784)
(10000, 784)
0.8433838383838383


In [201]:
test = df[(df["model_30"] == df["model_40"]) & (df["model_30"] == df["model_50"]) & (df["model_40"] == df["model_50"]) ]

In [208]:
1 - len(test[test["model_50"] != test["y"]])/len(df)

0.8320398009950248

In [None]:
np.save(f'./temp/digit_vectors_seed_{seed}_version_2_floating_retrained.npy', digit_vectors)

In [None]:
model_30 = np.load("./temp/digit_vectors_seed_30_version_2_floating_retrained.npy")
model_40 = np.load("./temp/digit_vectors_seed_40_version_2_floating_retrained.npy")
model_50 = np.load("./temp/digit_vectors_seed_50_version_2_floating_retrained.npy")

In [None]:
X_train, labels_train, _, _ = load_dataset()
# X_train, labels_train = shuffle(X_train, labels_train)
X_train, X_test, y_train, y_test = train_test_split(X_train, labels_train, test_size=0.33, random_state=42)

In [None]:
import copy
seeds = [30, 40, 50]
models = [model_30, model_40, model_50]
results = []

for seed, model in zip(seeds, models):
    np.random.seed(seed)
    proj = np.random.rand(D, IMG_LEN * IMG_LEN)
    X_train_copy = copy.deepcopy(X_train)
    X_train_copy = get_scenes(X_train_copy, proj)
    predictions = classify(X_train_copy, model)
    results.append(predictions)
    print("here")

In [49]:
X_train, labels_train, _, _ = load_dataset()
# X_train, labels_train = shuffle(X_train, labels_train)
X_train, X_test, y_train, y_test = train_test_split(X_train, labels_train, test_size=0.33, random_state=42)

In [23]:
import copy
seeds = [30, 40, 50]
models = [model_30, model_40, model_50]
results = []

for seed, model in zip(seeds, models):
    np.random.seed(seed)
    proj = np.random.rand(D, IMG_LEN * IMG_LEN)
    X_test_copy = copy.deepcopy(X_test)
    X_test_copy = get_scenes(X_test_copy, proj)
    predictions = classify(X_test_copy, model)
    results.append(predictions)
    print("here")

NameError: name 'get_scenes' is not defined

In [49]:
np.save("./temp/results_30_40_50_version_2.npy", results)

In [4]:
results = np.load("./temp/results_30_40_50_version_2.npy")

In [5]:
import pandas as pd
df = pd.DataFrame({'model_30': list(results[0]),
                   'model_40': list(results[1]),
                   'model_50': list(results[2]),
                   'y': y_test})

In [6]:
df.head()

Unnamed: 0,model_30,model_40,model_50,y
0,7,7,7,7
1,3,3,3,3
2,8,8,8,8
3,9,9,9,9
4,3,3,3,3


In [7]:
len(df)

19800

In [8]:
df_discrepencies = df[(df["model_30"] != df["model_40"]) | (df["model_30"] != df["model_50"]) | (df["model_40"] != df["model_50"])]

In [9]:
df_discrepencies.to_excel("./temp/test_discrepencies_version_2_floating.xlsx")

df_discrepencies = df[(df["y"] == df["model_30"]) | (df["y"] == df["model_40"]) | (df["y"] == df["model_50"])]

df_discrepencies = df_discrepencies[((df_discrepencies["y"] + df_discrepencies["model_30"]
                                     + df_discrepencies["model_40"] + df_discrepencies["model_50"]) 
                                     != df_discrepencies["y"] * 4)]

df_discrepencies.head()

sum(df_discrepencies["model_50"] != df_discrepencies["y"])

len(df_discrepencies)

df_discrepencies.to_excel("./temp/test_discrepencies_version_2.xlsx")

test_discrepencies = pd.read_excel("./temp/test_discrepencies_version_2.xlsx")

In [10]:
len(df_discrepencies)

631

In [11]:
df_discrepencies.head()

Unnamed: 0,model_30,model_40,model_50,y
15,9,7,9,7
29,2,6,2,6
64,5,0,5,0
83,1,1,7,9
102,5,5,8,5


In [60]:
test_discrepencies.rename(columns={"Unnamed: 0": "idx"}, inplace=True)

In [61]:
test_discrepencies.head()

Unnamed: 0,idx,model_30,model_40,model_50,y
0,15,9,7,9,7
1,29,2,6,2,6
2,64,5,0,5,0
3,102,5,5,8,5
4,173,9,7,9,9


In [62]:
def get_scene(img, proj):
    return np.dot(img, proj.T)

In [63]:
for row in test_discrepencies.iterrows():
    idx = row[1]["idx"]
    y_false = row[1]["model_30"]
    y_true = row[1]["y"]
    hv = get_scene(X_train[idx].reshape((1, -1)), proj)
    model_30[y_false] -= hv[0]
    model_30[y_true] += hv[0]

In [64]:
# print("Generating random projection...")
# proj = np.random.rand(D, IMG_LEN * IMG_LEN)
print("Generating random projection...")
seed = 30
np.random.seed(seed)
proj = np.random.rand(D, IMG_LEN * IMG_LEN)
# proj[proj==0] = -1
print(proj.shape)
def get_scene(img, proj):
    return np.dot(img, proj.T)

# Transform the image vectors into the hypervectors
def get_scenes(images, proj):
    print(images.shape)
    print(proj.shape)
    return np.dot(images[:NUM_SAMPLES, :], proj.T)

print("Projecting images to higher dim space...")
X_train = get_scenes(X_train, proj)

Generating random projection...
(10000, 784)
Projecting images to higher dim space...
(40200, 784)
(10000, 784)


In [65]:
print("Train accuracy:")
predictions = classify(X_train, model_30)
acc = accuracy_score(y_train[:X_train.shape[0]], predictions)
print(acc)

Train accuracy:
0.8130597014925374


In [66]:
print("Test accuracy:")
X_test = get_scenes(X_test, proj)
predictions = classify(X_test, model_30)
acc = accuracy_score(y_test[:X_test.shape[0]], predictions)
print(acc)

Test accuracy:
(19800, 784)
(10000, 784)
0.8077272727272727
