In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from mnist import MNIST
from tqdm import tqdm

In [2]:
mnist_loader = MNIST("../data/mnist/")
mnist_loader.gz = True

In [3]:
images, labels = mnist_loader.load_training()
timages, tlabels = mnist_loader.load_testing()

In [4]:
images = np.array(images)
labels = np.array(labels)
timages = np.array(timages)
tlabels = np.array(tlabels)

In [5]:
images.shape, timages.shape

((60000, 784), (10000, 784))

In [6]:
n_rows = images.shape[0]
t_n_rows = timages.shape[0]
batch_size = 2500

## init test

In [71]:
sub_x = images[:500]
sub_y = labels[:500]

In [76]:
fp = np.memmap('../data/mnist/tests/init.memmap', mode='w+', shape=(500, n_rows), dtype='float32')

In [77]:
for i in range(5):
    start = i * 100
    end = (i + 1) * 100
    fp[start:end] = euclidean_distances(X=sub_x[start:end], Y=images)
    print(sub_x[start:end].shape, start, end)

(100, 784) 0 100
(100, 784) 100 200
(100, 784) 200 300
(100, 784) 300 400
(100, 784) 400 500


In [78]:
np.allclose(fp, euclidean_distances(X=sub_x, Y=images))

True

In [79]:
del fp

init test working

## cos sim

In [9]:
path = '../data/mnist/cos_sim.memmap'

In [11]:
fp = np.memmap(path, dtype='float32', mode='w+', shape=(n_rows + t_n_rows, n_rows))

In [None]:
for batch_index in tqdm(range(n_rows // batch_size)):
    start = batch_size * batch_index
    end = batch_size * (batch_index + 1)
    fp[start:end, :] = cosine_similarity(X=images[start:end], Y=images)

A Jupyter Widget

In [17]:
for batch_index in tqdm(range(t_n_rows // batch_size)):
    start = batch_size * batch_index
    end = batch_size * (batch_index + 1)
    fp_start = start + n_rows
    fp_end = end + n_rows
    fp[fp_start:fp_end, :] = cosine_similarity(X=timages[start:end], Y=images)

A Jupyter Widget




In [18]:
del fp

## euc dist

In [9]:
path = '../data/mnist/eud_dist.memmap'
fp = np.memmap(path, dtype='float32', mode='w+', shape=(n_rows + t_n_rows, n_rows))

In [10]:
for batch_index in tqdm(range(n_rows // batch_size)):
    start = batch_size * batch_index
    end = batch_size * (batch_index + 1)
    fp[start:end, :] = euclidean_distances(X=images[start:end], Y=images)

A Jupyter Widget




In [11]:
for batch_index in tqdm(range(t_n_rows // batch_size)):
    start = batch_size * batch_index
    end = batch_size * (batch_index + 1)
    fp_start = start + n_rows
    fp_end = end + n_rows
    fp[fp_start:fp_end, :] = euclidean_distances(X=timages[start:end], Y=images)

A Jupyter Widget




In [12]:
del fp

## helper

In [7]:
def runner_train(fp, nf):
    X = images/nf
    for batch_index in tqdm(range(n_rows // batch_size)):
        start = batch_size * batch_index
        end = batch_size * (batch_index + 1)
        fp[start:end, :] = euclidean_distances(X=X[start:end], Y=X)

In [8]:
def runner_test(fp, nf):
    X = timages/nf
    Y = images/nf
    for batch_index in tqdm(range(t_n_rows // batch_size)):
        start = batch_size * batch_index
        end = batch_size * (batch_index + 1)
        fp_start = start + n_rows
        fp_end = end + n_rows
        fp[fp_start:fp_end, :] = euclidean_distances(X=X[start:end], Y=Y)

## euc no normalization

In [9]:
fp = np.memmap('../data/mnist/dist.memmap', dtype='float32', mode='w+', shape=(n_rows + t_n_rows, n_rows))

In [10]:
runner_train(fp, 1) # train-train

100%|██████████| 24/24 [05:06<00:00, 12.77s/it]


In [11]:
runner_test(fp, 1) #train-test

100%|██████████| 4/4 [00:55<00:00, 13.79s/it]


In [12]:
del fp

## euc 255 normalized

In [14]:
fp = np.memmap('../data/mnist/dist_255.memmap', dtype='float32', mode='w+', shape=(n_rows + t_n_rows, n_rows))

In [15]:
runner_train(fp, 255) # train-train

100%|██████████| 24/24 [04:52<00:00, 12.20s/it]


In [16]:
runner_test(fp, 255) #train-test

100%|██████████| 4/4 [00:40<00:00, 10.20s/it]


In [17]:
del fp

## euc 128 normalized

In [19]:
fp = np.memmap('../data/mnist/tests/edu_128.memmap', dtype='float32', mode='w+', shape=(n_rows + t_n_rows, n_rows))

In [20]:
runner_train(fp, 128) # train-train

100%|██████████| 24/24 [04:28<00:00, 11.17s/it]


In [21]:
runner_test(fp, 128) #train-test

100%|██████████| 4/4 [00:42<00:00, 10.60s/it]


In [22]:
del fp