In [13]:
import struct
import numpy as np
from sklearn.datasets import fetch_20newsgroups


def parse_mnist_images(file_path):
    with open(file_path, 'rb') as f:
        magic, num_images, num_rows, num_cols = struct.unpack('>IIII', f.read(16))

        if magic != 2051:
            raise ValueError("Invalid magic number in the MNIST image file")

        images = np.fromfile(f, dtype=np.uint8).reshape(num_images, num_rows, num_cols)

    return images

def parse_mnist_labels(file_path):
    with open(file_path, 'rb') as f:
        magic, num_labels = struct.unpack('>II', f.read(8))

        if magic != 2049:
            raise ValueError("Invalid magic number in the MNIST label file")

        labels = np.fromfile(f, dtype=np.uint8)

    return labels

images_path = 'train-images-idx3-ubyte'
labels_path = 'train-labels-idx1-ubyte'

images = parse_mnist_images(images_path)
images = images[:1000]
labels = parse_mnist_labels(labels_path)
labels = labels[:1000]

ng = fetch_20newsgroups(subset='train') 
documents = ng.data
labels = ng.target
documents = documents[:1000]
labels = labels[:1000]



In [17]:

from sklearn.feature_extraction.text import TfidfVectorizer


def shift_scale_normalize(images):
    flat_images = images.reshape(images.shape[0], -1)
    min_val = np.min(flat_images)
    max_val = np.max(flat_images)
    images_normalized = (flat_images - min_val) / (max_val - min_val)

    images_normalized = images_normalized.reshape(images.shape)
    
    return images_normalized


def zero_mean_normalize(images):
    flat_images = images.reshape(images.shape[0], -1)

    # zero mean normalization formula is (x - mean) / std_dev
    images_normalized = (flat_images - np.mean(flat_images)) / np.std(flat_images)

    images_normalized = images_normalized.reshape(images.shape)
    
    return images_normalized


## by calculating the term frequency, we can get the term frequency matrix
def term_frequency_weighting_normalize(documents):
    # stop words! remove common english words
    vectorizer = TfidfVectorizer(norm=None, use_idf=False)
    matrix = vectorizer.fit_transform(documents)
    
    return matrix

normalized_documents = term_frequency_weighting_normalize(documents).toarray()
images = shift_scale_normalize(images)
images = images.reshape(images.shape[0], -1)

print("images shape:", images.shape)
print("documents shape:", normalized_documents.shape)
print(images[0][10])
print(normalized_documents[0][:100])

images shape: (1000, 784)
documents shape: (1000, 32190)
0.0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [20]:
from scipy.spatial.distance import cdist

def compute_euclidean_distances(dataset_1, dataset_2):

    distances = cdist(dataset_1, dataset_2, metric='euclidean')

    return distances

images_distances = compute_euclidean_distances(images, images)
documents_distances = compute_euclidean_distances(normalized_documents, normalized_documents)

print(images_distances[0][:10])
print(documents_distances[0][:10])

[ 0.          9.36122213 10.87509456 10.01890101 10.48026358 10.12663683
  9.5042814   8.50966658  9.31437108 10.07571719]
[ 0.         18.27566688 27.60434748 17.17556404 18.81488772 38.27531842
 16.24807681 46.3788745  15.32970972 25.43619468]


In [21]:
import numpy as np
import numpy as np

def compute_euclidean_distances_custom(dataset_1, dataset_2):
    
    num_dataset_1 = dataset_1.shape[0]
    num_dataset_2 = dataset_2.shape[0]
    
    # Initialize a matrix
    distances = np.zeros((num_dataset_1, num_dataset_2))

    for i in range(num_dataset_1):
        for j in range(num_dataset_2):
            dist = np.sum((dataset_1[i] - dataset_2[j])**2)
            distances[i, j] = np.sqrt(dist)
    
    return distances

images_distances = compute_euclidean_distances_custom(images, images)
documents_distances = compute_euclidean_distances_custom(normalized_documents, normalized_documents)

print(images_distances[0])
print(documents_distances[0])


[ 0.          9.36122213 10.87509456 10.01890101 10.48026358 10.12663683
  9.5042814   8.50966658  9.31437108 10.07571719  8.39150172 11.24493854
 10.45527558 10.30421988  9.32249907 10.3055369   9.61667461  9.0413387
  9.26102655  8.96541177 12.3675979  10.43730187  9.53144011 10.04997891
  9.46622235 10.71066043  9.76730259  9.36504006 11.14420178 10.59621196
 10.15803839 10.30401392  9.3853338   9.30320148  9.76956332 10.22804266
  9.34246711 10.2453838   9.97734458  9.39653425  9.41941193  8.47669714
  9.36168539  8.40605135  8.97166714 10.26535365  9.94027919  9.78974347
  8.7535517   7.52090103  9.45022046 11.54884716 10.38867348  9.64100001
 10.19053167 10.39316902 12.21710123  9.72116602 11.30487641  9.85929383
 12.51790444 10.05384886  9.34889541 11.65269158 10.98016877 11.16688547
  9.74401777  9.54402092 11.51651339 10.78164102  8.21177126 10.36981913
  9.32291312  9.79937463  8.38087936 10.37366464 10.36995631  9.77194157
  9.62988748 10.55049847 10.52527176  9.49199545 10.

In [22]:
def compute_cosine_similarity(dataset_1, dataset_2):

    # Compute the dot product between the two sets of images
    dot_product = np.dot(dataset_1, dataset_2.T)
    
    # Calculate the norm of each image in both sets
    norm_1 = np.linalg.norm(dataset_1, axis=1)
    norm_2 = np.linalg.norm(dataset_2, axis=1)

    # Compute cosine similarity
    cosine_similarity = dot_product / np.outer(norm_1, norm_2)

    return cosine_similarity

images_cosine_similarity = compute_cosine_similarity(images, images)
documents_cosine_similarity = compute_cosine_similarity(normalized_documents, normalized_documents)

print(images_cosine_similarity[0])
print(documents_cosine_similarity[0])

[1.         0.55210019 0.21982848 0.33799347 0.34979632 0.45425453
 0.41737449 0.66936854 0.35276796 0.37819127 0.62913385 0.0815495
 0.49002335 0.42348687 0.36229556 0.39740302 0.46167426 0.5458097
 0.37837596 0.47165498 0.27120457 0.4896088  0.36817544 0.3237986
 0.45266735 0.51636029 0.30470524 0.68310664 0.49060796 0.22184489
 0.40679709 0.48096135 0.40742778 0.46587538 0.57201935 0.27798679
 0.57349154 0.53044428 0.39191341 0.5130025  0.37430556 0.61202127
 0.35853952 0.54061004 0.46017986 0.38498458 0.39686347 0.43552898
 0.48655909 0.7253875  0.41472717 0.48622822 0.41793238 0.36216206
 0.3839477  0.4619558  0.38377483 0.40071835 0.40149509 0.35564847
 0.18026705 0.32774771 0.55313665 0.48239222 0.32233922 0.14696154
 0.48842493 0.36254216 0.23214579 0.53266748 0.59886212 0.32162138
 0.34681866 0.45324532 0.60892466 0.48746225 0.38928432 0.37089527
 0.42399151 0.34623602 0.36764652 0.59674796 0.49854928 0.42277841
 0.43049984 0.52647262 0.31981258 0.42746059 0.3526574  0.3392891

In [23]:

class KNNClassifier:
    def __init__(self, k=3):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict_euclidean_distances(self, X_test):
        predictions = []
        distances = compute_euclidean_distances(X_test, self.X_train)

        for dist in distances:
            k_nearest_i = np.argsort(dist)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_nearest_i]

            predictions.append(max(set(k_nearest_labels), key=k_nearest_labels.count))
        
        return np.array(predictions)
    
    def predict_cosine_similarity(self, X_test):
        predictions = []
        cosine_similarities = compute_cosine_similarity(X_test, self.X_train)

        for sim in cosine_similarities:
            k_nearest_i = np.argsort(sim)[::-1][:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_nearest_i]
            
            predictions.append(max(set(k_nearest_labels), key=k_nearest_labels.count))

        return np.array(predictions)
    
    def accuracy(self, y_true, y_pred):
        return np.sum(y_true == y_pred) / len(y_true)

In [26]:

images_path = 'train-images-idx3-ubyte'
labels_path = 'train-labels-idx1-ubyte'

images = parse_mnist_images(images_path)
labels = parse_mnist_labels(labels_path)
images = images[:1000]
labels = labels[:1000]

images = images.reshape(images.shape[0], -1)

images = shift_scale_normalize(images)

split_train = int(len(images) * 0.8)
split_valid = int(len(images) * 0.9)

X_train, y_train = images[:split_train], labels[:split_train]
X_valid, y_valid = images[split_train:split_valid], labels[split_train:split_valid]
X_test, y_test = images[split_valid:], labels[split_valid:]

knn = KNNClassifier(k=3)
knn.fit(X_train, y_train)

y_valid_pred = knn.predict_euclidean_distances(X_valid)
validation_accuracy = knn.accuracy(y_valid, y_valid_pred)
print(f'Validation Accuracy: {validation_accuracy:.2f}')

y_test_pred = knn.predict_euclidean_distances(X_test)
test_accuracy = knn.accuracy(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')


Validation Accuracy: 0.90
Test Accuracy: 0.88
Validation Accuracy: 0.27
Test Accuracy: 0.27


In [None]:

ng = fetch_20newsgroups(subset='train')
documents = ng.data
labels = ng.target
documents = documents
labels = labels

normalized_documents = term_frequency_weighting_normalize(documents).toarray()

split_train = int(len(normalized_documents) * 0.8)
split_valid = int(len(normalized_documents) * 0.9)

X_train, y_train = normalized_documents[:split_train], labels[:split_train]
X_valid, y_valid = normalized_documents[split_train:split_valid], labels[split_train:split_valid]
X_test, y_test = normalized_documents[split_valid:], labels[split_valid:]

knn = KNNClassifier(k=3)
knn.fit(X_train, y_train)

# y_valid_pred = knn.predict_cosine_similarity(X_valid)
# validation_accuracy = knn.accuracy(y_valid, y_valid_pred)
# print(f'Validation Accuracy: {validation_accuracy:.2f}')

# y_test_pred = knn.predict_cosine_similarity(X_test)
# test_accuracy = knn.accuracy(y_test, y_test_pred)
# print(f'Test Accuracy: {test_accuracy:.2f}')

y_valid_pred = knn.predict_euclidean_distances(X_valid)
validation_accuracy = knn.accuracy(y_valid, y_valid_pred)
print(f'Validation Accuracy: {validation_accuracy:.2f}')

y_test_pred = knn.predict_euclidean_distances(X_test)
test_accuracy = knn.accuracy(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')
    


Validation Accuracy: 0.59
Test Accuracy: 0.57


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer


## by calculating the term frequency, we can get the term frequency matrix
def term_frequency_weighting_normalize(documents):
    # stop words! remove common english words
    # how many of
    vectorizer = TfidfVectorizer(norm=None, use_idf=False)
    matrix = vectorizer.fit_transform(documents)
    
    return matrix

normalized_documents = term_frequency_weighting_normalize(documents).toarray()

normalized_documents = normalized_documents[:100]
print(len(normalized_documents))


# the cat sat on the mat
# 

100


In [7]:
from scipy.spatial.distance import cdist

def euclidean_distances(documents):

    distances = cdist(documents, documents, metric='euclidean')

    return distances

distances = euclidean_distances(normalized_documents)
print(distances[0])




[  0.          18.27566688  27.60434748  17.17556404  18.81488772
  38.27531842  16.24807681  46.3788745   15.32970972  25.43619468
  18.41195264  67.6609193   17.97220076  71.330218    22.29349681
  20.0748599   31.22498999  96.76259608  18.27566688  23.8117618
  17.94435844  24.97999199  26.17250466  18.65475811  16.2788206
  18.86796226  34.51086785  16.37070554  37.62977544  17.3781472
  28.56571371  21.54065923  14.86606875  44.92215489  17.91647287
  19.77371993  18.41195264  41.41255848  28.05352028  70.73188814
  23.02172887  15.93737745  16.58312395  33.19638535  23.57965225
  16.          15.84297952  19.89974874  16.03121954  17.97220076
  21.65640783  21.81742423  16.64331698  21.79449472  43.01162634
  20.83266666  33.12099032  16.97056275  17.17556404 131.01526629
  17.69180601  21.9089023   17.91647287  15.90597372  16.40121947
  30.46309242  21.21320344  22.          59.78294071  20.39607805
 319.07679326  24.8394847   28.39013913  19.49358869  23.47338919
  18.33030278

In [9]:
import numpy as np

def euclidean_distances_custom(documents):
    num_documents = len(documents)
    documents_length = len(documents[0])
    
    distances = np.zeros((num_documents, num_documents))

    for i in range(num_documents):
        for j in range(i, num_documents):
            # loop through every pair of images
            # calculate the Euclidean distance between the two images
            
            dist = 0
            for k in range(documents_length):

                dist += (documents[i][k] - documents[j][k])**2

            dist = np.sqrt(dist)
            
            distances[i, j] = dist
            distances[j, i] = dist

    return distances

distances_manual = euclidean_distances_custom(normalized_documents)

print(distances_manual[0])


[  0.          18.27566688  27.60434748  17.17556404  18.81488772
  38.27531842  16.24807681  46.3788745   15.32970972  25.43619468
  18.41195264  67.6609193   17.97220076  71.330218    22.29349681
  20.0748599   31.22498999  96.76259608  18.27566688  23.8117618
  17.94435844  24.97999199  26.17250466  18.65475811  16.2788206
  18.86796226  34.51086785  16.37070554  37.62977544  17.3781472
  28.56571371  21.54065923  14.86606875  44.92215489  17.91647287
  19.77371993  18.41195264  41.41255848  28.05352028  70.73188814
  23.02172887  15.93737745  16.58312395  33.19638535  23.57965225
  16.          15.84297952  19.89974874  16.03121954  17.97220076
  21.65640783  21.81742423  16.64331698  21.79449472  43.01162634
  20.83266666  33.12099032  16.97056275  17.17556404 131.01526629
  17.69180601  21.9089023   17.91647287  15.90597372  16.40121947
  30.46309242  21.21320344  22.          59.78294071  20.39607805
 319.07679326  24.8394847   28.39013913  19.49358869  23.47338919
  18.33030278

In [41]:
def compute_cosine_similarity(documents):

    # get the dot product of each pair of images
    # shape of dot_product: (num_images, num_images)
    dot_product = np.dot(documents, documents.T)
    
    # calculate the norm of each image
    norm = np.linalg.norm(documents, axis=1)

    # this is the formula of cosine similarity
    cosine_similarity = dot_product / np.outer(norm, norm)

    return cosine_similarity

cosine_similarities_numpy = compute_cosine_similarity(normalized_documents)
print(cosine_similarities_numpy[0])

[1.         0.31275985 0.51513964 0.32130928 0.37335749 0.42218828
 0.27467634 0.30501928 0.28688525 0.40647549]
