# Problem 3 - Parse Data

In [1]:
import struct
import numpy as np
from sklearn.datasets import fetch_20newsgroups


def parse_mnist_images(file_path):
    with open(file_path, 'rb') as f:
        magic, num_images, num_rows, num_cols = struct.unpack('>IIII', f.read(16))

        if magic != 2051:
            raise ValueError("Invalid magic number in the MNIST image file")

        images = np.fromfile(f, dtype=np.uint8).reshape(num_images, num_rows, num_cols)

    return images

def parse_mnist_labels(file_path):
    with open(file_path, 'rb') as f:
        magic, num_labels = struct.unpack('>II', f.read(8))

        if magic != 2049:
            raise ValueError("Invalid magic number in the MNIST label file")

        labels = np.fromfile(f, dtype=np.uint8)

    return labels

images_path = 'train-images-idx3-ubyte'
labels_path = 'train-labels-idx1-ubyte'

images = parse_mnist_images(images_path)
images = images[:1000]
labels = parse_mnist_labels(labels_path)
labels = labels[:1000]

ng = fetch_20newsgroups(subset='train') 
documents = ng.data
labels = ng.target
documents = documents[:1000]
labels = labels[:1000]



# Problem 3 - Normalize Data

In [2]:

from sklearn.feature_extraction.text import TfidfVectorizer


def shift_scale_normalize(images):
    flat_images = images.reshape(images.shape[0], -1)
    min_val = np.min(flat_images)
    max_val = np.max(flat_images)
    images_normalized = (flat_images - min_val) / (max_val - min_val)

    images_normalized = images_normalized.reshape(images.shape)
    
    return images_normalized


def zero_mean_normalize(images):
    flat_images = images.reshape(images.shape[0], -1)

    # zero mean normalization formula is (x - mean) / std_dev
    images_normalized = (flat_images - np.mean(flat_images)) / np.std(flat_images)

    images_normalized = images_normalized.reshape(images.shape)
    
    return images_normalized


## by calculating the term frequency, we can get the term frequency matrix
def term_frequency_weighting_normalize(documents):
    # stop words! remove common english words
    vectorizer = TfidfVectorizer(norm=None, use_idf=False)
    matrix = vectorizer.fit_transform(documents)
    
    return matrix

normalized_documents = term_frequency_weighting_normalize(documents).toarray()
images = shift_scale_normalize(images)
images = images.reshape(images.shape[0], -1)

print("images shape:", images.shape)
print("documents shape:", normalized_documents.shape)
print(images[0][10])
print(normalized_documents[0][:100])

images shape: (1000, 784)
documents shape: (1000, 32190)
0.0
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


# Problem 3 - Euclidian distance using Library

In [3]:
from scipy.spatial.distance import cdist

def compute_euclidean_distances(dataset_1, dataset_2):

    distances = cdist(dataset_1, dataset_2, metric='euclidean')

    return distances

images_distances = compute_euclidean_distances(images, images)
documents_distances = compute_euclidean_distances(normalized_documents, normalized_documents)

print(images_distances[0][:10])
print(documents_distances[0][:10])

[ 0.          9.36122213 10.87509456 10.01890101 10.48026358 10.12663683
  9.5042814   8.50966658  9.31437108 10.07571719]
[ 0.         18.27566688 27.60434748 17.17556404 18.81488772 38.27531842
 16.24807681 46.3788745  15.32970972 25.43619468]


# Problem 3 - Euclidian distance using my own implementation

In [4]:
import numpy as np
import numpy as np

def compute_euclidean_distances_custom(dataset_1, dataset_2):
    
    num_dataset_1 = dataset_1.shape[0]
    num_dataset_2 = dataset_2.shape[0]
    
    # Initialize a matrix
    distances = np.zeros((num_dataset_1, num_dataset_2))

    for i in range(num_dataset_1):
        for j in range(num_dataset_2):
            dist = np.sum((dataset_1[i] - dataset_2[j])**2)
            distances[i, j] = np.sqrt(dist)
    
    return distances

# images_distances = compute_euclidean_distances_custom(images, images)
# documents_distances = compute_euclidean_distances_custom(normalized_documents, normalized_documents)

# print(images_distances[0])
# print(documents_distances[0])


# Problem 3 - Cosine Similarity using my own implementation

In [5]:
def compute_cosine_similarity(dataset_1, dataset_2):

    # Compute the dot product between the two sets of images
    dot_product = np.dot(dataset_1, dataset_2.T)
    
    # Calculate the norm of each image in both sets
    norm_1 = np.linalg.norm(dataset_1, axis=1)
    norm_2 = np.linalg.norm(dataset_2, axis=1)

    # Compute cosine similarity
    cosine_similarity = dot_product / np.outer(norm_1, norm_2)

    return cosine_similarity

# images_cosine_similarity = compute_cosine_similarity(images, images)
# documents_cosine_similarity = compute_cosine_similarity(normalized_documents, normalized_documents)

# print(images_cosine_similarity[0])
# print(documents_cosine_similarity[0])

# Problem 4 - Define KNN 

In [6]:

class KNNClassifier:
    def __init__(self, k=3):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict_euclidean_distances(self, X_test):
        predictions = []
        distances = compute_euclidean_distances(X_test, self.X_train)

        for dist in distances:
            k_nearest_i = np.argsort(dist)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_nearest_i]

            predictions.append(max(set(k_nearest_labels), key=k_nearest_labels.count))
        
        return np.array(predictions)
    
    def predict_cosine_similarity(self, X_test):
        predictions = []
        cosine_similarities = compute_cosine_similarity(X_test, self.X_train)

        for sim in cosine_similarities:
            k_nearest_i = np.argsort(sim)[::-1][:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_nearest_i]
            
            predictions.append(max(set(k_nearest_labels), key=k_nearest_labels.count))

        return np.array(predictions)
    
    def accuracy(self, y_true, y_pred):
        return np.sum(y_true == y_pred) / len(y_true)

# Problem 4 - test data - mnist

In [12]:

images_path = 'train-images-idx3-ubyte'
labels_path = 'train-labels-idx1-ubyte'

images = parse_mnist_images(images_path)
labels = parse_mnist_labels(labels_path)
images = images[:1000]
labels = labels[:1000]

images = images.reshape(images.shape[0], -1)

images = shift_scale_normalize(images)

split_train = int(len(images) * 0.8)
split_valid = int(len(images) * 0.9)

X_train, y_train = images[:split_train], labels[:split_train]
X_valid, y_valid = images[split_train:split_valid], labels[split_train:split_valid]
X_test, y_test = images[split_valid:], labels[split_valid:]

knn = KNNClassifier(k=3)
knn.fit(X_train, y_train)

y_valid_pred = knn.predict_euclidean_distances(X_valid)
validation_accuracy = knn.accuracy(y_valid, y_valid_pred)
print(f'Validation Accuracy: {validation_accuracy:.2f}')

y_test_pred = knn.predict_euclidean_distances(X_test)
test_accuracy = knn.accuracy(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')


Validation Accuracy: 0.90
Test Accuracy: 0.88


# Problem 4 - test data - 20 newsgroups

In [None]:
from sklearn.datasets import fetch_20newsgroups

ng = fetch_20newsgroups(subset='train')
documents = ng.data
labels = ng.target
documents = documents
labels = labels

normalized_documents = term_frequency_weighting_normalize(documents).toarray()

split_train = int(len(normalized_documents) * 0.8)
split_valid = int(len(normalized_documents) * 0.9)

X_train, y_train = normalized_documents[:split_train], labels[:split_train]
X_valid, y_valid = normalized_documents[split_train:split_valid], labels[split_train:split_valid]
X_test, y_test = normalized_documents[split_valid:], labels[split_valid:]

knn = KNNClassifier(k=3)
knn.fit(X_train, y_train)

y_valid_pred = knn.predict_cosine_similarity(X_valid)
validation_accuracy = knn.accuracy(y_valid, y_valid_pred)
print(f'Validation Accuracy: {validation_accuracy:.2f}')

y_test_pred = knn.predict_cosine_similarity(X_test)
test_accuracy = knn.accuracy(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')

# y_valid_pred = knn.predict_euclidean_distances(X_valid)
# validation_accuracy = knn.accuracy(y_valid, y_valid_pred)
# print(f'Validation Accuracy: {validation_accuracy:.2f}')

# y_test_pred = knn.predict_euclidean_distances(X_test)
# test_accuracy = knn.accuracy(y_test, y_test_pred)
# print(f'Test Accuracy: {test_accuracy:.2f}')
    


Validation Accuracy: 0.59
Test Accuracy: 0.57
