In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import *
from sklearn.decomposition import *
from sklearn.neighbors import KNeighborsClassifier
from scipy.special import gamma, factorial
from data.FACES.imageLoader import loadImages

In [2]:
class AssignmentUtils:
    def __init__(self):
        self.data_path = "data"
    
    def load_mnist(self, prefix_type):
        data_loc = os.path.join(self.data_path, 
                                "Reduced Fashion-MNIST", 
                                f"{prefix_type.capitalize()}_Data.csv")
        label_loc = os.path.join(self.data_path, 
                                "Reduced Fashion-MNIST", 
                                f"{prefix_type.capitalize()}_Labels.csv")
        
        data_df = pd.read_csv(data_loc, index_col=False)
        labels = []
        with open(label_loc) as lfile:
            while True:
                line = lfile.readline()
                if not line:
                    break
                labels.append(int(float(line.strip())))
        
        data = data_df.to_numpy()
        data_df = [
            {"image": d,
            "label": l} for d, l in zip(data, labels)
        ]
        data_df = pd.DataFrame(data_df)
        return data_df
            
    def load_faces(self, prefix_type):
        folder = os.path.join(self.data, "FACES", prefix_type)
        return loadImages(folder)


assignment_util = AssignmentUtils()

mnist_train = assignment_util.load_mnist("train")
mnist_test = assignment_util.load_mnist("test")

# Q8

In [31]:
class DensityEstimator:
    def __init__(self):
        pass
    
    def fit(self, X):
        self._data = X
        self.n_features = X.shape[1]
    
    def _sample_pdf(self, x):
        pass
        
    def pdf(self, X):
        return np.array(
            [self._sample_pdf(x) for x in X]
        )

class ParzenDensityEstimator(DensityEstimator):
    def __init__(self, window_size):
        super().__init__()
        self.window_size = window_size
    
    
    def _sample_pdf(self, x):
        phi = (np.abs(x - self._data) < self.window_size/2).any(axis=-1).any().astype(float)
        return phi / ((self.window_size ** self.n_features) * self._data.shape[0])
        

class KNNDensityEstimator(DensityEstimator):
    def __init__(self, k):
        super().__init__()
        self.k = k
    
    def _sample_pdf(self, x):
        dists = np.sqrt(np.power((x - self._data), 2).sum(axis=-1))
        max_k_dist = dists[np.argsort(dists)][self.k]
        r_n = (max_k_dist ** self.n_features) 
        coef = (np.pi ** (self.n_features/2)) / gamma(self.n_features/2 + 1)
        vol = r_n * coef
        return self.k / (self._data.shape[0] * vol)
        


class NaiveBayesClassifier:
    class PDFEstimatorFactory:
        MAPPINGS = {
                "parzen": ParzenDensityEstimator,
                "knn": KNNDensityEstimator,
        }
        @staticmethod
        def create(pdf_type, **params):
            return NaiveBayesClassifier.PDFEstimatorFactory.MAPPINGS[pdf_type](**params)

    def __init__(self, pdf_type, **pdf_estimator_params):
        self.pdf_type = pdf_type
        self.pdf_estimator_params = pdf_estimator_params

    def fit(self, X, y):
        self.classes_ = sorted(set(y))
        self.pdf_estimators = [
            NaiveBayesClassifier.PDFEstimatorFactory.create(self.pdf_type, 
                                       **self.pdf_estimator_params) for c in self.classes_
        ]
        for idx, c in enumerate(self.classes_):
            estimator = self.pdf_estimators[idx]
            indices = y == c
            estimator.fit(
                X[indices]
            )
    
    def predict(self, X):
        probs = np.zeros((X.shape[0], len(self.classes_)))
        for idx, c in enumerate(self.classes_):
            estimator = self.pdf_estimators[idx]
            probs[..., idx] = estimator.pdf(X)
        labels_idx = probs.argmax(axis=-1)
        labels = [self.classes_[l_idx] for l_idx in labels_idx]
        return labels

In [12]:
X_train, y_train = mnist_train["image"].to_numpy(), mnist_train["label"].to_numpy()
X_test, y_test = mnist_test["image"].to_numpy(), mnist_test["label"].to_numpy()

X_train = np.array([x for x in X_train])
X_test = np.array([x for x in X_test])
X_train.shape

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [35]:

w_sizes = [0.2, 0.5, 0.7]


clf = NaiveBayesClassifier(
    "parzen", window_size=w_sizes[0]
)

clf.fit(X_train, y_train)

preds = clf.predict(X_test)

KeyboardInterrupt: 