In [115]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
#from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import multivariate_normal as mvn

In [116]:
# Load the data from the CSV file into a numpy array
data = pd.read_csv('/content/sample_data/mnist_train_small.csv')

In [117]:
print(data.info)

<bound method DataFrame.info of        6  0  0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  ...  0.581  0.582  0.583  \
0      5  0    0    0    0    0    0    0    0    0  ...      0      0      0   
1      7  0    0    0    0    0    0    0    0    0  ...      0      0      0   
2      9  0    0    0    0    0    0    0    0    0  ...      0      0      0   
3      5  0    0    0    0    0    0    0    0    0  ...      0      0      0   
4      2  0    0    0    0    0    0    0    0    0  ...      0      0      0   
...   .. ..  ...  ...  ...  ...  ...  ...  ...  ...  ...    ...    ...    ...   
19994  0  0    0    0    0    0    0    0    0    0  ...      0      0      0   
19995  1  0    0    0    0    0    0    0    0    0  ...      0      0      0   
19996  2  0    0    0    0    0    0    0    0    0  ...      0      0      0   
19997  9  0    0    0    0    0    0    0    0    0  ...      0      0      0   
19998  5  0    0    0    0    0    0    0    0    0  ...      0      0      0

In this code, we first load the MNIST dataset using pandas library and extract the pixel values and labels into separate variables. Then, we normalize the pixel values by dividing them by 255.0, which scales them to a range between 0 and 1. Finally, we verify the normalization results by printing the minimum and maximum pixel values in the normalized dataset.

In [118]:
# Extract pixel values and labels
X_train = data.iloc[:, 1:].values
y_train = data.iloc[:, 0].values

In [119]:
# Normalize pixel values
X_train = X_train.astype('float32') / 255.0

In [120]:
# Verify the normalization results
print('Min pixel value:', np.min(X_train))
print('Max pixel value:', np.max(X_train))

Min pixel value: 0.0
Max pixel value: 1.0


PCA stands for Principal Component Analysis.

we use the PCA class from the sklearn.decomposition module to perform PCA with the n_components parameter set to 50. This means that the dimensionality of the data will be reduced to 50 features. The fit_transform method is then used to fit the PCA model to the data and transform it into the reduced feature space.

In [121]:
# Perform PCA to reduce dimensionality
pca = PCA(n_components=50)  # Set the number of components to keep
X_train_pca = pca.fit_transform(X_train)

In [122]:
# Verify the results
print('Original shape:', X_train.shape)
print('Reduced shape:', X_train_pca.shape)

Original shape: (19999, 784)
Reduced shape: (19999, 50)


The next step would be to split the dataset into training and validation sets and then implement the Bayes Classifier.

#Train the data (The Hard Way)

## Bayes Classifier

In [123]:
# Train a Naive Bayes classifier
class_means = np.zeros((10, X_train_pca.shape[1]))
class_vars = np.zeros((10, X_train_pca.shape[1]))
class_counts = np.zeros(10)

In [128]:
class GaussNB:
    
    def __init__(self, epsilon=1e-6):
        self.epsilon = epsilon
    
    def fit(self, X_train_pca, y_train):
        self.likelihood = {}
        self.priors = {}
        self.classes = set(y_train.astype(int))
        for class_label in self.classes:
            X_class = X_train_pca[y_train == class_label, :]
            self.likelihood[class_label] = {"mean": X_class.mean(axis=0), "cov": np.cov(X_class.T) + self.epsilon * np.eye(X_train_pca.shape[1])}
            self.priors[class_label] = len(X_class) / len(X_train_pca)
              
    def predict(self, X_val):
        N, D = X_val.shape
        P_hat = np.zeros((N, len(self.classes)))
        for class_label, likelihood in self.likelihood.items():
            P_hat[:, class_label] = multivariate_normal.logpdf(X_val, likelihood["mean"], likelihood["cov"]) + np.log(self.priors[class_label])
        return P_hat.argmax(axis=1)


In [129]:
def check_handwriting(str1, str2):
    str1 = str1.lower().replace(" ", "")
    str2 = str2.lower().replace(" ", "")
    
    if len(str1) != len(str2):
        return False
    
    char_count = {}
    for char in str1:
        char_count[char] = char_count.get(char, 0) + 1
    
    for char in str2:
        if char not in char_count or char_count[char] == 0:
            return False
        char_count[char] -= 1
        
    return True

In [130]:
gnb = GaussianNB()

In [134]:
gnb.fit(X_train_pca, y_train)
y_hat_train = gnb.predict(X_train_pca)
acc = accuracy_score(y_train, y_hat)

In [135]:
y_hat = gnb.predict(X_train_pca)

In [136]:
acc = accuracy_score(y_train, y_hat)

In [None]:
print(acc)

0.8714935746787339


##KNN

In [137]:
class KNNClassifier():

  def fit(self, X, y):
    self.X = X
    self.y = y


  def predict(self, X, K, epsilon = 1e-3):
    N = len(X)
    y_hat = np.zeros(N)

    for i in range(N):
      dist2 = np.sum((self.X-X[i])**2,axis=1)
      idxt  = np.argsort(dist2)[:K]
      gamma_k=1/(np.sqrt(dist2[idxt]+epsilon))
      y_hat[i] = np.bincount(self.y[idxt], weights=gamma_k).argmax()


    return y_hat

In [144]:
# create an instance of the class
knn = KNNClassifier()

In [145]:
# train the classifier on the training data
knn.fit(X_train_pca, y_train)

In [None]:
len(y_train)

12799

In [146]:
X_train_pca.shape[0]

19999

In [147]:
np.array_equal(y_train, X_train_pca[:, -1])

False

In [155]:
# make predictions on the validation data
y_val = knn.predict(X_train_pca, 500)

In [156]:
# evaluate the accuracy of the predictions
accuracy = accuracy_score(y_train, y_val)

In [157]:
print('KNN accuracy:', accuracy)

KNN accuracy: 0.9750487524376219


# Train the Data (The Easy Way)

##Split the data

In [175]:
# Split the dataset into training and validation sets
X_train_pca, X_val, y_train, y_val = train_test_split(X_train_pca, y_train, test_size=0.2, random_state=42)

##Bayes Classifier

In [176]:
# Create a Bayes classifier
bayes = GaussianNB()

In [178]:
# Train the classifier on the training data
bayes.fit(X_train_pca, y_train)

In [179]:
# Evaluate the classifier on the validation data
score = bayes.score(X_val, y_val)

In [180]:
# Print the accuracy score
print('Bayes accuracy:', score)

Bayes accuracy: 0.86675


##KNN

In [205]:
# Create a KNN classifier with k=10
knn = KNeighborsClassifier(n_neighbors=20)

In [206]:
# Train the classifier on the training data
knn.fit(X_train_pca, y_train)

In [207]:
# Evaluate the classifier on the validation data
score = knn.score(X_val, y_val)

In [208]:
# Print the accuracy score
print('KNN accuracy:', score)

KNN accuracy: 0.956


In [189]:
#%%shell
#jupyter nbconvert --to html /content/generativeModelsMNIST.ipynb