In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
#from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [9]:
# Load the data from the CSV file into a numpy array
data = pd.read_csv('/content/sample_data/mnist_train_small.csv')

In [10]:
print(data.info)

<bound method DataFrame.info of        6  0  0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  ...  0.581  0.582  0.583  \
0      5  0    0    0    0    0    0    0    0    0  ...      0      0      0   
1      7  0    0    0    0    0    0    0    0    0  ...      0      0      0   
2      9  0    0    0    0    0    0    0    0    0  ...      0      0      0   
3      5  0    0    0    0    0    0    0    0    0  ...      0      0      0   
4      2  0    0    0    0    0    0    0    0    0  ...      0      0      0   
...   .. ..  ...  ...  ...  ...  ...  ...  ...  ...  ...    ...    ...    ...   
19994  0  0    0    0    0    0    0    0    0    0  ...      0      0      0   
19995  1  0    0    0    0    0    0    0    0    0  ...      0      0      0   
19996  2  0    0    0    0    0    0    0    0    0  ...      0      0      0   
19997  9  0    0    0    0    0    0    0    0    0  ...      0      0      0   
19998  5  0    0    0    0    0    0    0    0    0  ...      0      0      0

In this code, we first load the MNIST dataset using pandas library and extract the pixel values and labels into separate variables. Then, we normalize the pixel values by dividing them by 255.0, which scales them to a range between 0 and 1. Finally, we verify the normalization results by printing the minimum and maximum pixel values in the normalized dataset.

In [11]:
# Extract pixel values and labels
X_train = data.iloc[:, 1:].values
y_train = data.iloc[:, 0].values

In [12]:
# Normalize pixel values
X_train = X_train.astype('float32') / 255.0

In [13]:
# Verify the normalization results
print('Min pixel value:', np.min(X_train))
print('Max pixel value:', np.max(X_train))

Min pixel value: 0.0
Max pixel value: 1.0


PCA stands for Principal Component Analysis.

we use the PCA class from the sklearn.decomposition module to perform PCA with the n_components parameter set to 50. This means that the dimensionality of the data will be reduced to 50 features. The fit_transform method is then used to fit the PCA model to the data and transform it into the reduced feature space.

In [14]:
# Perform PCA to reduce dimensionality
pca = PCA(n_components=50)  # Set the number of components to keep
X_train_pca = pca.fit_transform(X_train)

In [15]:
# Verify the results
print('Original shape:', X_train.shape)
print('Reduced shape:', X_train_pca.shape)

Original shape: (19999, 784)
Reduced shape: (19999, 50)


The next step would be to split the dataset into training and validation sets and then implement the Bayes Classifier.

In [16]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

#implement either the Bayes Classifier

In [17]:
# Create a Bayes classifier
bayes = GaussianNB()

In [18]:
# Train the classifier on the training data
bayes.fit(X_train, y_train)

In [19]:
# Evaluate the classifier on the validation data
score = bayes.score(X_val, y_val)

In [20]:
# Print the accuracy score
print('Bayes accuracy:', score)

Bayes accuracy: 0.576


#KNN

In [23]:
# Create a KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

In [24]:
# Train the classifier on the training data
knn.fit(X_train, y_train)

In [25]:
# Evaluate the classifier on the validation data
score = knn.score(X_val, y_val)

In [26]:
# Print the accuracy score
print('KNN accuracy:', score)

KNN accuracy: 0.95925
