# Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical

# The MNIST dataset (Modified National Institute of Standards and Technology) is a widely used dataset in machine learning, consisting of 60,000 training images and 10,000 test images of handwritten digits (0-9) in grayscale format. It serves as a benchmark for various image processing systems and is commonly used for training and testing machine learning models. The dataset is structured as 28x28 pixel images, making it suitable for tasks like digit recognition.

# Loading MNIST Dataset

In [2]:
(X_full, y_full), _ = mnist.load_data()

# Splitting the dataset into 2 sets i.e. 80% in Training set and 20% in Testing set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=42, stratify=y_full)

# Reading Dataset

In [4]:
data = pd.DataFrame(X_train.reshape(X_train.shape[0], -1))
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
data.shape

(48000, 784)

# Dataset contains 60000 rows and 784 columns

# Understanding Data

In [6]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48000 entries, 0 to 47999
Columns: 784 entries, 0 to 783
dtypes: uint8(784)
memory usage: 35.9 MB


# ----------- Descriptive Statistics -----------

In [8]:
print("Descriptive Statistics for MNIST Dataset (Pixel Values):")
df_stats = pd.DataFrame(X_train.reshape(X_train.shape[0], -1))
print(df_stats.describe())

Descriptive Statistics for MNIST Dataset (Pixel Values):
           0        1        2        3        4        5        6        7    \
count  48000.0  48000.0  48000.0  48000.0  48000.0  48000.0  48000.0  48000.0   
mean       0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
std        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
min        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
25%        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
50%        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
75%        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
max        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   

           8        9    ...           774           775           776  \
count  48000.0  48000.0  ...  48000.000000  48000.000000  48000.000000   
mean       0.0      0.0  ...      0.184708      0.070688      0.0

# Applying Models

# ----------- Machine Learning Models -----------

***Flatten and normalize images***

In [9]:
X_train_flat = X_train.reshape((X_train.shape[0], -1)) / 255.0
X_test_flat = X_test.reshape((X_test.shape[0], -1)) / 255.0

# 1. Random Forest

In [10]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_flat, y_train)
rf_pred = rf.predict(X_test_flat)
print("\n--- Random Forest Classification Report ---")
print(classification_report(y_test, rf_pred))


--- Random Forest Classification Report ---
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1185
           1       0.97      0.98      0.98      1348
           2       0.95      0.97      0.96      1192
           3       0.96      0.95      0.95      1226
           4       0.97      0.97      0.97      1168
           5       0.96      0.96      0.96      1084
           6       0.98      0.98      0.98      1184
           7       0.98      0.96      0.97      1253
           8       0.95      0.95      0.95      1170
           9       0.95      0.95      0.95      1190

    accuracy                           0.97     12000
   macro avg       0.97      0.97      0.97     12000
weighted avg       0.97      0.97      0.97     12000



# 2. SVM (Use a smaller subset for speed)

In [11]:
svm = SVC(kernel='linear')
svm.fit(X_train_flat[:10000], y_train[:10000])
svm_pred = svm.predict(X_test_flat)
print("\n--- SVM Classification Report ---")
print(classification_report(y_test, svm_pred))


--- SVM Classification Report ---
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      1185
           1       0.94      0.98      0.96      1348
           2       0.89      0.90      0.90      1192
           3       0.89      0.88      0.88      1226
           4       0.91      0.93      0.92      1168
           5       0.88      0.87      0.87      1084
           6       0.96      0.95      0.95      1184
           7       0.94      0.93      0.93      1253
           8       0.91      0.85      0.88      1170
           9       0.90      0.90      0.90      1190

    accuracy                           0.92     12000
   macro avg       0.92      0.92      0.92     12000
weighted avg       0.92      0.92      0.92     12000



# ----------- Deep Learning Model (CNN) -----------

***Reshape and normalize images***

In [12]:
X_train_dl = X_train.reshape(-1, 28, 28, 1).astype('float32') / 255.0
X_test_dl = X_test.reshape(-1, 28, 28, 1).astype('float32') / 255.0

***One-hot encode labels***

In [13]:
y_train_dl = to_categorical(y_train, 10)
y_test_dl = to_categorical(y_test, 10)

***Build CNN Model***

In [14]:
cnn = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


***Compile and train***

In [15]:
cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
cnn.fit(X_train_dl, y_train_dl, epochs=5, batch_size=128, validation_split=0.1,verbose=1)

Epoch 1/5
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 81ms/step - accuracy: 0.8234 - loss: 0.6014 - val_accuracy: 0.9737 - val_loss: 0.0887
Epoch 2/5
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.9775 - loss: 0.0720 - val_accuracy: 0.9827 - val_loss: 0.0547
Epoch 3/5
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 76ms/step - accuracy: 0.9840 - loss: 0.0494 - val_accuracy: 0.9821 - val_loss: 0.0505
Epoch 4/5
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 78ms/step - accuracy: 0.9883 - loss: 0.0358 - val_accuracy: 0.9887 - val_loss: 0.0418
Epoch 5/5
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 75ms/step - accuracy: 0.9916 - loss: 0.0261 - val_accuracy: 0.9852 - val_loss: 0.0440


<keras.src.callbacks.history.History at 0x21fd71fbf80>

***Evaluate***

In [16]:
cnn_pred_probs = cnn.predict(X_test_dl)
cnn_pred = np.argmax(cnn_pred_probs, axis=1)
y_test_labels = np.argmax(y_test_dl, axis=1)

print("\n--- CNN Classification Report ---")
print(classification_report(y_test_labels, cnn_pred))

[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step

--- CNN Classification Report ---
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1185
           1       0.99      0.99      0.99      1348
           2       0.99      0.97      0.98      1192
           3       0.99      0.97      0.98      1226
           4       0.99      0.95      0.97      1168
           5       0.98      0.99      0.99      1084
           6       1.00      0.98      0.99      1184
           7       0.99      0.98      0.99      1253
           8       0.96      0.99      0.97      1170
           9       0.94      0.99      0.96      1190

    accuracy                           0.98     12000
   macro avg       0.98      0.98      0.98     12000
weighted avg       0.98      0.98      0.98     12000

