In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Struct Library to Unpack Image Data

In [None]:
import struct

def read_idx(filename):
    # Open the file in binary mode for reading
    with open(filename, 'rb') as f:
        # Read the first 4 bytes and unpack them
        # '>HBB' means: 
        # '>' - big-endian
        # 'H' - unsigned short (2 bytes)
        # 'B' - unsigned byte (1 byte)
        zero, data_type, dims = struct.unpack('>HBB', f.read(4))
        
        # Read the dimensions of the data
        # '>I' means: 
        # '>' - big-endian
        # 'I' - unsigned int (4 bytes)
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        
        # Read the remaining bytes and interpret them as unsigned 8-bit integers (uint8)
        # Reshape the flat array into the shape specified by the header
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)


## Load the Dataset

In [None]:
def load_mnist(image_path, label_path):
    images = read_idx(image_path)
    labels = read_idx(label_path)
    return images, labels

train_image_path = '/kaggle/input/mnist-dataset/train-images-idx3-ubyte/train-images-idx3-ubyte'
train_label_path = '/kaggle/input/mnist-dataset/train-labels-idx1-ubyte/train-labels-idx1-ubyte'
test_image_path =  '/kaggle/input/mnist-dataset/t10k-images-idx3-ubyte/t10k-images-idx3-ubyte'
test_label_path =  '/kaggle/input/mnist-dataset/t10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte'

In [None]:
train_images, train_labels = load_mnist(train_image_path, train_label_path)
test_images, test_labels = load_mnist(test_image_path, test_label_path)

# print the shapes
print(f'Train images shape: {train_images.shape}')
print(f'Train labels shape: {train_labels.shape}')
print(f'Test images shape: {test_images.shape}')
print(f'Test labels shape: {test_labels.shape}')

In [None]:
# Reshae the arrays
train_images_flat = train_images.reshape(train_images.shape[0], -1)
test_images_flat = test_images.reshape(test_images.shape[0], -1)

X_train = train_images_flat
y_train = train_labels
X_test = test_images_flat
y_test = test_labels

In [None]:
from collections import Counter

# Count the occurrences of each label in the y_train array using Counter
label_counts = Counter(y_train)

# Iterate over the items in the label_counts dictionary
for label, count in label_counts.items():
    # Print out the label and its corresponding count in a formatted string
    print(f"Label {label}: Count {count}")


## Disribution of samples in Training set

In [None]:
import matplotlib.pyplot as plt

# Get the unique labels and their counts
unique_labels, counts = np.unique(train_labels, return_counts=True)

# Create a bar chart
plt.figure(figsize=(15, 10))
bars = plt.bar(unique_labels, counts, color='skyblue')
plt.xlabel('Labels')
plt.ylabel('Number of Samples')
plt.title('Distribution of Labels in MNIST Training Set')
plt.xticks(unique_labels)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Annotate the bars with labels (digits 0-9)
for bar, label in zip(bars, unique_labels):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 500, 
             f'{label}', ha='center', va='bottom', color='black', fontweight='bold')

# Show the plot
plt.show()

# Training Algorithms on the MNIST Dataset

In [None]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Define classifiers
classifiers = {
    'RandomForestClassifier': RandomForestClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SupportVectorMachine': make_pipeline(StandardScaler(), SVC()),
    'MultiLayerPerceptron': make_pipeline(StandardScaler(), MLPClassifier(max_iter=1000)),
    'LogisticRegression': LogisticRegression(),
    'XgboostClassifier': XGBClassifier()
}

# Train and evaluate each classifier
accuracies = {}

for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)
    # Predict on the test set
    y_pred = clf.predict(X_test)
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    # Store accuracy
    accuracies[name] = accuracy
    print(f"{name}: {accuracy:.4f}")

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical


# Normalize the input images to the range [0, 1]
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

# Reshape the data to include the channel dimension (required for CNNs)
X_train = X_train.reshape((X_train.shape[0], 28, 28, 1))
X_test = X_test.reshape((X_test.shape[0], 28, 28, 1))
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Define the CNN model
model = models.Sequential()

# First convolutional layer
model.add(layers.Conv2D(64, (3, 3), activation='relu', input_shape=(28, 28, 1)))
model.add(layers.MaxPooling2D((2, 2)))

# Second convolutional layer
model.add(layers.Conv2D(256, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

# Third convolutional layer
model.add(layers.Conv2D(132, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

# Fourth convolutional layer
# Fourth convolutional layer with padding
model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))


# Flatten the output from the convolutional layers
model.add(layers.Flatten())

# Fully connected layer
model.add(layers.Dense(64, activation='relu'))

# Output layer with softmax activation for classification
model.add(layers.Dense(10, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc:.4f}')


In [None]:
from tabulate import tabulate

def create_table(accuracies):


  # Multiply all values by 100
  accuracies = {name: accuracy * 100 for name, accuracy in accuracies.items()}

  # Add CNN and test_acc
  accuracies["CNN"] = test_acc * 100

  # Create the table
  table = tabulate(accuracies.items(), headers=["Model", "Accuracy (%)"], tablefmt="grid")

  return table


table = create_table(accuracies)

print(table)