# CLASSIFICATION ANALYSIS


The goal of classification analysis is to predict the category or class to which a new observation belongs, based on a set of input variables.

Given a set of observed data points $X = {x_0, x_1, ..., x_n}$ with corresponding class labels $Y = { y_0, y_1, ..., y_n}$, the objective is to find a function $y = f(x, \theta)$ that maps input variables $x$ to class labels $y$, where $\theta$ are the parameters of the model to be determined.

## Example 1: Classifying MNIST Digits

The [MNIST](https://yann.lecun.com/exdb/mnist/) dataset consists of black-and-white images of handwritten digits, each normalized to fit within a 28x28 pixel bounding box. The images are anti-aliased, introducing grayscale levels for smoother edges. The dataset includes 60,000 training images and 10,000 testing images, making it a widely used benchmark for image classification tasks.

`Goal: Find a model to classify images of handwritten digits of the MNIST Dataset into their corresponding digit labels.`

### Loading the MNIST dataset

In [1]:
import gzip
import os
import numpy as np
import struct


def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path, f'{kind}-labels-idx1-ubyte.gz')
    images_path = os.path.join(path, f'{kind}-images-idx3-ubyte.gz')

    with gzip.open(labels_path, 'rb') as lbpath:
        _, _ = struct.unpack('>II', lbpath.read(8))
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8)

    with gzip.open(images_path, 'rb') as imgpath:
        _, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
        images = np.frombuffer(imgpath.read(), dtype=np.uint8).reshape(num, rows*cols)

    return images, labels

In [None]:
# Load the dataset
mnist_path = '../supplemental_material/MNIST' 
X_train, y_train = load_mnist(mnist_path, kind='train')
X_test, y_test = load_mnist(mnist_path, kind='t10k')

print(f"Training set shape: {X_train.shape}, Training labels shape: {y_train.shape}")
print(f"Test set shape: {X_test.shape}, Test labels shape: {y_test.shape}")

### Classify digits with or without circles using Logistic Regression

In [3]:
from sklearn.preprocessing import StandardScaler


# Define the digits with and without circles
circle_digits = [0, 6, 8, 9]
no_circle_digits = [1, 2, 3, 4, 5, 7]

# Filter the training and test sets to include only the relevant digits
train_mask = np.isin(y_train, circle_digits + no_circle_digits)
X_train, y_train = X_train[train_mask], y_train[train_mask]

test_mask = np.isin(y_test, circle_digits + no_circle_digits)
X_test, y_test = X_test[test_mask], y_test[test_mask]

y_train_binary = np.where(np.isin(y_train, circle_digits), 1, 0)
y_test_binary = np.where(np.isin(y_test, circle_digits), 1, 0)



# Normalize the data (important for gradient descent)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

The formula for logistic regression is:

$$
P(y = 1 \mid X) = \sigma(w^T X + b) = \frac{1}{1 + e^{-(w^T X + b)}}
$$

where:
- $P(y = 1 \mid X)$ is the probability that the output $y$ is 1 given the input features $X$.
- $\sigma(z) = \frac{1}{1 + e^{-z}}$ is the sigmoid function.
- $w$ is the vector of weights.
- $X$ is the vector of input features.
- $b$ is the bias term.


In [4]:
# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Initialize weights and bias
def initialize_parameters(dim):
    w = np.zeros((dim, 1))
    b = 0
    return w, b

# Forward and backward propagation
def propagate(w, b, X, Y):
    m = X.shape[1]
    
    # Forward propagation
    Z = np.dot(w.T, X) + b
    A = sigmoid(Z)
    cost = -1/m * np.sum(Y * np.log(A) + (1 - Y) * np.log(1 - A))  # Compute cost

    # Backward propagation
    dw = 1/m * np.dot(X, (A - Y).T)
    db = 1/m * np.sum(A - Y)
    
    grads = {"dw": dw, "db": db}
    
    return grads, cost

# Optimization using gradient descent
def optimize(w, b, X, Y, num_iterations, learning_rate):
    costs = []
    
    for i in range(num_iterations):
        # Calculate gradients and cost
        grads, cost = propagate(w, b, X, Y)
        
        # Retrieve gradients
        dw = grads["dw"]
        db = grads["db"]
        
        # Update parameters
        w -= learning_rate * dw
        b -= learning_rate * db
        
        # Record the cost every 100 iterations
        if i % 100 == 0:
            costs.append(cost)
            print(f"Cost after iteration {i}: {cost}")
    
    params = {"w": w, "b": b}
    grads = {"dw": dw, "db": db}
    
    return params, grads, costs



To make predictions, we use the learned weights \( w \) and bias \( b \) to compute the output \( Z \) and apply the sigmoid function to get the probability. We threshold the probability to determine the class label:

$$
\text{predictions} = I(A > 0.5)
$$

where \( I \) is the indicator function that outputs 1 if true and 0 otherwise.


In [5]:
# Predict the labels
def predict(w, b, X):
    Z = np.dot(w.T, X) + b
    A = sigmoid(Z)
    predictions = (A > 0.5).astype(int)
    return predictions

In [None]:
# Reshape the data (required for our implementation)
X_train = X_train.T
X_test = X_test.T
y_train_binary = y_train_binary.reshape(1, -1)
y_test_binary = y_test_binary.reshape(1, -1)

# Initialize parameters
w, b = initialize_parameters(X_train.shape[0])

# Train the model
parameters, grads, costs = optimize(w, b, X_train, y_train_binary, num_iterations=2000, learning_rate=0.01)

# Get the optimized parameters
w = parameters["w"]
b = parameters["b"]

# Make predictions on the training and test sets
y_pred_train = predict(w, b, X_train)
y_pred_test = predict(w, b, X_test)

# Evaluate the model
train_accuracy = 100 - np.mean(np.abs(y_pred_train - y_train_binary)) * 100
test_accuracy = 100 - np.mean(np.abs(y_pred_test - y_test_binary)) * 100

print(f"Train accuracy: {train_accuracy}%")
print(f"Test accuracy: {test_accuracy}%")

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


y_test_flat = y_test.flatten()
y_pred_test_flat = y_pred_test.flatten()

cm = np.zeros((2, 10), dtype=int) 

for i, true_label in enumerate(y_test_flat):
    predicted_binary = y_pred_test_flat[i]
    if predicted_binary == 1:  
        cm[1, true_label] += 1 # Predicted as a circle digit
    else:  
        cm[0, true_label] += 1  # Predicted as a no-circle digit

# Plot the confusion matrix
plt.figure(figsize=(12, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], yticklabels=['No Circle', 'Circle'])
plt.xlabel('Digit Label')
plt.ylabel('Binary Classification')
plt.title('Confusion Matrix for Binary Classification (Circle vs No-Circle) with Digit Labels')
plt.show()

In [None]:
from skimage import measure, morphology

images = X_test.T.reshape(-1, 28, 28)  # Transpose and reshape to get (10000, 28, 28)


# Define function to count white pixels inside circles for specific digits (0, 6, 8, 9)
def count_inner_white_pixels(images, labels, circle_digits=[0, 6, 8, 9]):
    inner_white_counts = np.zeros_like(labels, dtype=int)
    
    for i, (image, label) in enumerate(zip(images, labels)):
        if label in circle_digits:
            # Threshold image for white (background and inner circles) and black (digit contours)
            binary_white = image >= 200  # White areas
            binary_black = image <= 50   # Black areas (digit shape)

            # Identify connected components in the binary white mask
            labeled_white = measure.label(binary_white, connectivity=2)
            regions = measure.regionprops(labeled_white)

            # Filter regions based on size and position within the digit bounds
            inner_white_pixels = 0
            for region in regions:
                # Check if the region is within the digit bounds and not part of the outer background
                if region.bbox[0] > 3 and region.bbox[1] > 3 and region.bbox[2] < 25 and region.bbox[3] < 25:
                    # Assuming this segment is an "inner circle" if inside digit bounds
                    inner_white_pixels += region.area

            inner_white_counts[i] = inner_white_pixels

    return inner_white_counts

# Count white pixels in inner circles for the circle digits
inner_white_pixel_counts = count_inner_white_pixels(images, y_test_flat)

# Plot histograms for inner white pixels only in circle digits (0, 6, 8, 9)
fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharey=True)

for idx, digit in enumerate(circle_digits):
    digit_inner_white_pixels = inner_white_pixel_counts[y_test_flat == digit]
    
    axes[idx].hist(digit_inner_white_pixels, bins=20, color='lightblue', edgecolor='black')
    axes[idx].set_title(f"Digit {digit}")
    axes[idx].set_xlabel("Inner White Pixels")

axes[0].set_ylabel("Frequency")
fig.suptitle("Histogram of Inner White Pixels in Circle Digits (0, 6, 8, 9)", fontsize=16)
plt.show()



In [None]:
images = X_test.T.reshape(-1, 28, 28)  # Transpose and reshape to get (10000, 28, 28)


def count_white_black_pixels(images):
    white_counts = np.sum(images >= 0, axis=(1, 2))  # White pixel threshold at 200+
    black_counts = np.sum(images <= 0, axis=(1, 2))   # Black pixel threshold at 50-
    return white_counts, black_counts


# Count white and black pixels in the reshaped images
white_pixel_counts, black_pixel_counts = count_white_black_pixels(images)

# Plot histogram for white pixels in circle digits (0, 6, 8, 9)
fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharey=True)

for idx, digit in enumerate(circle_digits):
    digit_white_pixels = white_pixel_counts[y_test_flat == digit]
    axes[idx].hist(digit_white_pixels, bins=20, color='skyblue', edgecolor='black')
    axes[idx].set_title(f"Digit {digit}")
    axes[idx].set_xlabel("White Pixels")

axes[0].set_ylabel("Frequency")
fig.suptitle("Histogram of White Pixels in Circle Digits (0, 6, 8, 9)", fontsize=16)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming we have y_test_flat and y_pred_test_flat ready in the dataset
# For the sake of this example, we will simulate a subset of images for digits 0-9 
# with each image being a 28x28 grayscale image and labels

# Simulated dataset for illustration
np.random.seed(0)
sample_size = 100  # Sample size for each digit
image_size = 28 * 28  # MNIST images are 28x28 pixels

# Generating random grayscale images as placeholders (values between 0 and 255)
# In an actual scenario, we would have real MNIST image data
images = np.random.randint(0, 256, size=(10 * sample_size, 28, 28))
labels = np.repeat(range(10), sample_size)

# Function to count white and black pixels
def count_white_black_pixels(images):
    white_counts = np.sum(images >= 200, axis=(1, 2))  # White pixel threshold at 200+
    black_counts = np.sum(images <= 50, axis=(1, 2))   # Black pixel threshold at 50-
    return white_counts, black_counts

# Count white and black pixels for all images
white_pixel_counts, black_pixel_counts = count_white_black_pixels(images)

# Plot histogram for white pixels in circle digits (0, 6, 8, 9)
fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharey=True)

# Plotting separate histograms for each circle digit (0, 6, 8, 9) for white pixels
for idx, digit in enumerate(circle_digits):
    digit_white_pixels = white_pixel_counts[labels == digit]
    
    axes[idx].hist(digit_white_pixels, bins=20, color='skyblue', edgecolor='black')
    axes[idx].set_title(f"Digit {digit}")
    axes[idx].set_xlabel("White Pixels")

axes[0].set_ylabel("Frequency")
fig.suptitle("Histogram of White Pixels in Circle Digits (0, 6, 8, 9)", fontsize=16)
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Reshape the data back to the format expected by sklearn
X_train_sklearn = X_train.T  # Shape: (num_samples, num_features)
X_test_sklearn = X_test.T    # Shape: (num_samples, num_features)
y_train_sklearn = y_train.flatten()  # Shape: (num_samples,)
y_test_sklearn = y_test.flatten()    # Shape: (num_samples,)

# Train the logistic regression model using sklearn
model = LogisticRegression(solver='lbfgs', max_iter=2000, n_jobs=-1)
model.fit(X_train_sklearn, y_train_sklearn)

# Predict on the test set
y_pred = model.predict(X_test_sklearn)

# Evaluate the model
accuracy = accuracy_score(y_test_sklearn, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

### Classify digits using MultiClass Logistic regression 

In multiclass logistic regression, we extend the binary case to handle multiple classes. We use learned weights \( W \) and bias \( b \) to compute the output \( Z \) for each class. The softmax function is then applied to obtain the probabilities for each class:

$$
P(y = k | \mathbf{x}) = \frac{e^{Z_k}}{\sum_{j=1}^{K} e^{Z_j}}
$$

where $Z_k$ is the output for class $k$, $K$ is the total number of classes, and $\mathbf{x}$ is the input feature vector. 


The predicted class label is determined by:

$$
\text{predictions} = \text{argmax}(P(y = k | \mathbf{x}))
$$

where $\text{argmax}$ selects the class with the highest probability.

In [None]:
sklearn_model = LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42)
sklearn_model.fit(X_train.T, y_train.flatten())  # Remember to transpose X_train for scikit-learn compatibility

# Make predictions on the test set
y_pred_test_sklearn = sklearn_model.predict(X_test.T)

# Evaluate the model
test_accuracy_sklearn = accuracy_score(y_test.flatten(), y_pred_test_sklearn)
print(f"Test accuracy: {test_accuracy_sklearn * 100:.2f}%")

### Classify digits with or witout circles using Support Vector Machine (SVM)

The goal of SVM is to find a hyperplane that maximizes the margin between two classes. Given a dataset of $N$ samples $(x_i, y_i)$ where $x_i \in \mathbb{R}^d$ is the feature vector and $y_i \in \{-1, 1\}$ is the class label, we aim to solve the following optimization problem:

$$
\text{minimize } J(w, b) = \frac{1}{2} \|w\|^2 + C \sum_{i=1}^{N} \max\left(0, 1 - y_i (w \cdot x_i + b)\right)
$$

### Definitions

- $w$: Weight vector (coefficients of the features).
- $b$: Bias term (intercept of the hyperplane).
- $C$: Regularization parameter that controls the trade-off between maximizing the margin and minimizing the classification error.
- $N$: Number of training samples.
- $d$: Number of features in each sample.

In [None]:
# Load the dataset
mnist_path = '../supplemental_material/MNIST' 
X_train, y_train = load_mnist(mnist_path, kind='train')
X_test, y_test = load_mnist(mnist_path, kind='t10k')

print(f"Training set shape: {X_train.shape}, Training labels shape: {y_train.shape}")
print(f"Test set shape: {X_test.shape}, Test labels shape: {y_test.shape}")

# Define the digits with and without circles
circle_digits = [0, 6, 8, 9]
no_circle_digits = [1, 2, 3, 4, 5, 7]

# Filter the training and test sets to include only the relevant digits
train_mask = np.isin(y_train, circle_digits + no_circle_digits)
X_train, y_train = X_train[train_mask], y_train[train_mask]

test_mask = np.isin(y_test, circle_digits + no_circle_digits)
X_test, y_test = X_test[test_mask], y_test[test_mask]

# Relabel the data: 1 for circle digits, 0 for no-circle digits
y_train = np.where(np.isin(y_train, circle_digits), 1, 0)
y_test = np.where(np.isin(y_test, circle_digits), 1, 0)

# Normalize the data (important for gradient descent)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [40]:
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.num_iterations = num_iterations
        self.w = None
        self.b = None

    def fit(self, X, y):
        num_samples, num_features = X.shape
        y = np.where(y <= 0, -1, 1)  # Convert labels to -1 and 1 for SVM

        # Initialize weights and bias
        self.w = np.zeros(num_features)
        self.b = 0

        # Gradient descent
        for _ in range(self.num_iterations):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    # If the point is correctly classified and outside the margin
                    dw = 2 * self.lambda_param * self.w
                    db = 0
                else:
                    # If the point is inside the margin or misclassified
                    dw = 2 * self.lambda_param * self.w - np.dot(x_i, y[idx])
                    db = y[idx]

                # Update weights and bias
                self.w -= self.learning_rate * dw
                self.b -= self.learning_rate * db

    def predict(self, X):
        approx = np.dot(X, self.w) - self.b
        return np.sign(approx)

In [41]:
# Initialize and train the SVM model
svm = SVM(learning_rate=0.001, lambda_param=0.01, num_iterations=1000)
svm.fit(X_train, y_train)

# Predict on the test set
y_pred_test = svm.predict(X_test)

# Convert predictions back to 0 and 1
y_pred_test = np.where(y_pred_test == -1, 0, 1)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Evaluate the model
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Test accuracy (SVM from scratch): {test_accuracy * 100:.2f}%")

# Confusion matrix and classification report
print("Confusion Matrix for Test Set (SVM from scratch):")
conf_matrix = confusion_matrix(y_test, y_pred_test)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (SVM from scratch)')
plt.show()

print("Classification Report (SVM from scratch):")
print(classification_report(y_test, y_pred_test))

In [43]:
from sklearn.svm import SVC

# Initialize and train the scikit-learn SVM model
sklearn_svm = SVC(kernel='linear')
sklearn_svm.fit(X_train, y_train)

# Predict on the test set
y_pred_test_sklearn = sklearn_svm.predict(X_test)

# Evaluate the scikit-learn model
test_accuracy_sklearn = accuracy_score(y_test, y_pred_test_sklearn)
print(f"Test accuracy (scikit-learn SVM): {test_accuracy_sklearn * 100:.2f}%")

### Digits classification using Multi Layer Perceptron
A Multi-Layer Perceptron (MLP) is a feedforward neural network with multiple layers of neurons that utilize weighted connections and activation functions to learn complex mappings from inputs to outputs

The output of the MLP can be represented as:

$$
Z^{(1)} = W^{(1)} X + b^{(1)}
$$

$$
A^{(1)} = \text{ReLU}(Z^{(1)})
$$

$$
Z^{(2)} = W^{(2)} A^{(1)} + b^{(2)}
$$

$$
A^{(2)} = \sigma(Z^{(2)})
$$

Where:

- $W^{(1)}, W^{(2)}$: Weight matrices for the first and second layers.
- $b^{(1)}, b^{(2)}$: Bias vectors for the first and second layers.
- $X$: Input features.
- $A^{(1)}, A^{(2)}$: Activations for the first and second layers.
- $\text{ReLU}$: Rectified Linear Unit activation function.
- $\sigma$: Sigmoid activation function for binary classification.


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

# Prepare the data
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

# Initialize the model, loss function, and optimizer
input_size = X_train.shape[1]  # 784 for MNIST
hidden_size = 128
output_size = 1  # Binary classification

model = MLP(input_size, hidden_size, output_size)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for i, (images, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Save the model weights
save_path = os.path.join("data", "weights", "MLP")
os.makedirs(save_path, exist_ok=True)  
torch.save(model.state_dict(), os.path.join(save_path, "MNIST.pth"))

# Evaluation
model.eval()
with torch.no_grad():
    y_pred_train = model(X_train_tensor).round()
    y_pred_test = model(X_test_tensor).round()

train_accuracy = (y_pred_train.eq(y_train_tensor).sum() / y_train_tensor.shape[0]).item()
test_accuracy = (y_pred_test.eq(y_test_tensor).sum() / y_test_tensor.shape[0]).item()

print(f'Train Accuracy: {train_accuracy * 100:.2f}%')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

# Confusion Matrix
y_pred_test_np = y_pred_test.numpy().flatten()
y_test_np = y_test_tensor.numpy().flatten()

cm = confusion_matrix(y_test_np, y_pred_test_np)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["No Circle", "Circle"])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for MLP (Circle vs No-Circle)')
plt.show()


### Digit Classification with LeNet
LeNet is a convolutional neural network (CNN) architecture that employs convolutional and subsampling layers to extract features from images, followed by fully connected layers for classification

The output of the LeNet Architecture can be represented as:

1. **Convolution and Activation:**
   $$
   Z^{(1)} = W^{(1)} * X + b^{(1)}
   $$
   $$
   A^{(1)} = \text{tanh}(Z^{(1)})
   $$

2. **Subsampling (Pooling):**
   $$
   A^{(2)} = \text{AvgPool}(A^{(1)})
   $$

3. **Second Convolution and Activation:**
   $$
   Z^{(3)} = W^{(3)} * A^{(2)} + b^{(3)}
   $$
   $$
   A^{(3)} = \text{tanh}(Z^{(3)})
   $$

4. **Second Subsampling (Pooling):**
   $$
   A^{(4)} = \text{AvgPool}(A^{(3)})
   $$

5. **Flattening:**
   $$
   A^{(5)} = \text{Flatten}(A^{(4)})
   $$

6. **Fully Connected Layer:**
   $$
   Z^{(6)} = W^{(6)} A^{(5)} + b^{(6)}
   $$
   $$
   A^{(6)} = \sigma(Z^{(6)})
   $$

Where:
- $W^{(l)}$: Weight matrices for each layer.
- $b^{(l)}$: Bias vectors for each layer.
- $X$: Input image.
- $\text{tanh}$: Hyperbolic tangent activation function.
- $\text{AvgPool}$: Average pooling operation.
- $\sigma$: Sigmoid activation function for output classification.

Pooling is a downsampling operation in CNNs that reduces feature map dimensions by summarizing local regions, commonly using max or average function.  
Flattening transforms a multi-dimensional tensor into a one-dimensional vector, preparing feature maps for input into fully connected layers in neural networks.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torchvision.transforms as transforms
import numpy as np

# Define LeNet-5 architecture
class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1)
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 2)  # Output 2 classes: circle and no-circle

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = torch.relu(self.conv2(x))
        x = torch.max_pool2d(x, kernel_size=2, stride=2)
        x = x.view(-1, 16*5*5)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Define the digits with and without circles
circle_digits = [0, 6, 8, 9]
no_circle_digits = [1, 2, 3, 4, 5, 7]

# Transform the data to tensor and normalize
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

# Function to filter and label the data
def filter_and_label_data(X, y, circle_digits, no_circle_digits):
    mask = np.isin(y, circle_digits + no_circle_digits)
    X_filtered, y_filtered = X[mask], y[mask]
    y_filtered = np.where(np.isin(y_filtered, circle_digits), 1, 0)
    return X_filtered, y_filtered

# Load the dataset
X_train, y_train = load_mnist(mnist_path, kind='train')
X_test, y_test = load_mnist(mnist_path, kind='t10k')

X_train, y_train = filter_and_label_data(X_train, y_train, circle_digits, no_circle_digits)
X_test, y_test = filter_and_label_data(X_test, y_test, circle_digits, no_circle_digits)

# Reshape data to (n_samples, 1, 28, 28) for PyTorch Conv2D
X_train = X_train.reshape(-1, 1, 28, 28).astype(np.float32)
X_test = X_test.reshape(-1, 1, 28, 28).astype(np.float32)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train)
y_train_tensor = torch.tensor(y_train).long()
X_test_tensor = torch.tensor(X_test)
y_test_tensor = torch.tensor(y_test).long()

# Create datasets and data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Instantiate the model, define the loss function and the optimizer
model = LeNet5()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

save_path = os.path.join("data", "weights", "Lenet5")
os.makedirs(save_path, exist_ok=True)  
torch.save(model.state_dict(), os.path.join(save_path, "MNIST.pth"))

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print(f'Accuracy: {100 * correct / total}%')
