# Train A Shape Classifier Model



In [None]:
import json
import os

train_data_root = "../datasets/train"
test_data_root = "../datasets/test"

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define transformations (including resizing and normalization)
transform = transforms.Compose(
    [
        transforms.Grayscale(
            num_output_channels=1
        ),  # Convert to grayscale (black and white images)
        transforms.Resize((64, 64)),  # Resize images to 64x64 pixels
        transforms.ToTensor(),  # Convert the image to a tensor
        transforms.Normalize(
            (0.5,), (0.5,)
        ),  # Normalize the images (mean=0.5, std=0.5 for grayscale)
    ]
)

# Load the dataset
train_dataset = datasets.ImageFolder(root=train_data_root, transform=transform)
test_dataset = datasets.ImageFolder(root=test_data_root, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Check class names (optional)
print(f"Classes: {train_dataset.classes}")


# 2. Define a simple CNN model
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # BUG: originally, the number of output channels in first conv layer did
        #      not match the number of input channels for the second conv layer
        self.conv1_out = 16
        self.conv1 = nn.Conv2d(1, self.conv1_out, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(self.conv1_out, 32, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(32 * 16 * 16, 128)
        # 3 classes: circle, triangle, rectangle
        self.fc2 = nn.Linear(128, 3)

    def forward(self, x):
        x = F.relu(self.conv1(x))  # First Conv Layer
        x = F.max_pool2d(x, 2)  # Max Pooling
        x = F.relu(self.conv2(x))  # Second Conv Layer
        x = F.max_pool2d(x, 2)  # Max Pooling
        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))  # Fully Connected Layer 1
        x = self.fc2(x)  # Fully Connected Layer 2 (output)
        return x


model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Statistics
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        # BUG: originally, the number of correct predictions was unconditionally
        #      set to zero. as intended, we now compute a non-zero percentage.
        accuracy = (100 * correct) / total
        print(
            f"Epoch [{epoch + 1}/{epochs}], Loss: {running_loss / len(train_loader):.4f}, Accuracy: {accuracy:.2f}%"
        )

# Train

In [None]:
train_model(model, train_loader, criterion, optimizer, epochs=15)

# Test

In [None]:
def test(model, test_loader):
    """Print the Precision, Recall and F1-score for the trained model"""
    # included in case model is updated to contain layers like dropout or
    # batchnorm (which behave differently during training and testing)
    model.eval()

    classes = test_loader.dataset.classes
    n_classes = len(classes)

    confusion_matrix = torch.zeros((n_classes, n_classes), dtype=torch.int32)

    # disable unneeded gradient tracking during calculations to reduce
    # time and space overhead
    with torch.inference_mode():
        # generate predictions batch-wise for test dataset
        for image_batch, true_labels in test_loader:
            # model call computes a forward pass, argmax gets index of predicted
            # class, and move tensor to CPU for when we index it
            preds = model(image_batch.to(device)).argmax(1).cpu()
            true = true_labels.cpu()

            # update confusion matrix to account for this batch of predictions
            for t, p in zip(true, preds):
                confusion_matrix[t, p] += 1

    # compute per-class counts of true positive, false positive, and false
    # negative predictions using confusion matrix
    class_TPs = confusion_matrix.diagonal()
    # column sum minus diagonal gives false positives
    class_FPs = confusion_matrix.sum(0) - class_TPs
    # row sum minus diagonal gives false negatives
    class_FNs = confusion_matrix.sum(1) - class_TPs

    # define small positive constant to avoid division by zero
    epsilon = 1e-9
    # compute evaluation metrics
    precision = class_TPs / (class_TPs + class_FPs + epsilon)
    recall = class_TPs / (class_TPs + class_FNs + epsilon)
    f1 = (2 * precision * recall) / (precision + recall + epsilon)

    # write computed metrics to stdout for each class
    for i in range(n_classes):
        print(
            f"\nClass: {classes[i]}\n"
            f"\tPrecision: {precision[i]:.3f}\n"
            f"\tRecall: {recall[i]:.3f}\n"
            f"\tF1 Score: {f1[i]:.3f}"
        )


test(model, test_loader)

# Show Predictions


In [None]:
from PIL import ImageDraw, ImageFont


def show_prediction(model, image):
    """Pass the image to the model and overlay the predicted shape and confidence on the input
    image and display it
    """
    # included in case model is updated to contain layers like dropout or
    # batchnorm (which behave differently during training and testing)
    model.eval()

    # preprocess input to make it uniform with training data format
    img_tensor = transform(image).unsqueeze(0).to(device)

    # forward pass (logits â†’ predicted class)
    with torch.inference_mode():
        outputs = model(img_tensor)
        probs = outputs.softmax(1)
        conf, pred_idx = probs.max(1)

    # map index to class name
    label = train_dataset.classes[pred_idx.item()]
    overlay_text = f"{label}\n{conf.item() * 100:.1f}%"

    # draw label on image
    out_img = image.convert("RGB")
    draw = ImageDraw.Draw(out_img)
    draw.text((3, 3), overlay_text, fill=(0, 155, 0))

    out_img.show()

In [None]:
import random
from torchvision.transforms.functional import to_pil_image

idx = random.randrange(len(test_dataset))

# test_dataset returns (Tensor, label)
sample_tensor, sample_label = test_dataset[idx]
# convert image tensor to PIL image
sample_img = to_pil_image(sample_tensor)
show_prediction(model, sample_img)