# Handwritten Math Symbol Recognition with a Convolutional Neural Network (PyTorch)

This notebook trains a **convolutional neural network (CNN)** to recognize **handwritten digits and basic math operators**
from images. The goal is to demonstrate a clean, end-to-end workflow:

- image preprocessing with `torchvision.transforms`
- train/test split from a folder-based dataset
- a compact **AlexNet-style** CNN adapted for **grayscale 100×100** inputs
- training on GPU (if available)
- evaluation with accuracy, precision, recall, and F1-score


## Setup
If you're running this locally and you don't have PyTorch/torchvision, install them first.


In [None]:
# Uncomment if needed:
# !pip -q install torch torchvision scikit-learn torchsummary


In [None]:
import os
import torch
import torch.nn as nn

from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt


## Dataset
Expected structure (folder-per-class):

```
dataset_digits/
  0/ ...
  1/ ...
  ...
  add/ ...
  sub/ ...
  mul/ ...
  ...
```
If you have `dataset_digits.zip`, this cell extracts it (only if the folder doesn't exist).


In [None]:
path_dir = "dataset_digits"

if not os.path.exists(path_dir) and os.path.exists("dataset_digits.zip"):
    os.system("unzip -q dataset_digits.zip")

print("Dataset folder exists:", os.path.exists(path_dir))


## Data loading + preprocessing
We standardize the input pipeline with these transforms (order matters):

1. **Resize** to 100×100 (fixed input size for the CNN)
2. Convert to **grayscale** (1 channel)
3. Convert to **tensor**
4. **Normalize** so pixel mean ≈ 0.5 and std ≈ 0.5

We also create a train/test split from the folder dataset because no split is provided.


In [None]:
def load_and_split_dataset(data_dir, batch_size=128, train_split=0.8):
    mean = [0.5]
    std = [0.5]

    transform = transforms.Compose([
        transforms.Resize((100, 100)),
        transforms.Grayscale(num_output_channels=1),
        transforms.ToTensor(),
        transforms.Normalize(mean, std),
    ])

    dataset = datasets.ImageFolder(root=data_dir, transform=transform)
    train_size = int(train_split * len(dataset))
    test_size = len(dataset) - train_size

    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader, transform, dataset

data_dir = "dataset_digits"
train_loader, test_loader, transform, dataset = load_and_split_dataset(data_dir)

print("Classes:", dataset.classes)
print("Num classes:", len(dataset.classes))


## CNN model (AlexNet-style, adapted)
We build an AlexNet-inspired model adjusted for:
- **grayscale** inputs (1 channel)
- **100×100** spatial resolution
- a final classifier with `num_classes` outputs

Note: The output of the model are **logits**. We use `CrossEntropyLoss`, which applies softmax internally.


In [None]:
class AlexNetModified(nn.Module):
    def __init__(self, num_classes: int, dropout: float = 0.5) -> None:
        super().__init__()

        # Input: (N, 1, 100, 100)
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=9, stride=3, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),

            nn.Conv2d(in_channels=32, out_channels=96, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),

            nn.Conv2d(in_channels=96, out_channels=192, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=192, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),

            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
        )

        # For 100x100 input, the feature map ends up as (N, 128, 7, 7)
        # Flatten size = 128 * 7 * 7 = 6272
        fc_in = 128 * 7 * 7

        self.classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(fc_in, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout),
            nn.Linear(512, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, num_classes),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

# Quick sanity check
num_classes = len(dataset.classes)
model = AlexNetModified(num_classes=num_classes)
dummy = torch.randn(1, 1, 100, 100)
out = model(dummy)
out.shape


## (Optional) Model summary
If you want a quick layer/shape overview, use `torchsummary`.


In [None]:
# Uncomment to see a summary
# !pip -q install torchsummary
# from torchsummary import summary
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# summary(AlexNetModified(num_classes=num_classes).to(device), input_size=(1, 100, 100), device=str(device))


## Training (GPU if available)
We train with:
- `CrossEntropyLoss`
- `Adam` optimizer (lr=1e-3)

You can increase `num_epochs` once you're happy with the pipeline.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


In [None]:
def train_model(model, train_loader, valid_loader, criterion, optimizer, device, num_epochs=5):
    model.to(device)

    for epoch in range(num_epochs):
        # Training
        model.train()
        total_train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / max(1, len(train_loader))

        # "Validation" (here we reuse the test loader as a held-out split)
        model.eval()
        total_valid_loss = 0.0
        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                total_valid_loss += loss.item()

        avg_valid_loss = total_valid_loss / max(1, len(valid_loader))

        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_valid_loss:.4f}")

    return model


In [None]:
model = AlexNetModified(num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

trained_model = train_model(model, train_loader, test_loader, criterion, optimizer, device, num_epochs=5)


## Evaluation
We compute:
- accuracy
- weighted precision / recall / F1

For imbalanced data, weighted metrics are usually more informative than raw accuracy alone.


In [None]:
def predict_and_evaluate(model, data_loader, device):
    model.to(device)
    model.eval()

    predictions, labels = [], []

    with torch.no_grad():
        for inputs, true_labels in data_loader:
            inputs = inputs.to(device)
            true_labels = true_labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)

            predictions.extend(predicted.view(-1).cpu().numpy())
            labels.extend(true_labels.view(-1).cpu().numpy())

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="weighted", zero_division=0
    )

    return predictions, labels, accuracy, precision, recall, f1

predictions, labels, accuracy, precision, recall, f1 = predict_and_evaluate(trained_model, test_loader, device)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")


## Visualize a few predictions
We plot a handful of test images with their predicted and true labels.


In [None]:
def show_predictions(data_loader, predictions, labels, class_names, num_images=10):
    plt.figure(figsize=(14, 3))

    batch = next(iter(data_loader))
    inputs, _ = batch

    for i in range(min(num_images, inputs.shape[0])):
        plt.subplot(1, num_images, i + 1)
        plt.imshow(inputs[i][0], cmap="gray")  # grayscale channel
        plt.title(f"Pred: {class_names[predictions[i]]}\nTrue: {class_names[labels[i]]}", fontsize=8)
        plt.axis("off")

    plt.tight_layout()
    plt.show()

class_names = dataset.classes
show_predictions(test_loader, predictions, labels, class_names, num_images=10)
