In [21]:
# Install necessary libraries (run this cell if needed)
%pip install torch torchvision datasets pandas matplotlib seaborn tqdm --quiet


Note: you may need to restart the kernel to use updated packages.


# Workshop: Handwritten Mathematical Expression Recognition

In this workshop, we will train different types of models to recognize handwritten mathematical expressions and convert them to LaTeX. We will use the [Azu/Handwritten-Mathematical-Expression-Convert-LaTeX](https://huggingface.co/datasets/Azu/Handwritten-Mathematical-Expression-Convert-LaTeX) dataset from Hugging Face.

## Step 1: Load the Dataset
We will start by loading the dataset and exploring its structure.


In [22]:
from datasets import load_dataset

ds = load_dataset("Azu/Handwritten-Mathematical-Expression-Convert-LaTeX")
ds


DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 12167
    })
})

In [23]:
# Display the first few samples and dataset features
print("Dataset keys:", ds.keys())
print("Features:", ds['train'].features)
print("First 3 samples:", ds['train'][0:3])
print("\nClass names:", ds['train'].features['label'].names)
print("Number of classes:", len(ds['train'].features['label'].names))


Dataset keys: dict_keys(['train'])
Features: {'image': Image(mode=None, decode=True), 'label': ClassLabel(names=['2014', '2016', '2019', 'train'])}
First 3 samples: {'image': [<PIL.BmpImagePlugin.BmpImageFile image mode=L size=464x85 at 0x1FF615078A0>, <PIL.BmpImagePlugin.BmpImageFile image mode=L size=105x69 at 0x1FF618FC050>, <PIL.BmpImagePlugin.BmpImageFile image mode=L size=76x56 at 0x1FF61E85C50>], 'label': [0, 0, 0]}

Class names: ['2014', '2016', '2019', 'train']
Number of classes: 4


In [24]:
from torch.utils.data import Dataset

class HFDataset(Dataset):
    def __init__(self, hf_dataset, transform=None):
        self.hf_dataset = hf_dataset
        self.transform = transform

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]
        image = item['image'].convert('L')  # Ensure grayscale
        label = item['label']
        if self.transform:
            image = self.transform(image)
        return image, label


## Step 2: Splitting the Dataset

The loaded dataset contains only a `train` split. To properly evaluate our models, we need to create a separate test set. We will randomly split the original training data into a new training set and a test set (for example, 80% for training and 20% for testing).


In [25]:
# Split the dataset into train and validation sets
train_test_split = ds['train'].train_test_split(test_size=0.2, seed=42)
train_ds = train_test_split['train']
val_ds = train_test_split['test']

print(f"Training samples: {len(train_ds)}")
print(f"Validation samples: {len(val_ds)}")
print(f"Number of classes: {len(ds['train'].features['label'].names)}")
print(f"Class names: {ds['train'].features['label'].names}")


Training samples: 9733
Validation samples: 2434
Number of classes: 4
Class names: ['2014', '2016', '2019', 'train']


## Step 3: Building a CNN Model

Let's build a Convolutional Neural Network to classify the handwritten mathematical expressions.


In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
import os
import pandas as pd
from tqdm import tqdm

# Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 1e-3
EPOCHS = 5  # Reduced for faster experimentation

# Device selection: CUDA (NVIDIA GPU), MPS (Apple Silicon), or CPU as fallback
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using CUDA (GPU)')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
    print('Using MPS (Apple Silicon GPU)')
else:
    device = torch.device('cpu')
    print('Using CPU')

# Define image transformations
transform = transforms.Compose([
    transforms.Resize((64, 256)),  # Height x Width
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,)),
])

# Create datasets and dataloaders
train_dataset = HFDataset(train_ds, transform=transform)
val_dataset = HFDataset(val_ds, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")


Using CPU
Train batches: 305
Validation batches: 77


In [27]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.5)
        
        # Calculate the size after convolutions and pooling
        # Input: (64, 256) -> after 3 pooling: (8, 32)
        self.fc1 = nn.Linear(128 * 8 * 32, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # (32, 128)
        x = self.pool(F.relu(self.conv2(x)))  # (16, 64)
        x = self.pool(F.relu(self.conv3(x)))  # (8, 32)
        x = x.view(x.size(0), -1)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Get number of classes from dataset
num_classes = len(ds['train'].features['label'].names)
print(f"Number of classes: {num_classes}")

model = SimpleCNN(num_classes).to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

print(f"Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")


Number of classes: 4
Model parameters: 16872452


## Step 4: Training the CNN Model

Now let's train our CNN model and track its performance.


In [None]:
# Directory to save trained models
os.makedirs('saved_models', exist_ok=True)

# DataFrame to store training history
history_df = pd.DataFrame(columns=['model', 'epoch', 'train_loss', 'val_loss', 'train_acc', 'val_acc'])

# Training loop
for epoch in range(EPOCHS):
    # Training phase
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    train_pbar = tqdm(train_loader, desc=f'Training Epoch {epoch+1}/{EPOCHS}')
    for batch_idx, (images, labels) in enumerate(train_pbar):
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        correct += predicted.eq(labels).sum().item()
        total += labels.size(0)
        
        # Update progress bar
        train_pbar.set_postfix({
            'Loss': f'{loss.item():.4f}',
            'Acc': f'{100.*correct/total:.2f}%'
        })
    
    train_loss = running_loss / total
    train_acc = correct / total

    # Validation phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        val_pbar = tqdm(val_loader, desc=f'Validation Epoch {epoch+1}/{EPOCHS}')
        for images, labels in val_pbar:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            val_loss += loss.item() * images.size(0)
            _, predicted = outputs.max(1)
            val_correct += predicted.eq(labels).sum().item()
            val_total += labels.size(0)
            
            # Update progress bar
            val_pbar.set_postfix({
                'Loss': f'{loss.item():.4f}',
                'Acc': f'{100.*val_correct/val_total:.2f}%'
            })
    
    val_loss = val_loss / val_total
    val_acc = val_correct / val_total

    # Print epoch results
    print(f'Epoch {epoch+1}/{EPOCHS}:')
    print(f'  Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}')
    print(f'  Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}')
    print('-' * 50)

    # Save results to DataFrame
    history_df = pd.concat([
        history_df,
        pd.DataFrame([{
            'model': 'SimpleCNN',
            'epoch': epoch+1,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'train_acc': train_acc,
            'val_acc': val_acc
        }])
    ], ignore_index=True)

    # Save model checkpoint
    torch.save(model.state_dict(), f'saved_models/SimpleCNN_epoch{epoch+1}.pt')

# Save training history to CSV
history_df.to_csv('saved_models/training_history.csv', index=False)
print("Training completed!")
print(f"Final validation accuracy: {val_acc:.4f}")


Training Epoch 1/5:   1%|▏         | 4/305 [00:14<18:37,  3.71s/it, Loss=1.1082, Acc=55.47%]

## Step 5: Visualize Training Results


In [None]:
import matplotlib.pyplot as plt

# Plot training history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Plot loss
ax1.plot(history_df['epoch'], history_df['train_loss'], label='Train Loss', marker='o')
ax1.plot(history_df['epoch'], history_df['val_loss'], label='Validation Loss', marker='s')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training and Validation Loss')
ax1.legend()
ax1.grid(True)

# Plot accuracy
ax2.plot(history_df['epoch'], history_df['train_acc'], label='Train Accuracy', marker='o')
ax2.plot(history_df['epoch'], history_df['val_acc'], label='Validation Accuracy', marker='s')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy')
ax2.set_title('Training and Validation Accuracy')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()

# Display final results
print("\nTraining History:")
print(history_df)


## Step 6: Test the Model

Let's test our trained model on a few samples from the validation set.


In [None]:
import numpy as np

# Get class names
class_names = ds['train'].features['label'].names

# Test on a few samples
model.eval()
with torch.no_grad():
    # Get a batch from validation loader
    images, labels = next(iter(val_loader))
    images, labels = images.to(device), labels.to(device)
    
    # Make predictions
    outputs = model(images)
    _, predicted = outputs.max(1)
    
    # Show first 8 samples
    fig, axes = plt.subplots(2, 4, figsize=(15, 8))
    axes = axes.ravel()
    
    for i in range(8):
        # Convert tensor to numpy for visualization
        img = images[i].cpu().squeeze().numpy()
        
        axes[i].imshow(img, cmap='gray')
        axes[i].set_title(f'True: {class_names[labels[i]]}, Pred: {class_names[predicted[i]]}')
        axes[i].axis('off')
        
        # Color the title based on correctness
        if labels[i] == predicted[i]:
            axes[i].title.set_color('green')
        else:
            axes[i].title.set_color('red')
    
    plt.tight_layout()
    plt.show()
    
    # Calculate accuracy for this batch
    batch_acc = (predicted == labels).float().mean().item()
    print(f"Batch accuracy: {batch_acc:.4f}")


## Next Steps

Now you can experiment with:

1. **Different architectures**: Try deeper CNNs, ResNet blocks, or attention mechanisms
2. **Hyperparameters**: Adjust learning rate, batch size, number of epochs
3. **Data augmentation**: Add rotation, scaling, noise to improve generalization
4. **Advanced models**: Implement encoder-decoder architectures for sequence generation
5. **Transfer learning**: Use pretrained models like TrOCR or Vision Transformers

The training history is saved in `saved_models/training_history.csv` and model checkpoints are in the `saved_models/` directory.
