In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18, ResNet18_Weights
import pandas as pd
import os
from torch.utils.data import DataLoader, random_split
from torch.optim import Adam


# Option 1: Simple CNN from scratch
class SimpleCNN(nn.Module):
    def __init__(self, num_classes=3):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)

        self.fc1 = nn.Linear(
            64 * 28 * 28, 512
        )  # 28x28 is the image size after 3 max pooling operations
        self.fc2 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = self.pool(self.relu(self.conv3(x)))
        x = x.view(-1, 64 * 28 * 28)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        return x


# Option 2: Transfer Learning with ResNet
class ResNetTransfer(nn.Module):
    def __init__(self, num_classes=10):
        super(ResNetTransfer, self).__init__()
        # Load pretrained ResNet18
        self.resnet = resnet18(weights=ResNet18_Weights.DEFAULT)

        # Freeze all layers
        for param in self.resnet.parameters():
            param.requires_grad = False

        # Replace the final fully connected layer
        num_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Linear(num_features, num_classes)

    def forward(self, x):
        return self.resnet(x)


# Example usage:
def main():
    # Create instances of both models
    simple_cnn = SimpleCNN(num_classes=3)
    resnet_transfer = ResNetTransfer(num_classes=3)

    # Print model summaries
    print("Simple CNN Architecture:")
    print(simple_cnn)
    print("\nResNet Transfer Learning Architecture:")
    print(resnet_transfer)


if __name__ == "__main__":
    main()

Simple CNN Architecture:
SimpleCNN(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=50176, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=3, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
)

ResNet Transfer Learning Architecture:
ResNetTransfer(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kern

In [15]:
# Define transformations
transform = transforms.Compose(
    [
        transforms.Grayscale(num_output_channels=3),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ]
)

path_to_dataset = "document_classification"

dataset = torchvision.datasets.ImageFolder(root=path_to_dataset, transform=transform)

# Split the dataset into training and testing sets
train_ratio = 0.8
train_size = int(train_ratio * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

batch_size = 4
num_workers = 2

train_loader = DataLoader(
    dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers
)
test_loader = DataLoader(
    dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers
)

print(f"{len(train_dataset)} training samples")
print(f"{len(test_dataset)} testing samples")

for images, labels in train_loader:
    print(images.shape, labels.shape)
    break

132 training samples
33 testing samples
torch.Size([4, 3, 224, 224]) torch.Size([4])


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def train_model(model, train_loader, test_loader, num_epoches=10, lr_rate=0.001):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=lr_rate)

    for epoch in range(num_epoches):
        model.train()
        train_loss = 0.0
        train_acc = 0.0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            train_acc += (preds == labels).float().mean()

        train_loss /= len(train_loader)
        train_acc /= len(train_loader)
        print(
            f"Epoch {epoch + 1}/{num_epoches}, "
            f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}"
        )

        # Evaluate the model on the test set
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, preds = torch.max(outputs, 1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)
        test_acc = correct / total
        print(f"Test Accuracy: {test_acc:.4f}")

In [17]:
# create a model
simple_model = SimpleCNN(num_classes=3)
train_model(simple_model, train_loader, test_loader, num_epoches=10, lr_rate=0.001)

Epoch 1/10, Train Loss: 1.3791, Train Acc: 0.4091
Test Accuracy: 0.3333
Epoch 2/10, Train Loss: 0.8063, Train Acc: 0.5833
Test Accuracy: 0.7879
Epoch 3/10, Train Loss: 0.6361, Train Acc: 0.7576
Test Accuracy: 0.7879
Epoch 4/10, Train Loss: 0.4763, Train Acc: 0.8485
Test Accuracy: 0.8182
Epoch 5/10, Train Loss: 0.4164, Train Acc: 0.8788
Test Accuracy: 0.9091
Epoch 6/10, Train Loss: 0.3103, Train Acc: 0.9015
Test Accuracy: 0.9394
Epoch 7/10, Train Loss: 0.1848, Train Acc: 0.9470
Test Accuracy: 0.9697
Epoch 8/10, Train Loss: 0.0968, Train Acc: 0.9621
Test Accuracy: 0.8485
Epoch 9/10, Train Loss: 0.0710, Train Acc: 0.9848
Test Accuracy: 0.9091
Epoch 10/10, Train Loss: 0.0224, Train Acc: 1.0000
Test Accuracy: 0.8485


In [20]:
# create a model
model = ResNetTransfer(num_classes=3)
train_model(model, train_loader, test_loader, num_epoches=10, lr_rate=0.001)

Epoch 1/10, Train Loss: 1.1016, Train Acc: 0.4394
Test Accuracy: 0.4848
Epoch 2/10, Train Loss: 0.8572, Train Acc: 0.6136
Test Accuracy: 0.8485
Epoch 3/10, Train Loss: 0.7562, Train Acc: 0.6667
Test Accuracy: 0.8485
Epoch 4/10, Train Loss: 0.7123, Train Acc: 0.7348
Test Accuracy: 0.9091
Epoch 5/10, Train Loss: 0.5719, Train Acc: 0.8106
Test Accuracy: 0.8788
Epoch 6/10, Train Loss: 0.6822, Train Acc: 0.7197
Test Accuracy: 0.8788
Epoch 7/10, Train Loss: 0.4942, Train Acc: 0.8561
Test Accuracy: 0.9697
Epoch 8/10, Train Loss: 0.5559, Train Acc: 0.7879
Test Accuracy: 0.9394
Epoch 9/10, Train Loss: 0.5107, Train Acc: 0.8182
Test Accuracy: 0.9697
Epoch 10/10, Train Loss: 0.5832, Train Acc: 0.7500
Test Accuracy: 0.9394


We can see that the simple cnn and the ResNet model tend to overfit, with max test accuracy of 0.9697 both.