1. Develop an image classification model based on transformer architecture without relying on pre-implemented transformer or self-attention modules such as torch.nn.Transformer or torch.nn.MultiheadAttention.


In [1]:
from modules.config import ViTConfig, TrainingConfig, DataConfig
from modules.ViT import VisionTransformer

import torch

from torchvision import datasets
import torchvision.transforms as transforms
from torchvision.models import resnet152, resnet50  # For comparison
from torch.utils.data import DataLoader, Subset


import numpy as np

from dataclasses import asdict
from modules.pipeline import train_and_evaluate_model

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Prepare data
data_config = DataConfig.base()

# DEBUG
# data_config.debug = True
# data_config.batch_size = 2

In [4]:
train_transform = transforms.Compose(
    [
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.RandAugment(num_ops=2),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ]
)

val_transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ]
)

In [5]:
trainset = datasets.CIFAR10(root="./data", train=True, download=True, transform=train_transform)
valset = datasets.CIFAR10(root="./data", train=True, download=True, transform=val_transform)

# Calculate split sizes
train_size = int(0.8 * len(trainset))
val_size = len(trainset) - train_size

# Generate indices for splitting
indices = list(range(len(trainset)))
np.random.shuffle(indices)
train_indices = indices[:train_size]
val_indices = indices[train_size:]

# Create subset datasets
train_data = Subset(trainset, train_indices)
val_data = Subset(valset, val_indices)
test_data = datasets.CIFAR10(root="./data", train=False, download=True, transform=val_transform)
classes = ("plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck")

if data_config.debug:
    train_data = Subset(train_data, list(range(256)))
    val_data = Subset(val_data, list(range(256)))
    test_data = Subset(test_data, list(range(256)))

train_loader = DataLoader(
    train_data,
    batch_size=data_config.batch_size,
    shuffle=True,
    num_workers=data_config.num_workers,
    pin_memory=data_config.pin_memory,
)
val_loader = DataLoader(
    val_data,
    batch_size=data_config.batch_size,
    num_workers=data_config.num_workers,
    pin_memory=data_config.pin_memory,
)
test_loader = DataLoader(
    test_data,
    batch_size=data_config.batch_size,
    num_workers=data_config.num_workers,
    pin_memory=data_config.pin_memory,
)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [6]:
def compare_models(
    vit_model,
    resnet152_model,
    resnet50_model,
    num_classes,
    train_loader,
    val_loader,
    test_loader,
    vit_train_config: dict[str, any],
    resnet152_train_config: dict[str, any],
    resnet50_train_config: dict[str, any],
    **kwargs,
):

    print("Evaluating ViT Model...")
    vit_metrics = train_and_evaluate_model(
        vit_model, num_classes, train_loader, val_loader, test_loader, **vit_train_config
    )

    print("Evaluating ResNet152 Model...")
    resnet152_metrics = train_and_evaluate_model(
        resnet152_model,
        num_classes,
        train_loader,
        val_loader,
        test_loader,
        **resnet152_train_config,
    )

    print("Evaluating ResNet Model...")
    resnet50_metrics = train_and_evaluate_model(
        resnet50_model, num_classes, train_loader, val_loader, test_loader, **resnet50_train_config
    )
    metrics = [
        "Test Accuracy",
        "Training Time (s)",
        "Model Size",
        "Avg Inference Time (s)",
        "F1 Score",
        "AUROC",
    ]

    metric_keys = [
        "test_accuracy",
        "training_time",
        "model_size",
        "avg_inference_time",
        "test_f1",
        "test_auroc",
    ]

    # Initialize comparison dictionary with metrics
    comparison = {"Metric": metrics}

    # Add ViT and CNN metrics with proper length checking
    for model_name, metrics_dict in [
        ("ViT", vit_metrics),
        ("ResNet152", resnet152_metrics),
        ("ResNet50", resnet50_metrics),
    ]:
        comparison[model_name] = [metrics_dict.get(key, "N/A") for key in metric_keys]

    # Create DataFrame and save to CSV
    import pandas as pd
    from datetime import datetime
    import pathlib

    # print(comparison)
    now = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    pathlib.Path(f"results/{now}").mkdir(parents=True, exist_ok=True)

    df = pd.DataFrame(comparison)
    df.to_csv(f"results/{now}/model_comparison.csv", index=False)

    # Print comparison table
    from tabulate import tabulate

    print("\nModel Comparison:")
    print(tabulate(df, headers="keys", tablefmt="grid"))

    # Save model predictions
    predictions_df = pd.DataFrame(
        {
            "ViT Predictions": vit_metrics.get("test_predictions", []),
            "ResNet152 Predictions": resnet152_metrics.get("test_predictions", []),
            "ResNet50 Predictions": resnet50_metrics.get("test_predictions", []),
            "Targets": vit_metrics.get("test_targets", []),
        }
    )

    predictions_df.to_csv(f"results/{now}/model_predictions.csv", index=False)

    # Save Training Configs for each
    import json

    with open(f"results/{now}/vit_train_config.json", "w") as f:
        json.dump(vit_train_config, f)

    with open(f"results/{now}/resnet152_train_config.json", "w") as f:
        json.dump(resnet152_train_config, f)

    with open(f"results/{now}/resnet50_train_config.json", "w") as f:
        json.dump(resnet50_train_config, f)

In [7]:
vit_config = ViTConfig.base()
vit_model = VisionTransformer(**asdict(vit_config))
resnet152_model = resnet152()
resnet50_model = resnet50()

vit_train_config = TrainingConfig.vit_base()
resnet152_train_config = TrainingConfig.resnet152()  # TODO: change to resnet50
resnet50_train_config = TrainingConfig.resnet50()  # TODO: change to resnet50

# DEBUG
# vit_train_config.epochs = 1
# cnn_train_config.epochs = 1

compare_models(
    vit_model,
    resnet152_model,
    resnet50_model,
    data_config.num_classes,
    train_loader,
    val_loader,
    test_loader,
    asdict(vit_train_config),
    asdict(resnet152_train_config),
    asdict(resnet50_train_config),
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Evaluating ViT Model...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/work/workdir/vit-assignment/.conda/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:298: The number of training batches (10) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.

  | Name       | Type               | Params | Mode 
----------------------------------------------------------
0 | model      | VisionTransformer  | 2.7 M  | train
1 | criterion  | CrossEntropyLoss   | 0      | train
2 | train_acc  | MulticlassAccuracy | 0      | train
3 | val_acc    | MulticlassAccuracy | 0      | train
4 | test_acc   | MulticlassAccuracy | 0      | train
5 | test_f1    | MulticlassF1Score  | 0      | train
6 | test_auroc | MulticlassAUROC    | 0      | train
----------------------------------------------------------
2.7 M     Trainable params
0         Non-trai

Epoch 19: 100%|██████████| 10/10 [00:08<00:00,  1.25it/s, v_num=15, train_loss=1.710, train_acc=0.436, val_loss=1.720, val_acc=0.438]



Epoch 30:   0%|          | 0/10 [00:00<?, ?it/s, v_num=15, train_loss=1.540, train_acc=0.520, val_loss=1.470, val_acc=0.553]         


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined