1. Develop an image classification model based on transformer architecture without relying on pre-implemented transformer or self-attention modules such as torch.nn.Transformer or torch.nn.MultiheadAttention.


In [1]:
from modules.config import ViTConfig, TrainingConfig, DataConfig
from modules.ViT import VisionTransformer

import torch

from torchvision import datasets
import torchvision.transforms as transforms
from torchvision.models import resnet152  # For comparison
from torch.utils.data import DataLoader, Subset


import numpy as np

from dataclasses import asdict
from modules.pipeline import train_and_evaluate_model

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Prepare data
data_config = DataConfig.base()

# DEBUG
# data_config.debug = True
# data_config.batch_size = 2

In [4]:
train_transform = transforms.Compose(
    [
        transforms.Resize(data_config.img_size),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ]
)

val_transform = transforms.Compose(
    [
        transforms.Resize(data_config.img_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
    ]
)

In [5]:
trainset = datasets.CIFAR10(root="./data", train=True, download=True, transform=train_transform)
valset = datasets.CIFAR10(root="./data", train=True, download=True, transform=val_transform)

# Calculate split sizes
train_size = int(0.8 * len(trainset))
val_size = len(trainset) - train_size

# Generate indices for splitting
indices = list(range(len(trainset)))
np.random.shuffle(indices)
train_indices = indices[:train_size]
val_indices = indices[train_size:]

# Create subset datasets
train_data = Subset(trainset, train_indices)
val_data = Subset(valset, val_indices)
test_data = datasets.CIFAR10(root="./data", train=False, download=True, transform=val_transform)
classes = ("plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck")

if data_config.debug:
    train_data = Subset(train_data, list(range(256)))
    val_data = Subset(val_data, list(range(256)))
    test_data = Subset(test_data, list(range(256)))

train_loader = DataLoader(
    train_data,
    batch_size=data_config.batch_size,
    shuffle=True,
    num_workers=data_config.num_workers,
    pin_memory=data_config.pin_memory,
)
val_loader = DataLoader(
    val_data,
    batch_size=data_config.batch_size,
    num_workers=data_config.num_workers,
    pin_memory=data_config.pin_memory,
)
test_loader = DataLoader(
    test_data,
    batch_size=data_config.batch_size,
    num_workers=data_config.num_workers,
    pin_memory=data_config.pin_memory,
)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [14]:
def compare_models(
    vit_model,
    cnn_model,
    num_classes,
    train_loader,
    val_loader,
    test_loader,
    vit_train_config: dict[str, any],
    cnn_train_config: dict[str, any],
    **kwargs
):

    print("Evaluating ViT Model...")
    vit_metrics = train_and_evaluate_model(
        vit_model, num_classes, train_loader, val_loader, test_loader, **vit_train_config
    )

    print("Evaluating CNN Model...")
    cnn_metrics = train_and_evaluate_model(
        cnn_model, num_classes, train_loader, val_loader, test_loader, **cnn_train_config
    )
    metrics = [
        "Test Accuracy",
        "Training Time (s)",
        "Model Size",
        "Avg Inference Time (s)",
        "F1 Score",
        "AUROC",
    ]

    metric_keys = [
        "test_accuracy",
        "training_time",
        "model_size",
        "avg_inference_time",
        "test_f1",
        "test_auroc",
    ]

    # Initialize comparison dictionary with metrics
    comparison = {"Metric": metrics}

    # Add ViT and CNN metrics with proper length checking
    for model_name, metrics_dict in [("ViT", vit_metrics), ("CNN", cnn_metrics)]:
        comparison[model_name] = [metrics_dict.get(key, "N/A") for key in metric_keys]

    # Create DataFrame and save to CSV
    import pandas as pd

    print(comparison)
    df = pd.DataFrame(comparison)
    df.to_csv("model_comparison.csv", index=False)

    # Print comparison table
    from tabulate import tabulate

    print("\nModel Comparison:")
    print(tabulate(df, headers="keys", tablefmt="grid"))

    # Save model predictions
    predictions_df = pd.DataFrame(
        {
            "ViT Predictions": vit_metrics.get("test_predictions", []),
            "CNN Predictions": cnn_metrics.get("test_predictions", []),
            "Targets": vit_metrics.get("test_targets", []),
        }
    )
    predictions_df.to_csv("model_predictions.csv", index=False)

In [15]:
vit_config = ViTConfig.base()
vit_model = VisionTransformer(**asdict(vit_config))
cnn_model = resnet152()

vit_train_config = TrainingConfig.vit_base()
cnn_train_config = TrainingConfig.resnet152()

# DEBUG
# vit_train_config.epochs = 1
# cnn_train_config.epochs = 1

compare_models(
    vit_model,
    cnn_model,
    data_config.num_classes,
    train_loader,
    val_loader,
    test_loader,
    asdict(vit_train_config),
    asdict(cnn_train_config),
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type               | Params | Mode 
----------------------------------------------------------
0 | model      | VisionTransformer  | 85.8 M | train
1 | train_acc  | MulticlassAccuracy | 0      | train
2 | val_acc    | MulticlassAccuracy | 0      | train
3 | test_acc   | MulticlassAccuracy | 0      | train
4 | test_f1    | MulticlassF1Score  | 0      | train
5 | test_auroc | MulticlassAUROC    | 0      | train
----------------------------------------------------------
85.8 M    Trainable params
0         Non-trainable params
85.8 M    Total params
343.225   Total estimated model params size (MB)
156       Modules in train mode
0         Modules in eval mode


Evaluating ViT Model...
Epoch 0: 100%|██████████| 128/128 [00:06<00:00, 19.27it/s, v_num=19, train_loss=5.300, train_acc=0.000, val_loss=3.380, val_acc=0.160]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 128/128 [00:09<00:00, 13.93it/s, v_num=19, train_loss=5.300, train_acc=0.000, val_loss=3.380, val_acc=0.160]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 128/128 [00:01<00:00, 100.58it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   avg_inference_time      0.006689317524433136
       model_size               85806344.0
        test_acc                0.08984375
       test_auroc            0.628031849861145
         test_f1            0.03496216982603073
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Evaluating CNN Model...



  | Name       | Type               | Params | Mode 
----------------------------------------------------------
0 | model      | ResNet             | 58.2 M | train
1 | train_acc  | MulticlassAccuracy | 0      | train
2 | val_acc    | MulticlassAccuracy | 0      | train
3 | test_acc   | MulticlassAccuracy | 0      | train
4 | test_f1    | MulticlassF1Score  | 0      | train
5 | test_auroc | MulticlassAUROC    | 0      | train
----------------------------------------------------------
58.2 M    Trainable params
0         Non-trainable params
58.2 M    Total params
232.657   Total estimated model params size (MB)
428       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 128/128 [00:13<00:00,  9.54it/s, v_num=10, train_loss=2.610, train_acc=0.000, val_loss=11.60, val_acc=0.117]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 128/128 [00:15<00:00,  8.39it/s, v_num=10, train_loss=2.610, train_acc=0.000, val_loss=11.60, val_acc=0.117]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 128/128 [00:02<00:00, 44.91it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   avg_inference_time      0.018930813297629356
       model_size               58164296.0
        test_acc                0.12109375
       test_auroc           0.5550526976585388
         test_f1            0.0470992848277092
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
{'Metric': ['Test Accuracy', 'Training Time (s)', 'Model Size', 'Avg Inference Time (s)', 'F1 Score', 'AUROC'], 'ViT': [0.08984375, 9.192267768085003, 85806346, 0.006689317524433136, 0.03496216982603073, 0.628031849861145], 'CNN': [0.12109375, 15.261773918289691, 58164298, 0.01893081329762935