In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import DataLoader, ConcatDataset
import torchvision
#from vit_pytorch.t2t import T2TViT
from timm import create_model

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def create_datasets_and_loaders():
    # Base transform
    base_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Augmentations
    augmentations = transforms.Compose([
        transforms.RandomResizedCrop(size=224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        transforms.RandomErasing(p=0.2)
    ])

    # Load the original dataset
    original_dataset = torchvision.datasets.ImageFolder(root=r'C:\Users\CoolA\Code\Medicinal Leaves\Dataset 1\train', transform=base_transform)

    # Create an augmented dataset
    augmented_dataset = torchvision.datasets.ImageFolder(root=r'C:\Users\CoolA\Code\Medicinal Leaves\Dataset 1\train', transform=augmentations)

    # Combine the original and augmented datasets
    trainset = ConcatDataset([original_dataset, augmented_dataset])

    # Create data loaders
    trainloader = DataLoader(trainset, batch_size=32, shuffle=True, num_workers=2)

    valset = torchvision.datasets.ImageFolder(root=r'C:\Users\CoolA\Code\Medicinal Leaves\Dataset 1\val', transform=base_transform)
    valloader = DataLoader(valset, batch_size=32, shuffle=False, num_workers=2)

    return trainloader, valloader, original_dataset.classes

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
trainloader, valloader, classes = create_datasets_and_loaders()
num_classes = len(classes)
results_log = []
num_classes

30

In [5]:
def train_model(model, model_name, trainloader, valloader, criterion, optimizer, num_epochs=20):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    results = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, labels in trainloader:
            inputs, labels = inputs.to(device), labels.to(device)
            # print(f"Inputs shape: {inputs.shape}")
            # print(f"Labels shape: {labels.shape}")
            
            optimizer.zero_grad()
            outputs = model(inputs)
            # print(f"Outputs shape: {outputs.shape}")
            
            assert outputs.shape == (inputs.shape[0], num_classes), f"Expected output shape {(inputs.shape[0], num_classes)}, but got {outputs.shape}"
            
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if isinstance(outputs, tuple):
                _, predicted = outputs[0].max(1)
            else:
                _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            
        train_loss = running_loss / len(trainloader)
        train_acc = 100. * correct / total

        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for inputs, labels in valloader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                if isinstance(outputs, tuple):
                    _, predicted = outputs[0].max(1)
                else:
                    _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()
        val_loss = val_loss / len(valloader)
        val_acc = 100. * val_correct / val_total

        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
        print()
        
        results.append([epoch+1, train_loss, train_acc, val_loss, val_acc])
        
    return [model_name, results]

# Large ViTs

## Simple Vision Transformer (ViT)

In [6]:
# ViT (Vision Transformer)
model = create_model('vit_base_patch16_224', pretrained=True)
model.head = nn.Linear(model.head.in_features, num_classes)
print(f"ViT parameters: {count_parameters(model)}")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

results_log.append(train_model(model, "ViT", trainloader, valloader, criterion, optimizer))

ViT parameters: 85821726
Epoch 1/20:
Train Loss: 3.4262, Train Acc: 8.45%
Val Loss: 2.7533, Val Acc: 18.95%

Epoch 2/20:
Train Loss: 2.7908, Train Acc: 18.59%
Val Loss: 2.1373, Val Acc: 30.26%

Epoch 3/20:
Train Loss: 2.3352, Train Acc: 30.72%
Val Loss: 1.8782, Val Acc: 41.84%

Epoch 4/20:
Train Loss: 2.1502, Train Acc: 35.84%
Val Loss: 1.7751, Val Acc: 44.74%

Epoch 5/20:
Train Loss: 1.9907, Train Acc: 39.04%
Val Loss: 1.5256, Val Acc: 52.11%

Epoch 6/20:
Train Loss: 1.8434, Train Acc: 43.95%
Val Loss: 1.4803, Val Acc: 53.16%

Epoch 7/20:
Train Loss: 1.7927, Train Acc: 46.22%
Val Loss: 1.4137, Val Acc: 54.47%

Epoch 8/20:
Train Loss: 1.6264, Train Acc: 51.13%
Val Loss: 1.5081, Val Acc: 53.95%

Epoch 9/20:
Train Loss: 1.6489, Train Acc: 50.62%
Val Loss: 1.2181, Val Acc: 63.42%

Epoch 10/20:
Train Loss: 1.6048, Train Acc: 52.16%
Val Loss: 1.1054, Val Acc: 66.58%

Epoch 11/20:
Train Loss: 1.6427, Train Acc: 50.58%
Val Loss: 1.3287, Val Acc: 56.84%

Epoch 12/20:
Train Loss: 1.5728, Train 

## 2. DeiT (Data-efficient image Transformers)

In [7]:
class DistillationLoss(torch.nn.Module):
    def __init__(self, base_criterion: torch.nn.Module, alpha: float = 0.5):
        super().__init__()
        self.base_criterion = base_criterion
        self.alpha = alpha

    def forward(self, outputs, labels):
        if isinstance(outputs, tuple):
            outputs, outputs_kd = outputs
            loss = self.base_criterion(outputs, labels)
            loss_kd = self.base_criterion(outputs_kd, labels)
            loss = self.alpha * loss + (1 - self.alpha) * loss_kd
        else:
            loss = self.base_criterion(outputs, labels)
        return loss

In [8]:
# 2. DeiT (Data-efficient image Transformers)
model = create_model('deit_base_distilled_patch16_224', pretrained=True)
model.head = nn.Linear(model.head.in_features, num_classes)
num_ftrs = model.head_dist.in_features
model.head_dist = torch.nn.Linear(num_ftrs, num_classes)

In [9]:
print(f"DeiT parameters: {count_parameters(model)}")

base_criterion = nn.CrossEntropyLoss()
criterion = DistillationLoss(base_criterion, alpha=0.5)
optimizer = optim.Adam(model.parameters(), lr=0.001)

results_log.append(train_model(model, "DeiT", trainloader, valloader, criterion, optimizer))

DeiT parameters: 85846332
Epoch 1/20:
Train Loss: 2.3864, Train Acc: 29.18%
Val Loss: 1.5010, Val Acc: 51.84%

Epoch 2/20:
Train Loss: 1.4306, Train Acc: 56.98%
Val Loss: 0.9763, Val Acc: 68.42%

Epoch 3/20:
Train Loss: 1.0156, Train Acc: 68.35%
Val Loss: 0.6840, Val Acc: 76.84%

Epoch 4/20:
Train Loss: 0.7993, Train Acc: 75.88%
Val Loss: 0.5709, Val Acc: 82.89%

Epoch 5/20:
Train Loss: 0.6460, Train Acc: 80.38%
Val Loss: 0.4616, Val Acc: 86.58%

Epoch 6/20:
Train Loss: 0.6135, Train Acc: 81.27%
Val Loss: 0.4543, Val Acc: 85.53%

Epoch 7/20:
Train Loss: 0.5646, Train Acc: 82.82%
Val Loss: 0.3697, Val Acc: 89.21%

Epoch 8/20:
Train Loss: 0.4886, Train Acc: 84.67%
Val Loss: 0.2753, Val Acc: 91.58%

Epoch 9/20:
Train Loss: 0.5024, Train Acc: 84.33%
Val Loss: 0.3525, Val Acc: 90.26%

Epoch 10/20:
Train Loss: 0.4741, Train Acc: 84.74%
Val Loss: 0.4697, Val Acc: 85.00%

Epoch 11/20:
Train Loss: 0.4527, Train Acc: 85.60%
Val Loss: 0.3365, Val Acc: 90.79%

Epoch 12/20:
Train Loss: 0.4359, Trai

### Swin Transformer

In [10]:
class SwinTransformerForClassification(nn.Module):
    def __init__(self, base_model, num_classes):
        super().__init__()
        self.base_model = base_model
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(base_model.num_features, num_classes)

    def forward(self, x):
        x = self.base_model.forward_features(x)
        #print(f"After base_model: {x.shape}")
        
        # Global average pooling
        x = x.mean(dim=[1, 2])
        #print(f"After global average pooling: {x.shape}")
        
        # Final classification layer
        x = self.fc(x)
        #print(f"After fc: {x.shape}")
        
        return x

# Create the base model
base_model = create_model('swin_base_patch4_window7_224', pretrained=True)

# Print the number of features
print(f"Number of features: {base_model.num_features}")

# Create our custom model
model = SwinTransformerForClassification(base_model, num_classes)

print(f"Swin Transformer parameters: {count_parameters(model)}")

Number of features: 1024
Swin Transformer parameters: 87798974


In [11]:
model

SwinTransformerForClassification(
  (base_model): SwinTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
    (layers): Sequential(
      (0): SwinTransformerStage(
        (downsample): Identity()
        (blocks): Sequential(
          (0): SwinTransformerBlock(
            (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
            (attn): WindowAttention(
              (qkv): Linear(in_features=128, out_features=384, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
              (proj): Linear(in_features=128, out_features=128, bias=True)
              (proj_drop): Dropout(p=0.0, inplace=False)
              (softmax): Softmax(dim=-1)
            )
            (drop_path1): Identity()
            (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
            (mlp): Mlp(
              (fc1): Linear(in_features

In [12]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

results_log.append(train_model(model, "SwinT", trainloader, valloader, criterion, optimizer))

Epoch 1/20:
Train Loss: 3.4307, Train Acc: 5.19%
Val Loss: 3.3874, Val Acc: 6.58%

Epoch 2/20:
Train Loss: 3.2405, Train Acc: 8.01%
Val Loss: 3.0361, Val Acc: 12.11%

Epoch 3/20:
Train Loss: 3.1039, Train Acc: 9.83%
Val Loss: 3.0052, Val Acc: 11.32%

Epoch 4/20:
Train Loss: 3.0788, Train Acc: 11.31%
Val Loss: 2.9316, Val Acc: 14.47%

Epoch 5/20:
Train Loss: 3.2557, Train Acc: 7.56%
Val Loss: 3.3973, Val Acc: 3.95%

Epoch 6/20:
Train Loss: 3.3926, Train Acc: 5.91%
Val Loss: 3.3783, Val Acc: 5.26%

Epoch 7/20:
Train Loss: 3.3905, Train Acc: 5.46%
Val Loss: 3.3782, Val Acc: 4.21%

Epoch 8/20:
Train Loss: 3.3790, Train Acc: 6.25%
Val Loss: 3.3768, Val Acc: 6.58%

Epoch 9/20:
Train Loss: 3.3796, Train Acc: 6.12%
Val Loss: 3.3809, Val Acc: 6.58%

Epoch 10/20:
Train Loss: 3.3834, Train Acc: 6.94%
Val Loss: 3.3716, Val Acc: 5.53%

Epoch 11/20:
Train Loss: 3.3752, Train Acc: 5.74%
Val Loss: 3.3693, Val Acc: 6.58%

Epoch 12/20:
Train Loss: 3.3751, Train Acc: 5.91%
Val Loss: 3.3658, Val Acc: 6.58

## 4. Mobile ViT

In [13]:
class MobileViTForClassification(nn.Module):
    def __init__(self, base_model, num_classes):
        super().__init__()
        self.base_model = base_model
        self.base_model.head = nn.Identity()  # Remove the original classification head
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Linear(base_model.num_features, num_classes)

    def forward(self, x):
        x = self.base_model(x)
        x = self.global_pool(x).flatten(1)
        return self.classifier(x)

In [14]:
# Create the base model
base_model = create_model('mobilevit_s', pretrained=True)
# Create the custom model
model = MobileViTForClassification(base_model, num_classes)

print(f"MobileViT parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

results_log.append(train_model(model, "MobileViT", trainloader, valloader, criterion, optimizer))

MobileViT parameters: 4956862
Epoch 1/20:
Train Loss: 1.1408, Train Acc: 78.59%
Val Loss: 0.2079, Val Acc: 93.68%

Epoch 2/20:
Train Loss: 0.2824, Train Acc: 92.51%
Val Loss: 0.0504, Val Acc: 99.21%

Epoch 3/20:
Train Loss: 0.1980, Train Acc: 94.36%
Val Loss: 0.0230, Val Acc: 99.74%

Epoch 4/20:
Train Loss: 0.1226, Train Acc: 96.60%
Val Loss: 0.0267, Val Acc: 99.21%

Epoch 5/20:
Train Loss: 0.1249, Train Acc: 96.19%
Val Loss: 0.0492, Val Acc: 98.42%

Epoch 6/20:
Train Loss: 0.1407, Train Acc: 96.19%
Val Loss: 0.0106, Val Acc: 100.00%

Epoch 7/20:
Train Loss: 0.1147, Train Acc: 96.49%
Val Loss: 0.2454, Val Acc: 92.63%

Epoch 8/20:
Train Loss: 0.1029, Train Acc: 97.18%
Val Loss: 0.1065, Val Acc: 95.53%

Epoch 9/20:
Train Loss: 0.0878, Train Acc: 97.42%
Val Loss: 0.0054, Val Acc: 100.00%

Epoch 10/20:
Train Loss: 0.0830, Train Acc: 97.84%
Val Loss: 0.0232, Val Acc: 99.74%

Epoch 11/20:
Train Loss: 0.0706, Train Acc: 98.25%
Val Loss: 0.0099, Val Acc: 99.74%

Epoch 12/20:
Train Loss: 0.0741

## Compact Vision Transformers (CVT)

In [8]:
# 5. Compact Vision Transformers (CVT)
model = create_model('cvt_13_224', pretrained=True)
model.head = nn.Linear(model.head.in_features, num_classes)
print(f"CVT parameters: {count_parameters(model)}")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

results_log.append(train_model(model, "CVT", trainloader, valloader, criterion, optimizer))

RuntimeError: Unknown model (cvt_13_224)

## 6. Pyramid Vision Transformer (PVT)

In [6]:
# 6. PVT (Pyramid Vision Transformer)
model = create_model('pvt_v2_b2', pretrained=True, num_classes=num_classes)
model.head = nn.Linear(model.head.in_features, num_classes)
print(f"PVT parameters: {count_parameters(model)}")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

results_log.append(train_model(model, "PVT", trainloader, valloader, criterion, optimizer))

PVT parameters: 24865246
Epoch 1/20:
Train Loss: 3.4051, Train Acc: 5.43%
Val Loss: 3.2279, Val Acc: 6.32%

Epoch 2/20:
Train Loss: 3.1732, Train Acc: 9.48%
Val Loss: 2.8935, Val Acc: 13.68%

Epoch 3/20:
Train Loss: 3.2431, Train Acc: 9.07%
Val Loss: 3.3816, Val Acc: 6.58%

Epoch 4/20:
Train Loss: 3.3812, Train Acc: 6.19%
Val Loss: 3.3782, Val Acc: 6.58%

Epoch 5/20:
Train Loss: 3.3786, Train Acc: 5.84%
Val Loss: 3.3779, Val Acc: 6.58%

Epoch 6/20:
Train Loss: 3.3803, Train Acc: 5.95%
Val Loss: 3.3773, Val Acc: 6.58%

Epoch 7/20:
Train Loss: 3.3748, Train Acc: 5.91%
Val Loss: 3.3708, Val Acc: 6.58%

Epoch 8/20:
Train Loss: 3.3750, Train Acc: 6.67%
Val Loss: 3.3707, Val Acc: 6.58%

Epoch 9/20:
Train Loss: 3.3725, Train Acc: 6.63%
Val Loss: 3.3699, Val Acc: 6.58%

Epoch 10/20:
Train Loss: 3.3711, Train Acc: 6.43%
Val Loss: 3.3668, Val Acc: 5.53%

Epoch 11/20:
Train Loss: 3.3729, Train Acc: 6.12%
Val Loss: 3.3702, Val Acc: 5.53%

Epoch 12/20:
Train Loss: 3.3677, Train Acc: 6.60%
Val Loss:

## 7. Token-to-Token ViT (T2T-ViT)

In [12]:
# 7. T2T-ViT (Tokens-to-Token ViT)
model = create_model('t2t_vit_14', pretrained=False, num_classes=num_classes)
print(f"T2T-ViT parameters: {count_parameters(model)}")

RuntimeError: Unknown model (t2t_vit_14)

In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

results_log.append(train_model(model, "T2TViT", trainloader, valloader, criterion, optimizer))

T2T-ViT parameters: 22968376
Epoch 1/10:
Train Loss: 3.2855, Train Acc: 8.80%
Val Loss: 3.4034, Val Acc: 11.32%

Epoch 2/10:
Train Loss: 3.2154, Train Acc: 9.83%
Val Loss: 2.9995, Val Acc: 8.95%

Epoch 3/10:
Train Loss: 3.2525, Train Acc: 8.69%
Val Loss: 3.1465, Val Acc: 12.11%

Epoch 4/10:
Train Loss: 3.1436, Train Acc: 10.45%
Val Loss: 3.4469, Val Acc: 9.21%

Epoch 5/10:
Train Loss: 3.1009, Train Acc: 12.78%
Val Loss: 2.9321, Val Acc: 18.42%

Epoch 6/10:
Train Loss: 3.1960, Train Acc: 11.27%
Val Loss: 3.0214, Val Acc: 13.16%

Epoch 7/10:
Train Loss: 3.1755, Train Acc: 11.55%
Val Loss: 3.0381, Val Acc: 15.53%

Epoch 8/10:
Train Loss: 3.1190, Train Acc: 12.44%
Val Loss: 2.9607, Val Acc: 15.00%

Epoch 9/10:
Train Loss: 3.2908, Train Acc: 8.25%
Val Loss: 3.4020, Val Acc: 6.32%

Epoch 10/10:
Train Loss: 3.3653, Train Acc: 7.18%
Val Loss: 3.3360, Val Acc: 6.84%



In [19]:
import pprint
pprint.pp(results_log)

[['MobileViT',
  [[1,
    1.1322349488407701,
    78.48797250859107,
    0.38796663253257674,
    88.15789473684211],
   [2,
    0.27744451312573404,
    92.92096219931271,
    0.3815717758843675,
    90.52631578947368],
   [3,
    0.1903810644379029,
    94.81099656357388,
    0.06363874473997082,
    98.6842105263158],
   [4,
    0.15827544677265726,
    95.63573883161511,
    0.1497575838099389,
    96.84210526315789],
   [5,
    0.13721451036386437,
    96.49484536082474,
    0.11183129770021576,
    95.78947368421052],
   [6, 0.10646251993315232, 97.18213058419244, 0.006984180969690594, 100.0],
   [7,
    0.08577700416240228,
    97.80068728522336,
    0.1039104358642362,
    96.57894736842105],
   [8,
    0.1368462966964836,
    96.08247422680412,
    0.15292733601139238,
    95.26315789473684],
   [9,
    0.10655207334311453,
    96.90721649484536,
    0.035926568671129644,
    98.6842105263158],
   [10,
    0.09183610201516486,
    97.38831615120274,
    0.03306130847583214,
  

# Smaller models

### DeiT Tiny (5M)

In [11]:
class DistillationLoss(torch.nn.Module):
    def __init__(self, base_criterion: torch.nn.Module, alpha: float = 0.5):
        super().__init__()
        self.base_criterion = base_criterion
        self.alpha = alpha

    def forward(self, outputs, labels):
        if isinstance(outputs, tuple):
            outputs, outputs_kd = outputs
            loss = self.base_criterion(outputs, labels)
            loss_kd = self.base_criterion(outputs_kd, labels)
            loss = self.alpha * loss + (1 - self.alpha) * loss_kd
        else:
            loss = self.base_criterion(outputs, labels)
        return loss

In [12]:
# Setup and check model
model = create_model('deit_tiny_distilled_patch16_224', pretrained=True)
model.head = nn.Linear(model.head.in_features, num_classes)
model.head_dist = nn.Linear(model.head_dist.in_features, num_classes)

print(f"DeiT-Tiny parameters: {count_parameters(model)}")
model

model.safetensors:   0%|          | 0.00/23.7M [00:00<?, ?B/s]

DeiT-Tiny parameters: 5536380


VisionTransformerDistilled(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Id

In [13]:
# Train model
base_criterion = nn.CrossEntropyLoss()
criterion = DistillationLoss(base_criterion, alpha=0.5)
optimizer = optim.Adam(model.parameters(), lr=0.001)
results_log.append(train_model(model, "DeiT-Tiny", trainloader, valloader, criterion, optimizer))

Epoch 1/10:
Train Loss: 2.8294, Train Acc: 19.59%
Val Loss: 2.1529, Val Acc: 34.47%

Epoch 2/10:
Train Loss: 2.0003, Train Acc: 40.72%
Val Loss: 1.3115, Val Acc: 57.63%

Epoch 3/10:
Train Loss: 1.6246, Train Acc: 49.62%
Val Loss: 1.0609, Val Acc: 70.79%

Epoch 4/10:
Train Loss: 1.1917, Train Acc: 64.02%
Val Loss: 0.6873, Val Acc: 75.26%

Epoch 5/10:
Train Loss: 1.0654, Train Acc: 67.53%
Val Loss: 0.6049, Val Acc: 80.53%

Epoch 6/10:
Train Loss: 0.8593, Train Acc: 73.54%
Val Loss: 0.6116, Val Acc: 81.58%

Epoch 7/10:
Train Loss: 0.8161, Train Acc: 74.74%
Val Loss: 0.5900, Val Acc: 82.11%

Epoch 8/10:
Train Loss: 0.7108, Train Acc: 78.97%
Val Loss: 0.4963, Val Acc: 83.68%

Epoch 9/10:
Train Loss: 0.6522, Train Acc: 79.45%
Val Loss: 0.4102, Val Acc: 87.63%

Epoch 10/10:
Train Loss: 0.5833, Train Acc: 82.44%
Val Loss: 0.5006, Val Acc: 83.42%



### MobileVIT XXS (0.96M)

In [25]:
# Setup and check model
model = create_model('mobilevit_xxs', pretrained=True)
in_features = model.head.fc.in_features
model.head.fc = nn.Linear(in_features, num_classes)
#model.head = nn.Linear(model.head.in_features, num_classes)

print(f"MobileViT-XXS parameters: {count_parameters(model)}")
model

MobileViT-XXS parameters: 960654


ByobNet(
  (stem): ConvNormAct(
    (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNormAct2d(
      16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
      (drop): Identity()
      (act): SiLU(inplace=True)
    )
  )
  (stages): Sequential(
    (0): Sequential(
      (0): BottleneckBlock(
        (shortcut): Identity()
        (conv1_1x1): ConvNormAct(
          (conv): Conv2d(16, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNormAct2d(
            32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): SiLU(inplace=True)
          )
        )
        (conv2_kxk): ConvNormAct(
          (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (bn): BatchNormAct2d(
            32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
          

In [26]:
# Train model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
results_log.append(train_model(model, "MobileViT-XXS", trainloader, valloader, criterion, optimizer))

Epoch 1/10:
Train Loss: 1.6459, Train Acc: 65.26%
Val Loss: 0.9001, Val Acc: 73.16%

Epoch 2/10:
Train Loss: 0.4847, Train Acc: 88.56%
Val Loss: 0.6422, Val Acc: 80.00%

Epoch 3/10:
Train Loss: 0.4080, Train Acc: 88.73%
Val Loss: 0.4413, Val Acc: 85.26%

Epoch 4/10:
Train Loss: 0.2647, Train Acc: 93.16%
Val Loss: 0.6951, Val Acc: 79.74%

Epoch 5/10:
Train Loss: 0.2469, Train Acc: 93.09%
Val Loss: 0.2771, Val Acc: 91.32%

Epoch 6/10:
Train Loss: 0.2028, Train Acc: 94.05%
Val Loss: 0.1000, Val Acc: 97.11%

Epoch 7/10:
Train Loss: 0.1921, Train Acc: 94.60%
Val Loss: 0.1740, Val Acc: 93.68%

Epoch 8/10:
Train Loss: 0.1621, Train Acc: 95.36%
Val Loss: 0.2821, Val Acc: 90.26%

Epoch 9/10:
Train Loss: 0.1432, Train Acc: 95.91%
Val Loss: 0.5019, Val Acc: 87.37%

Epoch 10/10:
Train Loss: 0.1244, Train Acc: 96.43%
Val Loss: 0.1181, Val Acc: 97.37%



### EfficientFormer-L1 (12M)

In [30]:
# Setup and check model
model = create_model('efficientformer_l1', pretrained=True)
model.head = nn.Linear(model.head.in_features, num_classes)
model.head_dist = nn.Linear(model.head_dist.in_features, num_classes)

print(f"EfficientFormer-L1 parameters: {count_parameters(model)}")
model

EfficientFormer-L1 parameters: 11418868


EfficientFormer(
  (stem): Stem4(
    (conv1): Conv2d(3, 24, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (norm1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act1): ReLU()
    (conv2): Conv2d(24, 48, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (norm2): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act2): ReLU()
  )
  (stages): Sequential(
    (0): EfficientFormerStage(
      (downsample): Identity()
      (blocks): Sequential(
        (0): MetaBlock2d(
          (token_mixer): Pooling(
            (pool): AvgPool2d(kernel_size=3, stride=1, padding=1)
          )
          (ls1): LayerScale2d()
          (drop_path1): Identity()
          (mlp): ConvMlpWithNorm(
            (fc1): Conv2d(48, 192, kernel_size=(1, 1), stride=(1, 1))
            (norm1): BatchNorm2d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act): GELU(approximate='none')
            

In [31]:
# Train model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
results_log.append(train_model(model, "EfficientFormer-L1", trainloader, valloader, criterion, optimizer))

Epoch 1/10:
Train Loss: 0.7096, Train Acc: 80.34%
Val Loss: 0.3074, Val Acc: 91.32%

Epoch 2/10:
Train Loss: 0.2955, Train Acc: 92.37%
Val Loss: 1.4206, Val Acc: 60.79%

Epoch 3/10:
Train Loss: 0.3120, Train Acc: 90.96%
Val Loss: 0.2710, Val Acc: 93.42%

Epoch 4/10:
Train Loss: 0.1717, Train Acc: 95.26%
Val Loss: 0.1001, Val Acc: 97.63%

Epoch 5/10:
Train Loss: 0.1894, Train Acc: 94.54%
Val Loss: 0.1351, Val Acc: 94.74%

Epoch 6/10:
Train Loss: 0.1503, Train Acc: 95.81%
Val Loss: 0.0277, Val Acc: 98.42%

Epoch 7/10:
Train Loss: 0.1474, Train Acc: 96.05%
Val Loss: 0.0231, Val Acc: 99.21%

Epoch 8/10:
Train Loss: 0.1321, Train Acc: 96.32%
Val Loss: 0.0928, Val Acc: 97.11%

Epoch 9/10:
Train Loss: 0.1389, Train Acc: 96.25%
Val Loss: 0.0834, Val Acc: 97.11%

Epoch 10/10:
Train Loss: 0.1320, Train Acc: 96.36%
Val Loss: 0.0304, Val Acc: 98.68%



### ConvNeXt-Atto (3.38M)

In [32]:
# Setup and check model
model = create_model('convnext_atto', pretrained=True)
in_features = model.head.fc.in_features
model.head.fc = nn.Linear(in_features, num_classes)

print(f"ConvNeXt-Atto parameters: {count_parameters(model)}")
model

model.safetensors:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

ConvNeXt-Atto parameters: 3384150


ConvNeXt(
  (stem): Sequential(
    (0): Conv2d(3, 40, kernel_size=(4, 4), stride=(4, 4))
    (1): LayerNorm2d((40,), eps=1e-06, elementwise_affine=True)
  )
  (stages): Sequential(
    (0): ConvNeXtStage(
      (downsample): Identity()
      (blocks): Sequential(
        (0): ConvNeXtBlock(
          (conv_dw): Conv2d(40, 40, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=40)
          (norm): LayerNorm2d((40,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Conv2d(40, 160, kernel_size=(1, 1), stride=(1, 1))
            (act): GELU()
            (drop1): Dropout(p=0.0, inplace=False)
            (norm): Identity()
            (fc2): Conv2d(160, 40, kernel_size=(1, 1), stride=(1, 1))
            (drop2): Dropout(p=0.0, inplace=False)
          )
          (shortcut): Identity()
          (drop_path): Identity()
        )
        (1): ConvNeXtBlock(
          (conv_dw): Conv2d(40, 40, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=40)


In [33]:
# Train model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
results_log.append(train_model(model, "ConvNeXt-Atto", trainloader, valloader, criterion, optimizer))

Epoch 1/10:
Train Loss: 3.0961, Train Acc: 11.37%
Val Loss: 2.4374, Val Acc: 26.32%

Epoch 2/10:
Train Loss: 2.3744, Train Acc: 27.08%
Val Loss: 1.7738, Val Acc: 44.47%

Epoch 3/10:
Train Loss: 1.9699, Train Acc: 40.17%
Val Loss: 1.2855, Val Acc: 60.53%

Epoch 4/10:
Train Loss: 1.6648, Train Acc: 50.07%
Val Loss: 0.9701, Val Acc: 70.79%

Epoch 5/10:
Train Loss: 1.4105, Train Acc: 58.45%
Val Loss: 0.8148, Val Acc: 76.32%

Epoch 6/10:
Train Loss: 1.1872, Train Acc: 63.71%
Val Loss: 0.8988, Val Acc: 76.05%

Epoch 7/10:
Train Loss: 1.0575, Train Acc: 67.73%
Val Loss: 0.7074, Val Acc: 75.53%

Epoch 8/10:
Train Loss: 0.9705, Train Acc: 69.69%
Val Loss: 0.5877, Val Acc: 79.74%

Epoch 9/10:
Train Loss: 0.8148, Train Acc: 76.01%
Val Loss: 0.4161, Val Acc: 87.37%

Epoch 10/10:
Train Loss: 0.7188, Train Acc: 78.01%
Val Loss: 0.3541, Val Acc: 88.95%



### PVT-Tiny

In [6]:
# Setup and check model
model = create_model("pvt_v2_b0", pretrained=True, num_classes=30)
print(f"PVT-Tiny-B0 parameters: {count_parameters(model)}")
model

model.safetensors:   0%|          | 0.00/14.7M [00:00<?, ?B/s]

PVT-Tiny-B0 parameters: 3417470


PyramidVisionTransformerV2(
  (patch_embed): OverlapPatchEmbed(
    (proj): Conv2d(3, 32, kernel_size=(7, 7), stride=(4, 4), padding=(3, 3))
    (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  )
  (stages): Sequential(
    (0): PyramidVisionTransformerStage(
      (blocks): ModuleList(
        (0-1): 2 x Block(
          (norm1): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (q): Linear(in_features=32, out_features=32, bias=True)
            (kv): Linear(in_features=32, out_features=64, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=32, out_features=32, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
            (sr): Conv2d(32, 32, kernel_size=(8, 8), stride=(8, 8))
            (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          )
          (drop_path1): Identity()
          (norm2): LayerNorm((32,), eps=1e-06, elementwise_affine

In [7]:
# Train model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
results_log.append(train_model(model, "pvt_tiny_b0", trainloader, valloader, criterion, optimizer))

Epoch 1/10:
Train Loss: 2.4762, Train Acc: 25.81%
Val Loss: 1.1276, Val Acc: 62.11%

Epoch 2/10:
Train Loss: 1.2685, Train Acc: 59.04%
Val Loss: 0.6382, Val Acc: 79.21%

Epoch 3/10:
Train Loss: 0.9597, Train Acc: 68.35%
Val Loss: 0.5809, Val Acc: 81.32%

Epoch 4/10:
Train Loss: 0.7084, Train Acc: 77.32%
Val Loss: 0.3626, Val Acc: 90.26%

Epoch 5/10:
Train Loss: 0.5436, Train Acc: 82.03%
Val Loss: 0.4340, Val Acc: 86.05%

Epoch 6/10:
Train Loss: 0.4251, Train Acc: 86.80%
Val Loss: 0.4169, Val Acc: 85.53%

Epoch 7/10:
Train Loss: 0.4276, Train Acc: 86.25%
Val Loss: 0.1834, Val Acc: 94.74%

Epoch 8/10:
Train Loss: 0.3250, Train Acc: 89.76%
Val Loss: 0.4282, Val Acc: 85.53%

Epoch 9/10:
Train Loss: 0.3990, Train Acc: 87.29%
Val Loss: 0.2019, Val Acc: 92.89%

Epoch 10/10:
Train Loss: 0.3055, Train Acc: 90.27%
Val Loss: 0.1254, Val Acc: 94.74%



## Tiny VIT 11M

In [11]:
import timm
timm.list_models()

['bat_resnext26ts',
 'beit_base_patch16_224',
 'beit_base_patch16_384',
 'beit_large_patch16_224',
 'beit_large_patch16_384',
 'beit_large_patch16_512',
 'beitv2_base_patch16_224',
 'beitv2_large_patch16_224',
 'botnet26t_256',
 'botnet50ts_256',
 'caformer_b36',
 'caformer_m36',
 'caformer_s18',
 'caformer_s36',
 'cait_m36_384',
 'cait_m48_448',
 'cait_s24_224',
 'cait_s24_384',
 'cait_s36_384',
 'cait_xs24_384',
 'cait_xxs24_224',
 'cait_xxs24_384',
 'cait_xxs36_224',
 'cait_xxs36_384',
 'coat_lite_medium',
 'coat_lite_medium_384',
 'coat_lite_mini',
 'coat_lite_small',
 'coat_lite_tiny',
 'coat_mini',
 'coat_small',
 'coat_tiny',
 'coatnet_0_224',
 'coatnet_0_rw_224',
 'coatnet_1_224',
 'coatnet_1_rw_224',
 'coatnet_2_224',
 'coatnet_2_rw_224',
 'coatnet_3_224',
 'coatnet_3_rw_224',
 'coatnet_4_224',
 'coatnet_5_224',
 'coatnet_bn_0_rw_224',
 'coatnet_nano_cc_224',
 'coatnet_nano_rw_224',
 'coatnet_pico_rw_224',
 'coatnet_rmlp_0_rw_224',
 'coatnet_rmlp_1_rw2_224',
 'coatnet_rmlp_1_r

In [11]:
# Setup and check model
model = create_model("tiny_vit_11m_224", pretrained=True, num_classes=30)

print(f"Tiny-ViT 11M parameters: {count_parameters(model)}")
model

Tiny-ViT 11M parameters: 10561442


TinyVit(
  (patch_embed): PatchEmbed(
    (conv1): ConvNorm(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (act): GELU(approximate='none')
    (conv2): ConvNorm(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (stages): Sequential(
    (0): ConvLayer(
      (blocks): Sequential(
        (0): MBConv(
          (conv1): ConvNorm(
            (conv): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          )
          (act1): GELU(approximate='none')
          (conv2): ConvNorm(
            (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=Fals

In [12]:
# Train model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
results_log.append(train_model(model, "tiny_vit_11m", trainloader, valloader, criterion, optimizer))

Epoch 1/10:
Train Loss: 2.0382, Train Acc: 51.31%
Val Loss: 0.3851, Val Acc: 90.00%

Epoch 2/10:
Train Loss: 0.6656, Train Acc: 84.12%
Val Loss: 0.4687, Val Acc: 86.84%

Epoch 3/10:
Train Loss: 0.4660, Train Acc: 88.80%
Val Loss: 0.1387, Val Acc: 96.32%

Epoch 4/10:
Train Loss: 0.3566, Train Acc: 90.82%
Val Loss: 0.0577, Val Acc: 97.89%

Epoch 5/10:
Train Loss: 0.3098, Train Acc: 92.27%
Val Loss: 0.1048, Val Acc: 97.37%

Epoch 6/10:
Train Loss: 0.2477, Train Acc: 93.40%
Val Loss: 0.0590, Val Acc: 97.63%

Epoch 7/10:
Train Loss: 0.2417, Train Acc: 93.64%
Val Loss: 0.0786, Val Acc: 98.16%

Epoch 8/10:
Train Loss: 0.2156, Train Acc: 94.54%
Val Loss: 0.0732, Val Acc: 97.89%

Epoch 9/10:
Train Loss: 0.1974, Train Acc: 94.54%
Val Loss: 0.0081, Val Acc: 100.00%

Epoch 10/10:
Train Loss: 0.1720, Train Acc: 95.53%
Val Loss: 0.0060, Val Acc: 99.74%



## Swin-Tiny

In [8]:
# Setup and check model
model = create_model('swin_tiny_patch4_window7_224', pretrained=True)
in_features = model.head.fc.in_features
model.head.fc = nn.Linear(in_features, num_classes)

print(f"Swin-Tiny parameters: {count_parameters(model)}")
model

model.safetensors:   0%|          | 0.00/114M [00:00<?, ?B/s]

Swin-Tiny parameters: 27542424


SwinTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
    (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
  )
  (layers): Sequential(
    (0): SwinTransformerStage(
      (downsample): Identity()
      (blocks): Sequential(
        (0): SwinTransformerBlock(
          (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
          (attn): WindowAttention(
            (qkv): Linear(in_features=96, out_features=288, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=96, out_features=96, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
            (softmax): Softmax(dim=-1)
          )
          (drop_path1): Identity()
          (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_features=96, out_features=384, bias=True)
            (act): GELU(approximate='none')
            (drop1): 

In [9]:
# Train model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
results_log.append(train_model(model, "Swin-Tiny", trainloader, valloader, criterion, optimizer))

Epoch 1/10:
Train Loss: 3.2384, Train Acc: 7.87%
Val Loss: 2.9876, Val Acc: 11.84%

Epoch 2/10:
Train Loss: 3.3490, Train Acc: 7.25%
Val Loss: 3.2785, Val Acc: 8.95%

Epoch 3/10:
Train Loss: 3.2122, Train Acc: 8.80%
Val Loss: 3.0631, Val Acc: 13.42%

Epoch 4/10:
Train Loss: 3.1661, Train Acc: 8.52%
Val Loss: 2.9981, Val Acc: 10.26%

Epoch 5/10:
Train Loss: 3.1675, Train Acc: 8.87%
Val Loss: 3.4485, Val Acc: 4.21%

Epoch 6/10:
Train Loss: 3.2146, Train Acc: 9.86%
Val Loss: 3.1088, Val Acc: 10.26%

Epoch 7/10:
Train Loss: 3.1785, Train Acc: 9.21%
Val Loss: 3.0609, Val Acc: 13.95%

Epoch 8/10:
Train Loss: 3.1492, Train Acc: 10.52%
Val Loss: 3.1975, Val Acc: 11.32%

Epoch 9/10:
Train Loss: 3.3096, Train Acc: 7.66%
Val Loss: 3.1608, Val Acc: 10.00%

Epoch 10/10:
Train Loss: 3.2006, Train Acc: 9.55%
Val Loss: 3.0651, Val Acc: 11.32%

