In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import numpy as np

In [None]:
def set_seeds(seed: int=42):
    """Sets random sets for torch operations.

    Args:
        seed (int, optional): Random seed to set. Defaults to 42.
    """
    # Set the seed for general torch operations
    torch.manual_seed(seed)
    # Set the seed for CUDA torch operations (ones that happen on the GPU)
    torch.cuda.manual_seed(seed)

1. Get Data

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
train_dataset = datasets.Food101(
    root='/content',
    split='train',
    download=True,
    transform=None  # No transform needed for sampling
)

test_dataset = datasets.Food101(
    root='/content',
    split='test',
    download=True,
    transform=None
)


100%|██████████| 5.00G/5.00G [02:40<00:00, 31.0MB/s]


In [None]:
import os
import random
from pathlib import Path
import shutil
from torchvision import datasets

# Step 4: Create the sampled directory structure
sampled_root = Path('/content/food101_sampled')
sampled_train = sampled_root / 'train'
sampled_test = sampled_root / 'test'
sampled_train.mkdir(parents=True, exist_ok=True)
sampled_test.mkdir(parents=True, exist_ok=True)

# Step 5: Sample 10% of images per category using metadata
sample_fraction = 0.5  # 10%
base_image_path = Path('/content/food-101/images')
meta_path = Path('/content/food-101/meta')

# Read train and test metadata
train_images = {}
test_images = {}

# Read train.txt with error handling
with open(meta_path / 'train.txt', 'r') as f:
    for line in f:
        line = line.strip()
        if not line:  # Skip empty lines
            continue
        parts = line.split(' ')
        if len(parts) < 1:  # Skip malformed lines
            print(f"Warning: Skipping malformed line in train.txt: {line}")
            continue
        img_path = parts[0]  # e.g., apple_pie/12345
        class_name = img_path.split('/')[0]
        full_path = base_image_path / f"{img_path}.jpg"
        if class_name not in train_images:
            train_images[class_name] = []
        train_images[class_name].append(str(full_path))

# Read test.txt with error handling
with open(meta_path / 'test.txt', 'r') as f:
    for line in f:
        line = line.strip()
        if not line:  # Skip empty lines
            continue
        parts = line.split(' ')
        if len(parts) < 1:  # Skip malformed lines
            print(f"Warning: Skipping malformed line in test.txt: {line}")
            continue
        img_path = parts[0]  # e.g., apple_pie/12345
        class_name = img_path.split('/')[0]
        full_path = base_image_path / f"{img_path}.jpg"
        if class_name not in test_images:
            test_images[class_name] = []
        test_images[class_name].append(str(full_path))

# Sample and copy images
for split, image_dict, sampled_dir in [
    ('train', train_images, sampled_train),
    ('test', test_images, sampled_test)
]:
    print(f"\nSampling {split} split...")
    for class_name, img_paths in image_dict.items():
        num_images = len(img_paths)
        num_sample = max(1, int(num_images * sample_fraction))  # At least 1 per category
        sampled_paths = random.sample(img_paths, num_sample)

        # Create category subdir
        class_dir = sampled_dir / class_name
        class_dir.mkdir(exist_ok=True)

        # Copy images
        for src_path in sampled_paths:
            filename = Path(src_path).name
            dst_path = class_dir / filename
            if Path(src_path).exists():
                shutil.copy2(src_path, dst_path)
            else:
                print(f"Warning: {src_path} not found")

        print(f"  {class_name}: {num_sample}/{num_images} images sampled")

# Step 6: Verify the sampled directory
print(f"\nSampled dataset created at /content/food101_sampled")
!ls /content/food101_sampled/train | head -5  # Show first 5 categories in train
print(f"\nTotal sampled images:")
!find /content/food101_sampled -type f | wc -l


Sampling train split...
  apple_pie: 375/750 images sampled
  baby_back_ribs: 375/750 images sampled
  baklava: 375/750 images sampled
  beef_carpaccio: 375/750 images sampled
  beef_tartare: 375/750 images sampled
  beet_salad: 375/750 images sampled
  beignets: 375/750 images sampled
  bibimbap: 375/750 images sampled
  bread_pudding: 375/750 images sampled
  breakfast_burrito: 375/750 images sampled
  bruschetta: 375/750 images sampled
  caesar_salad: 375/750 images sampled
  cannoli: 375/750 images sampled
  caprese_salad: 375/750 images sampled
  carrot_cake: 375/750 images sampled
  ceviche: 375/750 images sampled
  cheesecake: 375/750 images sampled
  cheese_plate: 375/750 images sampled
  chicken_curry: 375/750 images sampled
  chicken_quesadilla: 375/750 images sampled
  chicken_wings: 375/750 images sampled
  chocolate_cake: 375/750 images sampled
  chocolate_mousse: 375/750 images sampled
  churros: 375/750 images sampled
  clam_chowder: 375/750 images sampled
  club_sandwi

In [None]:
height = 224 #
width = 224
color_channels = 3
patch_size = 16

Equation 1: Split data into patches and creating the class, position and patch embedding

In [None]:
class PatchEmbedding(nn.Module):
  def __init__(self,
              in_channels:int=3,
              patch_size:int=16,
              embedding_dim:int=768):
    super().__init__()

    self.patcher = nn.Conv2d(in_channels=color_channels,
                         out_channels=embedding_dim,
                         kernel_size=patch_size,
                         stride=patch_size,
                         padding=0)
    self.flatten = nn.Flatten(start_dim=2, end_dim=3)


  def forward(self,x):
      # Create assertion to check that inputs are the correct shape
      image_resolution = x.shape[-1]
      assert image_resolution % patch_size == 0, f"Input image size must be divisible by patch size, image shape: {image_resolution}, patch size: {patch_size}"

      # Perform the forward pass
      x_patched = self.patcher(x)
      x_flattened = self.flatten(x_patched)
      # 6. Make sure the output shape has the right order
      return x_flattened.permute(0, 2, 1) # adjust so the embedding is on the final dimension [batch_size, P^2•C, N] -> [batch_size, N, P^2•C]

In [None]:
# Create the class token embedding as a learnable parameter that shares the same size as the embedding dimension (D)
class_token = nn.Parameter(torch.ones(batch_size, 1, embedding_dimension), # [batch_size, number_of_tokens, embedding_dimension]
                           requires_grad=True)

NameError: name 'batch_size' is not defined

In [None]:
# Create the learnable 1D position embedding
position_embedding = nn.Parameter(torch.ones(1,
                                             number_of_patches+1,
                                             embedding_dimension),
                                  requires_grad=True) # make sure it's learnable

Equation 2: Multi-Head Attention (MSA)

In [None]:
class MultiHeadSelfAttentionBlock(nn.Module):
  def __init__(self,
              embedding_dim:int=768, # Hidden size D from Table 1 for ViT-Base
              num_heads:int=12, # Heads from Table 1 for ViT-Base
              attn_dropout:float=0):
    super().__init__()

    self.Layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)

    self.MultiHeadAttention = nn.MultiheadAttention(embed_dim=embedding_dim,
                                                    num_heads=num_heads,
                                                    dropout=attn_dropout,
                                                    batch_first=True)

  def forward(self,x):
    x = self.Layer_norm(x)
    return self.MultiHeadAttention(query=x,
                                   key=x,
                                   value=x)[0]


 Equation 3: Multilayer Perceptron (MLP)

In [None]:
class MLPBlock(nn.Module):
  def __init__(self,embedding_dim:int=768, # Hidden Size D from Table 1 for ViT-Base
                 mlp_size:int=3072, # MLP size from Table 1 for ViT-Base
                 dropout:float=0.1):

    super().__init__()

    self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)

    self.mlp = nn.Sequential(
        nn.Linear(in_features=embedding_dim,
                      out_features=mlp_size),
        nn.GELU(), # "The MLP contains two layers with a GELU non-linearity (section 3.1)."
        nn.Dropout(p=dropout),
        nn.Linear(in_features=mlp_size, # needs to take same in_features as out_features of layer above
                      out_features=embedding_dim), # take back to embedding_dim
        nn.Dropout(p=dropout))

  def forward(self,x):
    return self.mlp(self.layer_norm(x))


Create the Transformer Encoder

In [None]:
class TransformerEncoderBlock(nn.Module):
  def __init__(self,
              embedding_dim:int=768, # Hidden size D from Table 1 for ViT-Base
              num_heads:int=12, # Heads from Table 1 for ViT-Base
              mlp_size:int=3072, # MLP size from Table 1 for ViT-Base
              mlp_dropout:float=0.1, # Amount of dropout for dense layers from Table 3 for ViT-Base
              attn_dropout:float=0):
    super().__init__()

    self.MultiHeadSelfAttentionBlock = MultiHeadSelfAttentionBlock(embedding_dim=embedding_dim,
                                                                   num_heads=num_heads,
                                                                   attn_dropout=attn_dropout)
    self.MLPBlock = MLPBlock(embedding_dim=embedding_dim,
                             mlp_size=mlp_size,
                             dropout=mlp_dropout)


  def forward(self,x):
    x = self.MultiHeadSelfAttentionBlock(x) + x
    x = self.MLPBlock(x) + x
    return x

In [None]:
# Create the same as above with torch.nn.TransformerEncoderLayer()
torch_transformer_encoder_layer = nn.TransformerEncoderLayer(d_model=768, # Hidden size D from Table 1 for ViT-Base
                                                             nhead=12, # Heads from Table 1 for ViT-Base
                                                             dim_feedforward=3072, # MLP size from Table 1 for ViT-Base
                                                             dropout=0.1, # Amount of dropout for dense layers from Table 3 for ViT-Base
                                                             activation="gelu", # GELU non-linear activation
                                                             batch_first=True, # Do our batches come first?
                                                             norm_first=True) # Normalize first or after MSA/MLP layers?

torch_transformer_encoder_layer

TransformerEncoderLayer(
  (self_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
  )
  (linear1): Linear(in_features=768, out_features=3072, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (linear2): Linear(in_features=3072, out_features=768, bias=True)
  (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.1, inplace=False)
  (dropout2): Dropout(p=0.1, inplace=False)
)

**Putting it all together to create ViT**

In [None]:
class ViT(nn.Module):
  def __init__(self,
                img_size:int=224, # Training resolution from Table 3 in ViT paper
                in_channels:int=3, # Number of channels in input image
                patch_size:int=16, # Patch size
                num_transformer_layers:int=12, # Layers from Table 1 for ViT-Base
                embedding_dim:int=768, # Hidden size D from Table 1 for ViT-Base
                mlp_size:int=3072, # MLP size from Table 1 for ViT-Base
                num_heads:int=12, # Heads from Table 1 for ViT-Base
                attn_dropout:float=0, # Dropout for attention projection
                mlp_dropout:float=0.1, # Dropout for dense/MLP layers
                embedding_dropout:float=0.1, # Dropout for patch and position embeddings
                num_classes:int=1000):
    super().__init__()

    # 3. Make the image size is divisible by the patch size
    assert img_size % patch_size == 0, f"Image size must be divisible by patch size, image size: {img_size}, patch size: {patch_size}."

    # 4. Calculate number of patches (height * width/patch^2)
    self.num_patches = (img_size * img_size) // patch_size**2

    self.class_embedding = nn.Parameter(data=torch.randn(1,1,embedding_dim),requires_grad=True)

    self.position_embedding = nn.Parameter(data=torch.randn(1,self.num_patches+1,embedding_dim),requires_grad=True)

    self.embedding_dropout = nn.Dropout(p=embedding_dropout)
    self.patch_embedding = PatchEmbedding(in_channels=in_channels,
                                          patch_size=patch_size,
                                          embedding_dim=embedding_dim)

    self.TransformerEncoder = nn.Sequential(*[TransformerEncoderBlock(embedding_dim=embedding_dim,
                                                                      num_heads=num_heads,
                                                                      mlp_size=mlp_size,
                                                                      mlp_dropout=mlp_dropout,
                                                                      attn_dropout=attn_dropout)for _ in range(num_transformer_layers)])

    self.classifier = nn.Sequential(
            nn.LayerNorm(normalized_shape=embedding_dim),
            nn.Linear(in_features=embedding_dim,
                      out_features=num_classes)
        )

  def forward(self, x):

        # 12. Get batch size
        batch_size = x.shape[0]

        # 13. Create class token embedding and expand it to match the batch size (equation 1)
        class_token = self.class_embedding.expand(batch_size, -1, -1) # "-1" means to infer the dimension (try this line on its own)

        # 14. Create patch embedding (equation 1)
        x = self.patch_embedding(x)

        # 15. Concat class embedding and patch embedding (equation 1)
        x = torch.cat((class_token, x), dim=1)

        # 16. Add position embedding to patch embedding (equation 1)
        x = self.position_embedding + x

        # 17. Run embedding dropout (Appendix B.1)
        x = self.embedding_dropout(x)

        # 18. Pass patch, position and class embedding through transformer encoder layers (equations 2 & 3)
        x = self.TransformerEncoder(x)

        # 19. Put 0 index logit through classifier (equation 4)
        x = self.classifier(x[:, 0]) # run on each sample in a batch at 0 index

        return x

In [None]:
set_seeds()
vit = ViT(num_classes=101)

In [None]:
import data_setup
test_dir = '/content/food101_sampled/test'
train_dir = '/content/food101_sampled/train'
BATCH_SIZE = 32 # this is lower than the ViT paper but it's because we're starting small

# Create image size (from Table 3 in the ViT paper)
IMG_SIZE = 224

# Create transform pipeline manually
manual_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
])

# Create data loaders
train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(
    train_dir=train_dir,
    test_dir=test_dir,
    transform=manual_transforms, # use manually created transforms
    batch_size=BATCH_SIZE
)

train_dataloader, test_dataloader, class_names

(<torch.utils.data.dataloader.DataLoader at 0x7c6ba3b9d0a0>,
 <torch.utils.data.dataloader.DataLoader at 0x7c6ba4656c00>,
 ['apple_pie',
  'baby_back_ribs',
  'baklava',
  'beef_carpaccio',
  'beef_tartare',
  'beet_salad',
  'beignets',
  'bibimbap',
  'bread_pudding',
  'breakfast_burrito',
  'bruschetta',
  'caesar_salad',
  'cannoli',
  'caprese_salad',
  'carrot_cake',
  'ceviche',
  'cheese_plate',
  'cheesecake',
  'chicken_curry',
  'chicken_quesadilla',
  'chicken_wings',
  'chocolate_cake',
  'chocolate_mousse',
  'churros',
  'clam_chowder',
  'club_sandwich',
  'crab_cakes',
  'creme_brulee',
  'croque_madame',
  'cup_cakes',
  'deviled_eggs',
  'donuts',
  'dumplings',
  'edamame',
  'eggs_benedict',
  'escargots',
  'falafel',
  'filet_mignon',
  'fish_and_chips',
  'foie_gras',
  'french_fries',
  'french_onion_soup',
  'french_toast',
  'fried_calamari',
  'fried_rice',
  'frozen_yogurt',
  'garlic_bread',
  'gnocchi',
  'greek_salad',
  'grilled_cheese_sandwich',
  'gr

In [None]:
import engine

# Setup the optimizer to optimize our ViT model parameters using hyperparameters from the ViT paper
optimizer = torch.optim.Adam(params=vit.parameters(),
                             lr=3e-3, # Base LR from Table 3 for ViT-* ImageNet-1k
                             betas=(0.9, 0.999), # default values but also mentioned in ViT paper section 4.1 (Training & Fine-tuning)
                             weight_decay=0.3) # from the ViT paper section 4.1 (Training & Fine-tuning) and Table 3 for ViT-* ImageNet-1k

# Setup the loss function for multi-class classification
loss_fn = torch.nn.CrossEntropyLoss()

# Set the seeds
set_seeds()

results = engine.train(model=vit,
                       train_dataloader=train_dataloader,
                       test_dataloader=test_dataloader,
                       optimizer=optimizer,
                       loss_fn=loss_fn,
                       epochs=10,
                       device=device)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 4.6839 | train_acc: 0.0100 | test_loss: 4.6152 | test_acc: 0.0099


In [None]:
from utils import plot_loss_curves

plot_loss_curves(results)