In [1]:
import os
import requests
import zipfile
from pathlib import Path

# Setup path to data folder
data_path = Path("data/")
image_path = data_path / "pizza_steak_sushi"

# If the image folder doesn't exist, download it and prepare it...
if image_path.is_dir():
    print(f"{image_path} directory exists.")
else:
    print(f"Did not find {image_path} directory, creating one...")
    image_path.mkdir(parents=True, exist_ok=True)

# Download pizza, steak, sushi data
with open(data_path / "pizza_steak_sushi.zip", "wb") as f:
  request = requests.get("https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip")
  print("Downloading pizza, steak, sushi data...")
  f.write(request.content)

# Unzip pizza, steak, sushi data
with zipfile.ZipFile(data_path / "pizza_steak_sushi.zip", "r") as zip_ref:
    print("Unzipping pizza, steak, sushi data...")
    zip_ref.extractall(image_path)

# Remove zip file
os.remove(data_path / "pizza_steak_sushi.zip")

Did not find data/pizza_steak_sushi directory, creating one...
Downloading pizza, steak, sushi data...
Unzipping pizza, steak, sushi data...


In [2]:
os.makedirs("going_modular", exist_ok=True)

In [3]:
%%writefile going_modular/data_setup.py
"""
Contains functionality for creating PyTorch DataLoaders for
image classification data.
"""
import os
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

NUM_WORKERS = os.cpu_count()

def create_dataloaders(
    train_dir: str,
    test_dir: str,
    train_transform: transforms.Compose,
    test_transform: transforms.Compose,
    batch_size: int,
    num_workers: int=NUM_WORKERS
):
  """Creates training and testing DataLoaders.
  """
  # Use ImageFolder to create dataset(s)
  train_data = datasets.ImageFolder(train_dir, transform=train_transform)
  test_data = datasets.ImageFolder(test_dir, transform=test_transform)

  # Get class names
  class_names = train_data.classes

  # Turn images into data loaders
  train_dataloader = DataLoader(
      train_data,
      batch_size=batch_size,
      shuffle=True,
      num_workers=num_workers,
      pin_memory=True,
  )
  test_dataloader = DataLoader(
      test_data,
      batch_size=batch_size,
      shuffle=False, # don't need to shuffle test data
      num_workers=num_workers,
      pin_memory=True,
  )

  return train_dataloader, test_dataloader, class_names

Writing going_modular/data_setup.py


In [None]:
%%writefile going_modular/model_builder.py
"""
Contains PyTorch model code to instantiate a TinyVGG model.
"""
import os
import torch
from torch import nn

class TinyVGG(nn.Module):
  """Creates the TinyVGG architecture.
  """
  def __init__(self, input_shape: int, hidden_units: int, output_shape: int) -> None:
      super().__init__()
      self.conv_block_1 = nn.Sequential(
          nn.Conv2d(in_channels=input_shape,
                    out_channels=hidden_units,
                    kernel_size=3,
                    stride=1,
                    padding=0),
          nn.ReLU(),
          nn.Conv2d(in_channels=hidden_units,
                    out_channels=hidden_units,
                    kernel_size=3,
                    stride=1,
                    padding=0),
          nn.ReLU(),
          nn.MaxPool2d(kernel_size=2,
                        stride=2)
      )
      self.conv_block_2 = nn.Sequential(
          nn.Conv2d(hidden_units, hidden_units, kernel_size=3, padding=0),
          nn.ReLU(),
          nn.Conv2d(hidden_units, hidden_units, kernel_size=3, padding=0),
          nn.ReLU(),
          nn.MaxPool2d(2)
      )
      self.classifier = nn.Sequential(
          nn.Flatten(),
          # Where did this in_features shape come from?
          # It's because each layer of our network compresses and changes the shape of our inputs data.
          nn.Linear(in_features=hidden_units*13*13,
                    out_features=output_shape)
      )

  def forward(self, x: torch.Tensor):
      x = self.conv_block_1(x)
      x = self.conv_block_2(x)
      x = self.classifier(x)
      return x
      # return self.classifier(self.conv_block_2(self.conv_block_1(x))) # <- leverage the benefits of operator fusion

In [17]:
%%writefile going_modular/engine.py
"""
Contains functions for training and testing a PyTorch model.
"""
import torch

from tqdm.auto import tqdm
from typing import Dict, List, Tuple

def train_step(model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               device: torch.device) -> Tuple[float, float]:
  """Trains a PyTorch model for a single epoch.

  """
  # Put model in train mode
  model.train()

  # Setup train loss and train accuracy values
  train_loss, train_acc = 0, 0

  # Loop through data loader data batches
  for batch, (X, y) in enumerate(dataloader):
      # Send data to target device
      X, y = X.to(device), y.to(device)

      # 1. Forward pass
      y_pred = model(X)

      # 2. Calculate  and accumulate loss
      loss = loss_fn(y_pred, y)
      train_loss += loss.item()

      # 3. Optimizer zero grad
      optimizer.zero_grad()

      # 4. Loss backward
      loss.backward()

      # 5. Optimizer step
      optimizer.step()

      # Calculate and accumulate accuracy metric across all batches
      y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
      train_acc += (y_pred_class == y).sum().item()/len(y_pred)

  # Adjust metrics to get average loss and accuracy per batch
  train_loss = train_loss / len(dataloader)
  train_acc = train_acc / len(dataloader)
  return train_loss, train_acc

def test_step(model: torch.nn.Module,
              dataloader: torch.utils.data.DataLoader,
              loss_fn: torch.nn.Module,
              device: torch.device) -> Tuple[float, float]:
  """Tests a PyTorch model for a single epoch.
  """
  # Put model in eval mode
  model.eval()

  # Setup test loss and test accuracy values
  test_loss, test_acc = 0, 0

  # Turn on inference context manager
  with torch.inference_mode():
      # Loop through DataLoader batches
      for batch, (X, y) in enumerate(dataloader):
          # Send data to target device
          X, y = X.to(device), y.to(device)

          # 1. Forward pass
          test_pred_logits = model(X)

          # 2. Calculate and accumulate loss
          loss = loss_fn(test_pred_logits, y)
          test_loss += loss.item()

          # Calculate and accumulate accuracy
          test_pred_labels = test_pred_logits.argmax(dim=1)
          test_acc += ((test_pred_labels == y).sum().item()/len(test_pred_labels))

  # Adjust metrics to get average loss and accuracy per batch
  test_loss = test_loss / len(dataloader)
  test_acc = test_acc / len(dataloader)
  return test_loss, test_acc

def train(model: torch.nn.Module,
          train_dataloader: torch.utils.data.DataLoader,
          test_dataloader: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device,
          patience: int = 7,
          name=None,
           scheduler=None) -> Dict[str, List]:
  """Trains and tests a PyTorch model with early stopping support."""

  results = {
      "train_loss": [],
      "train_acc": [],
      "test_loss": [],
      "test_acc": []
  }

  best_loss = float("inf")
  patience_counter = 0

  for epoch in tqdm(range(epochs)):
      train_loss, train_acc = train_step(
          model=model,
          dataloader=train_dataloader,
          loss_fn=loss_fn,
          optimizer=optimizer,
          device=device
      )

      test_loss, test_acc = test_step(
          model=model,
          dataloader=test_dataloader,
          loss_fn=loss_fn,
          device=device
      )

      # Print epoch results
      print(
          f"Epoch: {epoch+1}/{epochs} | "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
          f"Val Loss: {test_loss:.4f} | Val Acc: {test_acc:.4f}"
      )

      # Save metrics
      results["train_loss"].append(train_loss)
      results["train_acc"].append(train_acc)
      results["test_loss"].append(test_loss)
      results["test_acc"].append(test_acc)
      if scheduler:
          scheduler.step(test_loss)

      # --- Early Stopping Logic ---
      if test_loss < best_loss:
          best_loss = test_loss
          patience_counter = 0
          torch.save(model.state_dict(), "best_"+name+"_"+str(epoch)+".pth")
          print("✅ Validation loss improved — best model saved.")
      else:
          patience_counter += 1
          print(f"⚠️ No improvement for {patience_counter} epoch(s).")

      if patience_counter >= patience:
          print(f"\n⏹️ Early stopping triggered at epoch {epoch+1}.")
          break

  print("\nTraining complete.")
  return results


Overwriting going_modular/engine.py


In [None]:
%%writefile going_modular/utils.py
"""
Contains various utility functions for PyTorch model training and saving.
"""
import torch
from pathlib import Path

def save_model(model: torch.nn.Module,
               target_dir: str,
               model_name: str):
  """Saves a PyTorch model to a target directory.
  """
  # Create target directory
  target_dir_path = Path(target_dir)
  target_dir_path.mkdir(parents=True,
                        exist_ok=True)

  # Create model save path
  assert model_name.endswith(".pth") or model_name.endswith(".pt"), "model_name should end with '.pt' or '.pth'"
  model_save_path = target_dir_path / model_name

  # Save the model state_dict()
  print(f"[INFO] Saving model to: {model_save_path}")
  torch.save(obj=model.state_dict(),
             f=model_save_path)

In [7]:
%%writefile going_modular/train.py
"""
Trains a PyTorch image classification model using device-agnostic code.
"""

import os
import torch
import data_setup, engine, model_builder, utils

from torchvision import transforms

# Setup hyperparameters
NUM_EPOCHS = 20
BATCH_SIZE = 32
HIDDEN_UNITS = 10
LEARNING_RATE = 5e-4
WEIGHT_DECAY=1e-4
# Setup directories
train_dir = "data/pizza_steak_sushi/train"
test_dir = "data/pizza_steak_sushi/test"

# Setup target device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Create transforms
train_transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomAffine(0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

# Create DataLoaders with help from data_setup.py
train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(
    train_dir=train_dir,
    test_dir=test_dir,
    train_transform=train_transform,
    test_transform=test_transform,
    batch_size=BATCH_SIZE
)

# Create model with help from model_builder.py
model = model_builder.TinyVGG(
    input_shape=3,
    hidden_units=HIDDEN_UNITS,
    output_shape=len(class_names)
).to(device)

# Set loss and optimizer
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',      # because we want to minimize validation loss
    patience=2,      # wait 2 epochs before reducing LR
    factor=0.5,      # multiply LR by 0.5 each time it plateaus
)

# Start training with help from engine.py
engine.train(model=model,
             train_dataloader=train_dataloader,
             test_dataloader=test_dataloader,
             loss_fn=loss_fn,
             optimizer=optimizer,
             epochs=NUM_EPOCHS,
             device=device,
             patience=7,
             name="DuplicatedTinyVgg003",
             scheduler=scheduler)

# Save the model with help from utils.py
# utils.save_model(model=model,
#                  target_dir="models",
#                  model_name="DuplicatedTinyVgg003.pth")


Writing going_modular/train.py


In [8]:
image_path

PosixPath('data/pizza_steak_sushi')

In [9]:
!python3 going_modular/train.py

  0% 0/20 [00:00<?, ?it/s]Epoch: 1/20 | Train Loss: 1.0991 | Train Acc: 0.3125 | Val Loss: 1.1029 | Val Acc: 0.1979
✅ Validation loss improved — best model saved.
  5% 1/20 [00:02<00:43,  2.29s/it]Epoch: 2/20 | Train Loss: 1.0770 | Train Acc: 0.4258 | Val Loss: 1.1211 | Val Acc: 0.1979
⚠️ No improvement for 1 epoch(s).
 10% 2/20 [00:03<00:27,  1.54s/it]Epoch: 3/20 | Train Loss: 1.0998 | Train Acc: 0.2969 | Val Loss: 1.1251 | Val Acc: 0.1979
⚠️ No improvement for 2 epoch(s).
 15% 3/20 [00:04<00:21,  1.29s/it]Epoch: 4/20 | Train Loss: 1.0691 | Train Acc: 0.3398 | Val Loss: 1.0929 | Val Acc: 0.3116
✅ Validation loss improved — best model saved.
 20% 4/20 [00:05<00:18,  1.17s/it]Epoch: 5/20 | Train Loss: 1.0109 | Train Acc: 0.5898 | Val Loss: 1.0707 | Val Acc: 0.3513
✅ Validation loss improved — best model saved.
 25% 5/20 [00:06<00:16,  1.10s/it]Epoch: 6/20 | Train Loss: 1.0176 | Train Acc: 0.4414 | Val Loss: 1.0623 | Val Acc: 0.3617
✅ Validation loss improved — best model saved.
 30% 6/2

In [10]:
#now with transfer learning
%%writefile going_modular/googlenet003.py




Writing going_modular/googlenet003.py


In [18]:
%%writefile going_modular/transfer_train.py
"""
Trains a PyTorch image classification model using device-agnostic code.
"""

import os
import torch
import data_setup, engine

from torchvision import transforms,models

from torch import nn

# Setup hyperparameters
NUM_EPOCHS = 20
BATCH_SIZE = 32
HIDDEN_UNITS = 10
LEARNING_RATE = 1e-3
WEIGHT_DECAY=1e-4
# Setup directories
train_dir = "data/pizza_steak_sushi/train"
test_dir = "data/pizza_steak_sushi/test"

# Setup target device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Create transforms
from torchvision import transforms

train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomAffine(0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])


# Create DataLoaders with help from data_setup.py
train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(
    train_dir=train_dir,
    test_dir=test_dir,
    train_transform=train_transform,
    test_transform=test_transform,
    batch_size=BATCH_SIZE
)

# use googlenet pretrained model
googlenet_model = models.googlenet(weights='DEFAULT')
for param in googlenet_model.parameters():
  param.requires_grad = True

# Replace the classifier head
num_features = googlenet_model.fc.in_features
googlenet_model.fc = nn.Linear(num_features, 3)
googlenet_model.to(device)

# Set loss and optimizer
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.Adam(googlenet_model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',      # because we want to minimize validation loss
    patience=2,      # wait 2 epochs before reducing LR
    factor=0.5,      # multiply LR by 0.5 each time it plateaus
)
# Start training with help from engine.py
engine.train(model=googlenet_model,
             train_dataloader=train_dataloader,
             test_dataloader=test_dataloader,
             loss_fn=loss_fn,
             optimizer=optimizer,
             epochs=NUM_EPOCHS,
             device=device,
             patience=7,
             name="googlenet",
             scheduler=scheduler)

# Save the model with help from utils.py
# utils.save_model(model=model,
#                  target_dir="models",
#                  model_name="googlenet_003.pth")


Overwriting going_modular/transfer_train.py


In [19]:
!python3 going_modular/transfer_train.py


  0% 0/20 [00:00<?, ?it/s]Epoch: 1/20 | Train Loss: 0.8082 | Train Acc: 0.6328 | Val Loss: 0.6054 | Val Acc: 0.8551
✅ Validation loss improved — best model saved.
  5% 1/20 [00:02<00:56,  2.99s/it]Epoch: 2/20 | Train Loss: 0.6350 | Train Acc: 0.7812 | Val Loss: 0.6746 | Val Acc: 0.9062
⚠️ No improvement for 1 epoch(s).
 10% 2/20 [00:06<00:56,  3.14s/it]Epoch: 3/20 | Train Loss: 0.4395 | Train Acc: 0.9492 | Val Loss: 0.5795 | Val Acc: 0.8750
✅ Validation loss improved — best model saved.
 15% 3/20 [00:08<00:49,  2.94s/it]Epoch: 4/20 | Train Loss: 0.4208 | Train Acc: 0.9727 | Val Loss: 0.4536 | Val Acc: 0.9280
✅ Validation loss improved — best model saved.
 20% 4/20 [00:11<00:42,  2.67s/it]Epoch: 5/20 | Train Loss: 0.6004 | Train Acc: 0.8477 | Val Loss: 0.4900 | Val Acc: 0.9271
⚠️ No improvement for 1 epoch(s).
 25% 5/20 [00:13<00:37,  2.52s/it]Epoch: 6/20 | Train Loss: 0.5160 | Train Acc: 0.8477 | Val Loss: 0.5020 | Val Acc: 0.9072
⚠️ No improvement for 2 epoch(s).
 30% 6/20 [00:15<00:3

In [9]:
%%writefile going_modular/vision_transformer.py
import torch
from torch import nn
class PatchEmbeding(nn.Module):
    def __init__(self,
                in_channels:int=3,
                patch_size:int=16,
                embed_dim:int=768):
      super().__init__()
      self.patch_size=patch_size
      self.proj=nn.Conv2d(in_channels=in_channels,out_channels=embed_dim,kernel_size=patch_size,stride=patch_size,padding=0)
      self.flatten=nn.Flatten(start_dim=2,end_dim=3)

    def forward(self,x):
      image_resolution=x.shape[-1]
      assert image_resolution%self.patch_size==0,"image size must be divisible by patch size"
      x=self.proj(x)
      x=self.flatten(x)
      return x.permute(0,2,1)

class MultiHeadAttention(nn.Module):
    def __init__(self,
                embed_dim:int=768, # Hidden size D from Table 1 for ViT-Base
                num_heads:int=12, # Heads from Table 1 for ViT-Base
                attn_dropout:float=0): # doesn't look like the paper uses any dropout in MSABlocks
      super().__init__()
      self.layer_norm=nn.LayerNorm(normalized_shape=embed_dim)
      self.multi_head_attention=nn.MultiheadAttention(embed_dim=embed_dim,num_heads=num_heads,dropout=attn_dropout,batch_first=True)

    def forward(self,x):
      x=self.layer_norm(x)
      atten_output,_=self.multi_head_attention(query=x,key=x,value=x,need_weights=False)
      return atten_output
class MLP(nn.Module):
  def __init__(self,
              embed_dim:int=768, # Hidden Size D from Table 1 for ViT-Base
              mlp_size:int=3072, # MLP size from Table 1 for ViT-Base
              dropout:float=0.1): # Dropout from Table 3 for ViT-Base
    super().__init__()
    self.layer_norm=nn.LayerNorm(normalized_shape=embed_dim)
    self.mlp=nn.Sequential(
        nn.Linear(in_features=embed_dim,out_features=mlp_size),
                nn.GELU(),
        nn.Dropout(dropout),
        nn.Linear(in_features=mlp_size,out_features=embed_dim),
        nn.Dropout(p=dropout)
    )
  def forward(self,x):
    x=self.layer_norm(x)
    x=self.mlp(x)
    return x
class TransformerBlock(nn.Module):
    def __init__(self,
                 embed_dim:int=768, # Hidden size D from Table 1 for ViT-Base
                 num_heads:int=12, # Heads from Table 1 for ViT-Base
                 mlp_size:int=3072, # MLP size from Table 1 for ViT-Base
                 mlp_dropout:float=0.1, # Amount of dropout for dense layers from Table 3 for ViT-Base
                 attn_dropout:float=0): # Amount of dropout for attention layers
        super().__init__()
        self.msa_block=MultiHeadAttention(embed_dim=embed_dim,num_heads=num_heads,attn_dropout=attn_dropout)
        self.mlp_block=MLP(embed_dim=embed_dim,mlp_size=mlp_size,dropout=mlp_dropout)
    def forward(self,x):
      x=self.msa_block(x)+x
      x=self.mlp_block(x)+x
      return x

class Vit(nn.Module):
      def __init__(self,
                 img_size:int=224, # Training resolution from Table 3 in ViT paper
                 in_channels:int=3, # Number of channels in input image
                 patch_size:int=16, # Patch size
                 num_transformer_layers:int=12, # Layers from Table 1 for ViT-Base
                 embed_dim:int=768, # Hidden size D from Table 1 for ViT-Base
                 mlp_size:int=3072, # MLP size from Table 1 for ViT-Base
                 num_heads:int=12, # Heads from Table 1 for ViT-Base
                 attn_dropout:float=0, # Dropout for attention projection
                 mlp_dropout:float=0.1, # Dropout for dense/MLP layers
                 embed_dropout:float=0.1, # Dropout for patch and position embeddings
                 num_classes:int=1000): # Default for ImageNet but can customize this
          super().__init__() # don't forget the super().__init__()!
          assert img_size % patch_size==0, "image size must be divisible by patch size"
          num_patches=(img_size//patch_size)**2
          self.patch_embeding=PatchEmbeding(in_channels=in_channels,patch_size=patch_size,embed_dim=embed_dim)
          self.cls_token=nn.Parameter(torch.randn(1,1,embed_dim))
          self.position_embed=nn.Parameter(torch.randn(1,1+num_patches,embed_dim))
          self.embed_dropout=nn.Dropout(p=embed_dropout)
          self.transform_encoder=nn.Sequential(*[
              TransformerBlock(embed_dim,num_heads,mlp_size,mlp_dropout,attn_dropout) for _ in range(num_transformer_layers)
          ])
          self.classifier=nn.Sequential(
              nn.LayerNorm(normalized_shape=embed_dim),
              nn.Linear(in_features=embed_dim,out_features=num_classes)
          )
      def forward(self,x):
        batch_size=x.shape[0]
        x=self.patch_embeding(x)
        cls_token=self.cls_token.expand(batch_size,-1,-1)
        x=torch.cat((cls_token,x),dim=1)
        x=x+self.position_embed
        x=self.embed_dropout(x)
        x=self.transform_encoder(x)
        cls_token_final=x[:,0]
        return self.classifier(cls_token_final)




Writing going_modular/vision_transformer.py


In [13]:
%%writefile going_modular/vit_train.py
"""
Trains a PyTorch image classification model using device-agnostic code.
"""

import os
import torch
import data_setup, engine, vision_transformer

from torchvision import transforms,models

from torch import nn

# Setup hyperparameters
NUM_EPOCHS = 20
BATCH_SIZE = 32
HIDDEN_UNITS = 10
LEARNING_RATE = 1e-3
WEIGHT_DECAY=1e-4
# Setup directories
train_dir = "data/pizza_steak_sushi/train"
test_dir = "data/pizza_steak_sushi/test"

# Setup target device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Create transforms
from torchvision import transforms

train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.RandomAffine(0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])


# Create DataLoaders with help from data_setup.py
train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(
    train_dir=train_dir,
    test_dir=test_dir,
    train_transform=train_transform,
    test_transform=test_transform,
    batch_size=BATCH_SIZE
)

# use vit model
model = vision_transformer.Vit(num_classes=len(class_names)).to(device)


# Set loss and optimizer
loss_fn = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',      # because we want to minimize validation loss
    patience=2,      # wait 2 epochs before reducing LR
    factor=0.5,      # multiply LR by 0.5 each time it plateaus
)
# Start training with help from engine.py
engine.train(model=model,
             train_dataloader=train_dataloader,
             test_dataloader=test_dataloader,
             loss_fn=loss_fn,
             optimizer=optimizer,
             epochs=NUM_EPOCHS,
             device=device,
             patience=7,
             name="vit_003.pth",
             scheduler=scheduler)

# Save the model with help from utils.py
# utils.save_model(model=model,
#                  target_dir="models",
#                  model_name="vit_003.pth")


Overwriting going_modular/vit_train.py


In [14]:
!python3 going_modular/vit_train.py

  0% 0/20 [00:00<?, ?it/s]Epoch: 1/20 | Train Loss: 3.8934 | Train Acc: 0.2852 | Val Loss: 1.3373 | Val Acc: 0.5417
✅ Validation loss improved — best model saved.
  5% 1/20 [00:10<03:22, 10.64s/it]Epoch: 2/20 | Train Loss: 1.5096 | Train Acc: 0.2852 | Val Loss: 1.1722 | Val Acc: 0.2604
✅ Validation loss improved — best model saved.
 10% 2/20 [00:20<02:59,  9.98s/it]Epoch: 3/20 | Train Loss: 1.5162 | Train Acc: 0.3008 | Val Loss: 1.1274 | Val Acc: 0.2604
✅ Validation loss improved — best model saved.
 15% 3/20 [00:29<02:43,  9.60s/it]Epoch: 4/20 | Train Loss: 1.4799 | Train Acc: 0.3047 | Val Loss: 1.2845 | Val Acc: 0.1979
⚠️ No improvement for 1 epoch(s).
 20% 4/20 [00:37<02:26,  9.19s/it]Epoch: 5/20 | Train Loss: 1.3653 | Train Acc: 0.2578 | Val Loss: 1.0514 | Val Acc: 0.5417
✅ Validation loss improved — best model saved.
 25% 5/20 [00:46<02:16,  9.12s/it]Epoch: 6/20 | Train Loss: 1.2040 | Train Acc: 0.3125 | Val Loss: 1.4341 | Val Acc: 0.1979
⚠️ No improvement for 1 epoch(s).
 30% 6/2

In [19]:
import torch
torch.cuda.is_available()

True

In [20]:
torch.cuda.get_device_name()

'Tesla T4'

In [21]:
!nvidia-smi


Fri Oct 24 19:07:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                