### VGG-A Architecture

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets, models  # For datasets and transformations
import math  # For Xavier initialization

# --- 1. Model Definition (VGG Configuration A) ---

class VGG_A(nn.Module):
    def __init__(self, num_classes=1000):
        super(VGG_A, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))  # Add adaptive average pooling
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),  # Correct the input size
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),  # Dropout as specified in VGG paper

            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),

            nn.Linear(4096, num_classes),
        )
        self._initialize_weights() # call weight initialization

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)  # Flatten the output for the FC layers
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
      for m in self.modules(): # loop over all modules in the network
          if isinstance(m, nn.Conv2d):
              # Xavier/Glorot initialization for conv layers
              nn.init.xavier_uniform_(m.weight)
              if m.bias is not None:
                  nn.init.constant_(m.bias, 0) # initialize bias to 0 if it exists
          elif isinstance(m, nn.Linear):
              # Xavier/Glorot initialization for linear layers
              nn.init.xavier_uniform_(m.weight)
              nn.init.constant_(m.bias, 0) # initialize bias to 0



# --- 2. Data Loading and Augmentation ---

def get_data_loaders(data_dir, batch_size=256, train_scale=256, val_scale=256, use_multiscale=False):
    """
    Creates training and validation data loaders.

    Args:
      data_dir: Path to the dataset directory.
      batch_size: Batch size for training and validation.
      train_scale:  The smaller side of the training images, S (single-scale)
                    or a tuple (Smin, Smax) for multi-scale.
      val_scale:    The smaller side of the validation images (fixed).
      use_multiscale: True for multi-scale training.

    Returns:
        train_loader, val_loader:  Data loaders for training and validation.
    """
    if use_multiscale:
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=(0.5, 1.0)), # Scale jittering
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1), # added color jitter
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # ImageNet stats
        ])
    else:
        # Single-scale training
        train_transform = transforms.Compose([
            transforms.Resize(train_scale), # resize so the smaller side match to train_scale
            transforms.RandomCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1), # added color jitter
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # ImageNet stats
        ])


    val_transform = transforms.Compose([
        transforms.Resize(val_scale),  # Resize so smaller side matches val_scale
        transforms.CenterCrop(224),   # Center crop to 224x224
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Load datasets
    train_dataset = datasets.ImageFolder(root=f'{data_dir}/train', transform=train_transform)
    val_dataset = datasets.ImageFolder(root=f'{data_dir}/val', transform=val_transform)

    # Create data loaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

    return train_loader, val_loader


# --- 3. Training Loop ---

def train_model(model, train_loader, val_loader, num_epochs=74, initial_lr=0.01, device="cuda"):
    """Trains the VGG model.

    Args:
        model: The VGG model to train.
        train_loader: Training data loader.
        val_loader: Validation data loader.
        num_epochs: Number of training epochs.
        initial_lr: Initial learning rate.
        device:  Device to train on ("cuda" or "cpu").

    Returns:
        model: The trained model.  (optional: also return training history)
    """

    model = model.to(device) # move the model to specified device
    criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for classification
    optimizer = optim.SGD(model.parameters(), lr=initial_lr, momentum=0.9, weight_decay=5e-4) # optimzer
    # Learning rate scheduler: Reduce LR by factor of 10 when val accuracy plateaus
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=5, verbose=True) # learning rate scheduler


    best_val_acc = 0.0  # Keep track of best validation accuracy

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        running_loss = 0.0
        correct_train = 0
        total_train = 0

        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to device

            optimizer.zero_grad()  # Zero the parameter gradients

            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Calculate the loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update weights

            running_loss += loss.item() * inputs.size(0)  # Accumulate loss

            _, predicted = torch.max(outputs.data, 1) # predicted class is the one with the max value
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item() # compute accuracy

            if (i + 1) % 100 == 0:  # Print every 100 mini-batches
                print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')


        epoch_loss = running_loss / len(train_loader.dataset)  # Calculate average loss per epoch
        train_acc = 100 * correct_train / total_train # Calculate accuracy
        print(f'Epoch [{epoch + 1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Train Acc: {train_acc:.2f}%')



        # --- Validation Loop ---
        model.eval()  # Set model to evaluation mode
        correct_val = 0
        total_val = 0
        with torch.no_grad():  # No gradient calculation during validation
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()

        val_acc = 100 * correct_val / total_val
        print(f'Epoch [{epoch + 1}/{num_epochs}], Validation Accuracy: {val_acc:.2f}%')

        # Update learning rate (ReduceLROnPlateau)
        scheduler.step(val_acc) # the scheduler reduces the learning rate based on validation accuracy

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_vgg_a.pth') # save the weights of the best model

    print('Finished Training')
    return model



# --- 4. Main Execution ---

if __name__ == '__main__':
    # --- Hyperparameters & Settings ---
    data_dir = 'path/to/your/dataset'  # Replace with your dataset path.  Format: dataset/train, dataset/val
    num_classes = 1000  # Number of classes in your dataset (ImageNet = 1000)
    batch_size = 256
    num_epochs = 74 # VGG paper trained for 74 epochs
    initial_lr = 0.01 # VGG used 0.01 initial LR
    device = "cuda" if torch.cuda.is_available() else "cpu" # check if GPU is available
    print(f"Using device: {device}")


    # --- Data Loaders ---
    # Example: Single-scale training (S=256)
    train_loader, val_loader = get_data_loaders(data_dir, batch_size=batch_size, train_scale=256, val_scale=256)

    # Example: Multi-scale training (S in [256, 512])
    # train_loader, val_loader = get_data_loaders(data_dir, batch_size=batch_size, train_scale=(256, 512), val_scale = 256, use_multiscale=True)


    # --- Create and Train the Model ---
    model = VGG_A(num_classes=num_classes)
    trained_model = train_model(model, train_loader, val_loader, num_epochs=num_epochs, initial_lr=initial_lr, device=device)
    print("Training complete!")

### VGG-A LRN Architecture

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
import time

# --- 1. Model Definition (VGG A-LRN) ---

class VGGA_LRN(nn.Module):
    def __init__(self, num_classes=1000):
        super(VGGA_LRN, self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2),  # LRN Layer
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),  #  7x7 comes from 224 input size, 5 maxpool layers (224 / (2^5))
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, num_classes),
        )

        self._initialize_weights()  # Custom weight initialization


    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1) # or x = x.view(x.size(0), -1)  Flatten before FC layers
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # Kaiming He initialization (from He et al., 2015, designed for ReLU)
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)  # Initialize biases to 0
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)  # Normal dist, mean 0, std 0.01
                nn.init.constant_(m.bias, 0)


# --- 2. Data Loading and Augmentation ---

def get_data_loaders(data_dir, batch_size, train_scale=256, val_scale=256,
                    multi_scale_training=False, s_min=256, s_max=512):

    # Common transformations for both training and validation
    common_transforms = [
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # ImageNet mean/std
    ]

    if multi_scale_training:
        #Multi-scale Training (Scale Jittering)
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=(s_min/val_scale, s_max/val_scale)),
            transforms.RandomHorizontalFlip(),
            *common_transforms
        ])
    else:
        # Single Scale Training (Fixed S)
        train_transform = transforms.Compose([
            transforms.Resize(train_scale), # first resize to train_scale
            transforms.RandomCrop(224),    # then crop the 224X224 part
            transforms.RandomHorizontalFlip(), # Random horizontal flip
            *common_transforms
        ])

    val_transform = transforms.Compose([
        transforms.Resize(val_scale),  # Resize to validation scale (usually 256)
        transforms.CenterCrop(224), # Center Crop to 224X224.  For testing, we would use more crops.
        *common_transforms
    ])

    # Load datasets
    train_dataset = datasets.ImageFolder(root=data_dir + '/train', transform=train_transform)
    val_dataset = datasets.ImageFolder(root=data_dir + '/val', transform=val_transform)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

    return train_loader, val_loader

# --- 3. Training Function ---

def train(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, device):
    best_val_acc = 0.0

    for epoch in range(num_epochs):
        start_time = time.time()

        # --- Training Phase ---
        model.train()  # Set model to training mode
        running_loss = 0.0
        running_corrects = 0

        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()  # Zero the parameter gradients

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()   # Backpropagation
            optimizer.step()  # Update weights

            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = running_corrects.double() / len(train_loader.dataset)

        # --- Validation Phase ---
        model.eval()  # Set model to evaluation mode
        val_running_loss = 0.0
        val_running_corrects = 0

        with torch.no_grad():  # Disable gradient calculation during validation
            for val_inputs, val_labels in val_loader:
                val_inputs = val_inputs.to(device)
                val_labels = val_labels.to(device)

                val_outputs = model(val_inputs)
                val_loss = criterion(val_outputs, val_labels)

                _, val_preds = torch.max(val_outputs, 1)
                val_running_loss += val_loss.item() * val_inputs.size(0)
                val_running_corrects += torch.sum(val_preds == val_labels.data)

        val_epoch_loss = val_running_loss / len(val_loader.dataset)
        val_epoch_acc = val_running_corrects.double() / len(val_loader.dataset)


        # --- Learning Rate Scheduler ---
        scheduler.step(val_epoch_acc) #Pass validation accuracy to scheduler.  StepLR expects a "metric"

        end_time = time.time()
        epoch_duration = end_time - start_time
        print(f"Epoch {epoch+1}/{num_epochs} - Time: {epoch_duration:.0f}s")
        print(f"  Train Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")
        print(f"  Val Loss: {val_epoch_loss:.4f} Acc: {val_epoch_acc:.4f}")


        # --- Save Best Model ---
        if val_epoch_acc > best_val_acc:
            best_val_acc = val_epoch_acc
            torch.save(model.state_dict(), 'best_model_vgg_a_lrn.pth')


# --- 4. Main Training Loop ---

if __name__ == '__main__':
    # --- Hyperparameters ---
    num_classes = 1000  # ImageNet has 1000 classes
    batch_size = 256   # VGG used 256
    learning_rate = 0.01 # VGG used 0.01 initially
    num_epochs = 74  # VGG trained for about 74 epochs (370K iterations)
    momentum = 0.9 # Momentum value
    weight_decay = 5e-4  # L2 regularization weight decay

    # --- Device Configuration ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # --- Data Loaders ---
    data_dir = 'path/to/imagenet/data' # Replace with your ImageNet data path!
    train_loader, val_loader = get_data_loaders(data_dir, batch_size)

    # --- Model Instance ---
    model = VGGA_LRN(num_classes=num_classes).to(device)

    # --- Loss Function ---
    criterion = nn.CrossEntropyLoss()

    # --- Optimizer ---
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)

    # --- Learning Rate Scheduler (StepLR) ---
    # VGG decreased learning rate by a factor of 10 when validation accuracy stopped improving.
    # Use StepLR for simplicity (could also use ReduceLROnPlateau).
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=5, verbose=True)

    # --- Train the Model ---
    train(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, device)

    print("Training completed.")

### VGG-B Architecture

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import time
import copy

# --- 1. Model Definition (VGG-B Configuration) ---

class VGG_B(nn.Module):
    def __init__(self, num_classes=1000):
        super(VGG_B, self).__init__()
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 5
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))  # Add AdaptiveAvgPool2d
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096), # Corrected input size
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, num_classes),
        )
        self._initialize_weights() # Call weight initialization


    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x) # Use AdaptiveAvgPool2d
        x = torch.flatten(x, 1)  # Flatten the output
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # Kaiming He initialization (recommended for ReLU)
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                # Xavier/Glorot initialization (good for fully connected)
                nn.init.xavier_normal_(m.weight)
                nn.init.constant_(m.bias, 0)


# --- 2. Data Loading and Augmentation ---
# ImageNet mean and std are standard, even if not strictly from *your* training set.
# The goal of normalization is to center your data similarly to how ImageNet was centered.

def get_data_loaders(data_dir, batch_size=256, train_scale=256, val_scale=256, num_workers=4):
    # Single Scale Training (as in the paper)
    train_transform_single = transforms.Compose([
        transforms.Resize(train_scale),  # Resize the smaller edge to train_scale
        transforms.RandomCrop(224),      # Randomly crop a 224x224 patch
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # ImageNet normalization
    ])

    # Multi-Scale Training (Scale Jittering)
    train_transform_multi = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.25, 1.0)), # Random resized crop
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])


    # Validation/Test Transform (keep consistent)
    val_transform = transforms.Compose([
        transforms.Resize(val_scale),  # Resize
        transforms.CenterCrop(224),    # Center crop to 224x224
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Create datasets (using ImageFolder for folder structure)
    train_dataset_single = datasets.ImageFolder(root=data_dir + '/train', transform=train_transform_single)
    train_dataset_multi = datasets.ImageFolder(root=data_dir + '/train', transform=train_transform_multi) # Use for multi-scale training
    val_dataset = datasets.ImageFolder(root=data_dir + '/val', transform=val_transform)  # Assuming you have a validation set
    # test_dataset = datasets.ImageFolder(root=data_dir + '/test', transform=val_transform) # Use if you have a separate test set

    # Create data loaders
    train_loader_single = DataLoader(train_dataset_single, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True)
    train_loader_multi = DataLoader(train_dataset_multi, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True) # Multi-scale loader

    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
    # test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) #Use if you have a seperate test set

    #Return both single scale and multi scale loaders
    return train_loader_single, train_loader_multi, val_loader
    # return train_loader_single, val_loader  # Return loaders (use train_loader_multi for multi-scale training)

# --- 3. Training Loop ---

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=74, device='cuda'):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
                dataloader = train_loader
            else:
                model.eval()   # Set model to evaluate mode
                dataloader = val_loader

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            if phase == 'train':
                scheduler.step() # Step the scheduler after *each* epoch

            epoch_loss = running_loss / len(dataloader.dataset)
            epoch_acc = running_corrects.double() / len(dataloader.dataset)

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model (if it's the best so far)
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:.4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model



# --- 4. Main Execution ---

if __name__ == '__main__':
    # --- Hyperparameters (following the paper as closely as possible) ---
    num_classes = 1000  # ImageNet classes
    batch_size = 256
    initial_lr = 0.01  # 10^-2
    momentum = 0.9
    weight_decay = 5e-4  # 5 * 10^-4
    num_epochs = 74      # Total training epochs (VGG trained for ~74 epochs)
    train_scale = 256    # Single-scale training: S=256
    # train_scale = [256, 512]  # For multi-scale, use a list/tuple
    val_scale = 256  # Validation scale (Q) - often same as training scale in single-scale

    # --- Device ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # --- Data Loaders ---
    # Replace 'path/to/imagenet' with the actual path to your ImageNet dataset.
    #  Make sure your data is organized: path/to/imagenet/train, path/to/imagenet/val
    data_dir = 'path/to/imagenet'
    train_loader_single, train_loader_multi, val_loader = get_data_loaders(data_dir, batch_size, train_scale, val_scale)


    # --- Model Initialization ---
    model = VGG_B(num_classes=num_classes).to(device)
    #  Pretraining (optional, but recommended for deeper networks)
    #   You *could* load weights from a pretrained VGG-A model here if you have it.
    #   vgg_a = VGG_A(num_classes=1000)
    #   vgg_a.load_state_dict(torch.load('path/to/vgg_a_weights.pth'))
    #   # Initialize VGG-B with some layers from VGG-A
    #   model.features[:10].load_state_dict(vgg_a.features[:10].state_dict())
    #   model.classifier.load_state_dict(vgg_a.classifier.state_dict())

    # --- Loss Function ---
    criterion = nn.CrossEntropyLoss()

    # --- Optimizer ---
    # Use SGD with momentum and weight decay, as in the paper
    optimizer = optim.SGD(model.parameters(), lr=initial_lr, momentum=momentum, weight_decay=weight_decay)

    # --- Learning Rate Scheduler ---
    # StepLR scheduler (decrease LR by a factor of 10 every time val accuracy plateaus)
    # The paper decreased LR 3 times.  Patience=5 means wait 5 epochs of no improvement.
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=5, verbose=True)
    #Alternative learning rate scheduler : StepLR.  Decay LR every 25 epochs by a factor of 0.1 (3 decays total)
    #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=25, gamma=0.1)


    # --- Training ---
    # Train the model (using single-scale training loader)
    #For Single scale
    model = train_model(model, train_loader_single, val_loader, criterion, optimizer, scheduler, num_epochs, device)

    # --- Save the trained model ---
    torch.save(model.state_dict(), 'vgg_b_trained.pth')

    #For Multi Scale
    # model = train_model(model, train_loader_multi, val_loader, criterion, optimizer, scheduler, num_epochs, device)
    # # --- Save the trained model ---
    # torch.save(model.state_dict(), 'vgg_b_trained_multi.pth')

### VGG-C Architecture

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import math

# --- 1. Model Definition (Configuration C) ---

class VGG_C(nn.Module):
    def __init__(self, num_classes=1000):
        super(VGG_C, self).__init__()

        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=1, padding=0), # 1x1 convolution
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=1, padding=0), # 1x1 convolution
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 5
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=1, padding=0), # 1x1 convolution
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),  # 7x7 because input is 224x224, and 5 max pooling layers
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, num_classes),
        )

        self._initialize_weights()  # Initialize weights

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)  # Flatten the feature maps
        x = self.classifier(x)
        return x
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # Kaiming He initialization (He et al., 2015) - better for ReLU
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                # Xavier/Glorot initialization (Glorot & Bengio, 2010)
                nn.init.xavier_normal_(m.weight)
                nn.init.constant_(m.bias, 0)


# --- 2. Data Augmentation and Loading ---

def get_data_loaders(train_batch_size=256, test_batch_size=64, s_train=256, multi_scale=False, s_min=256, s_max=512):
    """
    Gets data loaders with data augmentation for VGG training.

    Args:
        train_batch_size: Batch size for training.
        test_batch_size: Batch size for testing.
        s_train:  Single training scale (if multi_scale=False).
        multi_scale: Whether to use multi-scale training.
        s_min: Minimum scale for multi-scale training.
        s_max: Maximum scale for multi-scale training.

    Returns:
        train_loader: DataLoader for the training set.
        val_loader: DataLoader for the validation set.
        test_loader: DataLoader for the test set.  (Optional, if you have a separate test set)
    """

    if multi_scale:
      # Multi-scale training transforms
        train_transform = transforms.Compose([
            transforms.Lambda(lambda x: transforms.RandomResizedCrop(224, scale=(0.08, 1.0))(x) if x.size[0] >= 224 and x.size[1]>=224  else transforms.Resize((224, 224))(x)  ), #  resize smaller images to 224
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),  # Random RGB color shift
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # ImageNet stats
        ])
    else:
      # Single-scale training transforms
        train_transform = transforms.Compose([
            transforms.Resize(s_train),
            transforms.RandomCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])


    test_transform = transforms.Compose([
        transforms.Resize(256),  # Resize to 256x256 (as in VGG paper for testing)
        transforms.CenterCrop(224),  # Center crop to 224x224
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Use CIFAR-10 as an example dataset (easily downloadable).  Replace with your dataset.
    train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
    val_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=test_transform)  # Use same transform as test for validation
    test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transform)

    # Split train dataset into training and validation (80/20 split).
    train_size = int(0.8 * len(train_dataset))
    val_size = len(train_dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])


    train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=test_batch_size, shuffle=False, num_workers=4, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False, num_workers=4, pin_memory=True)

    return train_loader, val_loader, test_loader  # Return validation loader as well

# --- 3. Training Loop ---

def train(model, train_loader, val_loader, optimizer, criterion, num_epochs=74, device='cuda', initial_lr=0.01):
    """
    Trains the VGG model.

    Args:
        model: The VGG model to train.
        train_loader: DataLoader for the training data.
        val_loader: DataLoader for the validation data.
        optimizer: The optimizer (e.g., SGD with momentum).
        criterion: The loss function (e.g., CrossEntropyLoss).
        num_epochs: Number of training epochs.
        device: 'cuda' or 'cpu'.
        initial_lr: The initial learning rate.
    """

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True) # Reduce LR on plateau

    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        running_loss = 0.0
        correct_train = 0
        total_train = 0

        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()  # Zero the parameter gradients

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()  # Backpropagation
            optimizer.step()

            running_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)  # Get predicted class
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

        train_loss = running_loss / len(train_loader)
        train_accuracy = 100 * correct_train / total_train

        # --- Validation Loop ---
        model.eval()  # Set the model to evaluation mode
        val_loss = 0.0
        correct_val = 0
        total_val = 0
        with torch.no_grad():  # No need to track gradients during validation
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs.data, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()
        val_loss /= len(val_loader)
        val_accuracy = 100 * correct_val / total_val

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%')


        scheduler.step(val_loss)  # Update learning rate based on validation loss
        if val_loss < best_val_loss:
          best_val_loss = val_loss
          torch.save(model.state_dict(), 'best_model.pth') #save checkpoints
          print("saved the best model")


# --- 4. Main Execution ---

if __name__ == '__main__':
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # 1. Create the model
    model = VGG_C(num_classes=10).to(device)  # CIFAR-10 has 10 classes

    # 2. Get data loaders (with data augmentation)
    train_loader, val_loader, test_loader = get_data_loaders(multi_scale=True)  # Enable multi-scale training


    # 3. Define optimizer and loss function
    criterion = nn.CrossEntropyLoss()  # Cross-entropy loss for multi-class classification
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)

    # 4. Train the model
    train(model, train_loader, val_loader, optimizer, criterion, num_epochs=74, device=device)  # Train for 74 epochs (as in VGG paper)

    # 5. (Optional) Load the best saved model and evaluate on the test set.
    # best_model = VGG_C(num_classes=10).to(device)
    # best_model.load_state_dict(torch.load('best_model.pth'))
    # best_model.eval()
    # #  ... evaluation code (similar to validation loop) ...

    print("Finished Training")

### VGG-E Architecture

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import random

# --- 1. Model Definition (Configuration E - VGG19) ---

class VGG19(nn.Module):
    def __init__(self, num_classes=1000, init_weights=True, pretrain_init=None):
        super(VGG19, self).__init__()

        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Block 5
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 512, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) # Adaptive pooling to handle variable sizes

        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),  # Dropout after the first two FC layers
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, num_classes),
        )

        if init_weights:
          if pretrain_init:
            self._initialize_weights_from_pretrain(pretrain_init) # init weights from shallower nets
          else:
            self._initialize_weights() #init weights randomly

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)  # Flatten for FC layers
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # Xavier/Glorot initialization
                nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                # Normal initialization for linear layers
                nn.init.normal_(m.weight, 0, 0.01)  # Mean 0, std 0.01
                nn.init.constant_(m.bias, 0)

    def _initialize_weights_from_pretrain(self, pretrained_model_path):
       
        pretrained_model = torch.load(pretrained_model_path) # load the shallow net.
        pretrained_dict = pretrained_model.state_dict()
        model_dict = self.state_dict()

        # 1. filter out unnecessary keys
        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict and model_dict[k].shape == v.shape}
        # 2. overwrite entries in the existing state dict
        model_dict.update(pretrained_dict)
        # 3. load the new state dict
        self.load_state_dict(model_dict)


# --- 2. Data Loading and Augmentation ---

class MultiScaleImageDataset(Dataset):  # Custom dataset for multi-scale training
    def __init__(self, image_paths, labels, min_scale=256, max_scale=512, train=True):
        super().__init__()
        self.image_paths = image_paths
        self.labels = labels
        self.min_scale = min_scale
        self.max_scale = max_scale
        self.train = train

        self.base_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
            #^^^ The VGG paper subtracts the mean RGB, calculated from the entire training set.
            #^^^ Normalizing with ImageNet stats is standard practice and easier.
        ])

        self.train_transform = transforms.Compose([
            transforms.RandomResizedCrop(224),  # Random crop to 224x224
            transforms.RandomHorizontalFlip(),  # Random horizontal flip
            transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1), # Color jitter
            # ^^^ The VGG paper used random RGB shifts. ColorJitter is a common approximation.
            self.base_transform  # Apply base transformations (ToTensor, Normalize)
        ])

        self.test_transform = self.base_transform  # No extra augmentation at test time

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        img = torchvision.io.read_image(img_path) # Use torchvision.io for faster image loading
        img = img / 255.0  # Convert to [0, 1] range

        if self.train:
          # Multi-scale training: random S from [min_scale, max_scale]
          s = random.randint(self.min_scale, self.max_scale) # Isotropic rescaling
          # Calculate the scaling factor for both dimensions
          scale_factor = s / min(img.shape[-2], img.shape[-1])
          new_height, new_width = int(img.shape[-2] * scale_factor), int(img.shape[-1] * scale_factor)
          img = transforms.functional.resize(img, (new_height, new_width))
          img = self.train_transform(img)
        else:
          img = self.test_transform(img)


        return img, label




# --- 3. Training Setup ---

def train_vgg19(train_dataset, val_dataset, pretrained_model_path=None, num_epochs=74, batch_size=256, learning_rate=0.01):
    """Trains a VGG19 model.

    Args:
        train_dataset: Training dataset.
        val_dataset: Validation dataset.
        pretrained_model_path: Path to a pretrained VGG-A (or shallower) model, or None for random initialization.
        num_epochs: Number of training epochs.
        batch_size: Batch size.
        learning_rate: Initial learning rate.

    Returns:
        Trained VGG19 model.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if pretrained_model_path:
        model = VGG19(init_weights=True, pretrain_init=pretrained_model_path)
        print(f"Training with pre-trained initialization from {pretrained_model_path}")
    else:
        model = VGG19(init_weights=True, pretrain_init=None)
        print("Training with random initialization")

    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=5e-4)
    # Learning Rate Scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=5, verbose=True)  #Reduce LR on Plateau

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True) # num_workers and pin_memory for speedup
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)


    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if (i + 1) % 100 == 0:  # Print every 100 mini-batches
                print(f"Epoch [{epoch + 1}/{num_epochs}], Batch [{i + 1}/{len(train_loader)}], Loss: {running_loss / 100:.4f}")
                running_loss = 0.0

        # Validation loop
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader)
        val_accuracy = 100 * correct / total
        print(f"Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

        # Step the learning rate scheduler based on validation loss
        scheduler.step(val_loss)


    return model


# --- 4. Example Usage ---

if __name__ == '__main__':
    # Dummy data for demonstration (replace with your actual dataset loading)
    num_samples = 1000
    image_paths = [f"dummy_image_{i}.jpg" for i in range(num_samples)]  # Replace with real paths
    labels = [random.randint(0, 999) for _ in range(num_samples)]  # Replace with real labels

    # Create dummy image files
    import os
    if not os.path.exists("dummy_images"):
        os.makedirs("dummy_images")
        for path in image_paths:
            # Create dummy image, fill it with random data.
            dummy_image = torch.rand(3, 256, 256)  # Example size.  Important: it's > 224x224.
            torchvision.utils.save_image(dummy_image, os.path.join("dummy_images", path))
    image_paths = [os.path.join("dummy_images", path) for path in image_paths]


    # Split into training and validation (example)
    train_paths = image_paths[:800]
    train_labels = labels[:800]
    val_paths = image_paths[800:]
    val_labels = labels[800:]

    train_dataset = MultiScaleImageDataset(train_paths, train_labels, train=True)
    val_dataset = MultiScaleImageDataset(val_paths, val_labels, train=False) #train = False


    # --- Train the model (with or without pretraining) ---

    # Example 1: Train from scratch (random initialization)
    #trained_model = train_vgg19(train_dataset, val_dataset, pretrained_model_path=None)

    # Example 2: Train with pre-training (using a pre-trained VGG-A, for example)
    # You would first need to train a VGG-A model and save its state_dict.
    # Let's assume you have a saved VGG-A model at "vgg_a_model.pth"

    vgg_a = VGG19(init_weights=True) # create a dummy vgga model (as we don't have a vgg a model we will just save the dummy vgga model and use it to initilize the weigths for vgg e model )
    # Train VGG-A (replace this with your actual VGG-A training code)
    # ... (your VGG-A training code here) ...
    torch.save(vgg_a, "vgg_a_model.pth") # we are just saving the random initilized model as we don't have vgg a model 

    # Now, use the pre-trained VGG-A to initialize and train VGG-E.
    trained_model = train_vgg19(train_dataset, val_dataset, pretrained_model_path="vgg_a_model.pth")

    # Save the trained model
    torch.save(trained_model, "vgg19_model.pth")
    print("Training complete. Model saved.")