In [None]:
import torch
import torch.nn as nn
import torchvision
from torchvision.transforms import InterpolationMode
from torchvision import datasets, transforms
from torchinfo import summary
import matplotlib.pyplot as plt
from tqdm import tqdm
from data_setup import create_dataloaders
import os
import numpy as np
import cv2
from utils import create_writer,save_model ,CLAHETransform
import model
from torch.utils.tensorboard import SummaryWriter
from model import *
import engine

In [2]:
## Helper functions

def simple_moving_average(data, window_size=20):
    """Calculate the Simple Moving Average (SMA) of a list."""
    sma = []
    for i in range(len(data)):
        # Calculate the start of the window
        start = max(0, i - window_size + 1)
        # Calculate the average for the current window
        average = sum(data[start:i + 1]) / (i - start + 1)
        sma.append(average)
    return sma

In [None]:
train_dir = "datasets/train"
val_dir = "datasets/val"

# Setup target device
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
# Create auto-transforms
transforms = torchvision.models.VGG19_Weights.DEFAULT.transforms() 

# Create DataLoaders
train_dataloader, test_dataloader, val_dataloader, class_names = create_dataloaders(
        train_dir=train_dir,
        val_dir=val_dir,
        transform=transforms,
        batch_size=int(os.getenv('BATCH_SIZE')),
    )
# Create vgg model
model_vgg19 = model.create_vgg19(device=device,class_names=class_names).to(device)

### (a)(i) Dataset Splitting and Composition
The dataset was divided into four distinct sets: **training**, **testing**, **validation**, and **production datasets**.

- **Training set**: This set was used to train the model and adjust the parameters during the learning process.
- **Testing set**: This set was used to evaluate the model's performance after training and to measure generalization.
- **Validation set**: This set was employed during training to monitor performance and facilitate early stopping, helping to prevent overfitting.
- **Production dataset**: This set consists of real-world data that the model encounters in a production environment, where it makes predictions without labeled data.

In [None]:
# Get the size of the training dataset
train_size = len(train_dataloader.dataset)

# Get the size of the validation dataset
val_size = len(val_dataloader.dataset)

# Get the size of the test dataset (if applicable)
test_size = len(test_dataloader.dataset)

# Print the dataset sizes
print(f"Training dataset size: {train_size}")
print(f"Test dataset size: {test_size}")
print(f"Validation dataset size: {val_size}")

### (a)(ii) [INCOMPLETE] Data Pre-processing and Augmentation
The data preprocessing pipeline included transformations similar to those used in training the VGG model on the ImageNet dataset. These transformations included standard resizing, normalization, and mean subtraction to align with the original VGG training setup.

Various image augmentation techniques were also explored to enhance model generalization, such as:
- Horizontal flipping
- Color jittering
- Rotation adjustments
- CLAHE Histogram Equalization

In [None]:
# VGG19 Default Transforms
vgg_transform = torchvision.models.VGG19_Weights.DEFAULT.transforms()
vgg_transform

In [6]:
## CLAHE Histogram Equalisation

image = cv2.imread('datasets/train/cat/cat.841.jpg') # OpenCV uses BGR instead of RGB

lab_image = cv2.cvtColor(image, cv2.COLOR_RGB2LAB)

# Split the LAB image to L, A, and B components
L, A, B = cv2.split(lab_image)

# Apply CLAHE to the L channel
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
L_clahe = clahe.apply(L)

# Merge the CLAHE L' channel back with A and B channels
lab_clahe_image = cv2.merge((L_clahe, A, B))

# Convert L'AB back to RGB
rgb_clahe_image = cv2.cvtColor(lab_clahe_image, cv2.COLOR_LAB2RGB)

rgb_clahe_image = rgb_clahe_image.transpose(2,1,0)
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
rgb_image = rgb_image.transpose(2,1,0)

In [None]:
# CLAHE Histogram Equalisation Comparison
plt.figure(figsize=(10, 4))

# RGB histogram
plt.subplot(1, 3, 1)
plt.hist(rgb_image[0,:,:].ravel(), bins=256, color='red', alpha=0.7)
plt.title("RGB Channel")
plt.xlim([0, 256])

# RGB CLAHE histogram
plt.subplot(1, 3, 2)
plt.hist(rgb_clahe_image[0,:,:].ravel(), bins=256, color='green', alpha=0.7)
plt.title("RGB LAB CLAHE Channel")
plt.xlim([0, 256])

# Display the plots
plt.tight_layout()
plt.show()

In [None]:
# CLAHE Histogram Equalisation Example
plt.figure(figsize=(10, 5))  # Adjust the figure size as needed

# Plot the first image
plt.subplot(1, 2, 1)  # 1 row, 2 columns, first subplot
plt.imshow(np.rot90(rgb_image.transpose(1,2,0), k=-1))
plt.title("RGB")
plt.axis('off')  # Hide the axis

# Plot the second image
plt.subplot(1, 2, 2)  # 1 row, 2 columns, second subplot
plt.imshow(np.rot90(rgb_clahe_image.transpose(1,2,0), k=-1))
plt.title("RGB LAB CLAHE")
plt.axis('off')  # Hide the axis

# Display the images
plt.tight_layout()
plt.show()

### (b)(i) Model Summary

In [None]:
summary(model=model_vgg19, 
        input_size=(32, 3, 224, 224),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
)

### (b)(ii) Training Strategy

1. **Forward Pass**  
   The model processes the input data through its layers to produce predictions.

2. **Calculate Loss**  
   The model’s predictions are compared to the ground truth labels to calculate the error using a loss function.

3. **Zero Gradients**  
   Before backpropagation, the accumulated gradients from previous steps are cleared to ensure accurate gradient computation for the current step.

4. **Backpropagation**  
   The gradient of the loss with respect to each trainable parameter is computed. This is done using backpropagation to propagate errors back through the network.

5. **Update Weights**  
   The optimizer updates the model's parameters using the gradients to minimize the loss function and improve predictions.


### (c)(i) Epoch Selection

Selecting the optimal number of epochs is crucial in training machine learning models to prevent overfitting and ensure good generalization on unseen data. Early stopping is an effective strategy that involves monitoring the model’s performance on a validation set and halting training when performance stops improving.

In this project, we utilize the `EarlyStopper` class for epoch selection with the following key features:

- **Patience:** Defines the number of epochs to wait for an improvement in validation loss before stopping. In our implementation, `patience=3` means training halts if validation loss does not improve for three consecutive epochs.

- **Minimum Delta:** Specifies the minimum change in validation loss that qualifies as an improvement. Here, `min_delta=10` indicates that a decrease in validation loss must exceed 10 to reset the patience counter.

Using early stopping optimizes the training process, reduces training time, and enhances the model's generalization ability.


### (c)(ii) Learning Rate Selection 

#### Naive Approach
The naive approach involves starting with a relatively large learning rate, such as **0.1**, and then progressively testing smaller values in an exponential decay manner. This can include values like **0.01**, **0.001**, etc. While this method can yield decent results, it may not always effectively identify the optimal learning rate for the model.

#### Proposed Approach
To enhance the learning rate selection process, we propose a more dynamic strategy:

1. **Initial Learning Rate**: Begin with a low learning rate, such as **0.0001**.
2. **Exponential Increase**: Gradually increase the learning rate exponentially with each training batch. This allows the model to quickly adapt to the optimal rate for convergence.
3. **Monitoring Progress**: For each batch, record both the learning rate and the corresponding training loss.
4. **Identifying Optimal Point**: Analyze the recorded data to identify the point at which the training loss exhibits the fastest decrease. This is determined by finding the steepest negative derivative of the loss concerning the learning rate.
5. **Smoothing Noisy Curves**: If the resulting curve is excessively noisy, apply a simple moving average (SMA) to smooth the data, allowing for clearer insights into the loss trends.

Reference: [1506.01186] Cyclical Learning Rates for Training Neural Networks 


In [None]:
# Cyclical Learning Rates Implementation
torch.manual_seed(42)

model_vgg19 = model.create_vgg19(device=device,class_names=class_names).to(device)

# Initializing parameters
initial_lr = 1e-5  # Lower initial learning rate
max_lr = 1e-2  # Lower maximum learning rate
num_batches = len(train_dataloader)  # Number of batches in the dataloader

scaling_factor = (max_lr / initial_lr) ** (1 / num_batches)  # Calculate scaling factor for exponential increase

# Initialize the optimizer with the initial learning rate
optimizer = torch.optim.Adam(model_vgg19.parameters(), lr=initial_lr)
loss_fn = nn.CrossEntropyLoss()

# Lists to store learning rates and losses
learning_rates = []
losses = []

model_vgg19.train()  # Set the model to training mode
epoch_loss = 0.0  # Initialize cumulative loss for the epoch

# Iterate through the dataloader
for X, y in train_dataloader:
    X = X.to(device)
    y = y.to(device)
    # Forward pass
    outputs = model_vgg19(X)
    loss = loss_fn(outputs, y)

    # Backward pass and optimization step
    optimizer.zero_grad()  # Reset gradients
    loss.backward()  # Backpropagation
    optimizer.step()  # Update parameters

    # Record the loss for this batch
    epoch_loss += loss.item()

    # Record the current learning rate
    current_lr = optimizer.param_groups[0]['lr']
    learning_rates.append(current_lr)
    losses.append(loss.item())

    # Update learning rate exponentially
    new_lr = current_lr * scaling_factor
    for param_group in optimizer.param_groups:
        param_group['lr'] = new_lr

    # Average loss for the epoch
avg_loss = epoch_loss / len(train_dataloader)


# Plotting learning rate against training loss
plt.figure(figsize=(10, 5))
plt.plot(learning_rates, losses, marker='o')
plt.title('Learning Rate vs Training Loss')
plt.xlabel('Learning Rate')
plt.ylabel('Training Loss')
plt.xscale('log') 
plt.grid()
plt.show()

In [None]:
# Plots for Batch - SMA and Learning Rate - SMA Loss

# Choose a window size
window_size = 20

# Calculate SMA
sma_learning_rates = simple_moving_average(learning_rates[:200], window_size)
sma_losses = simple_moving_average(losses[:200], window_size)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(learning_rates, label='Learning Rates', linestyle='-')
plt.title('Learning Rates')
plt.xlabel('Batch')
plt.ylabel('Learning Rate')
plt.legend()

# Subplot 2: Smoothed Losses vs Learning Rates
plt.subplot(1, 2, 2)
plt.plot(sma_learning_rates, sma_losses, linestyle='-', label='SMA Losses')
plt.title('SMA Losses vs Learning Rates')
plt.xlabel('Learning Rate')
plt.ylabel('SMA Loss')
plt.xscale('log')  # Use logarithmic scale for better visualization
plt.grid()
plt.legend()
# Show the plot
plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()

In [None]:
# Calculate the derivative of losses
loss_derivative = np.gradient(sma_losses)
sma_loss_derivative = simple_moving_average(loss_derivative, window_size)

# Plot the derivative of losses against smoothed learning rates
plt.figure(figsize=(10, 6))
plt.plot(sma_learning_rates[:200], sma_loss_derivative[:200], linestyle='-')
plt.title('Derivative of Losses vs SMA Learning Rates')
plt.xlabel('SMA Learning Rates')
plt.ylabel('Derivative of Losses')
plt.xscale('log') 
plt.grid()
plt.show()

### Experiment Tracking

In [13]:
from torchvision import transforms

# Case 1: VGG Default Transform
transform_case_1 = torchvision.models.VGG19_Weights.DEFAULT.transforms()

# Case 2: CLAHE
transform_case_2 = transforms.Compose([
    CLAHETransform(clip_limit=2.0, tile_grid_size=(8, 8)),  # Apply CLAHE
    transforms.Resize((256, 256), # Resize to 256x256
    interpolation=InterpolationMode.BICUBIC),  
    transforms.CenterCrop(224),  # Crop to 224x224
    transforms.ToTensor(),  # Convert the image to a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Case 3: Rotation + Horizontal Flip
transform_case_3 = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 

])
# Case 4: CLAHE + Rotation + Horizontal Flip
transform_case_4 = transforms.Compose([
    CLAHETransform(clip_limit=2.0, tile_grid_size=(8, 8)),  # Apply CLAHE
    transforms.Resize((256, 256), # Resize to 256x256
    interpolation=InterpolationMode.BICUBIC),  
    transforms.CenterCrop(224),  # Crop to 224x224
    transforms.ToTensor(),  # Convert the image to a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [14]:
def create_dataloaders_helper(transformation):
    return create_dataloaders(train_dir=train_dir,val_dir=val_dir,transform=transformation,
                              batch_size=int(os.getenv('BATCH_SIZE')))

In [18]:
experiment_dataloaders = {
                        "Default Transform": create_dataloaders_helper(transform_case_1),
                        "CLAHE Transform": create_dataloaders_helper(transform_case_2),
                        "Rotation + Horizontal Flip Transform": create_dataloaders_helper(transform_case_3),
                        "CLAHE + Rotation + Horizontal Flip Transform": create_dataloaders_helper(transform_case_4),
                        }

In [None]:
# 1. Set the random seeds
torch.manual_seed(42)

models = ["vgg11","vgg16","vgg19"]
device = "mps" if torch.backends.mps.is_available() else "cpu"
num_epochs = [10]
# 2. Keep track of experiment numbers
experiment_number = 0

# 3. Loop through each DataLoader
for dataloader_name, dataloaders in experiment_dataloaders.items():
    class_names = dataloaders[-1]
    # 4. Loop through each number of epochs
    for epochs in num_epochs: 

        # 5. Loop through each model name and create a new model based on the name
        for model_name in models:

            # 6. Create information print outs
            experiment_number += 1
            print(f"[INFO] Experiment number: {experiment_number}")
            print(f"[INFO] Model: {model_name}")
            print(f"[INFO] DataLoader: {dataloader_name}")
            print(f"[INFO] Number of epochs: {epochs}")  

            # 7. Select the model
            if model_name == "vgg11":
                model = create_vgg11(device=device,class_names=class_names) 
            elif model_name == "vgg16":
                model = create_vgg16(device=device,class_names=class_names)
            else:
                model = create_vgg19(device=device,class_names=class_names) 
            
            # 8. Create a new loss and optimizer for every model
            loss_fn = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)

            # 9. Train target model with target dataloaders and track experiments
            engine.train(model=model,
                  train_dataloader=dataloaders[0],
                  test_dataloader=dataloaders[1], 
                  val_dataloader=dataloaders[2],
                  optimizer=optimizer,
                  loss_fn=loss_fn,
                  epochs=epochs,
                  device=device,
                  writer=create_writer(experiment_name=dataloader_name,
                                       model_name=model_name,
                                       extra=f"{epochs}_epochs"))
            
            # 10. Save the model to file so we can get back the best model
            save_filepath = f"{model_name}_{dataloader_name}_{epochs}_epochs.pth"
            save_model(model=model,
                       target_dir="models",
                       model_name=save_filepath)
            print("-"*50 + "\n")