In [1]:
import numpy as np
import os
import pandas as pd
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from torchmetrics import Accuracy
import random
from tqdm.auto import tqdm
from timeit import default_timer as timer


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
IMAGE_SIZE = (64, 64)  # Setting image size to 64*64, default is 224*224

# Define transform to resize and convert to tensor
transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.ToTensor()
])

# Load the dataset
# Replace with your dataset directory
dataset_path = r"PATH_TO_TRAIN_DATA"
dataset = datasets.ImageFolder(root=dataset_path, transform=transform)

# Create the DataLoader
loader = DataLoader(dataset, batch_size=64, shuffle=False, num_workers=4)

# Initialize mean and std
mean = 0.0
std = 0.0
total_images_count = 0

# Compute mean and std over the entire dataset
print("Calculating mean and std...")
for images, _ in loader:
    batch_samples = images.size(0)  # batch size (last batch can have smaller size!)
    images = images.view(batch_samples, images.size(1), -1)  # reshape to (B, C, H*W)
    
    mean += images.mean(2).sum(0)
    std += images.std(2).sum(0)
    total_images_count += batch_samples

mean /= total_images_count
std /= total_images_count

print(f"Mean: {mean}")
print(f"Std: {std}")


Calculating mean and std...
Mean: tensor([0.4531, 0.4513, 0.3910])
Std: tensor([0.1808, 0.1760, 0.1748])


In [3]:
# Define transforms for train_dataset
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(64),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

# Define transforms for val_dataset
val_transforms = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])


In [4]:
train_dataset = ImageFolder(r"PATH_TO_TRAIN_DATA",transform= train_transforms)
val_dataset = ImageFolder(r"PATH_TO_VALIDATION_DATA", transform= val_transforms)

train_loader= DataLoader(train_dataset,batch_size=128,shuffle=True,num_workers=4)
val_loader= DataLoader(val_dataset,batch_size=128,shuffle=False,num_workers=4)

In [5]:
def save_epoch_metrics_csv(model_name, train_loss_history, val_loss_history,
                           train_acc_history, val_top1_acc_history, val_top5_acc_history,
                           filename=None):
    metrics_dict = {
        "epoch": list(range(1, len(train_loss_history) + 1)),
        "train_loss": train_loss_history,
        "val_loss": val_loss_history,
        "train_acc": train_acc_history,
        "val_top1": val_top1_acc_history,
        "val_top5": val_top5_acc_history,
    }
    df = pd.DataFrame(metrics_dict)

    if filename is None:
        filename = f"results/per_model_logs/{model_name}_metrics.csv"
    # Ensure directory exists
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    df.to_csv(filename, index=False)
    print(f"[✓] Saved epoch metrics to {filename}")


def append_model_summary(model_name, train_loss_history, val_loss_history,
                         train_acc_history, val_top1_acc_history, val_top5_acc_history,
                         total_train_time_sec, model, batch_size, optimizer_type,
                         lr_schedule_desc, image_size, architecture_type,
                         summary_csv_path="results/summary/final_model_comparison.csv"):

    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    summary_dict = {
        "model_name": model_name,
        "final_train_loss": train_loss_history[-1],
        "final_val_loss": val_loss_history[-1],
        "final_train_acc": train_acc_history[-1],
        "final_val_top1_acc": val_top1_acc_history[-1],
        "final_val_top5_acc": val_top5_acc_history[-1],
        "total_train_time_sec": total_train_time_sec,
        "total_train_time_min": round(total_train_time_sec / 60, 2),
        "num_params": num_params,
        "batch_size": batch_size,
        "optimizer": optimizer_type,
        "learning_rate_schedule": lr_schedule_desc,
        "image_size": image_size,
        "architecture_type": architecture_type,
    }

    # Ensure output directory exists
    os.makedirs(os.path.dirname(summary_csv_path), exist_ok=True)

    # Append or create
    if os.path.exists(summary_csv_path):
        existing_df = pd.read_csv(summary_csv_path)
        updated_df = pd.concat([existing_df, pd.DataFrame([summary_dict])], ignore_index=True)
    else:
        updated_df = pd.DataFrame([summary_dict])

    updated_df.to_csv(summary_csv_path, index=False)
    print(f"[✓] Appended model summary to {summary_csv_path}")


In [6]:
# Analyzing time taken to train the model
def print_train_time(start: float,
                     end: float,
                     device: torch.device = None):
  total_time = end-start
  print(f"Train time on {device} : {total_time:.3f} seconds")
  return total_time

In [7]:
def train_model(
    model,
    train_loader,
    val_loader,
    loss_fn,
    optimizer_adam,
    optimizer_sgd,
    scheduler_adam,
    scheduler_sgd,
    epochs,
    batch_size,
    image_size,
    architecture_type,
    model_name
):
    # Track metrics
    train_loss_history, val_loss_history = [], []
    train_acc_history, val_top1_acc_history, val_top5_acc_history = [], [], []

    best_top1 = 0.0
    best_model_wts = None

    start_time = timer()

    for epoch in tqdm(range(epochs)):
        print(f"\nEpoch {epoch+1}/{epochs}\n{'-'*20}")

        # ----------------
        # Select optimizer
        # ----------------
        if epoch < 30:
            optimizer = optimizer_adam
            scheduler = scheduler_adam
        else:
            optimizer = optimizer_sgd
            scheduler = scheduler_sgd

        # ----------------
        # Training Phase
        # ----------------
        model.train()
        running_loss = 0.0
        running_correct = 0
        total_samples_train = 0

        for X, y in train_loader:
            X, y = X.to(device), y.to(device)
            batch_size_curr = y.size(0)
            total_samples_train += batch_size_curr

            outputs = model(X).float()
            loss = loss_fn(outputs, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * batch_size_curr
            running_correct += (outputs.argmax(dim=1) == y).sum().item()

        train_loss = running_loss / total_samples_train
        train_acc = (running_correct / total_samples_train) * 100

        # ----------------
        # Validation Phase
        # ----------------
        model.eval()
        val_loss_total = 0.0
        val_samples_total = 0
        correct_top1 = 0
        correct_top5 = 0

        with torch.inference_mode():
            for X, y in val_loader:
                X, y = X.to(device), y.to(device)
                batch_size_curr = y.size(0)
                val_samples_total += batch_size_curr

                outputs = model(X).float()
                loss = loss_fn(outputs, y)
                val_loss_total += loss.item() * batch_size_curr

                _, pred_topk = outputs.topk(5, dim=1, largest=True, sorted=True)
                correct = pred_topk.eq(y.view(-1, 1).expand_as(pred_topk))

                correct_top1 += correct[:, :1].sum().item()
                correct_top5 += correct[:, :5].sum().item()

        val_loss = val_loss_total / val_samples_total
        top1_val_acc = (correct_top1 / val_samples_total) * 100
        top5_val_acc = (correct_top5 / val_samples_total) * 100

        # ----------------
        # Logging
        # ----------------
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
        print(f"Val Loss: {val_loss:.4f} | Top-1 Val Acc: {top1_val_acc:.2f}% | Top-5 Val Acc: {top5_val_acc:.2f}%")

        train_loss_history.append(train_loss)
        val_loss_history.append(val_loss)
        train_acc_history.append(train_acc)
        val_top1_acc_history.append(top1_val_acc)
        val_top5_acc_history.append(top5_val_acc)

        # Save best model
        if top1_val_acc > best_top1:
            best_top1 = top1_val_acc
            best_model_wts = model.state_dict().copy()
            os.makedirs("results/checkpoints", exist_ok=True)
            torch.save(best_model_wts, f"results/checkpoints/{model_name}_best.pth")
            print(f"[✓] Saved best model (Top-1 = {best_top1:.2f}%)")

        scheduler.step()

    end_time = timer()
    total_train_time_sec = end_time - start_time

    # Save metrics CSV
    save_epoch_metrics_csv(
        model_name=model_name,
        train_loss_history=train_loss_history,
        val_loss_history=val_loss_history,
        train_acc_history=train_acc_history,
        val_top1_acc_history=val_top1_acc_history,
        val_top5_acc_history=val_top5_acc_history
    )

    # Append summary CSV
    append_model_summary(
        model_name=model_name,
        train_loss_history=train_loss_history,
        val_loss_history=val_loss_history,
        train_acc_history=train_acc_history,
        val_top1_acc_history=val_top1_acc_history,
        val_top5_acc_history=val_top5_acc_history,
        total_train_time_sec=total_train_time_sec,
        model=model,
        batch_size=batch_size,
        optimizer_type="Adam (first 30 epochs) + SGD (rest)",
        lr_schedule_desc="StepLR: Adam(step=10, gamma=0.1), SGD(step=30, gamma=0.1)",
        image_size=f"{image_size[0]}x{image_size[1]}",
        architecture_type=architecture_type
    )

    print_train_time(start_time, end_time, device)
    print(f"[✓] Training complete for {model_name}")


In [8]:
class BasicBlock(nn.Module):
    def __init__(self,in_channels,out_channels,stride=1):
        super(BasicBlock,self) .__init__()

        self.residual_function = nn.Sequential(
            nn.Conv2d(in_channels,out_channels,kernel_size=3,stride=stride,padding=1,bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels,out_channels,kernel_size=3,padding=1,bias=False),
            nn.BatchNorm2d(out_channels)    
        )
        self.shortcut = nn.Sequential()
        if stride != 1or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=stride,bias=False),
                nn.BatchNorm2d(out_channels)
            )
        self.relu = nn.ReLU(inplace=True)

    def forward(self,x):
        out = self.residual_function(x)
        out += self.shortcut(x)
        return self.relu(out)


class ResNet34(nn.Module):
    def __init__(self,num_classes):
        super(ResNet34,self) .__init__()
        self.in_channels = 64
        self.layer = nn.Sequential(
            nn.Conv2d(3,64,kernel_size=7,stride=2,padding=3,bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3,stride=2,padding=1)
            )
        self.layer_1 = self.make_layer(64,3)
        self.layer_2 = self.make_layer(128,4,stride=2)
        self.layer_3 = self.make_layer(256,6,stride=2)
        self.layer_4 = self.make_layer(512,3,stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512,num_classes)

    def make_layer(self,out_channels,blocks,stride=1):
        layers=[]
        layers.append(BasicBlock(self.in_channels,out_channels,stride))
        self.in_channels = out_channels
        for _ in range(1,blocks):
            layers.append(BasicBlock(out_channels,out_channels))
        return nn.Sequential(*layers)


    def forward(self,x):
        x = self.layer(x)
        x = self.layer_1(x)
        x = self.layer_2(x)
        x = self.layer_3(x)
        x = self.layer_4(x)
        x = self.avgpool(x)
        x = torch.flatten(x,1)
        x = self.fc(x)
        return x
        

In [9]:
model = ResNet34(num_classes=100).to(device)

# Defining loss function
loss_fn = nn.CrossEntropyLoss()

# Defining optimizers
optimizer_adam = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)
optimizer_sgd = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)

# Using learning rate schedulers 
scheduler_adam = torch.optim.lr_scheduler.StepLR(optimizer_adam, step_size=10, gamma=0.1)
scheduler_sgd = torch.optim.lr_scheduler.StepLR(optimizer_sgd, step_size=30, gamma=0.1)

train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    loss_fn=loss_fn,
    optimizer_adam=optimizer_adam,
    optimizer_sgd=optimizer_sgd,
    scheduler_adam=scheduler_adam,
    scheduler_sgd=scheduler_sgd,
    epochs=100,
    batch_size=128,
    image_size=(64, 64),
    architecture_type="ResNet34",
    model_name="ResNet34_ImageNet100"
)


  0%|          | 0/100 [00:00<?, ?it/s]


Epoch 1/100
--------------------
Train Loss: 3.8077 | Train Acc: 11.63%
Val Loss: 3.4808 | Top-1 Val Acc: 17.78% | Top-5 Val Acc: 42.78%
[✓] Saved best model (Top-1 = 17.78%)

Epoch 2/100
--------------------
Train Loss: 3.2702 | Train Acc: 21.02%
Val Loss: 3.1507 | Top-1 Val Acc: 23.20% | Top-5 Val Acc: 51.28%
[✓] Saved best model (Top-1 = 23.20%)

Epoch 3/100
--------------------
Train Loss: 3.0126 | Train Acc: 25.97%
Val Loss: 3.0251 | Top-1 Val Acc: 25.24% | Top-5 Val Acc: 55.02%
[✓] Saved best model (Top-1 = 25.24%)

Epoch 4/100
--------------------
Train Loss: 2.8332 | Train Acc: 29.91%
Val Loss: 2.8895 | Top-1 Val Acc: 29.20% | Top-5 Val Acc: 58.64%
[✓] Saved best model (Top-1 = 29.20%)

Epoch 5/100
--------------------
Train Loss: 2.6888 | Train Acc: 32.77%
Val Loss: 2.7013 | Top-1 Val Acc: 32.82% | Top-5 Val Acc: 61.28%
[✓] Saved best model (Top-1 = 32.82%)

Epoch 6/100
--------------------
Train Loss: 2.5733 | Train Acc: 35.37%
Val Loss: 2.6773 | Top-1 Val Acc: 33.16% | Top-