In [None]:
# =========================
# Cell 1: Imports & Config
# =========================

import os
import sys

# Add project root (one level up from /notebooks)
sys.path.append("../")

import json
from pathlib import Path
from time import time

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
from tqdm import tqdm

from sklearn.metrics import accuracy_score, f1_score
from models.model_v2 import CustomCNN
from utils.metrics import compute_metrics, print_metrics, load_metrics, save_metrics

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

PROJECT_ROOT = Path("../").resolve()

TRAIN_ROOT = PROJECT_ROOT / "data" / "train"

MODELS_DIR  = PROJECT_ROOT / "models"
RESULTS_DIR = PROJECT_ROOT / "results"

MODELS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

MODEL_WEIGHTS_PATH = MODELS_DIR / "model_v2.pth"




S_PATH       = RESULTS_DIR / "metrics_v2.json"

BATCH_SIZE = 32
NUM_EPOCHS = 35
LEARNING_RATE = 1e-3
IMAGE_SIZE = 256
VAL_SPLIT = 0.2  # 20% of train used as validation


Using device: cuda


In [None]:
# =========================
# Cell 2: Data Transforms & Loaders (train/val split from train/)
# =========================

train_transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3),
    transforms.RandomResizedCrop(IMAGE_SIZE, scale=(0.8, 1.0)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])


val_transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

# Load full labeled dataset from data/train with class subfolders cat/, dog/
full_dataset = datasets.ImageFolder(root=str(TRAIN_ROOT), transform=train_transform)

class_names = full_dataset.classes
print("Classes:", class_names)
print("Total labeled samples:", len(full_dataset))

# Split into train and validation subsets
val_size = int(VAL_SPLIT * len(full_dataset))
train_size = len(full_dataset) - val_size

train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

train_dataset.dataset.transform = train_transform
val_dataset.dataset.transform = val_transform

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

print("Train samples:", len(train_dataset))
print("Val samples:", len(val_dataset))


Classes: ['cat', 'dog']
Total labeled samples: 25000
Train samples: 20000
Val samples: 5000


In [None]:
# =========================
# Cell 3: Initialize Model, Loss, Optimizer
# =========================

num_classes = len(class_names)

model = CustomCNN(num_classes=num_classes).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)


print(model)


CustomCNN(
  (features): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU(inplace=True)
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (global_pool): AdaptiveAvgPool2d(output_size=(1, 1))
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
   

In [None]:
# =========================
# Cell 4: Training & Evaluation Functions
# =========================

def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    all_preds = []
    all_labels = []

    pbar = tqdm(dataloader, desc="Training", leave=False)

    for images, labels in pbar:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.detach().cpu().tolist())
        all_labels.extend(labels.detach().cpu().tolist())

        pbar.set_postfix({"loss": round(loss.item(), 4)})

    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)
    epoch_f1 = f1_score(all_labels, all_preds, average="macro")

    return epoch_loss, epoch_acc, epoch_f1


def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_labels = []

    pbar = tqdm(dataloader, desc="Validating", leave=False)

    with torch.no_grad():
        for images, labels in pbar:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.detach().cpu().tolist())
            all_labels.extend(labels.detach().cpu().tolist())

            pbar.set_postfix({"loss": round(loss.item(), 4)})

    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)
    epoch_f1 = f1_score(all_labels, all_preds, average="macro")

    return epoch_loss, epoch_acc, epoch_f1



In [None]:
# =========================
# Cell 5: Run Training
# =========================

best_val_f1 = 0.0
history = {
    "train_loss": [],
    "train_acc": [],
    "train_f1": [],
    "val_loss": [],
    "val_acc": [],
    "val_f1": [],
}

start_time = time()


for epoch in range(1, NUM_EPOCHS + 1):

    train_loss, train_acc, train_f1 = train_one_epoch(
        model, train_loader, optimizer, criterion, DEVICE
    )
    val_loss, val_acc, val_f1 = evaluate(
        model, val_loader, criterion, DEVICE
    )

    # Save stats
    history["train_loss"].append(train_loss)
    history["train_acc"].append(train_acc)
    history["train_f1"].append(train_f1)
    history["val_loss"].append(val_loss)
    history["val_acc"].append(val_acc)
    history["val_f1"].append(val_f1)

    print(
        f"Epoch [{epoch}/{NUM_EPOCHS}] "
        f"Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f} "
        f"|| Val Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}"
    )

    # Save best model
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), MODEL_WEIGHTS_PATH)
        print(f"✅ Saved best model to {MODEL_WEIGHTS_PATH}")

    scheduler.step()

total_time = time() - start_time
print(f"Training completed in {total_time/60:.2f} minutes.")




Epoch [1/35] Train Loss: 0.6391 | Acc: 0.6226 | F1: 0.6196 || Val Loss: 0.7185 | Acc: 0.5856 | F1: 0.5082
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [2/35] Train Loss: 0.6009 | Acc: 0.6738 | F1: 0.6729 || Val Loss: 0.6928 | Acc: 0.6344 | F1: 0.5926
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [3/35] Train Loss: 0.5738 | Acc: 0.7046 | F1: 0.7039 || Val Loss: 0.6448 | Acc: 0.6668 | F1: 0.6431
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [4/35] Train Loss: 0.5554 | Acc: 0.7138 | F1: 0.7135 || Val Loss: 0.5659 | Acc: 0.7064 | F1: 0.6920
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [5/35] Train Loss: 0.5392 | Acc: 0.7285 | F1: 0.7282 || Val Loss: 0.5985 | Acc: 0.6822 | F1: 0.6547




Epoch [6/35] Train Loss: 0.5128 | Acc: 0.7478 | F1: 0.7475 || Val Loss: 0.5002 | Acc: 0.7546 | F1: 0.7541
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [7/35] Train Loss: 0.4990 | Acc: 0.7583 | F1: 0.7582 || Val Loss: 0.4770 | Acc: 0.7742 | F1: 0.7729
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [8/35] Train Loss: 0.4941 | Acc: 0.7627 | F1: 0.7625 || Val Loss: 0.6672 | Acc: 0.6376 | F1: 0.5993




Epoch [9/35] Train Loss: 0.4855 | Acc: 0.7701 | F1: 0.7700 || Val Loss: 0.4610 | Acc: 0.7860 | F1: 0.7846
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [10/35] Train Loss: 0.4783 | Acc: 0.7736 | F1: 0.7735 || Val Loss: 0.4555 | Acc: 0.7874 | F1: 0.7874
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [11/35] Train Loss: 0.4536 | Acc: 0.7903 | F1: 0.7902 || Val Loss: 0.4594 | Acc: 0.7830 | F1: 0.7819




Epoch [12/35] Train Loss: 0.4496 | Acc: 0.7901 | F1: 0.7901 || Val Loss: 0.4741 | Acc: 0.7652 | F1: 0.7593




Epoch [13/35] Train Loss: 0.4425 | Acc: 0.7964 | F1: 0.7963 || Val Loss: 0.4513 | Acc: 0.7870 | F1: 0.7850




Epoch [14/35] Train Loss: 0.4345 | Acc: 0.7996 | F1: 0.7995 || Val Loss: 0.4091 | Acc: 0.8200 | F1: 0.8197
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [15/35] Train Loss: 0.4296 | Acc: 0.8038 | F1: 0.8038 || Val Loss: 0.4213 | Acc: 0.8028 | F1: 0.8013




Epoch [16/35] Train Loss: 0.4146 | Acc: 0.8124 | F1: 0.8124 || Val Loss: 0.4264 | Acc: 0.8010 | F1: 0.7976




Epoch [17/35] Train Loss: 0.4061 | Acc: 0.8209 | F1: 0.8209 || Val Loss: 0.4118 | Acc: 0.8092 | F1: 0.8083




Epoch [18/35] Train Loss: 0.4082 | Acc: 0.8173 | F1: 0.8173 || Val Loss: 0.3888 | Acc: 0.8274 | F1: 0.8268
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [19/35] Train Loss: 0.3964 | Acc: 0.8224 | F1: 0.8223 || Val Loss: 0.4022 | Acc: 0.8152 | F1: 0.8134




Epoch [20/35] Train Loss: 0.3998 | Acc: 0.8216 | F1: 0.8215 || Val Loss: 0.3838 | Acc: 0.8298 | F1: 0.8296
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [21/35] Train Loss: 0.3865 | Acc: 0.8292 | F1: 0.8292 || Val Loss: 0.3829 | Acc: 0.8264 | F1: 0.8255




Epoch [22/35] Train Loss: 0.3845 | Acc: 0.8286 | F1: 0.8285 || Val Loss: 0.3761 | Acc: 0.8326 | F1: 0.8319
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [23/35] Train Loss: 0.3812 | Acc: 0.8323 | F1: 0.8322 || Val Loss: 0.3744 | Acc: 0.8384 | F1: 0.8381
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [24/35] Train Loss: 0.3769 | Acc: 0.8355 | F1: 0.8354 || Val Loss: 0.3728 | Acc: 0.8324 | F1: 0.8321




Epoch [25/35] Train Loss: 0.3747 | Acc: 0.8350 | F1: 0.8350 || Val Loss: 0.3696 | Acc: 0.8368 | F1: 0.8367




Epoch [26/35] Train Loss: 0.3699 | Acc: 0.8376 | F1: 0.8376 || Val Loss: 0.3618 | Acc: 0.8410 | F1: 0.8410
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [27/35] Train Loss: 0.3698 | Acc: 0.8386 | F1: 0.8385 || Val Loss: 0.3632 | Acc: 0.8372 | F1: 0.8369




Epoch [28/35] Train Loss: 0.3702 | Acc: 0.8387 | F1: 0.8386 || Val Loss: 0.3930 | Acc: 0.8190 | F1: 0.8167




Epoch [29/35] Train Loss: 0.3652 | Acc: 0.8405 | F1: 0.8405 || Val Loss: 0.3599 | Acc: 0.8392 | F1: 0.8389




Epoch [30/35] Train Loss: 0.3644 | Acc: 0.8394 | F1: 0.8394 || Val Loss: 0.3575 | Acc: 0.8428 | F1: 0.8428
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [31/35] Train Loss: 0.3576 | Acc: 0.8446 | F1: 0.8446 || Val Loss: 0.3544 | Acc: 0.8452 | F1: 0.8452
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [32/35] Train Loss: 0.3616 | Acc: 0.8438 | F1: 0.8437 || Val Loss: 0.3540 | Acc: 0.8480 | F1: 0.8480
✅ Saved best model to /media/veer/Data/Projects/collaborative_cnn_team08/models/model_v2.pth




Epoch [33/35] Train Loss: 0.3623 | Acc: 0.8413 | F1: 0.8412 || Val Loss: 0.3544 | Acc: 0.8452 | F1: 0.8451




Epoch [34/35] Train Loss: 0.3567 | Acc: 0.8454 | F1: 0.8453 || Val Loss: 0.3532 | Acc: 0.8478 | F1: 0.8477




Epoch [35/35] Train Loss: 0.3576 | Acc: 0.8444 | F1: 0.8444 || Val Loss: 0.3565 | Acc: 0.8448 | F1: 0.8445
Training completed in 54.84 minutes.


In [None]:
# =========================
# Cell 6: Save Metrics to results/metrics_v2.json
# =========================

metrics = {
    "num_epochs": NUM_EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE,
    "best_val_f1": best_val_f1,
    "history": history,
    "classes": class_names,
}

with open(METRICS_PATH, "w") as f:
    json.dump(metrics, f, indent=4)

print(f"Metrics saved to {METRICS_PATH}")
print("Done ✅")


Metrics saved to /media/veer/Data/Projects/collaborative_cnn_team08/results/metrics_v2.json
Done ✅
