In [21]:
# Config
DATA_DIR = 'F:\Projects\collaborative_cnn_team08\data'              # root data folder containing training_set/ and test_set/
OUTPUT_DIR = 'results'
MODEL_SAVE_PATH = 'models/model_v1.pth'
FINAL_METRICS_PATH = f"{OUTPUT_DIR}/metrics_v1.json"

BATCH_SIZE = 32
EPOCHS = 15
LR = 1e-3
NUM_WORKERS = 4
IMAGE_SIZE = (224, 224)

import os
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
print('DATA_DIR =', DATA_DIR)
print('Train folder =', os.path.join(DATA_DIR, 'training_set'))
print('Val folder   =', os.path.join(DATA_DIR, 'test_set'))

DATA_DIR = F:\Projects\collaborative_cnn_team08\data
Train folder = F:\Projects\collaborative_cnn_team08\data\training_set
Val folder   = F:\Projects\collaborative_cnn_team08\data\test_set


In [22]:
# Imports
import time
import random
import os, sys

sys.path.append("../")
from pathlib import Path
from tqdm.auto import tqdm
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
)

import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms, datasets

# Project code (ensure you're running notebook from repo root)
from models.model_v1 import get_model
# from utils.metrics import compute_classification_metrics, save_metrics


In [23]:
# Helper functions

  # Helper functions (corrected)
import os
import json        # <-- REQUIRED
import random
import numpy as np
import torch

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def safe_num_workers(requested):
    if os.name == 'nt':
        return 0
    try:
        import multiprocessing
        cpus = multiprocessing.cpu_count()
        return min(max(0, requested), max(1, cpus - 1))
    except Exception:
        return 0


def save_metrics(metrics: dict, path: str):
    """Save metrics dict as JSON."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        json.dump(metrics, f, indent=2)



In [24]:
# Prepare datasets and dataloaders
train_dir = os.path.join(DATA_DIR, 'training_set')
val_dir = os.path.join(DATA_DIR, 'test_set')

if not (os.path.isdir(train_dir) and os.path.isdir(val_dir)):
    raise RuntimeError(f"Expected folders not found. Ensure '{train_dir}' and '{val_dir}' exist.")

train_transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
])
val_transform = transforms.Compose([
    transforms.Resize(IMAGE_SIZE),
    transforms.ToTensor(),
])

train_ds = datasets.ImageFolder(train_dir, transform=train_transform)
val_ds = datasets.ImageFolder(val_dir, transform=val_transform)

num_workers = safe_num_workers(NUM_WORKERS)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers)

print('Classes:', train_ds.classes)
print('Train samples:', len(train_ds), 'Val samples:', len(val_ds))
print('num_workers =', num_workers)

Classes: ['cats', 'dogs']
Train samples: 8005 Val samples: 2023
num_workers = 0


In [25]:
# Model, criterion, optimizer, device
set_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

num_classes = len(train_ds.classes)
model = get_model(num_classes=num_classes, device=device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)


Device: cpu


In [26]:
def compute_classification_metrics(y_true, y_pred, average="macro"):
    """
    Compute common classification metrics for classification tasks.

    Args:
        y_true (list or np.array): Ground truth labels
        y_pred (list or np.array): Predicted labels
        average (str): Averaging mode for multi-class classification.
                       Options: "macro", "micro", "weighted"

    Returns:
        dict: Accuracy, F1, Precision, Recall, Confusion matrix
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    metrics = {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "f1": float(f1_score(y_true, y_pred, average=average, zero_division=0)),
        "precision": float(precision_score(y_true, y_pred, average=average, zero_division=0)),
        "recall": float(recall_score(y_true, y_pred, average=average, zero_division=0)),
    }

    try:
        cm = confusion_matrix(y_true, y_pred).tolist()
    except Exception:
        cm = None

    metrics["confusion_matrix"] = cm

    return metrics


def save_metrics(metrics: dict, path: str):
    """
    Save a dictionary of metrics to a JSON file.

    Args:
        metrics (dict): metrics dictionary
        path (str): output JSON file path
    """
    with open(path, "w") as f:
        json.dump(metrics, f, indent=2)


In [27]:
# Training & evaluation helper functions
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    trues, preds = [], []
    for imgs, labels in tqdm(dataloader, desc='Train', leave=False):
        imgs = imgs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outs = model(imgs)
        loss = criterion(outs, labels)
        loss.backward()
        optimizer.step()
        running_loss += float(loss.item()) * imgs.size(0)
        preds.extend(torch.argmax(outs, dim=1).cpu().tolist())
        trues.extend(labels.cpu().tolist())
    avg_loss = running_loss / max(1, len(dataloader.dataset))
    metrics = compute_classification_metrics(trues, preds)
    metrics['loss'] = float(avg_loss)
    return metrics


def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    trues, preds = [], []
    with torch.no_grad():
        for imgs, labels in tqdm(dataloader, desc='Val', leave=False):
            imgs = imgs.to(device)
            labels = labels.to(device)
            outs = model(imgs)
            loss = criterion(outs, labels)
            running_loss += float(loss.item()) * imgs.size(0)
            preds.extend(torch.argmax(outs, dim=1).cpu().tolist())
            trues.extend(labels.cpu().tolist())
    avg_loss = running_loss / max(1, len(dataloader.dataset))
    metrics = compute_classification_metrics(trues, preds)
    metrics['loss'] = float(avg_loss)
    return metrics


In [28]:
# Training loop
best_val_f1 = -1.0
history = {'epochs': []}
start_time = time.time()

for epoch in range(1, EPOCHS+1):
    t0 = time.time()
    print(f"\n=== Epoch {epoch}/{EPOCHS} ===")
    train_metrics = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_metrics = evaluate(model, val_loader, criterion, device)
    epoch_time = time.time() - t0
    print('Train:', train_metrics)
    print('Val:  ', val_metrics)

    rec = {'epoch': epoch, 'train': train_metrics, 'val': val_metrics, 'time_s': epoch_time}
    history['epochs'].append(rec)
    save_metrics(rec, os.path.join(OUTPUT_DIR, f'metrics_v1_epoch{epoch}.json'))

    val_f1 = float(val_metrics.get('f1', 0.0))
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print('Saved best model to', MODEL_SAVE_PATH)

total_time = time.time() - start_time
history['summary'] = {'total_time_s': total_time, 'best_val_f1': best_val_f1}
save_metrics(history, FINAL_METRICS_PATH)
print('\nTraining finished. Total time: {:.1f}s'.format(total_time))
print('Final metrics saved to', FINAL_METRICS_PATH)



=== Epoch 1/15 ===




Train: {'accuracy': 0.5982510930668332, 'f1': 0.5956584558447966, 'precision': 0.6009096615552283, 'recall': 0.5983013420724095, 'confusion_matrix': [[2715, 1285], [1931, 2074]], 'loss': 0.6574149778900409}
Val:   {'accuracy': 0.6198714780029659, 'f1': 0.5955591466868824, 'precision': 0.6580805284437692, 'recall': 0.6199928259501218, 'confusion_matrix': [[875, 136], [633, 379]], 'loss': 0.6401151240431785}
Saved best model to models/model_v1.pth

=== Epoch 2/15 ===




Train: {'accuracy': 0.618738288569644, 'f1': 0.6166345114458903, 'precision': 0.6214737064251081, 'recall': 0.6187848002496878, 'confusion_matrix': [[2773, 1227], [1825, 2180]], 'loss': 0.6411505076976064}
Val:   {'accuracy': 0.6238260009886307, 'f1': 0.6153674339398916, 'precision': 0.6356451246582504, 'recall': 0.6237528490947405, 'confusion_matrix': [[481, 530], [231, 781]], 'loss': 0.6298901717290393}
Saved best model to models/model_v1.pth

=== Epoch 3/15 ===




Train: {'accuracy': 0.6369768894440975, 'f1': 0.6346488543796471, 'precision': 0.6406396995377642, 'recall': 0.6370269975031211, 'confusion_matrix': [[2869, 1131], [1775, 2230]], 'loss': 0.6252156940867646}
Val:   {'accuracy': 0.5684626791893228, 'f1': 0.5259221220988524, 'precision': 0.6064657838056853, 'recall': 0.5683147433566734, 'confusion_matrix': [[272, 739], [134, 878]], 'loss': 0.7708852880082387}

=== Epoch 4/15 ===




Train: {'accuracy': 0.6498438475952529, 'f1': 0.647730672240048, 'precision': 0.6536080281464898, 'recall': 0.6498924781523097, 'confusion_matrix': [[2911, 1089], [1714, 2291]], 'loss': 0.6205748750707792}
Val:   {'accuracy': 0.6080079090459714, 'f1': 0.5772102298827249, 'precision': 0.652116566192827, 'recall': 0.6078746437409837, 'confusion_matrix': [[342, 669], [124, 888]], 'loss': 0.6988726464816973}

=== Epoch 5/15 ===




Train: {'accuracy': 0.6699562773266708, 'f1': 0.6689515549871501, 'precision': 0.6721039600174781, 'recall': 0.6699909488139826, 'confusion_matrix': [[2902, 1098], [1544, 2461]], 'loss': 0.6010734220804981}
Val:   {'accuracy': 0.7024221453287197, 'f1': 0.7019203845664327, 'precision': 0.7038315816406866, 'recall': 0.7024425978270643, 'confusion_matrix': [[752, 259], [343, 669]], 'loss': 0.5711784782353098}
Saved best model to models/model_v1.pth

=== Epoch 6/15 ===




Train: {'accuracy': 0.683947532792005, 'f1': 0.6829049942958549, 'precision': 0.6864625181817906, 'recall': 0.6839836142322098, 'confusion_matrix': [[2967, 1033], [1497, 2508]], 'loss': 0.5897110028910235}
Val:   {'accuracy': 0.6979733069698467, 'f1': 0.6965612982495333, 'precision': 0.7017904278101499, 'recall': 0.698007197507262, 'confusion_matrix': [[775, 236], [375, 637]], 'loss': 0.570603190438322}

=== Epoch 7/15 ===




Train: {'accuracy': 0.6935665209244223, 'f1': 0.6927917696840478, 'precision': 0.6955957996986198, 'recall': 0.6935981585518103, 'confusion_matrix': [[2977, 1023], [1430, 2575]], 'loss': 0.581405112841664}
Val:   {'accuracy': 0.7296094908551656, 'f1': 0.7290732682585497, 'precision': 0.7314846562660724, 'recall': 0.7296316604309121, 'confusion_matrix': [[783, 228], [319, 693]], 'loss': 0.552115708869457}
Saved best model to models/model_v1.pth

=== Epoch 8/15 ===




Train: {'accuracy': 0.7001873828856965, 'f1': 0.6992580069503147, 'precision': 0.7027568387616745, 'recall': 0.7002223782771535, 'confusion_matrix': [[3025, 975], [1425, 2580]], 'loss': 0.574425446294681}
Val:   {'accuracy': 0.7192288680177954, 'f1': 0.7114093690037271, 'precision': 0.7460588215942363, 'recall': 0.7193104115597988, 'confusion_matrix': [[894, 117], [451, 561]], 'loss': 0.5533670250342968}

=== Epoch 9/15 ===




Train: {'accuracy': 0.7051842598376015, 'f1': 0.7043350956141469, 'precision': 0.707631117809013, 'recall': 0.7052180087390761, 'confusion_matrix': [[3037, 963], [1397, 2608]], 'loss': 0.5656454166645262}
Val:   {'accuracy': 0.7103311913000494, 'f1': 0.7012343998870945, 'precision': 0.7396929311822928, 'recall': 0.7104176196228835, 'confusion_matrix': [[895, 116], [470, 542]], 'loss': 0.5622376466645973}

=== Epoch 10/15 ===




Train: {'accuracy': 0.714053716427233, 'f1': 0.7132568320577304, 'precision': 0.7165223803877396, 'recall': 0.7140869225967541, 'confusion_matrix': [[3069, 931], [1358, 2647]], 'loss': 0.5563982343018464}
Val:   {'accuracy': 0.7009391992090954, 'f1': 0.6893954947545051, 'precision': 0.7362452075594791, 'recall': 0.7010346661036895, 'confusion_matrix': [[904, 107], [498, 514]], 'loss': 0.5912070703046839}

=== Epoch 11/15 ===




Train: {'accuracy': 0.7175515302935666, 'f1': 0.7167417869687799, 'precision': 0.7201317538799026, 'recall': 0.7175852059925094, 'confusion_matrix': [[3086, 914], [1347, 2658]], 'loss': 0.5506629934391329}
Val:   {'accuracy': 0.7132970835392981, 'f1': 0.7061731338260248, 'precision': 0.7363706052022971, 'recall': 0.7133742273724212, 'confusion_matrix': [[879, 132], [448, 564]], 'loss': 0.5548876668931939}

=== Epoch 12/15 ===




Train: {'accuracy': 0.7271705184259838, 'f1': 0.7265180499851518, 'precision': 0.7294186810801326, 'recall': 0.7272013108614233, 'confusion_matrix': [[3106, 894], [1290, 2715]], 'loss': 0.5465765488512586}
Val:   {'accuracy': 0.7474048442906575, 'f1': 0.7466558050636343, 'precision': 0.7504204083005415, 'recall': 0.7474319051696164, 'confusion_matrix': [[811, 200], [311, 701]], 'loss': 0.509429448468816}
Saved best model to models/model_v1.pth

=== Epoch 13/15 ===




Train: {'accuracy': 0.7297938788257339, 'f1': 0.729062177866705, 'precision': 0.7323669360620577, 'recall': 0.7298266229712859, 'confusion_matrix': [[3129, 871], [1292, 2713]], 'loss': 0.534618564168786}
Val:   {'accuracy': 0.7004448838358873, 'f1': 0.693193967277707, 'precision': 0.7212141289405021, 'recall': 0.7003690628384216, 'confusion_matrix': [[553, 458], [148, 864]], 'loss': 0.5741409480689601}

=== Epoch 14/15 ===




Train: {'accuracy': 0.7390381011867583, 'f1': 0.7386098914906654, 'precision': 0.7406650393592722, 'recall': 0.7390636704119851, 'confusion_matrix': [[3120, 880], [1209, 2796]], 'loss': 0.5348125541232512}
Val:   {'accuracy': 0.6608996539792388, 'f1': 0.6431647402708267, 'precision': 0.7010722829801777, 'recall': 0.6610100163028818, 'confusion_matrix': [[894, 117], [569, 443]], 'loss': 0.6274031296674415}

=== Epoch 15/15 ===




Train: {'accuracy': 0.7379138038725797, 'f1': 0.737414691189674, 'precision': 0.7397905847429873, 'recall': 0.7379413233458177, 'confusion_matrix': [[3128, 872], [1226, 2779]], 'loss': 0.5281990752824763}
Val:   {'accuracy': 0.7118141374196737, 'f1': 0.7009257942053484, 'precision': 0.7481379113018598, 'recall': 0.7119086295805428, 'confusion_matrix': [[913, 98], [485, 527]], 'loss': 0.5575503963559085}

Training finished. Total time: 2056.5s
Final metrics saved to results/metrics_v1.json
