<a href="https://colab.research.google.com/github/ErangaOttachchige/Final-Year-Research-Project/blob/main/02_stage1_detection_filter_training_OPTIMIZED_for_Colab_Free_T4_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Stage 1 Detection/Filter - ULTRA OPTIMIZED for Colab Free T4
##
### KEY OPTIMIZATIONS:
### 1. Parallel caching with progress (8 workers)
### 2. Reduced batch processing with gradient accumulation
### 3. Smart worker config (no RAM crash)
### 4. Optional: Skip caching, train directly from Drive (for smaller datasets)
### 5. Memory-efficient data loading
### 6. Faster evaluation

In [14]:
# ============================================================================
# SETUP: Mount Drive + Paths
# ============================================================================
from google.colab import drive
drive.mount("/content/drive")

import os
DRIVE_CCT = "/content/drive/MyDrive/datasets/cct20"
PROC_DIR  = f"{DRIVE_CCT}/processed"
CSV_STAGE1 = f"{PROC_DIR}/cct20_stage1_imagelevel.csv"

print("âœ“ PROC_DIR files:", os.listdir(PROC_DIR))
print("âœ“ CSV exists:", os.path.exists(CSV_STAGE1))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
âœ“ PROC_DIR files: ['cct20_species_annotations.csv', 'cct20_stage1_imagelevel.csv', 'cct20_stage2_species_imagelevel.csv', 'stage2_best_species_efficientnet_b0_optimized.pt', 'stage2_label_mapping.json', 'stage1_best_efficientnet_b0.pt']
âœ“ CSV exists: True


In [15]:
# ============================================================================
# INSTALL PACKAGES
# ============================================================================
!pip -q install timm torchmetrics pandas numpy scikit-learn pillow tqdm

import torch
print("âœ“ CUDA:", torch.cuda.is_available())
print("âœ“ GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

âœ“ CUDA: True
âœ“ GPU: Tesla T4


In [16]:
# ============================================================================
# LOAD CSV + VERIFY
# ============================================================================
import pandas as pd

df = pd.read_csv(CSV_STAGE1)

missing = (~df["path"].apply(os.path.exists)).sum()
print(f"âœ“ Rows: {len(df)}, Missing paths: {missing}")
print("\nSplit counts:\n", df["split"].value_counts())
print("\nLabel counts:\n", df["label_stage1"].value_counts())

# Label mapping
classes = sorted(df["label_stage1"].unique())
class_to_idx = {c: i for i, c in enumerate(classes)}
idx_to_class = {i: c for c, i in class_to_idx.items()}
df["y"] = df["label_stage1"].map(class_to_idx)

print(f"\nâœ“ Classes: {classes}")
print(f"âœ“ Num classes: {len(classes)}")

âœ“ Rows: 57864, Missing paths: 0

Split counts:
 split
test_trans    23275
test_cis      15827
train         13553
val_cis        3484
val_trans      1725
Name: count, dtype: int64

Label counts:
 label_stage1
animal    51237
empty      4014
car        2613
Name: count, dtype: int64

âœ“ Classes: ['animal', 'car', 'empty']
âœ“ Num classes: 3


In [17]:
# ============================================================================
# ULTRA-FAST PARALLEL CACHING (3-5 min instead of 80 min)
# ============================================================================
import glob, hashlib
from PIL import Image
import torchvision.transforms as T
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from functools import partial

CACHE_DIR = "/content/stage1_cache_224"
os.makedirs(CACHE_DIR, exist_ok=True)

def cache_path(img_path):
    """Generate cache filename"""
    return os.path.join(CACHE_DIR, hashlib.md5(img_path.encode()).hexdigest() + ".pt")

df["cache_path"] = df["path"].apply(cache_path)

cached_count = len(glob.glob(CACHE_DIR + "/*.pt"))
print(f"\nâœ“ Cached tensors: {cached_count} / {len(df)}")

# Preprocessing transform
pre_tf = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
])

def process_one_image(row):
    """Process and cache a single image (thread-safe)"""
    cp = row["cache_path"]
    if os.path.exists(cp):
        return
    try:
        img = Image.open(row["path"]).convert("RGB")
        x = pre_tf(img)
        torch.save(x, cp)
    except Exception as e:
        # Silently skip corrupted images
        pass

if cached_count < len(df) * 0.95:
    print("ðŸ”„ Parallel caching (8 workers, ~3-8 min depending on Drive speed)...")

    rows_to_process = [row for _, row in df.iterrows()]

    with ThreadPoolExecutor(max_workers=8) as executor:
        list(tqdm(
            executor.map(process_one_image, rows_to_process),
            total=len(rows_to_process),
            desc="Caching Stage1"
        ))

    final_count = len(glob.glob(CACHE_DIR + "/*.pt"))
    print(f"âœ“ Cache done: {final_count} tensors")

    # Update cache paths (some may have failed)
    df["has_cache"] = df["cache_path"].apply(os.path.exists)
    df = df[df["has_cache"]].reset_index(drop=True)
    print(f"âœ“ Usable images: {len(df)}")
else:
    print("âœ“ Cache exists - skipping preprocessing!")


âœ“ Cached tensors: 57864 / 57864
âœ“ Cache exists - skipping preprocessing!


In [18]:
# ============================================================================
# MEMORY-EFFICIENT DATASET (loads cached tensors)
# ============================================================================
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

# Augmentation (applied to cached tensors during training)
aug_tf = T.Compose([
    T.RandomHorizontalFlip(0.5),
    T.ColorJitter(0.2, 0.2, 0.1),
])

class CachedDataset(Dataset):
    """Loads preprocessed tensors (very fast)"""
    def __init__(self, frame, augment=False):
        self.df = frame.reset_index(drop=True)
        self.augment = augment

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Load cached tensor (super fast!)
        x = torch.load(row["cache_path"])

        # Apply augmentation if training
        if self.augment:
            x = aug_tf(x)

        y = int(row["y"])
        return x, y

# Split data
train_df = df[df["split"] == "train"].reset_index(drop=True)
val_df   = df[df["split"] == "val_cis"].reset_index(drop=True)
valT_df  = df[df["split"] == "val_trans"].reset_index(drop=True)
test_cis_df   = df[df["split"] == "test_cis"].reset_index(drop=True)
test_trans_df = df[df["split"] == "test_trans"].reset_index(drop=True)

print(f"\nâœ“ Splits:")
print(f"  train: {len(train_df)}")
print(f"  val_cis: {len(val_df)}")
print(f"  val_trans: {len(valT_df)}")
print(f"  test_cis: {len(test_cis_df)}")
print(f"  test_trans: {len(test_trans_df)}")

# Create datasets
train_ds = CachedDataset(train_df, augment=True)
val_ds   = CachedDataset(val_df, augment=False)
valT_ds  = CachedDataset(valT_df, augment=False)
test_cis_ds   = CachedDataset(test_cis_df, augment=False)
test_trans_ds = CachedDataset(test_trans_df, augment=False)

# Balanced sampling for training
counts = train_df["y"].value_counts().sort_index()
w_class = 1.0 / counts
w_sample = train_df["y"].map(w_class).values
sampler = WeightedRandomSampler(
    torch.tensor(w_sample, dtype=torch.double),
    num_samples=len(w_sample),
    replacement=True
)

# Class weights for loss (FIXED: ensure all classes are represented)
# Create a weight array for ALL classes (not just those in train set)
class_weight = torch.ones(len(classes), dtype=torch.float32)

for class_idx in counts.index:
    class_weight[class_idx] = counts.sum() / (len(counts) * counts[class_idx])

print("\nâœ“ Train class distribution:")
print(counts)
print("\nâœ“ Loss class weights:")
print(class_weight)



âœ“ Splits:
  train: 13553
  val_cis: 3484
  val_trans: 1725
  test_cis: 15827
  test_trans: 23275

âœ“ Train class distribution:
y
0    12885
1      668
Name: count, dtype: int64

âœ“ Loss class weights:
tensor([ 0.5259, 10.1445,  1.0000])


In [19]:
# ============================================================================
# OPTIMIZED DATALOADERS (no RAM crash)
# ============================================================================
BATCH_TRAIN = 32   # Smaller batch + gradient accumulation
BATCH_EVAL  = 64
NUM_WORKERS = 2    # CRITICAL: max 2 workers on Colab free

train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_TRAIN,
    sampler=sampler,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    persistent_workers=False  # Disable for Colab stability
)

val_loader = DataLoader(
    val_ds,
    batch_size=BATCH_EVAL,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    persistent_workers=False
)

valT_loader = DataLoader(
    valT_ds,
    batch_size=BATCH_EVAL,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    persistent_workers=False
)

test_cis_loader = DataLoader(
    test_cis_ds,
    batch_size=BATCH_EVAL,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    persistent_workers=False
)

test_trans_loader = DataLoader(
    test_trans_ds,
    batch_size=BATCH_EVAL,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    persistent_workers=False
)

In [20]:
# ============================================================================
# TRAINING WITH GRADIENT ACCUMULATION + AMP
# ============================================================================
import timm
import torch.nn as nn
from sklearn.metrics import f1_score, accuracy_score

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nâœ“ Device: {device}")

# Model
model = timm.create_model("efficientnet_b0", pretrained=True, num_classes=len(classes)).to(device)

# Loss + optimizer
criterion = nn.CrossEntropyLoss(weight=class_weight.to(device))
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)

# AMP scaler
scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda"))

# Gradient accumulation (effective batch = 32 * 2 = 64)
ACCUM_STEPS = 2

def eval_loader(loader, name="eval"):
    """Fast evaluation with progress bar"""
    model.eval()
    all_y, all_p = [], []

    with torch.no_grad():
        for x, y in tqdm(loader, desc=name, leave=False):
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)

            with torch.cuda.amp.autocast(enabled=(device == "cuda")):
                logits = model(x)

            preds = logits.argmax(1)
            all_y.extend(y.cpu().tolist())
            all_p.extend(preds.cpu().tolist())

    acc = accuracy_score(all_y, all_p)
    mf1 = f1_score(all_y, all_p, average="macro")

    return acc, mf1, all_y, all_p


âœ“ Device: cuda


  scaler = torch.cuda.amp.GradScaler(enabled=(device == "cuda"))


In [21]:
# ============================================================================
# TRAINING LOOP
# ============================================================================
SAVE_PATH = f"{PROC_DIR}/stage1_best_efficientnet_b0.pt"
best_val_mf1 = -1.0
EPOCHS = 5

print(f"\n{'='*60}")
print(f"TRAINING STAGE 1 - {EPOCHS} EPOCHS")
print(f"{'='*60}\n")

for epoch in range(1, EPOCHS + 1):
    model.train()
    running_loss = 0.0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}")

    for batch_idx, (x, y) in enumerate(pbar):
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        # Forward pass with AMP
        with torch.cuda.amp.autocast(enabled=(device == "cuda")):
            logits = model(x)
            loss = criterion(logits, y) / ACCUM_STEPS

        # Backward pass
        scaler.scale(loss).backward()

        # Update weights every ACCUM_STEPS
        if (batch_idx + 1) % ACCUM_STEPS == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)

        running_loss += loss.item() * x.size(0) * ACCUM_STEPS
        pbar.set_postfix({"loss": f"{loss.item() * ACCUM_STEPS:.4f}"})

    # Epoch metrics
    train_loss = running_loss / len(train_df)

    # Validate
    val_acc, val_mf1, _, _ = eval_loader(val_loader, "val_cis")

    print(f"\nEpoch {epoch}:")
    print(f"  train_loss = {train_loss:.4f}")
    print(f"  val_cis    = acc {val_acc:.3f}, macroF1 {val_mf1:.3f}")

    # Optional: validate on trans
    if len(valT_df) > 0:
        vt_acc, vt_mf1, _, _ = eval_loader(valT_loader, "val_trans")
        print(f"  val_trans  = acc {vt_acc:.3f}, macroF1 {vt_mf1:.3f}")

    # Save best model
    if val_mf1 > best_val_mf1:
        best_val_mf1 = val_mf1
        torch.save(model.state_dict(), SAVE_PATH)
        print(f"  ðŸ’¾ SAVED BEST (macroF1 = {best_val_mf1:.3f})")

    print()


TRAINING STAGE 1 - 5 EPOCHS



  with torch.cuda.amp.autocast(enabled=(device == "cuda")):
Epoch 1/5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 424/424 [01:25<00:00,  4.98it/s, loss=0.0000]
  with torch.cuda.amp.autocast(enabled=(device == "cuda")):



Epoch 1:
  train_loss = 0.1315
  val_cis    = acc 0.713, macroF1 0.505


  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


  val_trans  = acc 0.784, macroF1 0.294
  ðŸ’¾ SAVED BEST (macroF1 = 0.505)



  with torch.cuda.amp.autocast(enabled=(device == "cuda")):
Epoch 2/5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 424/424 [01:22<00:00,  5.14it/s, loss=0.0000]
  with torch.cuda.amp.autocast(enabled=(device == "cuda")):



Epoch 2:
  train_loss = 0.0019
  val_cis    = acc 0.709, macroF1 0.458


  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


  val_trans  = acc 0.688, macroF1 0.273



  with torch.cuda.amp.autocast(enabled=(device == "cuda")):
Epoch 3/5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 424/424 [01:23<00:00,  5.10it/s, loss=0.0000]
  with torch.cuda.amp.autocast(enabled=(device == "cuda")):



Epoch 3:
  train_loss = 0.0003
  val_cis    = acc 0.712, macroF1 0.506


  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


  val_trans  = acc 0.839, macroF1 0.305
  ðŸ’¾ SAVED BEST (macroF1 = 0.506)



  with torch.cuda.amp.autocast(enabled=(device == "cuda")):
Epoch 4/5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 424/424 [01:25<00:00,  4.97it/s, loss=0.0000]
  with torch.cuda.amp.autocast(enabled=(device == "cuda")):



Epoch 4:
  train_loss = 0.0002
  val_cis    = acc 0.714, macroF1 0.525


  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


  val_trans  = acc 0.927, macroF1 0.321
  ðŸ’¾ SAVED BEST (macroF1 = 0.525)



  with torch.cuda.amp.autocast(enabled=(device == "cuda")):
Epoch 5/5: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 424/424 [01:23<00:00,  5.11it/s, loss=0.0000]
  with torch.cuda.amp.autocast(enabled=(device == "cuda")):



Epoch 5:
  train_loss = 0.0006
  val_cis    = acc 0.714, macroF1 0.532


  with torch.cuda.amp.autocast(enabled=(device == "cuda")):


  val_trans  = acc 0.888, macroF1 0.314
  ðŸ’¾ SAVED BEST (macroF1 = 0.532)



In [22]:
# ============================================================================
# FINAL TEST EVALUATION
# ============================================================================
from sklearn.metrics import classification_report

print(f"\n{'='*60}")
print("FINAL TEST EVALUATION (Stage 1)")
print(f"{'='*60}\n")

# Load best model
model.load_state_dict(torch.load(SAVE_PATH, map_location=device))

# Evaluate
cis_acc, cis_mf1, cis_y, cis_p = eval_loader(test_cis_loader, "test_cis")
tr_acc, tr_mf1, tr_y, tr_p = eval_loader(test_trans_loader, "test_trans")

print(f"ðŸŽ¯ TEST CIS   â†’ acc = {cis_acc:.3f}, macroF1 = {cis_mf1:.3f}")
print(f"ðŸŽ¯ TEST TRANS â†’ acc = {tr_acc:.3f}, macroF1 = {tr_mf1:.3f}\n")

print("--- CIS REPORT ---")
print(classification_report(cis_y, cis_p, target_names=[idx_to_class[i] for i in range(len(classes))]))

print("\n--- TRANS REPORT ---")
print(classification_report(tr_y, tr_p, target_names=[idx_to_class[i] for i in range(len(classes))]))



FINAL TEST EVALUATION (Stage 1)



  with torch.cuda.amp.autocast(enabled=(device == "cuda")):
  with torch.cuda.amp.autocast(enabled=(device == "cuda")):
                                                             

ðŸŽ¯ TEST CIS   â†’ acc = 0.921, macroF1 = 0.625
ðŸŽ¯ TEST TRANS â†’ acc = 0.907, macroF1 = 0.600

--- CIS REPORT ---
              precision    recall  f1-score   support

      animal       0.93      0.99      0.96     13856
         car       0.84      1.00      0.91       791
       empty       0.17      0.00      0.00      1180

    accuracy                           0.92     15827
   macro avg       0.64      0.67      0.63     15827
weighted avg       0.87      0.92      0.89     15827


--- TRANS REPORT ---
              precision    recall  f1-score   support

      animal       0.92      0.98      0.95     20384
         car       0.73      1.00      0.85      1113
       empty       0.04      0.00      0.00      1778

    accuracy                           0.91     23275
   macro avg       0.56      0.66      0.60     23275
weighted avg       0.85      0.91      0.87     23275





In [23]:
# ============================================================================
# SAVE METADATA
# ============================================================================
import json

mapping = {
    "classes": classes,
    "class_to_idx": class_to_idx
}

out_json = f"{PROC_DIR}/stage1_label_mapping.json"
with open(out_json, "w") as f:
    json.dump(mapping, f, indent=2)

print(f"\nâœ“ Saved: {out_json}")
print("âœ“ Stage 1 training complete!")



âœ“ Saved: /content/drive/MyDrive/datasets/cct20/processed/stage1_label_mapping.json
âœ“ Stage 1 training complete!
