
# üöÄ Colab Setup ‚Äî **CNNs-distracted-driving** (hardcoded + config-aware)

This version is **simplified and hardcoded** for your repo and URL, and it **respects your `src/ddriver/config.py`**.
- Repo name fixed to **`CNNs-distracted-driving`**
- Repo URL fixed to **`https://github.com/ClaudiaCPach/CNNs-distracted-driving`**
- Uses your `config.py` convention: when running in Colab, we **set env vars** (`DRIVE_PATH`, `DATASET_ROOT`, `OUT_ROOT`, `CKPT_ROOT`, `FAST_DATA`) so your code reads correct paths via `ddriver.config`.
- Optional `FAST_DATA` at `/content/data` for faster I/O (if you later copy data there).

> Run cells **top ‚Üí bottom** the first time. Re-run **Update repo** to pull new commits after you push.


In [None]:

# üîß 0) (Optional) quick GPU check
!nvidia-smi || echo "No GPU detected ‚Äî CPU runtime is okay for setup steps."


In [None]:

# üîß 1) Fixed config for your repo + Drive layout
import os

REPO_URL       = "https://github.com/ClaudiaCPach/CNNs-distracted-driving"
REPO_DIRNAME   = "CNNs-distracted-driving"   # hardcoded
BRANCH         = "main"
PROJECT_ROOT   = f"/content/{REPO_DIRNAME}"  # where the repo will live in Colab

# Your persistent Google Drive base folder (matches your project docs):
DRIVE_PATH       = "/content/drive/MyDrive/TFM"
DRIVE_DATA_ROOT  = f"{DRIVE_PATH}/data"      # contains auc.distracted.driver.dataset_v2

# Optional: a fast, ephemeral workspace inside the VM
FAST_DATA        = "/content/data"           # rsync target for faster I/O (lives on the VM SSD)

# Start with Drive as the canonical dataset root; later cells can switch to FAST_DATA
DATASET_ROOT     = DRIVE_DATA_ROOT
OUT_ROOT         = f"{DRIVE_PATH}/outputs"
CKPT_ROOT        = f"{DRIVE_PATH}/checkpoints"


In [None]:

# üîå 2) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
print("‚úÖ Drive mounted.")


In [None]:

# üìÅ 3) Clone or update the repo (no name inference ‚Äî all hardcoded)
import os, subprocess

def sh(cmd):
    print(f"\n$ {cmd}")
    rc = subprocess.call(cmd, shell=True, executable="/bin/bash")
    if rc != 0:
        raise RuntimeError(f"Command failed with exit code {rc}: {cmd}")

if os.path.isdir(PROJECT_ROOT):
    print(f"üìÅ Repo already present at {PROJECT_ROOT}. Pulling latest on branch {BRANCH}...")
    sh(f"cd {PROJECT_ROOT} && git fetch origin {BRANCH} && git checkout {BRANCH} && git pull --rebase origin {BRANCH}")
else:
    print(f"‚¨áÔ∏è Cloning {REPO_URL} ‚Üí {PROJECT_ROOT}")
    sh(f"git clone --branch {BRANCH} {REPO_URL} {PROJECT_ROOT}")

print("PROJECT_ROOT =", PROJECT_ROOT)


In [None]:

# üì¶ 4) Install the repo (editable) + requirements (uses pyproject.toml if present)
import os, subprocess

def sh(cmd):
    print(f"\n$ {cmd}")
    rc = subprocess.call(cmd, shell=True, executable="/bin/bash")
    if rc != 0:
        raise RuntimeError(f"Command failed with exit code {rc}: {cmd}")

print("üîÑ Upgrading pip/setuptools/wheel...")
sh("python -m pip install --upgrade pip setuptools wheel")

has_pyproject = os.path.exists(os.path.join(PROJECT_ROOT, "pyproject.toml"))
if has_pyproject:
    print("üì¶ Editable install from pyproject.toml ...")
    sh(f"cd {PROJECT_ROOT} && pip install -e .")
else:
    print("‚ö†Ô∏è No pyproject.toml found. Skipping editable install.")

req_path = os.path.join(PROJECT_ROOT, "requirements.txt")
if os.path.exists(req_path):
    print("üìù Installing requirements.txt...")
    sh(f"pip install -r {req_path}")
else:
    print("‚ÑπÔ∏è No requirements.txt found ‚Äî continuing.")


In [None]:

# üå≥ 5) Configure environment for your ddriver.config (Colab branch)
# Your config.py reads env vars and falls back to sensible defaults when in Colab.
import os

os.environ["DRIVE_PATH"]   = DRIVE_PATH
os.environ["DATASET_ROOT"] = DATASET_ROOT
os.environ["OUT_ROOT"]     = OUT_ROOT
os.environ["CKPT_ROOT"]    = CKPT_ROOT
os.environ["FAST_DATA"]    = FAST_DATA

# Also write a .env (harmless in Colab; helpful if code calls load_dotenv())
env_text = f"""DRIVE_PATH={DRIVE_PATH}
DATASET_ROOT={DATASET_ROOT}
OUT_ROOT={OUT_ROOT}
CKPT_ROOT={CKPT_ROOT}
FAST_DATA={FAST_DATA}
"""
with open(os.path.join(PROJECT_ROOT, ".env"), "w") as f:
    f.write(env_text)

print("‚úÖ Environment variables set for ddriver.config")
print("\nSummary:")
for k in ["DRIVE_PATH","DATASET_ROOT","OUT_ROOT","CKPT_ROOT","FAST_DATA"]:
    print(f"{k} = {os.environ[k]}")


In [None]:

# üîó 6) (Optional) Symlink dataset into repo for familiar paths (scripts that assume PROJECT_ROOT/data/...)
# Not required when using ddriver.config, but convenient for ad-hoc browsing.
import os

LOCAL_DATA_DIR = f"{PROJECT_ROOT}/data"
os.makedirs(LOCAL_DATA_DIR, exist_ok=True)

dataset_link = os.path.join(LOCAL_DATA_DIR, "auc.distracted.driver.dataset_v2")
if not os.path.islink(dataset_link) and not os.path.exists(dataset_link):
    try:
        os.symlink(DATASET_ROOT, dataset_link)
        print(f"üîó Symlinked {dataset_link} ‚Üí {DATASET_ROOT}")
    except OSError as e:
        print(f"‚ÑπÔ∏è Symlink skipped or failed: {e}")
else:
    print("‚ÑπÔ∏è Dataset link already exists.")


In [None]:

# üîç 7) Quick sanity checks
import os, glob

def preview_dir(path, n=10):
    print(f"Listing up to {n} items under: {path}")
    try:
        for i, name in enumerate(sorted(os.listdir(path))):
            print("  -", name)
            if i+1 >= n:
                break
    except Exception as e:
        print("Could not list:", e)

print("\nTop-level DATASET_ROOT:")
preview_dir(os.environ["DATASET_ROOT"], n=10)

cam1_train = os.path.join(os.environ["DATASET_ROOT"], "v2_cam1_cam2_ split_by_driver", "Camera 1", "train")
print("\nCamera 1/train class folders (first 10):")
preview_dir(cam1_train, n=10)

for cls in ["c0","c1","c2"]:
    cls_dir = os.path.join(cam1_train, cls)
    if os.path.isdir(cls_dir):
        num_imgs = len([p for p in glob.glob(os.path.join(cls_dir, "*")) if os.path.isfile(p)])
        print(f"  ‚Ä¢ {cls}: {num_imgs} files")


In [None]:

# ‚úÖ 8) Import smoke test (uses your package + config.py)
import sys, os
sys.path.append(PROJECT_ROOT)
sys.path.append(os.path.join(PROJECT_ROOT, "src"))  # <‚Äî lets Python find src/ddriver

try:
    import ddriver
    print("ddriver imported OK from:", ddriver.__file__)
    # Confirm config picks up Colab env:
    try:
        from ddriver import config
        print("Loaded ddriver.config successfully.")
        # Echo the resolved paths from config (they are pathlib.Path objects)
        print("config.DATASET_ROOT =", config.DATASET_ROOT)
        print("config.OUT_ROOT     =", config.OUT_ROOT)
        print("config.CKPT_ROOT    =", config.CKPT_ROOT)
        print("config.FAST_DATA    =", config.FAST_DATA)
    except Exception as e:
        print("Note: ddriver.config not imported:", e)
except Exception as e:
    print("‚ö†Ô∏è Import failed ‚Äî check package name/setup.")
    print(e)


# üìã 9) Generate Manifest and Split CSVs

This step creates the CSV files that tell your code where all the images are and which ones go to train/val/test.

**What this does:**
- Scans all your images in the dataset folder
- Creates a big list (manifest.csv) with info about every image
- Creates three smaller lists (train.csv, val.csv, test.csv) that say which images belong where
- Saves everything to your Google Drive so it's permanent

**Why we need this:**
- Your training code needs to know which images to use
- The manifest remembers which driver each image belongs to (for VAL split)
- The split CSVs organize images into train/val/test groups


In [None]:
# Run the manifest generator
# This is like asking a librarian to catalog all your books and create reading lists

import subprocess
import sys

# Make sure we can import ddriver
sys.path.insert(0, PROJECT_ROOT)

# Run the manifest script
# --write-split-lists means "also create train.csv, val.csv, test.csv files"
manifest_cmd = f"cd {PROJECT_ROOT} && python -m ddriver.data.manifest --write-split-lists"

print("üî® Generating manifest and split CSVs...")
print(f"Running: {manifest_cmd}\n")

result = subprocess.run(
    manifest_cmd,
    shell=True,
    capture_output=True,
    text=True
)

# Show what happened
print(result.stdout)
if result.stderr:
    print("Warnings/Errors:")
    print(result.stderr)

if result.returncode == 0:
    print("\n‚úÖ Manifest and split CSVs generated successfully!")
    print(f"   Manifest: {os.environ['OUT_ROOT']}/manifests/manifest.csv")
    print(f"   Train split: {os.environ['OUT_ROOT']}/splits/train.csv")
    print(f"   Val split: {os.environ['OUT_ROOT']}/splits/val.csv")
    print(f"   Test split: {os.environ['OUT_ROOT']}/splits/test.csv")
else:
    print(f"\n‚ùå Error generating manifest (exit code {result.returncode})")
    raise RuntimeError("Manifest generation failed")


In [None]:
# Quick check: Did the CSVs get created?
# This is like checking that the librarian actually wrote down all the book lists

import pandas as pd
from pathlib import Path

manifest_path = Path(os.environ['OUT_ROOT']) / "manifests" / "manifest.csv"
train_path = Path(os.environ['OUT_ROOT']) / "splits" / "train.csv"
val_path = Path(os.environ['OUT_ROOT']) / "splits" / "val.csv"
test_path = Path(os.environ['OUT_ROOT']) / "splits" / "test.csv"

print("üìä Checking CSV files...\n")

for name, path in [("Manifest", manifest_path), ("Train", train_path), ("Val", val_path), ("Test", test_path)]:
    if path.exists():
        df = pd.read_csv(path)
        print(f"‚úÖ {name}: {len(df)} rows, columns: {list(df.columns)}")
    else:
        print(f"‚ùå {name}: File not found at {path}")

# Show a sample from the manifest
if manifest_path.exists():
    print("\nüìÑ Sample from manifest (first 3 rows):")
    sample = pd.read_csv(manifest_path).head(3)
    print(sample[['path', 'class_id', 'driver_id', 'camera', 'split']].to_string())


In [None]:
# Create a tiny balanced subset for quick testing
# Run this cell ONCE to create train_small.csv, then use it for fast experiments

import pandas as pd
from pathlib import Path
from ddriver import config

train_csv = Path(config.OUT_ROOT) / "splits" / "train.csv"
train_small_csv = Path(config.OUT_ROOT) / "splits" / "train_small.csv"

print(f"Reading {train_csv}...")
df = pd.read_csv(train_csv)

# Get 20 images per class (balanced)
small = df.groupby("class_id").head(20)

print(f"Original train.csv: {len(df)} images")
print(f"Small subset: {len(small)} images ({len(small) // 10} per class)")
print(f"\nClass distribution in small subset:")
print(small["class_id"].value_counts().sort_index())

small.to_csv(train_small_csv, index=False)
print(f"\n‚úÖ Saved to {train_small_csv}")

### ‚ö°Ô∏è Tiny-train option

Set `USE_TINY_SPLIT = True` in the training cell below to replace the heavy
`train.csv` with the quick `train_small.csv` (20 images per class). Validation
and test splits stay full so you still see realistic metrics.

Run the "Create a tiny balanced subset" cell once per Drive setup before
enabling this flag.


# üß™ 10) Test dataset.py and datamod.py

Now let's make sure the code that loads images actually works!

**What we're testing:**
1. **dataset.py** - Can it load a single image and give us the right info?
2. **datamod.py** - Can it create data loaders that give us batches of images?

**Why test this:**
- If these don't work, training will fail
- Better to catch problems now than later
- We want to see that images load correctly and labels are right


In [None]:
# Test 1: Can dataset.py load a single image?
# This is like testing if a worker can fetch one book from the library

from ddriver.data.dataset import AucDriverDataset
from torchvision import transforms as T
from pathlib import Path

# Get paths from config
manifest_csv = Path(os.environ['OUT_ROOT']) / "manifests" / "manifest.csv"
val_split_csv = Path(os.environ['OUT_ROOT']) / "splits" / "val.csv"

print("üß™ Test 1: Testing AucDriverDataset (dataset.py)")
print(f"   Manifest: {manifest_csv}")
print(f"   Using Val split: {val_split_csv}\n")

try:
    # Create a simple dataset (no fancy transforms, just load the image)
    simple_transforms = T.ToTensor()  # Just convert to tensor, no augmentation
    
    val_dataset = AucDriverDataset(
        manifest_csv=manifest_csv,
        split_csv=val_split_csv,
        transforms=simple_transforms
    )
    
    print(f"‚úÖ Dataset created! It has {len(val_dataset)} images in VAL split")
    
    # Try to load the first image
    print("\nüìñ Loading first image from VAL split...")
    sample = val_dataset[0]
    
    print(f"‚úÖ Image loaded successfully!")
    print(f"   Image shape: {sample['image'].shape} (should be [3, height, width])")
    print(f"   Label: {sample['label']} (should be 0-9)")
    print(f"   Driver ID: {sample['driver_id']} (VAL should have driver IDs)")
    print(f"   Camera: {sample['camera']} (should be 'cam1' or 'cam2')")
    print(f"   Path: {sample['path'][:80]}...")  # Show first 80 chars
    
    # Check that label is valid (0-9)
    if 0 <= sample['label'] <= 9:
        print(f"   ‚úÖ Label is valid (0-9)")
    else:
        print(f"   ‚ùå Label {sample['label']} is NOT in range 0-9!")
    
    # Check that VAL has driver IDs
    if sample['driver_id'] is not None:
        print(f"   ‚úÖ VAL split has driver ID (as expected)")
    else:
        print(f"   ‚ö†Ô∏è  VAL split missing driver ID (might be okay if this image wasn't in your DRIVER_RANGES)")
    
    print("\n‚úÖ Test 1 PASSED: dataset.py works!")
    
except Exception as e:
    print(f"\n‚ùå Test 1 FAILED: {e}")
    import traceback
    traceback.print_exc()
    raise


# üßµ 11) Full pipeline (train ‚Üí predict ‚Üí metrics)

Now that data loading is working, these next cells show how to:
1. Register the model you want (e.g., `resnet18` from timm)
2. Run training from the command line helper
3. Generate predictions from a checkpoint
4. Evaluate metrics and save all logs to Drive

> You can change the `RUN_TAG`, model name, epochs, etc. in the code below.


In [None]:
# Register models you want to use (run once per runtime)
# This example uses timm's resnet18.

!pip -q install timm

from ddriver.models import registry

registry.register_timm_backbone("resnet18")
print("Available models:", registry.available_models()[:10])


## üöÇ 11.1 Train a model (adjust these knobs)

- Choose a `RUN_TAG` so logs/checkpoints go into `TFM/checkpoints/runs/<tag>/...`
- Set epochs/batch size to something small for a dry run (1 epoch, 16 batch)
- This command uses the CLI helper (`python -m src.ddriver.cli.train ...`)
- Logs + checkpoints are saved automatically to Google Drive


In [None]:
import subprocess, textwrap, json, time, threading
from pathlib import Path

# ResNet-18 baseline run (change RUN_TAG for each experiment)
RUN_TAG = "resnet18_full_v1"     # change me for each experiment
MODEL_NAME = "resnet18"          # must be registered above (timm)

# Training hyperparameters
EPOCHS = 15                      # for real runs; use 1‚Äì2 for quick smoke tests
BATCH_SIZE = 32                  # try 64 later if VRAM allows
NUM_WORKERS = 2                  # how many background data-loading workers
IMAGE_SIZE = 224
LR = 1e-3                        # good starting LR for fine-tuning ResNet-18
LR_DROP_EPOCH = None             # drop LR after this epoch (1-based)
LR_DROP_FACTOR = 0.1             # multiply LR by this factor after drop
USE_TINY_SPLIT = False           # True = train_small.csv (debug); False = full train.csv

manifest_csv = Path(OUT_ROOT) / "manifests" / "manifest.csv"
train_split = "train_small.csv" if USE_TINY_SPLIT else "train.csv"
train_csv = Path(OUT_ROOT) / "splits" / train_split
val_csv = Path(OUT_ROOT) / "splits" / "val.csv"
test_csv = Path(OUT_ROOT) / "splits" / "test.csv"

if USE_TINY_SPLIT:
    print("‚ö° Using train_small.csv (20 imgs/class) for a quick smoke test.")
else:
    print("ü™µ Using full train.csv for a proper run.")

train_cmd = textwrap.dedent(f"""
cd {PROJECT_ROOT}
python -m src.ddriver.cli.train \
    --model-name {MODEL_NAME} \
    --epochs {EPOCHS} \
    --batch-size {BATCH_SIZE} \
    --num-workers {NUM_WORKERS} \
    --image-size {IMAGE_SIZE} \
    --lr {LR} \
    --label-smoothing 0.05 \
    --out-tag {RUN_TAG} \
    --manifest-csv {manifest_csv} \
    --train-csv {train_csv} \
    --val-csv {val_csv} \
    --test-csv {test_csv}
""")

print("Running training command and streaming logs:\n", train_cmd)

proc = subprocess.Popen(
    train_cmd,
    shell=True,
    text=True,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
)

# Background GPU monitor (prints every 5 seconds)
def _gpu_monitor():
    while proc.poll() is None:
        try:
            stats = subprocess.check_output(
                "nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total "
                "--format=csv,nounits,noheader",
                shell=True,
            ).decode("utf-8").strip()
            print(f"[GPU] util%, mem_used, mem_total :: {stats}")
        except Exception as exc:
            print("[GPU] Could not query nvidia-smi:", exc)
        time.sleep(5)

monitor_thread = threading.Thread(target=_gpu_monitor, daemon=True)
monitor_thread.start()

# Stream CLI stdout live
if proc.stdout is None:
    raise RuntimeError("Training process has no stdout pipe.")

for line in proc.stdout:
    print(line, end="")

proc.wait()
monitor_thread.join(timeout=0)

print("\n‚úÖ Training run complete!\n")

# --- Display every epoch's metrics so the notebook shows the learning curve ---
run_base = Path(CKPT_ROOT) / "runs" / RUN_TAG
all_runs = sorted(run_base.glob("*/"))
if not all_runs:
    raise FileNotFoundError(f"No run folders found under {run_base}")
latest_run = all_runs[-1]

history_path = latest_run / "history.json"
if not history_path.exists():
    raise FileNotFoundError(f"history.json not found in {latest_run}")

history = json.loads(history_path.read_text()).get("history", [])
print(f"üìä Epoch metrics for run: {latest_run.name}")
for record in history:
    train_metrics = record.get("train", {})
    val_metrics = record.get("val", {}) or {}
    train_loss = train_metrics.get("loss")
    train_acc = train_metrics.get("accuracy")
    val_loss = val_metrics.get("loss")
    val_acc = val_metrics.get("accuracy")
    val_str = (
        f"val_loss={val_loss:.4f} acc={val_acc:.4f}"
        if val_loss is not None and val_acc is not None
        else "val_loss=‚Äî val_acc=‚Äî"
    )
    print(
        f"  Epoch {record['epoch']:>2}: "
        f"train_loss={train_loss:.4f} acc={train_acc:.4f}  "
        f"{val_str}"
    )
print("")


## üìù 11.1a Log training summary to Google Sheet
Run this right after the training cell finishes. It looks up the newest run under `CKPT_ROOT/runs/<RUN_TAG>`, grabs the best/final train + val accuracies, and logs the model/hyperparams so you can compare experiments before doing predictions or metrics.


In [None]:
# üìù Training summary ‚Üí Google Sheet
!pip -q install gspread

import json
from pathlib import Path

import gspread
from google.colab import auth
import google.auth

auth.authenticate_user()
creds, _ = google.auth.default()
gc = gspread.authorize(creds)

TRAIN_SHEET_NAME = "TFM Train Logs"   # create this sheet/tab ahead of time
TRAIN_WORKSHEET = "Sheet1"

run_base = Path(CKPT_ROOT) / "runs" / RUN_TAG
all_runs = sorted(run_base.glob("*/"))
if not all_runs:
    raise FileNotFoundError(f"No run folders found under {run_base}")
latest_run = all_runs[-1]
print(f"Logging training summary for run folder: {latest_run}")

history_path = latest_run / "history.json"
if not history_path.exists():
    raise FileNotFoundError(f"history.json not found under {latest_run}")

history_records = json.loads(history_path.read_text()).get("history", [])
if not history_records:
    raise ValueError(f"history.json under {latest_run} has no records.")

params_path = latest_run / "params.json"
params = json.loads(params_path.read_text()) if params_path.exists() else {}

model_name = params.get("model_name", MODEL_NAME)
epochs_cfg = params.get("epochs", EPOCHS)
batch_cfg = params.get("batch_size", BATCH_SIZE)
lr_cfg = params.get("lr", LR)
lr_drop_epoch_cfg = params.get("lr_drop_epoch", LR_DROP_EPOCH)
lr_drop_factor_cfg = params.get("lr_drop_factor", LR_DROP_FACTOR)
image_size_cfg = params.get("image_size", IMAGE_SIZE)
num_workers_cfg = params.get("num_workers", NUM_WORKERS)
use_tiny_cfg = params.get("use_tiny_split", USE_TINY_SPLIT)


def _best_metric(records, split: str) -> tuple[dict, float | None]:
    best_epoch = None
    best_acc = None
    for rec in records:
        split_metrics = rec.get(split) or {}
        acc = split_metrics.get("accuracy")
        if acc is None:
            continue
        if best_acc is None or acc > best_acc:
            best_acc = acc
            best_epoch = rec.get("epoch")
    final_metrics = records[-1].get(split) or {}
    final_acc = final_metrics.get("accuracy")
    return {"epoch": best_epoch, "accuracy": best_acc}, final_acc


best_train, final_train = _best_metric(history_records, "train")
best_val, final_val = _best_metric(history_records, "val")

row = [
    RUN_TAG,
    latest_run.name,
    model_name,
    epochs_cfg,
    batch_cfg,
    lr_cfg,
    lr_drop_epoch_cfg,
    lr_drop_factor_cfg,
    image_size_cfg,
    num_workers_cfg,
    use_tiny_cfg,
    best_train["epoch"] if best_train["epoch"] is not None else "",
    round(best_train["accuracy"], 4) if best_train["accuracy"] is not None else "",
    best_val["epoch"] if best_val["epoch"] is not None else "",
    round(best_val["accuracy"], 4) if best_val["accuracy"] is not None else "",
    round(final_train, 4) if final_train is not None else "",
    round(final_val, 4) if final_val is not None else "",
]

ws = gc.open(TRAIN_SHEET_NAME).worksheet(TRAIN_WORKSHEET)
ws.append_row(row, value_input_option="USER_ENTERED")
print(f"Appended training summary for {latest_run.name} ‚úÖ")



In [None]:
# üîÑ Optional: copy + compress dataset subset ‚Üí fast local SSD (/content/data)
# Re-encodes JPEGs once (quality 80, short side 320px) before landing in /content/data.

import importlib
import os
from pathlib import Path

from ddriver.data.fastcopy import CompressionSpec, copy_splits_with_compression

SRC_ROOT = Path(DRIVE_DATA_ROOT) / "auc.distracted.driver.dataset_v2"
DST_ROOT = Path(FAST_DATA) / "auc.distracted.driver.dataset_v2"

split_csvs = {
    "train": Path(OUT_ROOT) / "splits" / "train.csv",
    "val": Path(OUT_ROOT) / "splits" / "val.csv",
    "train_small": Path(OUT_ROOT) / "splits" / "train_small.csv",
}

compression_spec = CompressionSpec(
    target_short_side=320,  # still >= image_size + resize margin for training
    jpeg_quality=80,        # ImageNet-level compression, visually lossless
)

summary = copy_splits_with_compression(
    split_csvs=split_csvs,
    src_root=SRC_ROOT,
    dst_root=DST_ROOT,
    compression=compression_spec,
    skip_existing=True,
)

print(
    f"\nüìâ FAST_DATA copy stats: processed {summary['processed']} of {summary['total']} files "
    f"(skipped {summary['skipped']} already present)."
)
print(f"Compressed dataset root: {summary['dst_root']}")

DATASET_ROOT = FAST_DATA
os.environ["DATASET_ROOT"] = str(DATASET_ROOT)
try:
    from ddriver import config as _ddriver_config
    importlib.reload(_ddriver_config)
    print("\n‚ö° Copy complete. DATASET_ROOT now points to the local FAST_DATA copy for this runtime:")
    print("   ddriver.config.DATASET_ROOT =", _ddriver_config.DATASET_ROOT)
except Exception as exc:
    print("\n‚ö° Copy complete and DATASET_ROOT env updated, but could not reload ddriver.config:", exc)
print("   (Re-run env summary if you want to rewrite .env, but training now uses /content/data.)")


In [None]:
# üîÑ Optional: copy + compress TEST split ‚Üí /content/data (same settings as train/val)

import importlib
from pathlib import Path

from ddriver.data.fastcopy import CompressionSpec, copy_splits_with_compression

SRC_ROOT = Path(DRIVE_DATA_ROOT) / "auc.distracted.driver.dataset_v2"
DST_ROOT = Path(FAST_DATA) / "auc.distracted.driver.dataset_v2"

split_csvs = {
    "test": Path(OUT_ROOT) / "splits" / "test.csv",
}

compression_spec = CompressionSpec(
    target_short_side=320,
    jpeg_quality=80,
)

summary = copy_splits_with_compression(
    split_csvs=split_csvs,
    src_root=SRC_ROOT,
    dst_root=DST_ROOT,
    compression=compression_spec,
    skip_existing=True,
)

print(
    f"\nüìâ FAST_DATA test copy stats: processed {summary['processed']} of {summary['total']} "
    f"(skipped {summary['skipped']} already present)."
)
print(f"Compressed dataset root: {summary['dst_root']}")

# DATASET_ROOT is already pointing at FAST_DATA from the earlier cell, but reload config just in case
try:
    from ddriver import config as _ddriver_config
    importlib.reload(_ddriver_config)
    print("\n‚ö° Test copy complete. ddriver.config now sees:")
    print("   ddriver.config.DATASET_ROOT =", _ddriver_config.DATASET_ROOT)
except Exception as exc:
    print("\n‚ö° Test copy complete; config reload optional:", exc)

## üëÄ 11.1b Optional: sanity-check a few images
Run this right after the copy+compress cell to view originals from Drive next to their compressed FAST_DATA twins. You can change `NUM_SAMPLES` or switch which split to inspect if you want more spot checks.


In [None]:
# üëÄ Visual sanity check: Drive vs FAST_DATA
import random
from pathlib import Path

import pandas as pd
from IPython.display import display
from PIL import Image

NUM_SAMPLES = 3          # how many images to compare
SPLIT_FOR_CHECK = "val"  # choose 'train', 'val', or 'train_small'

split_csv = Path(OUT_ROOT) / "splits" / f"{SPLIT_FOR_CHECK}.csv"
if not split_csv.exists():
    raise FileNotFoundError(f"Split CSV not found: {split_csv}. Run manifest generation first.")

df = pd.read_csv(split_csv)
if df.empty:
    raise ValueError(f"No rows in {split_csv}; cannot sample images.")

marker = "auc.distracted.driver.dataset_v2"
marker_lower = marker.lower()

def _relative_path(path_str: str) -> Path:
    path_str = str(path_str)
    path = Path(path_str)
    if path.is_absolute():
        lowered = path_str.lower()
        idx = lowered.find(marker_lower)
        if idx == -1:
            raise ValueError(f"Could not locate dataset marker '{marker}' inside: {path_str}")
        rel = Path(path_str[idx:])
    else:
        rel = path
    parts = rel.parts
    if parts and parts[0].lower() == marker_lower:
        rel = Path(*parts[1:])
    return rel

sample_paths = df["path"].sample(
    n=min(NUM_SAMPLES, len(df)),
    replace=False,
    random_state=42,
).tolist()

for idx, sample_path in enumerate(sample_paths, 1):
    rel = _relative_path(sample_path)
    drive_img_path = Path(DRIVE_DATA_ROOT) / rel
    fast_img_path = Path(FAST_DATA) / rel

    if not drive_img_path.exists():
        raise FileNotFoundError(f"Drive image missing: {drive_img_path}")
    if not fast_img_path.exists():
        raise FileNotFoundError(
            f"FAST_DATA image missing: {fast_img_path}. Run the copy+compress cell first."
        )

    print(f"\nSample {idx}: {rel}")
    print("Drive (original):")
    with Image.open(drive_img_path) as orig:
        display(orig.copy())
    print("FAST_DATA (compressed):")
    with Image.open(fast_img_path) as comp:
        display(comp.copy())



## üì¶ 11.2 Pick the latest checkpoint file

This cell looks inside `CKPT_ROOT/runs/<RUN_TAG>/` and grabs the newest `epoch_*.pt`. Use this path in the prediction step.


In [None]:
from pathlib import Path

RUN_TAG = globals().get("RUN_TAG", "resnet18_full_v1")  # reuse your latest training tag by default

run_base = Path(CKPT_ROOT) / "runs" / RUN_TAG
runs = sorted(run_base.glob("*/"))
if not runs:
    raise FileNotFoundError(f"No run folders found under {run_base}")

# ---- choose which run folder to use ----
RUN_IDX = -1          # -1 = newest, 0 = oldest, or any index from the printout below
print("Available runs:")
for idx, run_dir in enumerate(runs):
    print(f"  [{idx}] {run_dir.name}")
target_run = runs[RUN_IDX]
print(f"\nSelected run: {target_run}\n")

# ---- choose which checkpoint (epoch) inside that run ----
checkpoint_patterns = ["epoch_*.pt", "best.pt", "last.pt"]
checkpoints = []
for pattern in checkpoint_patterns:
       matches = sorted(target_run.glob(pattern))
       if matches:
           checkpoints.extend(matches)

if not checkpoints:
       raise FileNotFoundError(f"No checkpoints found under {target_run}")

CHECKPOINT_NAME = "best.pt"  # or "last.pt", or None to take the last match
if CHECKPOINT_NAME:
       chosen_ckpt = target_run / CHECKPOINT_NAME
       if not chosen_ckpt.exists():
           raise FileNotFoundError(chosen_ckpt)
else:
       chosen_ckpt = checkpoints[-1]

LATEST_CKPT = chosen_ckpt
print("Using checkpoint:", LATEST_CKPT)

## üîÆ 11.3 Generate predictions CSV

- Uses the checkpoint above
- Choose which split to predict on (`val` or `test`)
- Saves CSV under `OUT_ROOT/preds/<split>/<out_tag>.csv`


In [None]:
PRED_SPLIT = "val"           # or "test"
PRED_TAG = f"{RUN_TAG}_{PRED_SPLIT}"

predict_cmd = textwrap.dedent(f"""
cd {PROJECT_ROOT}
python -m src.ddriver.cli.predict \
    --model-name {MODEL_NAME} \
    --checkpoint {LATEST_CKPT} \
    --split {PRED_SPLIT} \
    --batch-size {BATCH_SIZE} \
    --num-workers {NUM_WORKERS} \
    --image-size {IMAGE_SIZE} \
    --out-tag {PRED_TAG}
""")

print("Running prediction command:\n", predict_cmd)
result = subprocess.run(predict_cmd, shell=True, text=True)
if result.returncode != 0:
    raise RuntimeError("Prediction command failed. See logs above.")
print("\n‚úÖ Predictions completed! Check OUT_ROOT/preds/")


## üìä 11.4 Evaluate metrics

- Uses `src/ddriver/metrics.py`
- Reads the manifest + split CSV + predictions CSV
- Saves results under `OUT_ROOT/metrics/<tag>/<timestamp>/`
- Shows accuracy + macro F1 + per-driver/camera (optional)


In [None]:
from pathlib import Path

manifest_path = Path(OUT_ROOT) / "manifests" / "manifest.csv"
split_csv_path = Path(OUT_ROOT) / "splits" / f"{PRED_SPLIT}.csv"
preds_csv_path = Path(OUT_ROOT) / "preds" / PRED_SPLIT / f"{PRED_TAG}.csv"
METRICS_TAG = PRED_TAG

metrics_cmd = textwrap.dedent(f"""
cd {PROJECT_ROOT}
python -m src.ddriver.eval.metrics \
    --manifest {manifest_path} \
    --split-csv {split_csv_path} \
    --predictions {preds_csv_path} \
    --out-tag {METRICS_TAG} \
    --per-driver \
    --per-camera
""")

print("Running metrics command:\n", metrics_cmd)
result = subprocess.run(metrics_cmd, shell=True, text=True)
if result.returncode != 0:
    raise RuntimeError("Metrics command failed. See logs above.")
print("\n‚úÖ Metrics saved under OUT_ROOT/metrics/")


In [None]:
# Test 2: Can datamod.py create data loaders and load batches?
# This is like testing if the teacher can organize students into groups and give them work

from ddriver.data.datamod import build_dataloaders, make_cfg_from_config
import torch

print("üß™ Test 2: Testing build_dataloaders (datamod.py)\n")

try:
    # Create config using the helper that uses ddriver.config paths
    # This is the easy way - it automatically finds your CSVs!
    cfg = make_cfg_from_config(
        batch_size=4,  # Small batch for testing (faster)
        num_workers=2,  # Use 2 workers (Colab might have limited CPUs)
        image_size=224,  # Standard image size
    )
    
    print("‚úÖ Config created using ddriver.config paths:")
    print(f"   Manifest: {cfg.manifest_csv}")
    print(f"   Train: {cfg.train_split_csv}")
    print(f"   Val: {cfg.val_split_csv}")
    print(f"   Test: {cfg.test_split_csv}\n")
    
    # Build the data loaders
    print("üî® Building data loaders...")
    loaders = build_dataloaders(cfg)
    
    print("‚úÖ Data loaders created!")
    print(f"   Available splits: {list(loaders.keys())}\n")
    
    # Test train loader
    print("üì¶ Testing TRAIN loader...")
    train_loader = loaders["train"]
    train_batch = next(iter(train_loader))
    
    print(f"   ‚úÖ Train batch loaded!")
    print(f"   Batch size: {train_batch['image'].shape[0]} images")
    print(f"   Image shape: {train_batch['image'].shape} (should be [batch_size, 3, 224, 224])")
    print(f"   Labels: {train_batch['label'].tolist()} (should be list of 0-9)")
    print(f"   Driver IDs: {train_batch['driver_id']} (train should mostly be None)")
    print(f"   Cameras: {train_batch['camera']}")
    
    # Check image shape is correct
    expected_shape = (cfg.batch_size, 3, cfg.image_size, cfg.image_size)
    if train_batch['image'].shape == expected_shape:
        print(f"   ‚úÖ Image shape is correct: {train_batch['image'].shape}")
    else:
        print(f"   ‚ùå Image shape wrong! Got {train_batch['image'].shape}, expected {expected_shape}")
    
    # Test val loader
    print("\nüì¶ Testing VAL loader...")
    val_loader = loaders["val"]
    val_batch = next(iter(val_loader))
    
    print(f"   ‚úÖ Val batch loaded!")
    print(f"   Batch size: {val_batch['image'].shape[0]} images")
    print(f"   Image shape: {val_batch['image'].shape}")
    print(f"   Labels: {val_batch['label'].tolist()}")
    print(f"   Driver IDs: {val_batch['driver_id']} (VAL should have driver IDs!)")
    
    # Check that VAL has driver IDs
    val_has_ids = any(did is not None for did in val_batch['driver_id'])
    if val_has_ids:
        print(f"   ‚úÖ VAL batch has driver IDs (as expected)")
    else:
        print(f"   ‚ö†Ô∏è  VAL batch missing driver IDs (check your DRIVER_RANGES in manifest.py)")
    
    # Test that images are normalized (should be in range roughly -2 to 2 after ImageNet normalization)
    img_min, img_max = train_batch['image'].min().item(), train_batch['image'].max().item()
    print(f"\n   Image value range: [{img_min:.3f}, {img_max:.3f}]")
    print(f"   (Should be roughly -2 to 2 after ImageNet normalization)")
    
    print("\n‚úÖ Test 2 PASSED: datamod.py works! Data loaders are ready for training!")
    
except Exception as e:
    print(f"\n‚ùå Test 2 FAILED: {e}")
    import traceback
    traceback.print_exc()
    raise


### ‚úÖ You're all set!

**What just happened:**
1. ‚úÖ Mounted Google Drive
2. ‚úÖ Cloned/updated your repo
3. ‚úÖ Installed the package
4. ‚úÖ Set up paths (works on Colab and Mac!)
5. ‚úÖ Generated manifest.csv and train/val/test split CSVs
6. ‚úÖ Tested that dataset.py can load images
7. ‚úÖ Tested that datamod.py can create data loaders
8. ‚úÖ (Optional) Registered a model + ran training ‚Üí prediction ‚Üí metrics pipeline

**Your CSVs are saved in Google Drive:**
- `OUT_ROOT/manifests/manifest.csv` - Big list of all images
- `OUT_ROOT/splits/train.csv` - Training images
- `OUT_ROOT/splits/val.csv` - Validation images (with driver IDs!)
- `OUT_ROOT/splits/test.csv` - Test images

**Next steps:**
- Adjust the training/prediction cells (epochs, batch size, tags) to run bigger experiments
- All paths use `ddriver.config` so it works on Colab and Mac
- Re-run **Clone/Update** cell after pushing new commits
- Optional: copy some data into `/content/data` to use `FAST_DATA` for speed



### ‚úÖ You‚Äôre set!
- Your repo + URL are **hardcoded**.
- `ddriver.config` will see the Colab env vars and resolve paths there.
- Re-run **Clone/Update** after pushing new commits.
- Optional: copy some data into `/content/data` to use `FAST_DATA` for speed, then call `ddriver.config.dataset_dir(prefer_fast=True)` in your scripts.


In [None]:
# ---- Colab cell: append metrics + params to Google Sheet ----
!pip -q install gspread

import json
from pathlib import Path

import gspread
from google.colab import auth
import google.auth

auth.authenticate_user()
creds, _ = google.auth.default()
gc = gspread.authorize(creds)

EVAL_SHEET_NAME = "TFM Eval Logs"   # create this sheet/tab ahead of time
EVAL_WORKSHEET = "Sheet1"

METRICS_TAG = (
    globals().get("METRICS_TAG")
    or globals().get("PRED_TAG")
    or "resnet18_full_v1_val"
)  # match the --out-tag you used
metrics_root = Path(OUT_ROOT) / "metrics" / METRICS_TAG
runs = sorted(metrics_root.glob("*/"))
if not runs:
    raise FileNotFoundError(f"No metrics runs found under {metrics_root}")
latest_metrics = runs[-1]
print("Logging metrics folder:", latest_metrics)

def _read_json(path: Path, *, required: bool = True) -> dict:
    if not path.exists():
        if required:
            raise FileNotFoundError(f"Expected file missing: {path}")
        return {}
    return json.loads(path.read_text())

metrics = _read_json(latest_metrics / "metrics.json")
inputs = _read_json(latest_metrics / "inputs.json", required=False)
params = _read_json(latest_metrics / "params.json", required=False)

overall = metrics.get("overall", {})
macro = overall.get("macro_avg", {})

row = [
    str(latest_metrics),
    inputs.get("predictions", ""),
    inputs.get("split_source", ""),
    metrics.get("num_examples", ""),
    round(overall.get("accuracy", 0.0), 4),
    round(macro.get("f1", 0.0), 4),
    json.dumps(params, sort_keys=True)[:500],
]

ws = gc.open(EVAL_SHEET_NAME).worksheet(EVAL_WORKSHEET)
ws.append_row(row, value_input_option="USER_ENTERED")
print(f"Appended metrics run {latest_metrics.name} to {EVAL_SHEET_NAME}/{EVAL_WORKSHEET} ‚úÖ")

### üìä 11.4a Visualize Confusion Matrix

Quick peek at where the model confuses classes using the most recent metrics run.


In [None]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

METRICS_TAG = (
    globals().get("METRICS_TAG")
    or globals().get("PRED_TAG")
    or "resnet18_full_v1_val"
)  # change if you used a different --out-tag
metrics_root = Path(OUT_ROOT) / "metrics" / METRICS_TAG
runs = sorted(metrics_root.glob("*/"))
if not runs:
    raise FileNotFoundError(f"No metrics runs found under {metrics_root}")
latest_metrics = runs[-1]
print("Reading confusion matrix from:", latest_metrics)

metrics = json.loads((latest_metrics / "metrics.json").read_text())
cm_info = metrics.get("confusion_matrix")
if not cm_info:
    raise ValueError("confusion_matrix missing from metrics.json")

labels = cm_info["rows_cols_labels"]
cm_df = pd.DataFrame(cm_info["matrix"], index=labels, columns=labels)

counts_path = latest_metrics / "confusion_matrix_counts.png"
plt.figure(figsize=(8, 6))
sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues")
plt.title(f"Confusion matrix ‚Äì {METRICS_TAG}")
plt.ylabel("True class")
plt.xlabel("Predicted class")
plt.tight_layout()
plt.savefig(counts_path)
plt.show()
print("Saved counts heatmap to", counts_path)

cm_norm = cm_df.div(cm_df.sum(axis=1).replace(0, 1), axis=0)
norm_path = latest_metrics / "confusion_matrix_normalized.png"
plt.figure(figsize=(8, 6))
sns.heatmap(cm_norm, annot=True, fmt=".2f", cmap="Blues")
plt.title(f"Normalized confusion matrix ‚Äì {METRICS_TAG}")
plt.ylabel("True class")
plt.xlabel("Predicted class")
plt.tight_layout()
plt.savefig(norm_path)
plt.show()
print("Saved normalized heatmap to", norm_path)
