# Kaggle: YOLOv12n Training Pipeline (Complete)

This notebook trains Ultralytics YOLOv12n on a YOLO-formatted dataset hosted in `/kaggle/input`. It includes:
- Environment setup and dependency installation
- Configuration and reproducibility
- Accelerator detection (CPU/GPU/TPU) and AMP
- Data sanity checks (counts, empty labels, class distribution)
- Training with Ultralytics (mirrors your local hyperparameters)
- Metrics visualization and inference preview
- Artifact export to `/kaggle/working` for download

In [None]:
# 1) Import and Install Dependencies
# - Ensure ultralytics and common libs are available on Kaggle

import sys, subprocess, os

def pip_install(pkg):
    print(f"Installing {pkg}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

# Core
pip_install("ultralytics>=8.1.0,<9")
pip_install("pandas>=2.0.0")
pip_install("numpy>=1.24.0")
pip_install("scikit-learn>=1.2.0")
pip_install("matplotlib>=3.7.0")

# Optional visualization tools
try:
    import cv2  # noqa
except Exception:
    pip_install("opencv-python")

from ultralytics import YOLO
import pandas as pd
import numpy as np
import json, random, time, glob
from pathlib import Path
import matplotlib.pyplot as plt

In [None]:
# 2) Configuration and Reproducibility

class Cfg:
    # Dataset slug uploaded to Kaggle Datasets (adjust to your dataset)
    # Example: 'username/indian-urban-dataset-yolo'
    dataset_slug = os.environ.get('DATASET_SLUG', 'username/indian-urban-dataset-yolo')

    # Paths inside Kaggle
    input_root = Path('/kaggle/input')
    work_root = Path('/kaggle/working')
    data_dirname = os.environ.get('DATA_DIRNAME', 'Indian_Urban_Dataset_yolo')  # optional subdir name if present

    # Training
    weights = os.environ.get('WEIGHTS', 'yolov12n.pt')
    epochs = int(os.environ.get('EPOCHS', 80))
    imgsz = int(os.environ.get('IMGSZ', 640))
    batch = int(os.environ.get('BATCH', 16))
    device = os.environ.get('DEVICE', '')  # ''=auto

    # Reproducibility
    seed = int(os.environ.get('SEED', 42))

cfg = Cfg()

# Seeding
import torch
import random

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(cfg.seed)

# Resolve dataset mount dir
DATASET_MOUNT = cfg.input_root / cfg.dataset_slug.split('/')[-1]
assert DATASET_MOUNT.exists(), f"Dataset mount not found at {DATASET_MOUNT}. Attach the dataset in Kaggle (Add Data)."

# If a zip exists inside the dataset mount, auto-extract the first zip
import zipfile
zip_candidates = list(DATASET_MOUNT.rglob('*.zip'))
if zip_candidates:
    extract_root = cfg.work_root / 'extracted_dataset'
    extract_root.mkdir(parents=True, exist_ok=True)
    print('Found zip:', zip_candidates[0])
    with zipfile.ZipFile(zip_candidates[0], 'r') as zf:
        zf.extractall(extract_root)
    # If the zip contains a single top-level directory, use it; otherwise use extract_root
    subdirs = [p for p in extract_root.iterdir() if p.is_dir()]
    DATASET_DIR = subdirs[0] if len(subdirs) == 1 else extract_root
else:
    # Use the mount directly; if an expected subdir name exists, prefer it
    candidate = DATASET_MOUNT / cfg.data_dirname
    DATASET_DIR = candidate if candidate.exists() else DATASET_MOUNT

print('DATASET_DIR =', DATASET_DIR)

# Locate data.yaml by search if not at expected path
import glob as _glob
DATA_YAML = None
candidates = list(DATASET_DIR.rglob('data.yaml'))
if candidates:
    DATA_YAML = candidates[0]
else:
    candidate = DATASET_DIR / 'data.yaml'
    if candidate.exists():
        DATA_YAML = candidate

assert DATA_YAML is not None and DATA_YAML.exists(), f"data.yaml not found anywhere under {DATASET_DIR}"
print('DATA_YAML   =', DATA_YAML)


In [None]:
# 3) Detect Accelerator (CPU/GPU/TPU) and AMP

try:
    import torch
    device_name = 'cuda' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')
    print('Auto device =', device_name)
except Exception as e:
    print('Torch not available, defaulting to CPU', e)
    device_name = 'cpu'

amp_dtype = torch.float16 if device_name == 'cuda' else torch.bfloat16 if device_name == 'mps' else torch.float32
print('AMP dtype =', amp_dtype)


In [None]:
# 4) Load Data from /kaggle/input (or extracted) and basic EDA

import yaml
with open(DATA_YAML, 'r') as f:
    data_cfg = yaml.safe_load(f)

print('data.yaml contents:')
print(json.dumps(data_cfg, indent=2))

# Derive absolute split paths
# Support both (a) relative to a 'path' key and (b) direct split paths relative to DATASET_DIR
base_path = DATASET_DIR
if 'path' in data_cfg and data_cfg['path']:
    # If data.yaml already has 'path', make sure it's consistent with our DATASET_DIR
    # Prefer our resolved DATASET_DIR to avoid mismatches
    data_cfg['path'] = str(base_path)

split_paths = {}
for k in ['train', 'val']:
    raw = data_cfg[k]
    p = base_path / raw if not str(raw).startswith('/') else Path(raw)
    split_paths[k] = p
    print(k, 'path =', p)
    assert p.exists(), f"Missing split path: {p}"

# Count images/labels and empty labels
from collections import Counter

def yolo_stats(labels_root: Path):
    files = list(labels_root.rglob('*.txt'))
    empty = 0
    cls_count = Counter()
    for p in files:
        if p.stat().st_size == 0:
            empty += 1
            continue
        for line in p.read_text().splitlines():
            parts = line.strip().split()
            if not parts:
                continue
            cls_count[parts[0]] += 1
    return len(files), empty, dict(sorted(cls_count.items(), key=lambda x: int(x[0])))

# Infer labels directory from image directory parent structure
for split in ['train', 'val']:
    img_dir = split_paths[split]
    # Typically: datasets/images/train and datasets/labels/train
    labels_dir = img_dir.parents[1] / 'labels' / split if len(img_dir.parents) >= 2 else img_dir.parent.parent / 'labels' / split
    num_labels, empty_labels, dist = yolo_stats(labels_dir)
    num_images = len(list(img_dir.rglob('*.jpg'))) + len(list(img_dir.rglob('*.png')))
    print(f"Split={split}: images={num_images}, label_files={num_labels}, empty_labels={empty_labels}")
    print('class distribution:', dist)


In [None]:
# Patch data.yaml for Kaggle runtime (write to /kaggle/working)

import yaml
WORK_DATA_YAML = cfg.work_root / 'data_kaggle.yaml'

with open(DATA_YAML, 'r') as f:
    cfg_yaml = yaml.safe_load(f)

# Ensure 'path' points to the dataset directory under /kaggle/input
cfg_yaml['path'] = str(DATASET_DIR / cfg.data_dirname)

with open(WORK_DATA_YAML, 'w') as f:
    yaml.safe_dump(cfg_yaml, f)

print('Wrote patched data.yaml to:', WORK_DATA_YAML)
print('Using path:', cfg_yaml['path'])

In [None]:
# 5) Training with Ultralytics YOLO

# Mirror your local hyperparameters
print('Starting training...')
model = YOLO(cfg.weights)
train_result = model.train(
    data=str(WORK_DATA_YAML),
    epochs=cfg.epochs,
    imgsz=cfg.imgsz,
    batch=cfg.batch,
    device=cfg.device,  # ''=auto; Kaggle will pick GPU if available
    project=str(cfg.work_root),
    name='yolov12n_indian_urban_kaggle',
    exist_ok=True,
)
print(train_result)

# Locate artifacts
ART_DIR = cfg.work_root / 'yolov12n_indian_urban_kaggle'
print('Artifacts at:', ART_DIR)
assert ART_DIR.exists(), 'Training artifact directory missing'
WEIGHTS_BEST = ART_DIR / 'weights' / 'best.pt'
WEIGHTS_LAST = ART_DIR / 'weights' / 'last.pt'
print('Best weights:', WEIGHTS_BEST)
print('Last weights:', WEIGHTS_LAST)


In [None]:
# 6) Metrics visualization (results.csv, results.png)

import pandas as pd
import matplotlib.pyplot as plt

results_csv = ART_DIR / 'results.csv'
results_png = ART_DIR / 'results.png'

if results_csv.exists():
    df = pd.read_csv(results_csv)
    display(df.tail(3))
    print('Final row:')
    display(df.iloc[[-1]])
else:
    print('results.csv not found at', results_csv)

if results_png.exists():
    from IPython.display import Image, display as disp
    disp(Image(filename=str(results_png)))
else:
    print('results.png not found at', results_png)


In [None]:
# 7) Inference preview on a few validation images

from IPython.display import display, Image

val_dir = DATASET_DIR / data_cfg['val']
assert val_dir.exists()

subset = sorted(glob.glob(str(val_dir / '*.jpg')))[:8]
if not subset:
    subset = sorted(glob.glob(str(val_dir / '*.png')))[:8]

print('Preview images:', len(subset))

model_inf = YOLO(str(WEIGHTS_BEST if WEIGHTS_BEST.exists() else WEIGHTS_LAST))
# Save predictions under the training artifacts folder for easy browsing
res = model_inf.predict(source=subset, imgsz=cfg.imgsz, conf=0.25,
                        project=str(ART_DIR), name='predict', save=True)

# Show the result images saved by Ultralytics in ART_DIR/predict
pred_dir = ART_DIR / 'predict'
print('Predictions directory:', pred_dir)

if pred_dir.exists():
    imgs = sorted(glob.glob(str(pred_dir / '*.jpg')))
    if not imgs:
        imgs = sorted(glob.glob(str(pred_dir / '*.png')))
    for p in imgs[:6]:
        display(Image(filename=p))
else:
    print('No prediction images found (Ultralytics may have saved elsewhere)')


In [None]:
# Patch data.yaml for Kaggle runtime (write to /kaggle/working)

import yaml
WORK_DATA_YAML = cfg.work_root / 'data_kaggle.yaml'

with open(DATA_YAML, 'r') as f:
    cfg_yaml = yaml.safe_load(f)

# Ensure 'path' points to our resolved DATASET_DIR
cfg_yaml['path'] = str(DATASET_DIR)

with open(WORK_DATA_YAML, 'w') as f:
    yaml.safe_dump(cfg_yaml, f)

print('Wrote patched data.yaml to:', WORK_DATA_YAML)
print('Using path:', cfg_yaml['path'])


# Quick Start (Kaggle)

1) Add your dataset to Kaggle Datasets. Its slug will look like `username/indian-urban-dataset-yolo`.
2) Open a new Kaggle Notebook (GPU if available), add the dataset via the sidebar.
3) In the config cell, set env vars if needed:
   - `DATASET_SLUG` (e.g., `username/indian-urban-dataset-yolo`)
   - `WEIGHTS` (default `yolov12n.pt`), `EPOCHS`, `IMGSZ`, `BATCH`, `DEVICE`
4) Run all cells top-to-bottom.
5) Outputs:
   - Artifacts: `/kaggle/working/yolov12n_indian_urban_kaggle` (weights, results.csv/png)
   - Zipped bundle: `/kaggle/working/yolov12n_artifacts.zip`
   - Convenience copies: `/kaggle/working/best.pt`, `/kaggle/working/last.pt`
6) Download the zip and `best.pt` from the right-hand “Output” panel.
