# üõ£Ô∏è Train Pothole Detector (RDD2022) on Colab GPU

This notebook automates the training of a YOLOv12 model for Pothole Detection using the sekilab RDD2022 dataset.

**Steps:**
1.  Setup Environment (GPU Check)
2.  Install Dependencies
3.  Download & Prepare Dataset
4.  Train Model
5.  Download Weights

In [None]:
# 1. Check GPU
!nvidia-smi

In [None]:
# 2. Install Dependencies
!pip install ultralytics

In [None]:
# 3. Download Dataset (RDD2022 - India Only)
# We will use the Kaggle dataset directly or download from a source.
# For simplicity, we assume you upload the 'RDD2022_India.zip' to Colab Files manually 
# OR mount Drive if you have it there.

from google.colab import drive
drive.mount('/content/drive')

# Copy dataset from Drive to Colab local (faster training)
# NOTE: Make sure the file uploaded to Drive/Colab is named exactly 'RDD2022_India.zip'
!cp '/content/drive/MyDrive/RDD2022_India.zip' '/content/dataset.zip'
!unzip -q '/content/dataset.zip' -d '/content/dataset'

In [None]:
# 4. Data Preparation Script (Same as local)
import os
import glob
import xml.etree.ElementTree as ET
import shutil
import random
from tqdm import tqdm
import yaml

# CONFIG
BASE_DIR = '/content'
# Update path to match the unzipped structure of RDD2022_India.zip
# Usually it unzips into 'RDD2022_India/India/train...' or similar. Adjusting recursively just in case.
RAW_IMAGES_DIR = glob.glob("/content/dataset/**/train/images", recursive=True)[0]
RAW_XML_DIR = glob.glob("/content/dataset/**/train/annotations/xmls", recursive=True)[0]

PROCESSED_DIR = os.path.join(BASE_DIR, "dataset", "processed_rdd")
IMAGES_TRAIN_DIR = os.path.join(PROCESSED_DIR, "images", "train")
IMAGES_VAL_DIR = os.path.join(PROCESSED_DIR, "images", "val")
LABELS_TRAIN_DIR = os.path.join(PROCESSED_DIR, "labels", "train")
LABELS_VAL_DIR = os.path.join(PROCESSED_DIR, "labels", "val")

CLASSES = ['D00', 'D01', 'D10', 'D11', 'D20', 'D40', 'D43', 'D44']
CLASS_MAP = {name: i for i, name in enumerate(CLASSES)}
SPLIT_RATIO = 0.8

def convert_xml_to_yolo(xml_file, output_txt_path):
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        size = root.find('size')
        w = int(size.find('width').text)
        h = int(size.find('height').text)
        yolo_lines = []
        for obj in root.findall('object'):
            cls_name = obj.find('name').text
            if cls_name not in CLASS_MAP: continue
            cls_id = CLASS_MAP[cls_name]
            xmlbox = obj.find('bndbox')
            b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), 
                 float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
            dw = 1.0 / w
            dh = 1.0 / h
            x = (b[0] + b[1]) / 2.0 * dw
            y = (b[2] + b[3]) / 2.0 * dh
            bw = (b[1] - b[0]) * dw
            bh = (b[3] - b[2]) * dh
            yolo_lines.append(f"{cls_id} {x:.6f} {y:.6f} {bw:.6f} {bh:.6f}")
        if yolo_lines:
            with open(output_txt_path, 'w') as f:
                f.write('\n'.join(yolo_lines))
            return True
        return False
    except Exception as e:
        print(e)
        return False

print("üöÄ Starting Data Prep...")
for d in [IMAGES_TRAIN_DIR, IMAGES_VAL_DIR, LABELS_TRAIN_DIR, LABELS_VAL_DIR]:
    os.makedirs(d, exist_ok=True)
    
xml_files = glob.glob(os.path.join(RAW_XML_DIR, "*.xml"))
dataset = []
for xml_path in xml_files:
    basename = os.path.splitext(os.path.basename(xml_path))[0]
    jpg_path = os.path.join(RAW_IMAGES_DIR, basename + ".jpg")
    if os.path.exists(jpg_path):
        dataset.append((jpg_path, xml_path))

random.shuffle(dataset)
split = int(len(dataset) * SPLIT_RATIO)
train, val = dataset[:split], dataset[split:]

def process(subset, img_dir, lbl_dir):
    for img, xml in tqdm(subset):
        if convert_xml_to_yolo(xml, os.path.join(lbl_dir, os.path.splitext(os.path.basename(img))[0] + ".txt")):
            shutil.copy(img, os.path.join(img_dir, os.path.basename(img)))

process(train, IMAGES_TRAIN_DIR, LABELS_TRAIN_DIR)
process(val, IMAGES_VAL_DIR, LABELS_VAL_DIR)

# Create YAML
data_yaml = {
    'path': PROCESSED_DIR,
    'train': 'images/train',
    'val': 'images/val',
    'names': {i: n for i, n in enumerate(CLASSES)}
}
with open('/content/rdd_data.yaml', 'w') as f:
    yaml.dump(data_yaml, f)
    
print("‚úÖ Ready for Training!")

In [None]:
# 5. Train Model
!yolo task=detect mode=train model=yolo12n.pt data='/content/rdd_data.yaml' epochs=50 imgsz=640

In [None]:
# 6. Download Weights
from google.colab import files
files.download('/content/runs/detect/train/weights/best.pt')