# üåä FloodWatch AI ‚Äî High-Accuracy YOLO Training (v2)

This notebook trains a **high-accuracy flood-aware YOLOv8s** model on GPU with an expanded dataset,
quality-audited labels, class balancing, and enhanced augmentations.

**Upgrades from v1:**
| Setting | Baseline (v1) | High-Accuracy (v2) |
| --- | --- | --- |
| Model | YOLOv8**n** | YOLOv8**s** (or **m**) |
| Resolution | 640px | **768px** |
| Epochs | 40 | **100** |
| Early Stopping | patience=10 | patience=**20** |
| Augmentations | Default | **Mosaic + HSV + Flip + Scale + Mixup** |
| Dataset | ~781 images | **‚â•1200 images** (expanded + balanced) |
| Labels | Raw | **Audited & cleaned** |

**Target:** mAP50 ‚â• 0.80 (baseline: 0.69)

> ‚ö° **Runtime:** Go to `Runtime ‚Üí Change runtime type ‚Üí GPU (T4 or better)` before running

## 1Ô∏è‚É£ Install Dependencies

In [None]:
!pip install -q ultralytics roboflow Pillow
import torch
print(f'PyTorch: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'GPU Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB')
else:
    print('‚ö†Ô∏è No GPU detected! Go to Runtime ‚Üí Change runtime type ‚Üí GPU')

## 2Ô∏è‚É£ Download & Expand Flood Dataset

Downloads the primary flood dataset and additional sources to reach ‚â•1200 images.

In [None]:
from roboflow import Roboflow
import os, glob, shutil

API_KEY = "yUaG6RQfZ6ZrAFlwj6w7"

# ‚îÄ‚îÄ Primary dataset: yolo-floods-relief (781 images) ‚îÄ‚îÄ
rf = Roboflow(api_key=API_KEY)
project = rf.workspace("modellabel").project("yolo-floods-relief")
version = project.version(1)
dataset = version.download("yolov8", location="./flood_dataset")
print("‚úÖ Primary flood dataset downloaded!")

# Count current images
for split in ['train', 'valid', 'test']:
    imgs = glob.glob(f'./flood_dataset/{split}/images/*')
    print(f'  {split}: {len(imgs)} images')

In [None]:
# ‚îÄ‚îÄ Download additional flood datasets to expand beyond 781 ‚îÄ‚îÄ
# Add more Roboflow public flood datasets here
ADDITIONAL_DATASETS = [
    # (workspace, project, version, description)
    # Uncomment and add datasets you have access to:
    # ("workspace-name", "flood-detection-project", 1, "FloodNet subset"),
]

for ws, proj, ver, desc in ADDITIONAL_DATASETS:
    print(f'Downloading: {desc}...')
    try:
        p = rf.workspace(ws).project(proj)
        v = p.version(ver)
        v.download('yolov8', location=f'./_extra_{proj}_v{ver}')
        
        # Merge into primary dataset
        for split in ['train', 'valid', 'test']:
            src_imgs = f'./_extra_{proj}_v{ver}/{split}/images'
            src_lbls = f'./_extra_{proj}_v{ver}/{split}/labels'
            if os.path.isdir(src_imgs):
                dst_imgs = f'./flood_dataset/{split}/images'
                dst_lbls = f'./flood_dataset/{split}/labels'
                for f in os.listdir(src_imgs):
                    if not os.path.exists(os.path.join(dst_imgs, f)):
                        shutil.copy2(os.path.join(src_imgs, f), dst_imgs)
                if os.path.isdir(src_lbls):
                    for f in os.listdir(src_lbls):
                        if not os.path.exists(os.path.join(dst_lbls, f)):
                            shutil.copy2(os.path.join(src_lbls, f), dst_lbls)
        print(f'  ‚úÖ {desc} merged')
    except Exception as e:
        print(f'  ‚ùå Failed: {e}')

# Final count
total = 0
for split in ['train', 'valid', 'test']:
    imgs = glob.glob(f'./flood_dataset/{split}/images/*')
    total += len(imgs)
    print(f'  {split}: {len(imgs)} images')
print(f'  Total: {total} images')
if total >= 1200:
    print('‚úÖ Target ‚â•1200 reached!')
else:
    print(f'‚ö†Ô∏è Need {1200 - total} more images. Add extra datasets above or upload manually.')

### üì§ Upload Additional Images (Optional)

If you have extra flood images, upload them to Colab and merge:

In [None]:
# Uncomment to upload and merge extra images from Google Drive or local files
# from google.colab import drive
# drive.mount('/content/drive')
# EXTRA_DIR = '/content/drive/MyDrive/flood_extra_images'
#
# if os.path.isdir(EXTRA_DIR):
#     dst = './flood_dataset/train/images'
#     for f in os.listdir(EXTRA_DIR):
#         if f.lower().endswith(('.jpg', '.jpeg', '.png')):
#             shutil.copy2(os.path.join(EXTRA_DIR, f), dst)
#     print(f'Extra images merged from {EXTRA_DIR}')

## 3Ô∏è‚É£ Normalize Labels (Polygon ‚Üí BBox)

**Critical step:** Some flood datasets use segmentation polygon labels.
YOLO detection training requires exactly 5 values per line: `class x_center y_center width height`.
This cell converts all polygon annotations to bounding box equivalents.

In [None]:
def polygon_to_bbox(parts):
    """Convert polygon annotation to YOLO bbox format."""
    if len(parts) < 5: return None
    try:
        cls_id = int(parts[0])
        coords = [float(v) for v in parts[1:]]
    except (ValueError, IndexError): return None
    if len(coords) % 2 != 0: return None
    xs, ys = coords[0::2], coords[1::2]
    if not xs or not ys: return None
    x_min, x_max = max(0, min(xs)), min(1, max(xs))
    y_min, y_max = max(0, min(ys)), min(1, max(ys))
    w, h = x_max - x_min, y_max - y_min
    if w <= 0 or h <= 0: return None
    return f"{cls_id} {x_min+w/2:.6f} {y_min+h/2:.6f} {w:.6f} {h:.6f}"

poly_count = 0
bbox_count = 0
files_fixed = 0

for split in ["train", "valid", "test"]:
    lbl_dir = f"./flood_dataset/{split}/labels"
    if not os.path.isdir(lbl_dir): continue
    for fname in sorted(os.listdir(lbl_dir)):
        if not fname.endswith(".txt"): continue
        fpath = os.path.join(lbl_dir, fname)
        with open(fpath) as f:
            lines = f.readlines()
        converted = []
        had_polygon = False
        for line in lines:
            parts = line.strip().split()
            if len(parts) == 5:
                converted.append(line.strip())
                bbox_count += 1
            elif len(parts) > 5:
                bbox_line = polygon_to_bbox(parts)
                if bbox_line:
                    converted.append(bbox_line)
                    poly_count += 1
                    had_polygon = True
        if had_polygon:
            with open(fpath, "w") as f:
                f.write("\n".join(converted) + "\n")
            files_fixed += 1

print(f"Polygons converted to bbox: {poly_count}")
print(f"Already bbox: {bbox_count}")
print(f"Files fixed: {files_fixed}")
print("\u2705 All labels are now in detection bbox format!")

## 4Ô∏è‚É£ Audit & Clean Labels

Automatically detect and fix annotation quality issues.

In [None]:
import json
from collections import defaultdict

VALID_CLASS_IDS = {0, 1, 2, 3, 4, 5}
MIN_BOX_AREA = 0.001
DUPLICATE_IOU_THRESHOLD = 0.95

def compute_iou(b1, b2):
    """IoU between two YOLO boxes (cls, xc, yc, w, h)."""
    _, x1, y1, w1, h1 = b1
    _, x2, y2, w2, h2 = b2
    ax1, ay1, ax2, ay2 = x1-w1/2, y1-h1/2, x1+w1/2, y1+h1/2
    bx1, by1, bx2, by2 = x2-w2/2, y2-h2/2, x2+w2/2, y2+h2/2
    ix1, iy1, ix2, iy2 = max(ax1,bx1), max(ay1,by1), min(ax2,bx2), min(ay2,by2)
    if ix2<=ix1 or iy2<=iy1: return 0.0
    inter = (ix2-ix1)*(iy2-iy1)
    return inter / (w1*h1 + w2*h2 - inter) if (w1*h1+w2*h2-inter)>0 else 0.0

def audit_and_fix(dataset_dir, fix=True):
    stats = defaultdict(int)
    for split in ['train', 'valid', 'test']:
        lbl_dir = os.path.join(dataset_dir, split, 'labels')
        if not os.path.isdir(lbl_dir): continue
        for fname in sorted(os.listdir(lbl_dir)):
            if not fname.endswith('.txt'): continue
            fpath = os.path.join(lbl_dir, fname)
            with open(fpath) as f:
                lines = f.readlines()
            
            anns = []
            for line in lines:
                parts = line.strip().split()
                if len(parts) < 5: continue
                try:
                    cls, xc, yc, w, h = int(parts[0]), *[float(p) for p in parts[1:5]]
                except: continue

                if cls not in VALID_CLASS_IDS:
                    stats['invalid_class'] += 1; continue
                xc = max(0, min(1, xc)); yc = max(0, min(1, yc))
                if xc-w/2 < 0: w = xc*2
                if xc+w/2 > 1: w = (1-xc)*2
                if yc-h/2 < 0: h = yc*2
                if yc+h/2 > 1: h = (1-yc)*2
                if w*h < MIN_BOX_AREA:
                    stats['tiny_box'] += 1; continue
                anns.append((cls, xc, yc, w, h))
            
            # Remove duplicates
            final = []
            for a in anns:
                dup = any(a[0]==e[0] and compute_iou(a,e) > DUPLICATE_IOU_THRESHOLD for e in final)
                if dup: stats['duplicate'] += 1
                else: final.append(a)
            
            if not final:
                stats['empty_label'] += 1
                if fix:
                    os.remove(fpath)
                    for ext in ('.jpg','.jpeg','.png'):
                        img = fpath.replace('/labels/','/images/').replace('.txt', ext)
                        if os.path.isfile(img): os.remove(img); break
                continue
            
            if fix and len(final) != len(lines):
                with open(fpath, 'w') as f:
                    for cls, xc, yc, w, h in final:
                        f.write(f'{cls} {xc:.6f} {yc:.6f} {w:.6f} {h:.6f}\n')
                stats['files_fixed'] += 1
    return dict(stats)

# Run audit
report = audit_and_fix('./flood_dataset', fix=True)
print('\nüìã Label Audit Report:')
for k, v in sorted(report.items()):
    print(f'  {k:20s}: {v}')
print('‚úÖ Labels cleaned!')

## 5Ô∏è‚É£ Balance Class Distribution

In [None]:
from collections import Counter
from PIL import Image, ImageOps

CLASS_NAMES = {0:'person', 1:'car', 2:'bicycle', 3:'motorcycle', 4:'bus', 5:'truck'}

def analyze_classes(dataset_dir, split='train'):
    counts = Counter()
    imgs_per_cls = defaultdict(set)
    lbl_dir = os.path.join(dataset_dir, split, 'labels')
    if not os.path.isdir(lbl_dir): return counts, imgs_per_cls
    for fname in os.listdir(lbl_dir):
        if not fname.endswith('.txt'): continue
        with open(os.path.join(lbl_dir, fname)) as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 5:
                    try:
                        cls = int(parts[0])
                        counts[cls] += 1
                        imgs_per_cls[cls].add(fname)
                    except: pass
    return counts, imgs_per_cls

counts, imgs_per_cls = analyze_classes('./flood_dataset')
print('üìä Class Distribution (Before):')
for cls_id in sorted(counts):
    name = CLASS_NAMES.get(cls_id, f'cls_{cls_id}')
    print(f'  {cls_id} ({name:12s}): {counts[cls_id]:5d} annotations, {len(imgs_per_cls[cls_id]):4d} images')

# Detect and fix imbalance
if counts:
    median = sorted(counts.values())[len(counts)//2]
    minorities = {c: v for c, v in counts.items() if v < median * 0.5}
    if minorities:
        print(f'\n‚ö†Ô∏è Minority classes detected, oversampling...')
        img_dir = './flood_dataset/train/images'
        lbl_dir = './flood_dataset/train/labels'
        dup_count = 0
        for cls_id, cls_imgs in imgs_per_cls.items():
            if cls_id not in minorities: continue
            for label_fname in list(cls_imgs):
                base = os.path.splitext(label_fname)[0]
                # Duplicate
                new_lbl = os.path.join(lbl_dir, f'{base}_dup.txt')
                if not os.path.exists(new_lbl):
                    shutil.copy2(os.path.join(lbl_dir, label_fname), new_lbl)
                    for ext in ('.jpg','.jpeg','.png'):
                        src = os.path.join(img_dir, base + ext)
                        if os.path.isfile(src):
                            shutil.copy2(src, os.path.join(img_dir, f'{base}_dup{ext}'))
                            dup_count += 1; break
                # Horizontal flip
                flip_lbl = os.path.join(lbl_dir, f'{base}_flip.txt')
                if not os.path.exists(flip_lbl):
                    with open(os.path.join(lbl_dir, label_fname)) as f:
                        flipped = []
                        for line in f:
                            parts = line.strip().split()
                            if len(parts) >= 5:
                                parts[1] = str(round(1.0 - float(parts[1]), 6))
                                flipped.append(' '.join(parts))
                    with open(flip_lbl, 'w') as f:
                        f.write('\n'.join(flipped) + '\n')
                    for ext in ('.jpg','.jpeg','.png'):
                        src = os.path.join(img_dir, base + ext)
                        if os.path.isfile(src):
                            img = Image.open(src)
                            ImageOps.mirror(img).save(os.path.join(img_dir, f'{base}_flip{ext}'))
                            dup_count += 1; break
        print(f'  Added {dup_count} augmented images')
        counts2, _ = analyze_classes('./flood_dataset')
        print('\nüìä Class Distribution (After):')
        for cls_id in sorted(counts2):
            name = CLASS_NAMES.get(cls_id, f'cls_{cls_id}')
            print(f'  {cls_id} ({name:12s}): {counts2[cls_id]:5d} annotations')
    else:
        print('\n‚úÖ Classes are balanced!')

## 5Ô∏è‚É£ Prepare Dataset Config

In [None]:
import yaml

# Write clean dataset YAML
DATA_YAML = './flood_dataset.yaml'

config = {
    'path': os.path.abspath('./flood_dataset'),
    'train': 'train/images',
    'val': 'valid/images',
    'test': 'test/images',
    'names': {
        0: 'person',
        1: 'car',
        2: 'bicycle',
        3: 'motorcycle',
        4: 'bus',
        5: 'truck',
    }
}

with open(DATA_YAML, 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print('Dataset config:')
with open(DATA_YAML) as f:
    print(f.read())

# Final image counts
total = 0
for split in ['train', 'valid', 'test']:
    n = len(glob.glob(f'./flood_dataset/{split}/images/*'))
    total += n
    print(f'{split}: {n} images')
print(f'Total: {total} images')

## 6Ô∏è‚É£ Choose Model Size

Select based on your GPU memory:
- **T4 (15 GB):** `yolov8s.pt` at 768px ‚úÖ
- **A100/V100 (40+ GB):** `yolov8m.pt` at 832px üöÄ

In [None]:
# ‚ïê‚ïê‚ïê MODEL SELECTION ‚ïê‚ïê‚ïê
# Change these based on your GPU:
MODEL_NAME = 'yolov8s.pt'   # Options: 'yolov8s.pt', 'yolov8m.pt'
IMG_SIZE = 768               # Options: 768, 832
EPOCHS = 100
PATIENCE = 20

# Auto-detect GPU and suggest settings
if torch.cuda.is_available():
    gpu_mem = torch.cuda.get_device_properties(0).total_mem / 1e9
    print(f'GPU Memory: {gpu_mem:.1f} GB')
    if gpu_mem >= 30:
        print('üí° You can use yolov8m.pt at 832px!')
    elif gpu_mem >= 14:
        print('üí° yolov8s.pt at 768px is optimal for your GPU')
    else:
        print('üí° Consider yolov8s.pt at 640px for lower VRAM')
        IMG_SIZE = 640

print(f'\nSelected: {MODEL_NAME} @ {IMG_SIZE}px for {EPOCHS} epochs')

## 7Ô∏è‚É£ Train High-Accuracy Flood Model üöÄ

Full training with enhanced augmentations and optimized hyperparameters.

In [None]:
# == YOLOv8m Maximum-Accuracy Training ==
import torch

MODEL_NAME = "yolov8m.pt"
IMG_SIZE = 832
EPOCHS = 200
PATIENCE = 40
BATCH = -1  # auto-fit GPU memory

if torch.cuda.is_available():
    vram = torch.cuda.get_device_properties(0).total_mem / 1e9
    print(f"GPU: {torch.cuda.get_device_name(0)} ({vram:.1f} GB)")
    if vram >= 24:
        MODEL_NAME = "yolov8l.pt"
        print(f"Using {MODEL_NAME} (large VRAM)")

model = YOLO(MODEL_NAME)

# Bbox tightening (shrink loose boxes 3%)
for split in ["train", "valid", "test"]:
    lbl_dir = f"./flood_dataset/{split}/labels"
    if not os.path.isdir(lbl_dir): continue
    for fname in sorted(os.listdir(lbl_dir)):
        if not fname.endswith(".txt"): continue
        fpath = os.path.join(lbl_dir, fname)
        with open(fpath) as f:
            lines = f.readlines()
        out = []
        for line in lines:
            p = line.strip().split()
            if len(p) != 5: out.append(line.strip()); continue
            try:
                c = int(p[0])
                xc, yc, w, h = [float(x) for x in p[1:5]]
                w *= 0.94; h *= 0.94
                w = max(w, 0.01); h = max(h, 0.01)
                out.append(f"{c} {xc:.6f} {yc:.6f} {w:.6f} {h:.6f}")
            except: out.append(line.strip())
        with open(fpath, "w") as f:
            f.write("\n".join(out) + "\n")
print("Bboxes tightened")

# Train
results = model.train(
    data="./flood_dataset/data.yaml",
    epochs=EPOCHS, imgsz=IMG_SIZE, batch=BATCH,
    patience=PATIENCE, device=0,
    project="./runs", name="flood_maxacc", exist_ok=True, pretrained=True,
    optimizer="AdamW", lr0=0.002, lrf=0.01, weight_decay=0.0005,
    warmup_epochs=5, cos_lr=True,
    mosaic=1.0, mixup=0.1, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4,
    scale=0.5, translate=0.1, perspective=0.0005,
    flipud=0.2, fliplr=0.5, degrees=5.0, shear=2.0,
    copy_paste=0.1, erasing=0.1,
    box=7.5, cls=0.5, dfl=1.5,
    save=True, save_period=25, plots=True, val=True, verbose=True,
)
print("Training complete!")

## 7Ô∏è‚É£ Evaluate & Compare with Baseline

In [None]:
from IPython.display import Image, display

RUN_DIR = 'runs/detect/yolov8_flood_highacc'

# Training curves
results_img = glob.glob(f'{RUN_DIR}/results.png')
if results_img:
    print('üìà Training Curves:')
    display(Image(filename=results_img[0], width=900))

# Confusion matrix
cm_img = glob.glob(f'{RUN_DIR}/confusion_matrix.png')
if cm_img:
    print('\nüìä Confusion Matrix:')
    display(Image(filename=cm_img[0], width=600))

# Normalized confusion matrix
cm_norm = glob.glob(f'{RUN_DIR}/confusion_matrix_normalized.png')
if cm_norm:
    display(Image(filename=cm_norm[0], width=600))

In [None]:
# == Evaluate & Compare ==
BASELINE_MAP50 = 0.69

metrics = results.results_dict if hasattr(results, "results_dict") else {}
p = metrics.get("metrics/precision(B)", None)
r = metrics.get("metrics/recall(B)", None)
m50 = metrics.get("metrics/mAP50(B)", None)
m95 = metrics.get("metrics/mAP50-95(B)", None)

print("=" * 55)
print("FloodWatch YOLO - Max-Accuracy Results")
print("=" * 55)
print(f"Model: {MODEL_NAME} | Resolution: {IMG_SIZE}px | Epochs: {EPOCHS}")
print(f"Precision:  {p}")
print(f"Recall:     {r}")
print(f"mAP50:      {m50}")
print(f"mAP50-95:   {m95}")
if m50 is not None:
    delta = m50 - BASELINE_MAP50
    status = "IMPROVED" if delta > 0 else "REGRESSION"
    print(f"Baseline mAP50: {BASELINE_MAP50:.4f} | Delta: {delta:+.4f} | {status}")
print("=" * 55)


### Sample Predictions

In [None]:
# Run predictions on validation images
val_images = glob.glob('./flood_dataset/valid/images/*')

if val_images:
    preds = best_model.predict(
        source=val_images[:6],
        save=True,
        conf=0.4,
        project='runs/detect',
        name='flood_highacc_predictions',
        exist_ok=True,
    )
    pred_imgs = sorted(glob.glob('runs/detect/flood_highacc_predictions/*.jpg'))
    for img in pred_imgs[:6]:
        display(Image(filename=img, width=500))
        print()

## 8Ô∏è‚É£ Export Production Model

In [None]:
import shutil
from google.colab import files

best_pt = f'{RUN_DIR}/weights/best.pt'
output_pt = 'yolov8_flood_highacc.pt'
shutil.copy2(best_pt, output_pt)

print(f'Model size: {os.path.getsize(output_pt) / 1024 / 1024:.1f} MB')
print(f'Precision:  {metrics.box.mp:.4f}')
print(f'Recall:     {metrics.box.mr:.4f}')
print(f'mAP50:      {metrics.box.map50:.4f}')
print(f'mAP50-95:   {metrics.box.map:.4f}')
print()
print('üì• Downloading model...')
print('Place this file at: models/yolov8_flood_highacc.pt in your FloodWatch project')

files.download(output_pt)

In [None]:
# ‚îÄ‚îÄ Also save the last.pt as backup ‚îÄ‚îÄ
last_pt = f'{RUN_DIR}/weights/last.pt'
if os.path.isfile(last_pt):
    shutil.copy2(last_pt, 'yolov8_flood_highacc_last.pt')
    print('Backup weights saved: yolov8_flood_highacc_last.pt')
    # files.download('yolov8_flood_highacc_last.pt')  # Uncomment to download

## üîü Export to ONNX (Optional)

For faster CPU inference or edge deployment.

In [None]:
# Uncomment to export to ONNX
# best_model.export(format='onnx', imgsz=IMG_SIZE)
# onnx_path = f'{RUN_DIR}/weights/best.onnx'
# if os.path.isfile(onnx_path):
#     shutil.copy2(onnx_path, 'yolov8_flood_highacc.onnx')
#     files.download('yolov8_flood_highacc.onnx')
#     print(f'ONNX size: {os.path.getsize("yolov8_flood_highacc.onnx") / 1024 / 1024:.1f} MB')