In [6]:
import torch
print("CUDA available:", torch.cuda.is_available())

# ================================
# Step 1: Upgrade pip, setuptools, and wheel
# ================================
!pip install --upgrade pip setuptools wheel  # Upgrade pip, setuptools, and wheel

# ================================
# Step 2: Uninstall conflicting packages
# ================================
!pip uninstall -y protobuf google-api-core google-cloud-bigquery google-cloud-translate google-ai-generativelanguage rich

# ================================
# Step 3: Install compatible versions of dependencies
# ================================
!pip install protobuf==3.19.5 google-api-core==2.15.0 google-cloud-bigquery==3.24.0 google-cloud-translate==3.12.1 google-ai-generativelanguage==0.6.15 rich==12.4.4

# ================================
# Step 4: Install torch_xla for TPU support
# ================================
!pip install torch_xla

# ================================
# Step 5: Install YOLO and other required libraries
# ================================
!pip install -U "ultralytics>=8.3.0" torch torchvision torchaudio pyyaml tqdm opencv-python matplotlib \
    protobuf<6.0.0 \
    numpy<2.1.0 \
    google-api-core>=2.15.0,<3.0.0 \
    pandas-gbq>=0.29.1 \
    google-cloud-bigquery-storage>=2.30.0,<3.0.0 \
    rich<14 \
    scikit-learn>=1.3.1,<2.0.0 \
    "tensorflow-cpu==2.18.0"

# ================================
# Step 6: Install ONNX and ONNX Runtime
# ================================
!pip install onnx onnxruntime

print("done")


CUDA available: True
Found existing installation: protobuf 6.32.1
Uninstalling protobuf-6.32.1:
  Successfully uninstalled protobuf-6.32.1
[0mCollecting protobuf==3.19.5
  Using cached protobuf-3.19.5-py2.py3-none-any.whl.metadata (828 bytes)
Collecting google-api-core==2.15.0
  Downloading google_api_core-2.15.0-py3-none-any.whl.metadata (2.7 kB)
Collecting google-cloud-bigquery==3.24.0
  Downloading google_cloud_bigquery-3.24.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting google-cloud-translate==3.12.1
  Using cached google_cloud_translate-3.12.1-py2.py3-none-any.whl.metadata (5.2 kB)
Collecting google-ai-generativelanguage==0.6.15
  Using cached google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting rich==12.4.4
  Using cached rich-12.4.4-py3-none-any.whl.metadata (18 kB)
INFO: pip is looking at multiple versions of google-ai-generativelanguage to determine which version is compatible with other requirements. This could take a while.
[31mERROR: Cann

In [None]:
!pip uninstall -y wandb

import os
import torch
from pathlib import Path
from ultralytics import YOLO
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.nn as nn

# ============================================================
# Base Paths
# ============================================================
BASE_DIR = Path("/kaggle/input")
WORKING_DIR = Path("/kaggle/working")
RUNS_DIR = WORKING_DIR / "runs" / "detect"

DATASET_PATH = BASE_DIR / "reefscape" / "data.yaml"
MODEL_PATH = "/kaggle/input/yolo11n/other/default/1/yolo11n.pt"

# ============================================================
# Helpers
# ============================================================
def print_header(text):
    print("\n" + "=" * 65)
    print(f"🚀 {text}")
    print("=" * 65)

# ============================================================
# DDP Initialization
# ============================================================
def setup_ddp(rank, world_size):
    """Initialize Distributed Data Parallel"""
    # Set master node environment variables (use 'localhost' for a single-node setup)
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group(
        backend='nccl', 
        rank=rank, 
        world_size=world_size
    )

def cleanup_ddp():
    """Clean up after DDP"""
    dist.destroy_process_group()

# ============================================================
# Training Function for Distributed Data Parallel (DDP)
# ============================================================
def train_phase(rank, world_size, name, base_model, data, epochs, batch, imgsz=640, lr0=None):
    """Runs or resumes a YOLO training phase with DDP."""
    print_header(f"Starting Phase: {name}")

    project_dir = RUNS_DIR / name
    last_weights = project_dir / "weights" / "last.pt"

    # If session expired, resume automatically
    if last_weights.exists():
        print(f"Resuming from previous session: {last_weights}")
        model = YOLO(str(last_weights))
    else:
        print(f"Starting new training run: {name}")
        model = YOLO(base_model)

    # Set up DDP
    model = model.model.to(rank)  # Move model to the correct device
    model = DDP(model, device_ids=[rank])

    args = dict(
        data=str(data),
        epochs=epochs,
        batch=batch,
        imgsz=imgsz,
        device=rank,  # Use the correct GPU for this process
        name=name,
        project=str(RUNS_DIR.parent),
        save=True,
        workers=8,
        distributed_rank=rank,
        # No caching here
    )

    if lr0:
        args["lr0"] = lr0

    # Start training
    model.train(**args)
    print(f"✅ Phase '{name}' complete!")

# ============================================================
# Main Training Pipeline with DDP
# ============================================================
def main(rank, world_size):
    setup_ddp(rank, world_size)  # Set up DDP for this rank
    
    # -----------------------
    # Phase 1: General Train
    # -----------------------
    train_phase(
        rank=rank,
        world_size=world_size,
        name="coral_algae_phase1",
        base_model=MODEL_PATH,
        data=DATASET_PATH,
        epochs=150,
        batch=64,
        imgsz=640
    )

    # -----------------------
    # Phase 2: Fine-Tune
    # -----------------------
    phase1_best = RUNS_DIR / "coral_algae_phase1" / "weights" / "best.pt"
    fine_tune_base = str(phase1_best if phase1_best.exists() else MODEL_PATH)

    train_phase(
        rank=rank,
        world_size=world_size,
        name="coral_algae_finetune",
        base_model=fine_tune_base,
        data=DATASET_PATH,
        epochs=50,
        batch=8,
        imgsz=640,
        lr0=0.001
    )

    cleanup_ddp()  # Clean up DDP processes
    print_header("✅ All training phases complete!")

# ============================================================
# Run the Script for Distributed Training
# ============================================================
if __name__ == "__main__":
    world_size = 2  # Number of GPUs (2 in this case)

    # Set environment variables for distributed training
    os.environ["WORLD_SIZE"] = str(world_size)
    
    # The rank will be set based on the process number (you may set it manually or use a launcher like torchrun)
    rank = int(os.environ["RANK"]) if "RANK" in os.environ else 0  # Default rank if not set

    # Call the main function
    main(rank, world_size)
