In [None]:
import torch
import time 
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torch.utils.data import Dataset, DataLoader
import yaml
from PIL import Image
import os

# Config
data_yaml = 'data.yaml'  # Your dataset config
batch_size = 2
image_size = 512
epochs = 20

# Dataset
class YOLODataset(Dataset):
    def __init__(self, yaml_path, mode='train'):
        with open(yaml_path) as f:
            data = yaml.safe_load(f)
        
        self.img_dir = os.path.join(data['path'], data[mode])
        self.label_dir = self.img_dir.replace('images', 'labels')
        self.images = [f for f in os.listdir(self.img_dir) 
                      if f.endswith(('.jpg', '.png', '.jpeg'))]
        self.classes = data['names']
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        # Load image
        img_path = os.path.join(self.img_dir, self.images[idx])
        img = Image.open(img_path).convert('RGB')
        img = torchvision.transforms.functional.to_tensor(img)
        img = torchvision.transforms.functional.resize(img, [image_size]*2)
        
        # Load labels
        label_path = os.path.join(self.label_dir, 
                                os.path.splitext(self.images[idx])[0] + '.txt')
        boxes, labels = [], []
        
        if os.path.exists(label_path):
            with open(label_path) as f:
                for line in f:
                    class_id, xc, yc, w, h = map(float, line.strip().split())
                    # Convert YOLO to Pascal VOC
                    x1 = (xc - w/2) * image_size
                    y1 = (yc - h/2) * image_size
                    x2 = (xc + w/2) * image_size
                    y2 = (yc + h/2) * image_size
                    boxes.append([x1, y1, x2, y2])
                    labels.append(int(class_id) + 1)  # +1 because background is class 0
        
        target = {
            'boxes': torch.tensor(boxes, dtype=torch.float32),
            'labels': torch.tensor(labels, dtype=torch.int64),
            'image_id': torch.tensor([idx]),
            'area': (torch.tensor(boxes)[:, 3] - torch.tensor(boxes)[:, 1]) * 
                    (torch.tensor(boxes)[:, 2] - torch.tensor(boxes)[:, 0]),
            'iscrowd': torch.zeros(len(labels), dtype=torch.int64)
        }
        
        return img, target

# Model
def create_model(num_classes):
    backbone = torchvision.models.mobilenet_v2(weights='DEFAULT').features
    backbone.out_channels = 1280  # MobilenetV2 feature dimension
    
    anchor_generator = AnchorGenerator(
        sizes=((32, 64, 128, 256),),
        aspect_ratios=((0.5, 1.0, 2.0),)
    )
    
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(
        featmap_names=['0'],
        output_size=7,
        sampling_ratio=2
    )
    
    return FasterRCNN(
        backbone,
        num_classes=num_classes,
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler
    )

# Training
def train():
    # Data
    train_set = YOLODataset(data_yaml, 'train')
    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=lambda x: tuple(zip(*x))
    )
    
    # Model
    model = create_model(len(train_set.classes) + 1).to('cuda')
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
    
    print(f"\n🚀 Starting training on {len(train_set)} images")
    print(f"📦 Batch size: {batch_size} | 🔄 Total batches: {len(train_loader)}")
    print(f"🔥 Epochs: {epochs} | 💻 Device: {next(model.parameters()).device}\n")

    # Training loop
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        start_time = time.time()
        
        for batch_idx, (images, targets) in enumerate(train_loader):
            batch_start = time.time()

            # Move to GPU
            images = [img.to('cuda') for img in images]
            targets = [{k: v.to('cuda') for k, v in t.items()} for t in targets]
            
            # Forward + backward
            optimizer.zero_grad()
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())

            losses.backward()
            optimizer.step()

            batch_time = time.time() - batch_start
            epoch_loss += losses.item()
            avg_loss = epoch_loss / (batch_idx + 1)

            print(
                f"\rEpoch {epoch+1}/{epochs} | "
                f"Batch {batch_idx+1}/{len(train_loader)} | "
                f"Loss: {losses.item():.3f} (avg: {avg_loss:.3f}) | "
                f"Time: {batch_time:.2f}s/batch | "
                f"Mem: {torch.cuda.memory_allocated()/1e9:.2f}GB",
                end="", flush=True
            )                    

        epoch_time = time.time() - start_time
        print(f"\n✅ Epoch {epoch+1} complete | "
              f"Avg loss: {epoch_loss/len(train_loader):.4f} | "
              f"Time: {epoch_time:.1f}s | "
              f"LR: {optimizer.param_groups[0]['lr']:.2e}\n")

if __name__ == '__main__':
    train()

In [None]:
import torch
import time 
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torch.utils.data import Dataset, DataLoader
import yaml
from PIL import Image
import os
from tqdm import tqdm

# Config
data_yaml = 'data.yaml'  # Your dataset config
batch_size = 2
image_size = 512
epochs = 20
checkpoint_dir = 'checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Dataset
class YOLODataset(Dataset):
    def __init__(self, yaml_path, mode='train', augment=False):
        with open(yaml_path) as f:
            data = yaml.safe_load(f)
        
        self.img_dir = os.path.join(data['path'], data[mode])
        self.label_dir = self.img_dir.replace('images', 'labels')
        self.images = [f for f in os.listdir(self.img_dir) 
                      if f.endswith(('.jpg', '.png', '.jpeg'))]
        self.classes = data['names']
        self.augment = augment
        
        # Define transforms
        self.base_transform = torchvision.transforms.Compose([
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Resize([image_size]*2)
        ])
        
        if self.augment:
            self.transform = torchvision.transforms.Compose([
                torchvision.transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
                torchvision.transforms.RandomHorizontalFlip(p=0.5),
                self.base_transform
            ])
        else:
            self.transform = self.base_transform
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        # Load image
        img_path = os.path.join(self.img_dir, self.images[idx])
        img = Image.open(img_path).convert('RGB')
        img = self.transform(img)
        
        # Load labels
        label_path = os.path.join(self.label_dir, 
                                os.path.splitext(self.images[idx])[0] + '.txt')
        boxes, labels = [], []
        
        if os.path.exists(label_path):
            with open(label_path) as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) == 5:  # Make sure line has correct format
                        class_id, xc, yc, w, h = map(float, parts)
                        # Convert YOLO to Pascal VOC
                        x1 = (xc - w/2) * image_size
                        y1 = (yc - h/2) * image_size
                        x2 = (xc + w/2) * image_size
                        y2 = (yc + h/2) * image_size
                        boxes.append([x1, y1, x2, y2])
                        labels.append(int(class_id) + 1)  # +1 because background is class 0
        
        # Handle empty boxes case
        if len(boxes) == 0:
            target = {
                'boxes': torch.zeros((0, 4), dtype=torch.float32),
                'labels': torch.zeros(0, dtype=torch.int64),
                'image_id': torch.tensor([idx]),
                'area': torch.zeros(0, dtype=torch.float32),
                'iscrowd': torch.zeros(0, dtype=torch.int64)
            }
        else:
            boxes = torch.tensor(boxes, dtype=torch.float32)
            target = {
                'boxes': boxes,
                'labels': torch.tensor(labels, dtype=torch.int64),
                'image_id': torch.tensor([idx]),
                'area': (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]),
                'iscrowd': torch.zeros(len(labels), dtype=torch.int64)
            }
        
        return img, target

# Model
def create_model(num_classes):
    backbone = torchvision.models.mobilenet_v2(weights='DEFAULT').features
    backbone.out_channels = 1280  # MobilenetV2 feature dimension
    
    anchor_generator = AnchorGenerator(
        sizes=((32, 64, 128, 256),),
        aspect_ratios=((0.5, 1.0, 2.0),)
    )
    
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(
        featmap_names=['0'],
        output_size=7,
        sampling_ratio=2
    )
    
    return FasterRCNN(
        backbone,
        num_classes=num_classes,
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler
    )

# Validation function
def validate(model, data_loader, device):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for images, targets in tqdm(data_loader, desc="Validating"):
            images = [img.to(device) for img in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            # Temporarily switch to train mode to get losses
            model.train()
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            model.eval()
            
            val_loss += losses.item()
    
    return val_loss / len(data_loader)

# Training
def train():
    # Data
    train_set = YOLODataset(data_yaml, 'train', augment=True)
    val_set = YOLODataset(data_yaml, 'val')  # Assuming 'val' exists in your data.yaml
    
    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=lambda x: tuple(zip(*x)),
        num_workers=4
    )
    
    val_loader = DataLoader(
        val_set,
        batch_size=batch_size,
        collate_fn=lambda x: tuple(zip(*x)),
        num_workers=2
    )
    
    # Model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = create_model(len(train_set.classes) + 1).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
    scaler = torch.cuda.amp.GradScaler()
    
    print(f"\n🚀 Starting training on {len(train_set)} images, validating on {len(val_set)} images")
    print(f"📦 Batch size: {batch_size} | 🔄 Total batches: {len(train_loader)}")
    print(f"🔥 Epochs: {epochs} | 💻 Device: {device}\n")

    best_val_loss = float('inf')
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        start_time = time.time()
        
        # Training phase
        for batch_idx, (images, targets) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")):
            batch_start = time.time()

            # Move to device
            images = [img.to(device) for img in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            # Forward + backward with mixed precision
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                loss_dict = model(images, targets)
                losses = sum(loss for loss in loss_dict.values())

            scaler.scale(losses).backward()
            scaler.step(optimizer)
            scaler.update()

            epoch_loss += losses.item()
            
            # Print batch stats
            if (batch_idx + 1) % 10 == 0:
                avg_loss = epoch_loss / (batch_idx + 1)
                print(
                    f"\rEpoch {epoch+1}/{epochs} | "
                    f"Batch {batch_idx+1}/{len(train_loader)} | "
                    f"Loss: {losses.item():.3f} (avg: {avg_loss:.3f}) | "
                    f"Mem: {torch.cuda.memory_allocated()/1e9:.2f}GB",
                    end="", flush=True
                )

        # Validation phase
        val_loss = validate(model, val_loader, device)
        lr_scheduler.step()
        
        epoch_time = time.time() - start_time
        train_loss = epoch_loss / len(train_loader)
        
        print(f"\n✅ Epoch {epoch+1} complete | "
              f"Train loss: {train_loss:.4f} | Val loss: {val_loss:.4f} | "
              f"Time: {epoch_time:.1f}s | LR: {optimizer.param_groups[0]['lr']:.2e}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), os.path.join(checkpoint_dir, 'best_model.pth'))
            print(f"💾 Saved best model with val loss: {val_loss:.4f}")
        
        # Periodic checkpoint
        if (epoch + 1) % 5 == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f'epoch_{epoch+1}.pth')
            torch.save(model.state_dict(), checkpoint_path)
            print(f"💾 Saved checkpoint at epoch {epoch+1}")

if __name__ == '__main__':
    train()

In [None]:
import torch
import torchvision
from torchvision.models.detection import FasterRCNN
import yaml
from torchvision.models.detection.rpn import AnchorGenerator
import os
from PIL import Image
import numpy as np
from tqdm import tqdm
from torchmetrics.detection import MeanAveragePrecision

# CONFIGURATION
MODEL_PATH = "checkpoints/best_model.pth"
DATA_YAML = "data.yaml"
IMAGE_DIR = "images/val"
LABEL_DIR = "labels/val"
IMAGE_SIZE = 512
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CONFIDENCE_THRESHOLD = 0.5

# Load class names from data.yaml
with open(DATA_YAML) as f:
    data = yaml.safe_load(f)
CLASS_NAMES = data['names']
NUM_CLASSES = len(CLASS_NAMES) + 1  # +1 for background

def create_model():
    backbone = torchvision.models.mobilenet_v2(weights=None).features
    backbone.out_channels = 1280
    anchor_generator = AnchorGenerator(
        sizes=((32, 64, 128, 256),),
        aspect_ratios=((0.5, 1.0, 2.0),)
    )
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(
        featmap_names=['0'],
        output_size=7,
        sampling_ratio=2
    )
    return FasterRCNN(
        backbone,
        num_classes=NUM_CLASSES,
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler
    )

def load_data():
    image_files = [f for f in os.listdir(IMAGE_DIR) if f.endswith(('.jpg', '.png', '.jpeg'))]
    transform = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Resize((IMAGE_SIZE, IMAGE_SIZE))
    ])

    data = []
    for img_file in image_files:
        img_path = os.path.join(IMAGE_DIR, img_file)
        img = Image.open(img_path).convert('RGB')
        img_tensor = transform(img)

        label_path = os.path.join(LABEL_DIR, os.path.splitext(img_file)[0] + '.txt')
        boxes = []
        labels = []

        if os.path.exists(label_path):
            with open(label_path) as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) == 5:
                        class_id, xc, yc, w, h = map(float, parts)
                        x1 = (xc - w/2) * IMAGE_SIZE
                        y1 = (yc - h/2) * IMAGE_SIZE
                        x2 = (xc + w/2) * IMAGE_SIZE
                        y2 = (yc + h/2) * IMAGE_SIZE
                        boxes.append([x1, y1, x2, y2])
                        labels.append(int(class_id) + 1)

        target = {
            'boxes': torch.tensor(boxes, dtype=torch.float32) if boxes else torch.zeros((0, 4), dtype=torch.float32),
            'labels': torch.tensor(labels, dtype=torch.int64) if labels else torch.zeros(0, dtype=torch.int64)
        }
        data.append((img_tensor, target))
    return data

def evaluate_model():
    model = create_model().to(DEVICE)
    model.load_state_dict(torch.load(MODEL_PATH))
    model.eval()

    data = load_data()
    metric = MeanAveragePrecision(
        box_format='xyxy',
        iou_thresholds=[0.5],
        class_metrics=True
    )

    with torch.no_grad():
        for img, target in tqdm(data, desc="Evaluating"):
            img = img.unsqueeze(0).to(DEVICE)
            prediction = model(img)[0]

            keep = prediction['scores'] > CONFIDENCE_THRESHOLD
            filtered_pred = {
                'boxes': prediction['boxes'][keep],
                'scores': prediction['scores'][keep],
                'labels': prediction['labels'][keep]
            }

            # Move target to same device
            target = {
                'boxes': target['boxes'].to(DEVICE),
                'labels': target['labels'].to(DEVICE)
            }

            metric.update([filtered_pred], [target])


    # Handle different torchmetrics versions
    results = metric.compute()
    metrics = {
        'map_50': results.get('map_50', results.get('map@0.5', torch.tensor(0.))).item(),
        'precision': results.get('map_per_class', torch.tensor([0.]*NUM_CLASSES)).mean().item(),
        'recall': results.get('mar_50', results.get('recall@0.5', torch.tensor(0.))).item(),
        'per_class': results.get('map_per_class', torch.tensor([0.]*NUM_CLASSES)).tolist()
    }

    print("\n📊 Evaluation Results:")
    print(f"mAP@0.5: {metrics['map_50']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")

    print("\nPer-Class AP@0.5:")
    for i, class_name in enumerate(CLASS_NAMES):
        print(f"{class_name}: {metrics['per_class'][i]:.4f}")

if __name__ == '__main__':
    evaluate_model()