In [2]:
!pip install torch torchvision pycocotools matplotlib numpy Pillow psutil
!pip install ultralytics  # For YOLOv8

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [4]:
# Download COCO 2017 Val
!mkdir -p data/coco
!wget -nc http://images.cocodataset.org/annotations/annotations_trainval2017.zip -P data/coco
!wget -nc http://images.cocodataset.org/zips/val2017.zip -P data/coco
!unzip -n data/coco/annotations_trainval2017.zip -d data/coco
!unzip -n data/coco/val2017.zip -d data/coco/images

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 extracting: data/coco/images/val2017/000000212226.jpg  
 extracting: data/coco/images/val2017/000000231527.jpg  
 extracting: data/coco/images/val2017/000000578922.jpg  
 extracting: data/coco/images/val2017/000000062808.jpg  
 extracting: data/coco/images/val2017/000000119038.jpg  
 extracting: data/coco/images/val2017/000000114871.jpg  
 extracting: data/coco/images/val2017/000000463918.jpg  
 extracting: data/coco/images/val2017/000000365745.jpg  
 extracting: data/coco/images/val2017/000000320425.jpg  
 extracting: data/coco/images/val2017/000000481404.jpg  
 extracting: data/coco/images/val2017/000000314294.jpg  
 extracting: data/coco/images/val2017/000000335328.jpg  
 extracting: data/coco/images/val2017/000000513688.jpg  
 extracting: data/coco/images/val2017/000000158548.jpg  
 extracting: data/coco/images/val2017/000000132116.jpg  
 extracting: data/coco/images/val2017/000000415238.jpg  
 extracting: data/coco/

In [14]:
import torch
import torchvision
from torchvision.models.detection import maskrcnn_resnet50_fpn
from ultralytics import YOLO
from PIL import Image
import numpy as np
from pycocotools.coco import COCO
from pycocotools import mask as maskUtils
import time
import cv2
import matplotlib.pyplot as plt
import psutil
import os

In [33]:
NUM_IMAGES = 1000
CONFIDENCE_THRESHOLD = 0.5

In [15]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load Mask R-CNN
maskrcnn_model = maskrcnn_resnet50_fpn(pretrained=True).to(device).eval()

# Load YOLOv8-seg
yolov8_model = YOLO('yolov8x-seg.pt').to(device)



In [34]:
from PIL import Image
from torchvision.transforms import functional as F
import torch
import numpy as np
from pycocotools import mask as maskUtils
import time
import cv2

def evaluate_model(model, coco, img_ids, model_type='maskrcnn'):
    """Evaluate model on COCO dataset"""
    metrics = []

    for img_id in img_ids[:NUM_IMAGES]:  # Evaluate on first 50 images
        # Load image and annotations
        img_info = coco.loadImgs(img_id)[0]
        img_path = f"data/coco/images/val2017/{img_info['file_name']}"
        ann_ids = coco.getAnnIds(imgIds=img_id)
        annotations = coco.loadAnns(ann_ids)

        # Get ground truth
        gt_masks = [coco.annToMask(ann) for ann in annotations]
        gt_boxes = [ann['bbox'] for ann in annotations]

        # Get image dimensions
        img = Image.open(img_path)
        img_width, img_height = img.size

        # Run inference
        start_time = time.time()

        if model_type == 'maskrcnn':
            # Mask R-CNN inference
            image_tensor = F.to_tensor(img).unsqueeze(0).to(device)
            with torch.no_grad():
                outputs = model(image_tensor)[0]

            # Process Mask R-CNN outputs
            pred_masks = outputs['masks'].cpu().numpy().squeeze(1)
            scores = outputs['scores'].cpu().numpy()
            keep = scores > CONFIDENCE_THRESHOLD  # Confidence threshold
            pred_masks = pred_masks[keep]
            pred_boxes = outputs['boxes'].cpu().numpy()[keep]

        else:  # YOLOv8
            # YOLOv8 inference
            try:
                results = model(img_path)
                result = results[0]  # Get first (and only) result

                # Process YOLOv8 outputs
                if result.masks is None:  # No detections
                    pred_masks = np.zeros((0, img_height, img_width))
                    pred_boxes = np.zeros((0, 4))
                else:
                    pred_masks = result.masks.data.cpu().numpy()
                    pred_boxes = result.boxes.xyxy.cpu().numpy()

                    # Resize masks to original image dimensions
                    resized_masks = []
                    for mask in pred_masks:
                        mask = cv2.resize(mask.squeeze(), (img_width, img_height))
                        resized_masks.append(mask)
                    pred_masks = np.array(resized_masks)

            except Exception as e:
                print(f"Error processing image {img_id}: {str(e)}")
                pred_masks = np.zeros((0, img_height, img_width))
                pred_boxes = np.zeros((0, 4))

        inference_time = time.time() - start_time

        # Calculate metrics if we have predictions
        if len(pred_boxes) > 0:
            # Convert masks to RLE format for pycocotools
            pred_rles = [maskUtils.encode(np.asarray(mask > 0.5, order='F'))
                        for mask in pred_masks]
            gt_rles = [maskUtils.encode(np.asarray(mask, order='F'))
                      for mask in gt_masks]

            # Calculate IoU
            ious = maskUtils.iou(pred_rles, gt_rles, [0]*len(gt_rles))
            miou = np.mean(ious) if len(ious) > 0 else 0

            # Calculate mAP (simplified version)
            tp = np.sum(np.max(ious, axis=1) > 0.5) if len(ious) > 0 else 0
            precision = tp / len(pred_boxes)

        else:
            miou = 0
            precision = 0

        metric = {
            'mAP': precision,
            'mIoU': miou,
            'Dice': 2 * miou / (1 + miou) if miou > 0 else 0,
            'FPS': 1 / (inference_time + 1e-6)  # Avoid division by zero
        }

        print("Metric for:", img_path)
        print(metric)
        metrics.append(metric)

    # Aggregate metrics
    return {
        'mAP': np.mean([r['mAP'] for r in metrics]),
        'mIoU': np.mean([r['mIoU'] for r in metrics]),
        'Dice': np.mean([r['Dice'] for r in metrics]),
        'FPS': np.mean([r['FPS'] for r in metrics])
    }

In [35]:
# Initialize COCO API
coco = COCO('data/coco/annotations/instances_val2017.json')
img_ids = coco.getImgIds()[:NUM_IMAGES]

# Evaluate models
maskrcnn_results = evaluate_model(maskrcnn_model, coco, img_ids, 'maskrcnn')
yolov8_results = evaluate_model(yolov8_model, coco, img_ids, 'yolov8')

# Display results
print("Mask R-CNN Results:", maskrcnn_results)
print("YOLOv8 Results:", yolov8_results)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Speed: 2.0ms preprocess, 45.7ms inference, 2.5ms postprocess per image at shape (1, 3, 448, 640)
Metric for: data/coco/images/val2017/000000397133.jpg
{'mAP': np.float64(0.4117647058823529), 'mIoU': np.float64(0.017405522619574485), 'Dice': np.float64(0.03421550646739061), 'FPS': 10.877000421551482}

image 1/1 /content/data/coco/images/val2017/000000037777.jpg: 448x640 1 bottle, 1 bowl, 1 banana, 5 oranges, 2 chairs, 1 potted plant, 1 dining table, 1 oven, 1 sink, 1 refrigerator, 45.0ms
Speed: 2.9ms preprocess, 45.0ms inference, 2.7ms postprocess per image at shape (1, 3, 448, 640)
Metric for: data/coco/images/val2017/000000037777.jpg
{'mAP': np.float64(0.26666666666666666), 'mIoU': np.float64(0.03709713436467117), 'Dice': np.float64(0.07154032758444943), 'FPS': 14.342384320999303}

image 1/1 /content/data/coco/images/val2017/000000252219.jpg: 448x640 3 persons, 1 car, 2 traffic lights, 1 backpack, 2 umbrellas, 47.1ms
Spe

In [36]:
import pandas as pd

results_df = pd.DataFrame({
    'Model': ['Mask R-CNN', 'YOLOv8'],
    'mAP@0.5': [maskrcnn_results['mAP'], yolov8_results['mAP']],
    'mIoU': [maskrcnn_results['mIoU'], yolov8_results['mIoU']],
    'Dice': [maskrcnn_results['Dice'], yolov8_results['Dice']],
    'FPS': [maskrcnn_results['FPS'], yolov8_results['FPS']]
})

print(results_df)

        Model   mAP@0.5      mIoU      Dice        FPS
0  Mask R-CNN  0.653778  0.194674  0.285754   6.502511
1      YOLOv8  0.746885  0.221345  0.313887  13.991079


In [22]:
COCO_CLASS_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
    'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
    'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A', 'N/A',
    'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', 'clock',
    'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [27]:
def visualize_results(img_id, coco):
    img_info = coco.loadImgs(img_id)[0]
    img_path = f"data/coco/images/val2017/{img_info['file_name']}"
    image = np.array(Image.open(img_path))
    img_height, img_width = image.shape[:2]

    # Create figure
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24, 8))

    # Ground truth visualization
    ax1.imshow(image)
    for ann in coco.loadAnns(coco.getAnnIds(imgIds=img_id)):
        # Draw bounding box
        bbox = ann['bbox']
        x, y, w, h = bbox
        rect = plt.Rectangle((x, y), w, h, fill=False,
                           color='red', linewidth=2, linestyle='-')
        ax1.add_patch(rect)

        # Draw segmentation mask
        mask = coco.annToMask(ann)
        color = np.random.random(3)
        image_masked = image.copy()
        image_masked[mask == 1] = image_masked[mask == 1] * 0.7 + color * 255 * 0.3
        ax1.imshow(image_masked)

        # Add label
        class_name = COCO_CLASS_NAMES[ann['category_id']]
        ax1.text(x, y - 5, class_name,
                bbox=dict(facecolor='red', alpha=0.5),
                fontsize=8, color='white')
    ax1.set_title('Ground Truth')

    # Mask R-CNN visualization
    ax2.imshow(image)
    image_tensor = F.to_tensor(Image.open(img_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        outputs = maskrcnn_model(image_tensor)[0]

    for i in range(len(outputs['boxes'])):
        if outputs['scores'][i] < CONFIDENCE_THRESHOLD:  # Confidence threshold
            continue

        # Draw bounding box
        box = outputs['boxes'][i].cpu().numpy()
        x1, y1, x2, y2 = box
        width = x2 - x1
        height = y2 - y1
        rect = plt.Rectangle((x1, y1), width, height,
                           fill=False, color='blue', linewidth=2)
        ax2.add_patch(rect)

        # Draw segmentation mask
        mask = outputs['masks'][i].squeeze().cpu().numpy() > CONFIDENCE_THRESHOLD
        color = np.random.random(3)
        image_masked = image.copy()
        image_masked[mask] = image_masked[mask] * 0.7 + color * 255 * 0.3
        ax2.imshow(image_masked)

        # Add label and confidence
        class_name = COCO_CLASS_NAMES[outputs['labels'][i].item()]
        score = outputs['scores'][i].item()
        ax2.text(x1, y1 - 5, f"{class_name}: {score:.2f}",
                bbox=dict(facecolor='blue', alpha=0.5),
                fontsize=8, color='white')
    ax2.set_title('Mask R-CNN')

    # YOLOv8 visualization
    ax3.imshow(image)
    results = yolov8_model(img_path)
    result = results[0]

    if result.masks is not None:
        for i in range(len(result.boxes)):
            if result.boxes.conf[i] < CONFIDENCE_THRESHOLD:  # Confidence threshold
                continue

            # Draw bounding box
            box = result.boxes.xyxy[i].cpu().numpy()
            x1, y1, x2, y2 = box
            width = x2 - x1
            height = y2 - y1
            rect = plt.Rectangle((x1, y1), width, height,
                               fill=False, color='green', linewidth=2)
            ax3.add_patch(rect)

            # Draw segmentation mask
            mask = result.masks.data[i].cpu().numpy()
            mask = cv2.resize(mask.squeeze(), (img_width, img_height)) > CONFIDENCE_THRESHOLD
            color = np.random.random(3)
            image_masked = image.copy()
            image_masked[mask] = image_masked[mask] * 0.7 + color * 255 * 0.3
            ax3.imshow(image_masked)

            # Add label and confidence
            class_id = int(result.boxes.cls[i].item())
            class_name = yolov8_model.names[class_id]
            score = result.boxes.conf[i].item()
            ax3.text(x1, y1 - 5, f"{class_name}: {score:.2f}",
                    bbox=dict(facecolor='green', alpha=0.5),
                    fontsize=8, color='white')
    ax3.set_title('YOLOv8')

    plt.tight_layout()
    plt.show()

In [28]:
for img_id in img_ids[:3]:
    visualize_results(img_id, coco)

Output hidden; open in https://colab.research.google.com to view.