# Video Object Detection using YOLO

This notebook demonstrates object detection in video files using YOLO (You Only Look Once) and evaluates the results using Intersection over Union (IOU).


## 1. Setup and Imports


In [None]:
import cv2
import numpy as np
import json
import os
from pathlib import Path
from typing import List, Tuple, Dict
from ultralytics import YOLO
import matplotlib.pyplot as plt
from collections import defaultdict

print("Libraries imported successfully!")


## 2. Configuration


In [None]:
# Configuration
INPUT_VIDEO = "input_video.mp4"  # Change this to your video file path
OUTPUT_VIDEO = "output_video.mp4"  # Output video with bounding boxes
MODEL_NAME = "yolov8n.pt"  # YOLO model (yolov8n.pt, yolov8s.pt, yolov8m.pt, etc.)
CONFIDENCE_THRESHOLD = 0.25  # Minimum confidence for detections
DETECTION_CLASS = 0  # 0 = person in COCO dataset
GROUND_TRUTH_FILE = "ground_truth.json"  # Ground truth annotations (if available)
IOU_THRESHOLD = 0.5  # IOU threshold for matching detections to ground truth


## 3. Helper Functions


In [None]:
def calculate_iou(boxA: List[float], boxB: List[float]) -> float:
    """
    Calculate Intersection over Union (IOU) between two bounding boxes.
    
    Args:
        boxA: Bounding box [x1, y1, x2, y2]
        boxB: Bounding box [x1, y1, x2, y2]
        
    Returns:
        IOU value between 0 and 1
    """
    # Determine coordinates of intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    
    # Compute area of intersection
    inter_area = max(0, xB - xA) * max(0, yB - yA)
    
    # Compute area of both bounding boxes
    boxA_area = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
    boxB_area = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
    
    # Compute IOU
    union_area = boxA_area + boxB_area - inter_area
    if union_area == 0:
        return 0.0
    
    iou = inter_area / union_area
    return iou


## 4. Load YOLO Model


In [None]:
# Load YOLO model
print(f"Loading YOLO model: {MODEL_NAME}")
model = YOLO(MODEL_NAME)
print("Model loaded successfully!")

# Display model info
print(f"\nModel classes: {len(model.names)} classes")
print(f"Class 0 (person): {model.names[0]}")


## 5. Process Video and Detect Objects


In [None]:
# Open video file
cap = cv2.VideoCapture(INPUT_VIDEO)

if not cap.isOpened():
    raise ValueError(f"Could not open video file: {INPUT_VIDEO}")

# Get video properties
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

print(f"Video properties:")
print(f"  Resolution: {frame_width}x{frame_height}")
print(f"  FPS: {fps}")
print(f"  Total frames: {total_frames}")
print(f"  Duration: {total_frames/fps:.2f} seconds")

# Define codec and create VideoWriter
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps, (frame_width, frame_height))

# Storage for detections
all_detections = {}
frame_count = 0
total_detections = 0

print(f"\nProcessing video...")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Detect objects
    results = model(frame, conf=CONFIDENCE_THRESHOLD, verbose=False)
    
    frame_detections = []
    annotated_frame = frame.copy()
    
    for result in results:
        boxes = result.boxes
        for i in range(len(boxes)):
            box = boxes.xyxy[i].cpu().numpy()  # [x1, y1, x2, y2]
            confidence = float(boxes.conf[i].cpu().numpy())
            class_id = int(boxes.cls[i].cpu().numpy())
            
            # Filter by class (0 = person)
            if class_id == DETECTION_CLASS:
                x1, y1, x2, y2 = map(int, box)
                
                # Draw bounding box
                cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                
                # Draw label with confidence
                label = f"Person {confidence:.2f}"
                label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
                cv2.rectangle(annotated_frame, (x1, y1 - label_size[1] - 10), 
                            (x1 + label_size[0], y1), (0, 255, 0), -1)
                cv2.putText(annotated_frame, label, (x1, y1 - 5), 
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)
                
                frame_detections.append({
                    'bbox': box.tolist(),
                    'confidence': confidence,
                    'class_id': class_id
                })
    
    # Save detections for this frame
    all_detections[frame_count] = frame_detections
    total_detections += len(frame_detections)
    
    # Write frame to output video
    out.write(annotated_frame)
    
    frame_count += 1
    if frame_count % 30 == 0:
        print(f"Processed {frame_count}/{total_frames} frames...")

cap.release()
out.release()

print(f"\nProcessing complete!")
print(f"Total frames processed: {frame_count}")
print(f"Total detections: {total_detections}")
print(f"Average detections per frame: {total_detections/frame_count:.2f}")
print(f"Output video saved to: {OUTPUT_VIDEO}")


## 6. Save Detections


In [None]:
# Save detections to JSON
detections_path = OUTPUT_VIDEO.replace('.mp4', '_detections.json')
with open(detections_path, 'w') as f:
    json.dump(all_detections, f, indent=2)

print(f"Detections saved to: {detections_path}")


## 7. Evaluate with Ground Truth (if available)


In [None]:
def match_detections_to_ground_truth(detections: List[Dict], 
                                     ground_truth: List[List[float]], 
                                     iou_threshold: float = 0.5) -> Tuple[List[float], List[bool]]:
    """Match detected bounding boxes to ground truth boxes using IOU."""
    if not detections or not ground_truth:
        return [], [False] * len(detections)
    
    matched_ious = []
    match_flags = [False] * len(detections)
    used_gt = [False] * len(ground_truth)
    
    # Calculate IOU for all pairs
    iou_matrix = []
    for det in detections:
        det_bbox = det['bbox']
        ious = []
        for gt_bbox in ground_truth:
            iou = calculate_iou(det_bbox, gt_bbox)
            ious.append(iou)
        iou_matrix.append(ious)
    
    # Greedy matching: match highest IOU pairs first
    while True:
        max_iou = 0
        best_det_idx = -1
        best_gt_idx = -1
        
        for det_idx, ious in enumerate(iou_matrix):
            if match_flags[det_idx]:
                continue
            for gt_idx, iou in enumerate(ious):
                if used_gt[gt_idx]:
                    continue
                if iou > max_iou:
                    max_iou = iou
                    best_det_idx = det_idx
                    best_gt_idx = gt_idx
        
        if max_iou >= iou_threshold and best_det_idx >= 0:
            matched_ious.append(max_iou)
            match_flags[best_det_idx] = True
            used_gt[best_gt_idx] = True
        else:
            break
    
    return matched_ious, match_flags


In [None]:
# Load ground truth if available
if os.path.exists(GROUND_TRUTH_FILE):
    print(f"Loading ground truth from: {GROUND_TRUTH_FILE}")
    with open(GROUND_TRUTH_FILE, 'r') as f:
        ground_truth = json.load(f)
    
    # Evaluate detections
    all_ious = []
    frame_metrics = {}
    
    for frame_num_str, frame_detections in all_detections.items():
        frame_num = int(frame_num_str)
        
        if str(frame_num) not in ground_truth:
            continue
        
        gt_boxes = ground_truth[str(frame_num)]
        matched_ious, match_flags = match_detections_to_ground_truth(
            frame_detections, gt_boxes, IOU_THRESHOLD
        )
        
        all_ious.extend(matched_ious)
        
        # Calculate metrics for this frame
        true_positives = len(matched_ious)
        false_positives = sum(1 for flag in match_flags if not flag)
        false_negatives = len(gt_boxes) - true_positives
        
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        frame_metrics[frame_num] = {
            'true_positives': true_positives,
            'false_positives': false_positives,
            'false_negatives': false_negatives,
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score,
            'avg_iou': np.mean(matched_ious) if matched_ious else 0,
            'matched_ious': matched_ious
        }
    
    # Overall metrics
    mean_iou = np.mean(all_ious) if all_ious else 0
    median_iou = np.median(all_ious) if all_ious else 0
    
    print(f"\n{'='*80}")
    print(f"EVALUATION RESULTS")
    print(f"{'='*80}")
    print(f"Mean IOU: {mean_iou:.4f}")
    print(f"Median IOU: {median_iou:.4f}")
    print(f"Total Matches: {len(all_ious)}")
    print(f"\nPer-frame metrics:")
    for frame_num, metrics in sorted(frame_metrics.items()):
        print(f"\nFrame {frame_num}:")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1 Score: {metrics['f1_score']:.4f}")
        print(f"  Average IOU: {metrics['avg_iou']:.4f}")
        
    # Visualize IOU distribution
    if all_ious:
        plt.figure(figsize=(10, 6))
        plt.hist(all_ious, bins=20, edgecolor='black')
        plt.xlabel('IOU Value')
        plt.ylabel('Frequency')
        plt.title('Distribution of IOU Values')
        plt.axvline(mean_iou, color='r', linestyle='--', label=f'Mean IOU: {mean_iou:.3f}')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()
else:
    print(f"Ground truth file not found: {GROUND_TRUTH_FILE}")
    print("To create ground truth annotations, use the annotate_ground_truth.py script")


In [None]:
# Display a few sample frames from the output video
cap = cv2.VideoCapture(OUTPUT_VIDEO)
sample_frames = [0, total_frames // 4, total_frames // 2, 3 * total_frames // 4]

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

for idx, frame_num in enumerate(sample_frames):
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
    ret, frame = cap.read()
    
    if ret:
        # Convert BGR to RGB for matplotlib
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        axes[idx].imshow(frame_rgb)
        axes[idx].set_title(f"Frame {frame_num}")
        axes[idx].axis('off')

cap.release()
plt.tight_layout()
plt.show()
