# Digeiz Technical Test
This test consists in reordering and cleaning a video where the frames have been shuffled and some unrelated frames have been added.

The solution here works by tracking objects across frames (using RT-DETRv2 by Baidu), removing frames where no object is detected, and then computing similarities between frames (in terms of a weighted sum of grayscale histogram distance and main bounding box tracking).

Using this similarity metric, the algorithm then finds a path of maximization of total similarity between adjactent frames (or a path of least total distance between frames), starting from candidates frames that are the most distant on average from the other frames. After that, a coherence check removes the frames that are not coherent with their neighbours in terms of tracked objects.

The removed frames are saved in a folder, while a cleaned video and its reversed version (as our algorithm doesn't know in which order the original video might have been filmed) are generated.

All the infos regarding the frames and their processing are saved in a json.

### Imports

In [1]:
import cv2
import torch
import numpy as np
from pathlib import Path
from typing import Optional, Tuple, List, Dict
import urllib.request
from PIL import Image
from torchvision.transforms import functional as F
from dataclasses import dataclass
import json
import subprocess
import tempfile
import shutil
import os
from os import path

### A Data Class for storing information about frames:

In [2]:
@dataclass
class FrameDetection:
    """Store detection information for a frame."""
    frame_idx: int
    frame: np.ndarray
    boxes: np.ndarray
    scores: np.ndarray
    labels: np.ndarray
    has_detections: bool
    max_confidence: float

### The main class for processing videos using RT-DETRv2:

In [3]:


class RTDETRv2VideoPipeline:
    """
    Pipeline for processing videos with RT-DETRv2 object detection.
    Supports filtering frames without objects and reordering shuffled frames.
    """

    def __init__(self, model_name: str = "rtdetrv2_r50vd", device: Optional[str] = None):
        """
        Initialize the RT-DETRv2 video processing pipeline.

        Args:
            model_name: Model variant (rtdetrv2_r18vd, rtdetrv2_r34vd, rtdetrv2_r50vd, rtdetrv2_r101vd)
            device: Device to run inference on ('cuda', 'cpu', or None for auto-detect)
        """
        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = None
        self.model_name = model_name
        self.coco_classes = self._load_coco_classes()
        self.colors = self._generate_colors(len(self.coco_classes))

    def _load_coco_classes(self):
        """Load COCO dataset class names."""
        self.load_model()
        return self.model.config.id2label

    def _generate_colors(self, n: int):
        """Generate distinct colors for each class."""
        np.random.seed(42)
        return [(np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255))
                for _ in range(n)]

    def load_model(self, weights_path: Optional[str] = None):
        """
        Load the RT-DETRv2 model.

        Args:
            weights_path: Path to model weights. If None, uses Hugging Face transformers.
        """
        print(f"Loading RT-DETRv2 model on {self.device}...")

        if weights_path:
            # Load from local weights (requires RT-DETR repo)
            from rtdetrv2_pytorch.src.zoo.rtdetr import RTDETR
            self.model = RTDETR.from_pretrained(weights_path)
        else:
            # Use Hugging Face transformers (easier setup)
            from transformers import RTDetrForObjectDetection, RTDetrImageProcessor

            model_map = {
                'rtdetrv2_r18vd': 'PekingU/rtdetr_r18vd',
                'rtdetrv2_r34vd': 'PekingU/rtdetr_r34vd',
                'rtdetrv2_r50vd': 'PekingU/rtdetr_r50vd',
                'rtdetrv2_r101vd': 'PekingU/rtdetr_r101vd'
            }

            model_id = model_map.get(self.model_name, 'PekingU/rtdetr_r50vd')
            self.processor = RTDetrImageProcessor.from_pretrained(model_id)
            self.model = RTDetrForObjectDetection.from_pretrained(model_id)

        self.model.to(self.device)
        self.model.eval()
        print("Model loaded successfully!")

    def preprocess_frame(self, frame: np.ndarray) -> Tuple[torch.Tensor, Tuple[int, int]]:
        """
        Preprocess a video frame for model input.

        Args:
            frame: Input frame (BGR format from OpenCV)

        Returns:
            Preprocessed tensor and original size
        """
        # Convert BGR to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame_rgb)

        # Process with model processor
        inputs = self.processor(images=pil_image, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        return inputs, frame.shape[:2]

    def postprocess_predictions(self, outputs, orig_size: Tuple[int, int],
                               confidence_threshold: float = 0.5):
        """
        Post-process model outputs to get bounding boxes.

        Args:
            outputs: Model outputs
            orig_size: Original frame size (height, width)
            confidence_threshold: Minimum confidence for detections

        Returns:
            List of detections (boxes, scores, labels)
        """
        # Get predictions
        logits = outputs.logits[0]
        boxes = outputs.pred_boxes[0]

        # Apply softmax to get probabilities
        probs = logits.softmax(-1)
        scores, labels = probs.max(-1)

        # Filter by confidence
        keep = scores > confidence_threshold
        scores = scores[keep].cpu().numpy()
        labels = labels[keep].cpu().numpy()
        boxes = boxes[keep].cpu().numpy()

        # Convert boxes from normalized [cx, cy, w, h] to [x1, y1, x2, y2]
        h, w = orig_size
        boxes_converted = boxes.copy()
        boxes_converted[:, 0] = (boxes[:, 0] - boxes[:, 2] / 2) * w  # x1
        boxes_converted[:, 1] = (boxes[:, 1] - boxes[:, 3] / 2) * h  # y1
        boxes_converted[:, 2] = (boxes[:, 0] + boxes[:, 2] / 2) * w  # x2
        boxes_converted[:, 3] = (boxes[:, 1] + boxes[:, 3] / 2) * h  # y2

        return boxes_converted, scores, labels

    def draw_detections(self, frame: np.ndarray, boxes, scores, labels) -> np.ndarray:
        """
        Draw bounding boxes and labels on frame.

        Args:
            frame: Input frame
            boxes: Bounding boxes
            scores: Confidence scores
            labels: Class labels

        Returns:
            Annotated frame
        """
        annotated_frame = frame.copy()

        for box, score, label in zip(boxes, scores, labels):
            x1, y1, x2, y2 = box.astype(int)
            color = self.colors[label]

            # Draw bounding box
            cv2.rectangle(annotated_frame, (x1, y1), (x2, y2), color, 2)

            # Prepare label text
            class_name = self.coco_classes[label]
            text = f"{class_name}: {score:.2f}"

            # Draw label background
            (text_width, text_height), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
            cv2.rectangle(annotated_frame, (x1, y1 - text_height - 10),
                         (x1 + text_width, y1), color, -1)

            # Draw label text
            cv2.putText(annotated_frame, text, (x1, y1 - 5),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

        return annotated_frame

    def compute_frame_similarity(self, frame1: np.ndarray, frame2: np.ndarray) -> float:
        """
        Compute similarity between two frames using histogram comparison.

        Args:
            frame1: First frame
            frame2: Second frame

        Returns:
            Similarity score (0-1, higher is more similar)
        """
        # Convert to grayscale
        gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
        gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)

        # Compute histograms
        hist1 = cv2.calcHist([gray1], [0], None, [256], [0, 256])
        hist2 = cv2.calcHist([gray2], [0], None, [256], [0, 256])

        # Normalize histograms
        hist1 = cv2.normalize(hist1, hist1).flatten()
        hist2 = cv2.normalize(hist2, hist2).flatten()

        # Compute correlation
        correlation = cv2.compareHist(hist1, hist2, cv2.HISTCMP_CORREL)

        return correlation

    def check_bbox_coherence(self, fd1: FrameDetection, fd2: FrameDetection,
                            target_class: Optional[int] = None,
                            iou_threshold: float = 0.3,
                            check_all_objects: bool = True) -> bool:
        """
        Check if bounding boxes are coherent between two adjacent frames.

        Args:
            fd1: First frame detection
            fd2: Second frame detection
            target_class: Class ID to prioritize (or None to check all)
            iou_threshold: Minimum IoU to consider boxes coherent
            check_all_objects: If True, check all common objects; if False, only target_class

        Returns:
            True if boxes are coherent, False otherwise
        """
        if check_all_objects or target_class is None:
            # Check if ANY objects are consistent between frames (more robust)
            # Group boxes by class
            boxes_by_class_1 = {}
            for box, label in zip(fd1.boxes, fd1.labels):
                label_int = int(label)
                if label_int not in boxes_by_class_1:
                    boxes_by_class_1[label_int] = []
                boxes_by_class_1[label_int].append(box)

            boxes_by_class_2 = {}
            for box, label in zip(fd2.boxes, fd2.labels):
                label_int = int(label)
                if label_int not in boxes_by_class_2:
                    boxes_by_class_2[label_int] = []
                boxes_by_class_2[label_int].append(box)

            # Find common object classes between frames
            common_classes = set(boxes_by_class_1.keys()) & set(boxes_by_class_2.keys())

            if not common_classes:
                # No common objects - frames are incoherent
                return False


            # Check each common class - if ANY has good IoU, frames are coherent
            classes_to_check = common_classes
            max_iou_overall = 0
            best_class = None

            for cls in classes_to_check:
                boxes1 = boxes_by_class_1[cls]
                boxes2 = boxes_by_class_2[cls]

                # Find best matching box for this class
                max_iou_class = 0
                for b1 in boxes1:
                    for b2 in boxes2:
                        iou = self.compute_bbox_iou(b1, b2)
                        if iou > max_iou_class:
                            max_iou_class = iou

                if max_iou_class > max_iou_overall:
                    max_iou_overall = max_iou_class
                    best_class = cls

                # Early exit if we found a good match
                if max_iou_overall >= iou_threshold:
                    return True

            # No class had sufficient IoU
            return False

        else:

            # Get boxes for target class in both frames
            boxes1 = [box for box, label in zip(fd1.boxes, fd1.labels) if int(label) == target_class]
            boxes2 = [box for box, label in zip(fd2.boxes, fd2.labels) if int(label) == target_class]

            # If target object missing in either frame
            if not boxes1 or not boxes2:
                return False

            # Find best matching box (highest IoU)
            max_iou = 0
            for b1 in boxes1:
                for b2 in boxes2:
                    iou = self.compute_bbox_iou(b1, b2)
                    max_iou = max(max_iou, iou)

            return max_iou >= iou_threshold

    def filter_incoherent_frames(self, frame_detections: List[FrameDetection],
                                 target_class: Optional[int] = None,
                                 iou_threshold: float = 0.3,
                                 window_size: int = 2,
                                 check_all_objects: bool = True) -> List[FrameDetection]:
        """
        Filter out frames with incoherent bounding boxes compared to adjacent frames.

        Args:
            frame_detections: Ordered list of FrameDetection objects
            target_class: Target object class to prioritize (or None)
            iou_threshold: Minimum IoU between adjacent frames to keep frame
            window_size: Number of adjacent frames to check (1=prev only, 2=prev+next)
            check_all_objects: If True, check ALL common objects between frames (recommended);
                              If False, only check target_class (strict tracking)

        Returns:
            Filtered list of FrameDetection objects
        """
        if len(frame_detections) <= 2:
            return frame_detections

        print(f"Filtering frames with incoherent bounding boxes...")
        print(f"  IoU threshold: {iou_threshold}, Window size: {window_size}")
        print(f"  Check all objects: {check_all_objects}")

        # Analyze object distribution
        if check_all_objects or target_class is None:
            all_classes = set()
            for fd in frame_detections:
                all_classes.update([int(l) for l in fd.labels])
            print(f"  Total unique object types in sequence: {len(all_classes)}")
            print(f"  Object types: {[self.coco_classes[c] for c in sorted(all_classes)]}")

        # Mark frames as coherent or incoherent
        keep_flags = [False] * len(frame_detections)

        # Check each frame against its neighbors
        for i in range(len(frame_detections)):
            coherence_count = 0
            checks = 0

            # Check against previous frame
            if i > 0:
                if self.check_bbox_coherence(frame_detections[i-1], frame_detections[i],
                                            target_class, iou_threshold, check_all_objects):
                    coherence_count += 1
                checks += 1

            # Check against next frame (if window_size >= 2)
            if window_size >= 2 and i < len(frame_detections) - 1:
                if self.check_bbox_coherence(frame_detections[i], frame_detections[i+1],
                                            target_class, iou_threshold, check_all_objects):
                    coherence_count += 1
                checks += 1

            # Keep frame if it's coherent with at least one neighbor
            if checks > 0 and coherence_count > 0:
                keep_flags[i] = True

        # Ensure we keep at least some frames
        if sum(keep_flags) == 0:
            print("  WARNING: All frames marked as incoherent! Keeping all frames.")
            return frame_detections

        # Filter frames
        filtered = [fd for fd, keep in zip(frame_detections, keep_flags) if keep]
        removed = len(frame_detections) - len(filtered)

        print(f"  Removed {removed} incoherent frames")
        print(f"  Kept {len(filtered)} coherent frames")

        # Show which frames were removed
        if removed > 0 and removed <= 20:
            removed_indices = [fd.frame_idx for fd, keep in zip(frame_detections, keep_flags) if not keep]
            print(f"  Removed frame indices: {removed_indices}")

        return filtered

    def compute_bbox_iou(self, box1: np.ndarray, box2: np.ndarray) -> float:
        """
        Compute Intersection over Union (IoU) between two bounding boxes.

        Args:
            box1: First box [x1, y1, x2, y2]
            box2: Second box [x1, y1, x2, y2]

        Returns:
            IoU score (0-1, higher means more overlap)
        """
        # Calculate intersection area
        x1 = max(box1[0], box2[0])
        y1 = max(box1[1], box2[1])
        x2 = min(box1[2], box2[2])
        y2 = min(box1[3], box2[3])

        intersection = max(0, x2 - x1) * max(0, y2 - y1)

        # Calculate union area
        box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
        box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
        union = box1_area + box2_area - intersection

        if union == 0:
            return 0.0

        return intersection / union

    def find_most_present_object(self, frame_detections: List[FrameDetection]) -> Tuple[int, int]:
        """
        Find the most frequently detected object class and its typical bounding box.

        Args:
            frame_detections: List of FrameDetection objects

        Returns:
            Tuple of (most_common_class_id, frame_index_with_best_example)
        """
        from collections import Counter

        # Count all detected classes
        class_counter = Counter()
        class_boxes = {}  # Store boxes for each class

        for fd in frame_detections:
            for label, box, score in zip(fd.labels, fd.boxes, fd.scores):
                label = int(label)
                class_counter[label] += 1

                if label not in class_boxes:
                    class_boxes[label] = []
                class_boxes[label].append({
                    'frame_idx': fd.frame_idx,
                    'box': box,
                    'score': score
                })

        if not class_counter:
            return None, None

        # Get most common class
        most_common_class = class_counter.most_common(1)[0][0]

        # Find the frame with the highest confidence detection of this class
        best_detection = max(class_boxes[most_common_class], key=lambda x: x['score'])
        best_frame_idx = best_detection['frame_idx']

        # Find which frame_detection corresponds to this frame_idx
        for idx, fd in enumerate(frame_detections):
            if fd.frame_idx == best_frame_idx:
                return most_common_class, idx

        return most_common_class, 0

    def compute_object_tracking_score(self, fd1: FrameDetection, fd2: FrameDetection,
                                     target_class: int) -> float:
        #TODO: Multiple object tracking score (instead of main object across frames)
        """
        Compute a score for how well fd2 follows fd1 based on tracking the target object.

        Args:
            fd1: First frame detection
            fd2: Second frame detection
            target_class: Class ID to track

        Returns:
            Tracking score (higher is better)
        """
        # Get boxes for target class in both frames
        boxes1 = [box for box, label in zip(fd1.boxes, fd1.labels) if int(label) == target_class]
        boxes2 = [box for box, label in zip(fd2.boxes, fd2.labels) if int(label) == target_class]

        if not boxes1 or not boxes2:
            # If target object not in one or both frames, use visual similarity
            return self.compute_frame_similarity(fd1.frame, fd2.frame) * 0.5

        # Find best matching box (highest IoU)
        max_iou = 0
        for b1 in boxes1:
            for b2 in boxes2:
                iou = self.compute_bbox_iou(b1, b2)
                max_iou = max(max_iou, iou)

        # Combine IoU with visual similarity
        visual_sim = self.compute_frame_similarity(fd1.frame, fd2.frame)

        # Weighted combination: prioritize object tracking (IoU) over visual similarity
        score = 0.7 * max_iou + 0.3 * visual_sim

        return score

    def find_candidate_start_frames(self, frame_detections: List[FrameDetection],
                                   similarity_matrix: np.ndarray,
                                   num_candidates: int = 5) -> List[int]:
        """
        Find candidate frames to use as starting points for TSP optimization.

        Args:
            frame_detections: List of FrameDetection objects
            similarity_matrix: Precomputed pairwise similarity matrix
            num_candidates: Number of candidate starting frames to return

        Returns:
            List of frame indices to try as starting points
        """
        n = len(frame_detections)
        candidates = []

        # Select frames that are MOST DIFFERENT from others (high avg distance)
        # These are good starting points as they're at the "edges" of the sequence
        avg_distances = []
        for i in range(n):
            # Calculate average dissimilarity (1 - similarity) to all other frames
            avg_dist = np.mean(1 - similarity_matrix[i])
            avg_distances.append((i, avg_dist))

        # Sort by average distance (descending) - most different frames first
        avg_distances.sort(key=lambda x: x[1], reverse=True)

        # Take top diverse candidates
        candidates = [idx for idx, _ in avg_distances[:num_candidates]]
        print(f"    Selected diverse frames (high avg distance from others)")

        # Ensure we have enough candidates
        if len(candidates) < num_candidates:
            # Add some evenly distributed frames
            step = n // (num_candidates - len(candidates) + 1)
            for i in range(0, n, step):
                if i not in candidates and len(candidates) < num_candidates:
                    candidates.append(i)

        return candidates[:num_candidates]

    def reorder_frames(self, frame_detections: List[FrameDetection],
                      use_object_tracking: bool = True,
                      num_start_candidates: int = 5) -> List[FrameDetection]:
        """
        Reorder shuffled frames by optimizing the total similarity between adjacent frames.
        Tries multiple starting frames and selects the best solution.

        Args:
            frame_detections: List of FrameDetection objects
            use_object_tracking: If True, track the most present object for reordering
            num_start_candidates: Number of different starting frames to try

        Returns:
            Reordered list of FrameDetection objects
        """
        if len(frame_detections) <= 1:
            return frame_detections

        print("Reordering shuffled frames using TSP optimization with multiple starting points...")

        # Find the most present object to track
        target_class = None

        if use_object_tracking:
            target_class, _ = self.find_most_present_object(frame_detections)
            if target_class is not None:
                class_name = self.coco_classes[target_class]
                print(f"  Tracking most present object: '{class_name}' (class {target_class})")
            else:
                print("  No objects detected for tracking, using visual similarity only")
                use_object_tracking = False

        n = len(frame_detections)
        print(f"  Computing similarity matrix for {n} frames...")

        # Compute pairwise similarity matrix (cache all similarities)
        similarity_matrix = np.zeros((n, n))

        for i in range(n):
            if i % 20 == 0:
                print(f"    Computing similarities: {i}/{n} frames...")
            for j in range(i + 1, n):
                if use_object_tracking and target_class is not None:
                    score = self.compute_object_tracking_score(
                        frame_detections[i], frame_detections[j], target_class
                    )
                else:
                    score = self.compute_frame_similarity(
                        frame_detections[i].frame, frame_detections[j].frame
                    )
                similarity_matrix[i][j] = score
                similarity_matrix[j][i] = score  # Symmetric

        print(f"  Similarity matrix computed!")

        # Get candidate starting frames using the specified strategy
        print(f"  Finding candidate starting frames...")
        candidate_starts = self.find_candidate_start_frames(
            frame_detections,
            similarity_matrix,  # Pass the similarity matrix
            num_start_candidates,
        )

        print(f"  Trying {len(candidate_starts)} different starting frames...")

        # Try TSP with each starting frame
        best_route = None
        best_score = -float('inf')
        best_start = None

        for candidate_idx, start_idx in enumerate(candidate_starts):
            print(f"\n  Candidate {candidate_idx + 1}/{len(candidate_starts)}: Starting from frame {frame_detections[start_idx].frame_idx}")

            print(f"    Using nearest neighbours + 2-opt optimization...")
            route = self.tsp_nearest_neighbor_2opt(similarity_matrix, start_idx, max_iterations=1000)

            # Calculate total score for this route
            total_score = sum(similarity_matrix[route[i]][route[i+1]]
                            for i in range(len(route) - 1))
            avg_score = total_score / (len(route) - 1)

            print(f"    Total score: {total_score:.2f}, Average: {avg_score:.3f}")

            if total_score > best_score:
                best_score = total_score
                best_route = route
                best_start = start_idx
                print(f"    ✓ New best solution!")

        # Reorder frames according to optimal path
        print(f"\n  Best starting frame: {frame_detections[best_start].frame_idx}")
        print(f"  Best total similarity score: {best_score:.2f}")
        print(f"  Best average adjacent frame similarity: {best_score / (n - 1):.3f}")

        ordered = [frame_detections[i] for i in best_route]

        print(f"Frame reordering complete!")

        return ordered

    def tsp_nearest_neighbor_2opt(self, similarity_matrix: np.ndarray, start: int,
                  max_iterations: int = 1000) -> List[int]:
        """
        2-opt local search for TSP. Good balance of quality and speed.
        """
        n = len(similarity_matrix)

        # Start with nearest neighbor solution
        route = self._nearest_neighbor_tour(similarity_matrix, start)

        def calculate_total_distance(route):
            return -sum(similarity_matrix[route[i]][route[i+1]]
                       for i in range(len(route) - 1))

        best_distance = calculate_total_distance(route)
        improved = True
        iteration = 0

        while improved and iteration < max_iterations:
            improved = False
            iteration += 1

            for i in range(1, n - 2):
                for j in range(i + 1, n - 1):
                    # Try reversing segment [i:j+1]
                    new_route = route[:i] + route[i:j+1][::-1] + route[j+1:]
                    new_distance = calculate_total_distance(new_route)

                    if new_distance < best_distance:
                        route = new_route
                        best_distance = new_distance
                        improved = True
                        break

                if improved:
                    break

            if iteration % 100 == 0:
                print(f"    2-opt iteration {iteration}, score: {-best_distance:.2f}")

        print(f"    2-opt converged after {iteration} iterations")
        return route


    def _nearest_neighbor_tour(self, similarity_matrix: np.ndarray, start: int) -> List[int]:
        """
        Greedy nearest neighbor heuristic for TSP.
        """
        n = len(similarity_matrix)
        unvisited = set(range(n))
        route = [start]
        unvisited.remove(start)

        current = start
        while unvisited:
            # Find nearest unvisited node (highest similarity)
            next_node = max(unvisited, key=lambda x: similarity_matrix[current][x])
            route.append(next_node)
            unvisited.remove(next_node)
            current = next_node

        return route

    def process_video_with_filtering(self,
                                    input_path: str,
                                    output_path: str,
                                    confidence_threshold: float = 0.5,
                                    remove_empty_frames: bool = True,
                                    reorder_shuffled: bool = False,
                                    filter_incoherent: bool = False,
                                    bbox_iou_threshold: float = 0.3,
                                    draw_bboxes: bool = True,
                                    generate_reversed: bool = False,
                                    save_removed_frames: bool = False,
                                    removed_frames_folder: str = "removed_frames",
                                    save_metadata: bool = True,
                                    show_progress: bool = True):
        """
        Process a video with object detection, filtering, and optional reordering.

        Args:
            input_path: Path to input video
            output_path: Path to save output video
            confidence_threshold: Minimum confidence for detections
            remove_empty_frames: Remove frames without detected objects
            reorder_shuffled: Reorder frames if they are shuffled
            filter_incoherent: Remove frames with incoherent bounding boxes after reordering
            bbox_iou_threshold: Minimum IoU between adjacent frames for coherence check
            draw_bboxes: Draw bounding boxes on output frames
            generate_reversed: Also generate a reversed version of the video
            save_removed_frames: Save removed frames as images to a folder
            removed_frames_folder: Folder path for saving removed frames
            save_metadata: Save detection metadata to JSON
            show_progress: Display progress information
        """
        if self.model is None:
            self.load_model()


        # Determine output path
        temp_output = None
        final_output = output_path


        output_path_for_cv = output_path
        # Ensure output has .mp4 extension
        if not output_path_for_cv.lower().endswith('.mp4'):
            output_path_for_cv = output_path_for_cv.rsplit('.', 1)[0] + '.mp4'

        # Open video
        cap = cv2.VideoCapture(input_path)

        if not cap.isOpened():
            raise ValueError(f"Could not open video: {input_path}")

        # Get video properties
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        print(f"Processing video: {input_path}")
        print(f"Total frames: {total_frames}, FPS: {fps}, Resolution: {width}x{height}")
        print(f"Settings: remove_empty={remove_empty_frames}, reorder={reorder_shuffled}")

        # Process all frames and store detections
        frame_detections = []
        frame_count = 0

        print("\nPhase 1: Detecting objects in all frames...")

        try:
            while True:
                ret, frame = cap.read()

                if not ret:
                    break

                # Run detection on every frame
                with torch.no_grad():
                    inputs, orig_size = self.preprocess_frame(frame)
                    outputs = self.model(**inputs)
                    boxes, scores, labels = self.postprocess_predictions(
                        outputs, orig_size, confidence_threshold
                    )

                # Store detection info
                has_detections = len(boxes) > 0
                max_conf = float(np.max(scores)) if has_detections else 0.0

                frame_detection = FrameDetection(
                    frame_idx=frame_count,
                    frame=frame.copy(),
                    boxes=boxes,
                    scores=scores,
                    labels=labels,
                    has_detections=has_detections,
                    max_confidence=max_conf
                )

                frame_detections.append(frame_detection)
                frame_count += 1

                if show_progress and frame_count % 30 == 0:
                    progress = (frame_count / total_frames) * 100
                    detected = sum(1 for fd in frame_detections if fd.has_detections)
                    print(f"  Progress: {progress:.1f}% ({frame_count}/{total_frames} frames, {detected} with detections)")

        finally:
            cap.release()

        # Filter frames without detections if requested
        if remove_empty_frames:
            print(f"\nPhase 2: Filtering frames without detections...")
            original_count = len(frame_detections)

            # Separate kept and removed frames
            kept_frames = []
            removed_frames = []
            removed_frames_info = []

            for fd in frame_detections:
                if fd.has_detections:
                    kept_frames.append(fd)
                else:
                    removed_frames.append(fd)

            removed_count = len(removed_frames)
            print(f"  Removed {removed_count} frames without detections")
            print(f"  Kept {len(kept_frames)} frames with detections")

            # Save removed frames if requested
            if save_removed_frames and removed_frames:
                print(f"  Saving {len(removed_frames)} removed frames to {removed_frames_folder}/...")
                if path.exists(removed_frames_folder) == False:
                    os.mkdir(removed_frames_folder)
                for fd in removed_frames:
                    filename = f"removed_empty_frame_{fd.frame_idx:06d}.jpg"
                    filepath = os.path.join(removed_frames_folder, filename)
                    cv2.imwrite(filepath, fd.frame)
                    removed_frames_info.append({
                        'original_frame_idx': fd.frame_idx,
                        'reason': 'no_detections',
                        'saved_path': filepath
                    })
                print(f"  ✓ Saved empty frames")

            frame_detections = kept_frames

        # Reorder frames if requested
        if reorder_shuffled and len(frame_detections) > 1:
            print(f"\nPhase 3: Reordering shuffled frames...")
            frame_detections = self.reorder_frames(
                frame_detections,
                use_object_tracking=True,
                num_start_candidates=5,  # Try 5 different starting frames
            )

        # Filter incoherent frames if requested
        if filter_incoherent and len(frame_detections) > 2:
            phase_num = 4 if reorder_shuffled else 3
            print(f"\nPhase {phase_num}: Filtering frames with incoherent bounding boxes...")

            # Keep track of which frames to remove
            original_count = len(frame_detections)

            filtered_frames = self.filter_incoherent_frames(
                frame_detections,
                target_class=None,  # Will auto-detect most common class
                iou_threshold=bbox_iou_threshold,
                window_size=2,  # Check both previous and next frames
                check_all_objects=True  # Check ANY common objects (not just primary)
            )

            # Find removed frames
            if save_removed_frames:
                kept_indices = {fd.frame_idx for fd in filtered_frames}
                removed_incoherent = [fd for fd in frame_detections if fd.frame_idx not in kept_indices]

                if removed_incoherent:
                    print(f"  Saving {len(removed_incoherent)} incoherent frames to {removed_frames_folder}/...")
                    for fd in removed_incoherent:
                        filename = f"removed_incoherent_frame_{fd.frame_idx:06d}.jpg"
                        filepath = os.path.join(removed_frames_folder, filename)

                        # Draw bboxes on saved frame to show why it was removed
                        annotated = self.draw_detections(fd.frame, fd.boxes, fd.scores, fd.labels)
                        cv2.imwrite(filepath, annotated)

                        removed_frames_info.append({
                            'original_frame_idx': fd.frame_idx,
                            'reason': 'incoherent_bboxes',
                            'num_detections': len(fd.boxes),
                            'detected_classes': [self.coco_classes[int(label)] for label in fd.labels],
                            'saved_path': filepath
                        })
                    print(f"  ✓ Saved incoherent frames with bounding boxes")

            frame_detections = filtered_frames

        # Write output video
        phase_num = 3
        if reorder_shuffled:
            phase_num += 1
        if filter_incoherent:
            phase_num += 1
        print(f"\nPhase {phase_num}: Writing output video...")

        # Helper function to write video
        def write_video(frame_list, output_file, video_label="output"):
            """Write frames to video file."""
            temp_out = None
            final_out = output_file

            out_file = output_file
            if not out_file.lower().endswith('.mp4'):
                out_file = out_file.rsplit('.', 1)[0] + '.mp4'

            # Create video writer
            codecs_to_try = [('avc1', 'H.264'), ('mp4v', 'MPEG-4'), ('X264', 'x264')]
            out_writer = None
            for codec, name in codecs_to_try:
                fourcc = cv2.VideoWriter_fourcc(*codec)
                out_writer = cv2.VideoWriter(out_file, fourcc, fps, (width, height))
                if out_writer.isOpened():
                    print(f"    Using codec: {name} ({codec})")
                    break
                out_writer.release()

            if out_writer is None or not out_writer.isOpened():
                raise RuntimeError("Could not create video writer")

            if not out_writer.isOpened():
                raise RuntimeError("Could not create video writer")

            # Write frames
            for idx, fd in enumerate(frame_list):
                # Draw detections if requested
                if draw_bboxes:
                    output_frame = self.draw_detections(fd.frame, fd.boxes, fd.scores, fd.labels)
                else:
                    output_frame = fd.frame

                out_writer.write(output_frame)

                if show_progress and (idx + 1) % 50 == 0:
                    progress = ((idx + 1) / len(frame_list)) * 100
                    print(f"    Writing {video_label}: {progress:.1f}% ({idx + 1}/{len(frame_list)} frames)")

            out_writer.release()

            return final_out

        # Write forward video
        print(f"  Writing forward video...")
        final_output = write_video(frame_detections, output_path, "forward video")

        # Write reversed video if requested
        reversed_output = None
        if generate_reversed:
            print(f"\n  Writing reversed video...")
            reversed_path = output_path.rsplit('.', 1)[0] + '_reversed.mp4'
            reversed_frames = list(reversed(frame_detections))
            reversed_output = write_video(reversed_frames, reversed_path, "reversed video")

        metadata = {
            'input_video': input_path,
            'output_video': final_output,
            'reversed_video': reversed_output,
            'original_frames': total_frames,
            'output_frames': len(frame_detections),
            'removed_frames': total_frames - len(frame_detections),
            'removed_frames_saved': save_removed_frames,
            'removed_frames_folder': removed_frames_folder if save_removed_frames else None,
            'fps': fps,
            'resolution': [width, height],
            'confidence_threshold': confidence_threshold,
            'draw_bboxes': draw_bboxes,
            'frames': [],
            'removed_frames_details': removed_frames_info if save_removed_frames else []
        }

        for idx, fd in enumerate(frame_detections):
            if save_metadata:
                frame_meta = {
                    'output_frame_idx': idx,
                    'original_frame_idx': fd.frame_idx,
                    'num_detections': len(fd.boxes),
                    'max_confidence': float(fd.max_confidence),
                    'detected_classes': [self.coco_classes[int(label)] for label in fd.labels]
                }
                metadata['frames'].append(frame_meta)

        # Save metadata
        if save_metadata:
            metadata_path = final_output.rsplit('.', 1)[0] + '_metadata.json'
            with open(metadata_path, 'w') as f:
                json.dump(metadata, f, indent=2)
            print(f"\nMetadata saved to: {metadata_path}")

        print(f"\n{'='*60}")
        print(f"Video processing complete!")
        print(f"Original frames: {total_frames}")
        print(f"Output frames: {len(frame_detections)}")
        print(f"Frames removed: {total_frames - len(frame_detections)}")
        if save_removed_frames and removed_frames_info:
            print(f"Removed frames saved to: {removed_frames_folder}/ ({len(removed_frames_info)} images)")
        print(f"Bounding boxes drawn: {'Yes' if draw_bboxes else 'No'}")
        print(f"Forward video saved to: {final_output}")
        if generate_reversed:
            print(f"Reversed video saved to: {reversed_output}")
        print(f"{'='*60}")




### Now let's instantiate the pipeline with the model...

In [None]:
# Initialize pipeline (model should be in [rtdetrv2_r18vd, rtdetrv2_r34vd, rtdetrv2_r50vd, rtdetrv2_r101vd])
pipeline = RTDETRv2VideoPipeline(
    model_name="rtdetrv2_r50vd"
)

Loading RT-DETRv2 model on cpu...


### And run it (don't forget to upload the corrupted video to the environment!)

In [None]:
# Process video with filtering and reordering
pipeline.process_video_with_filtering(
    input_path="/content/corrupted_video.mp4",
    output_path="corrected_video.mp4",
    confidence_threshold=0.5,
    remove_empty_frames=True,   # Remove frames without objects
    reorder_shuffled=True,       # Reorder shuffled frames (tries 5 starting points)
    filter_incoherent=True,      # Remove frames with inconsistent bounding boxes
    bbox_iou_threshold=0.3,      # Min IoU between adjacent frames (0.2-0.5 typical)
    draw_bboxes=True,            # Draw bounding boxes on output (set False for clean video)
    generate_reversed=True,      # Also generate reversed video
    save_removed_frames=True,    # Save removed frames as images
    removed_frames_folder="/content/removed_frames",  # Folder for removed frames
    save_metadata=True,          # Save frame metadata to JSON
    show_progress=True
)

    # Note: The pipeline has these main phases:
    # 1. Detect objects in all frames
    # 2. Remove frames without detections (optional) - saved to folder
    # 3. Reorder shuffled frames using TSP (optional)
    # 4. Filter frames with incoherent bounding boxes (optional) - saved to folder
    # 5. Write output video(s) - forward and/or reversed
    #
    # Output files:
    # - output_video.mp4: Forward video (with or without bboxes)
    # - output_video_reversed.mp4: Reversed video (if generate_reversed=True)
    # - output_video_metadata.json: Frame metadata (if save_metadata=True)
    # - removed_frames/: Folder with removed frame images (if save_removed_frames=True)
    #   - removed_empty_frame_XXXXXX.jpg: Frames without detections
    #   - removed_incoherent_frame_XXXXXX.jpg: Frames with incoherent bboxes (annotated)

### The resulting videos can now be downloaded from the environment. And you can check which frames were removed in the dedicated folder