In [None]:
!pip install ultralytics
!pip install opencv-python
!pip install filterpy
!pip install scipy
!pip install matplotlib
!pip install numpy
!pip install pandas
!pip install torch torchvision
!pip install pillow

import numpy as np
import cv2
from ultralytics import YOLO
from filterpy.kalman import KalmanFilter
from scipy.optimize import linear_sum_assignment
import matplotlib.pyplot as plt
from pathlib import Path
import os
import pandas as pd
from collections import defaultdict
from IPython.display import clear_output
from google.colab import drive

import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from PIL import Image


Collecting ultralytics
  Downloading ultralytics-8.3.234-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.18 (from ultralytics)
  Downloading ultralytics_thop-2.0.18-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.3.234-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.18-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.234 ultralytics-thop-2.0.18
Collecting filterpy
  Downloading filterpy-1.4.5.zip (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.0/178.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: filterpy
  Building wheel for filterpy (setup.py) ... [?25l[?25hdone
  Created wheel for filterpy: filename=filterpy-1.4.5-py3-n

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

class KalmanBoxTracker:
    """
    Represents the internal state of individual tracked objects using Kalman filter.
    """
    count = 0

    def __init__(self, bbox):
        """
        Initialize a tracker using initial bounding box.
        bbox: [x1, y1, x2, y2, score]
        """
        # Define constant velocity model
        self.kf = KalmanFilter(dim_x=7, dim_z=4)

        self.kf.F = np.array([
            [1,0,0,0,1,0,0],
            [0,1,0,0,0,1,0],
            [0,0,1,0,0,0,1],
            [0,0,0,1,0,0,0],
            [0,0,0,0,1,0,0],
            [0,0,0,0,0,1,0],
            [0,0,0,0,0,0,1]
        ])

        self.kf.H = np.array([
            [1,0,0,0,0,0,0],
            [0,1,0,0,0,0,0],
            [0,0,1,0,0,0,0],
            [0,0,0,1,0,0,0]
        ])

        self.kf.R[2:,2:] *= 10.
        self.kf.P[4:,4:] *= 1000.
        self.kf.P *= 10.
        self.kf.Q[-1,-1] *= 0.01
        self.kf.Q[4:,4:] *= 0.01

        self.kf.x[:4] = self.convert_bbox_to_z(bbox)

        self.time_since_update = 0
        self.id = KalmanBoxTracker.count
        KalmanBoxTracker.count += 1
        self.history = []
        self.hits = 0
        self.hit_streak = 0
        self.age = 0

        # Store confidence score
        self.last_score = bbox[4] if len(bbox) > 4 else 0.0

    def update(self, bbox):
        """Updates the state vector with observed bbox."""
        self.time_since_update = 0
        self.history = []
        self.hits += 1
        self.hit_streak += 1
        self.kf.update(self.convert_bbox_to_z(bbox))
        self.last_score = bbox[4] if len(bbox) > 4 else 0.0

    def predict(self):
        """Advances the state vector and returns predicted bounding box."""
        if (self.kf.x[6] + self.kf.x[2]) <= 0:
            self.kf.x[6] *= 0.0
        self.kf.predict()
        self.age += 1
        if self.time_since_update > 0:
            self.hit_streak = 0
        self.time_since_update += 1
        self.history.append(self.convert_x_to_bbox(self.kf.x))
        return self.history[-1]

    def get_state(self):
        """Returns the current bounding box estimate."""
        return self.convert_x_to_bbox(self.kf.x)

    @staticmethod
    def convert_bbox_to_z(bbox):
        """Convert [x1,y1,x2,y2] to [x,y,s,r]"""
        w = bbox[2] - bbox[0]
        h = bbox[3] - bbox[1]
        x = bbox[0] + w/2.
        y = bbox[1] + h/2.
        s = w * h
        r = w / float(h)
        return np.array([x, y, s, r]).reshape((4, 1))

    @staticmethod
    def convert_x_to_bbox(x, score=None):
        """Convert [x,y,s,r] to [x1,y1,x2,y2]"""
        w = np.sqrt(x[2] * x[3])
        h = x[2] / w
        if score == None:
            return np.array([x[0]-w/2., x[1]-h/2., x[0]+w/2., x[1]+h/2.]).reshape((1,4))
        else:
            return np.array([x[0]-w/2., x[1]-h/2., x[0]+w/2., x[1]+h/2., score]).reshape((1,5))

class AppearanceExtractor:
    """
    CNN-based appearance feature extractor for Deep SORT.
    Uses pre-trained ResNet for feature extraction.
    """
    def __init__(self, use_gpu=True):
        self.device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
        print(f"Appearance extractor using device: {self.device}")

        # Use ResNet18 as feature extractor
        resnet = models.resnet18(pretrained=True)
        self.model = nn.Sequential(*list(resnet.children())[:-1])
        self.model.to(self.device)
        self.model.eval()

        self.feature_dim = 512

        # Image preprocessing
        self.transform = transforms.Compose([
            transforms.Resize((128, 64)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                               std=[0.229, 0.224, 0.225])
        ])

    def extract(self, frame, bbox):
        """
        Extract appearance feature from bounding box region.
        Returns L2-normalized feature vector.
        """
        x1, y1, x2, y2 = [int(v) for v in bbox]

        h, w = frame.shape[:2]
        x1 = max(0, x1)
        y1 = max(0, y1)
        x2 = min(w, x2)
        y2 = min(h, y2)

        if x2 <= x1 or y2 <= y1:
            return np.zeros(self.feature_dim)

        crop = frame[y1:y2, x1:x2]

        if crop.size == 0:
            return np.zeros(self.feature_dim)

        crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
        crop_pil = Image.fromarray(crop_rgb)

        crop_tensor = self.transform(crop_pil).unsqueeze(0).to(self.device)

        with torch.no_grad():
            features = self.model(crop_tensor)
            features = features.squeeze()

        features = features.cpu().numpy()
        norm = np.linalg.norm(features)
        if norm > 0:
            features = features / norm

        return features


def cosine_distance(a, b):
    """Compute cosine distance between two feature vectors."""
    return 1.0 - np.dot(a, b)

class DeepSORTTrack:
    """Single track for Deep SORT with appearance features."""
    def __init__(self, detection, track_id, feature):
        """
        Args:
            detection: [x1, y1, x2, y2, score]
            track_id: unique ID
            feature: appearance feature vector
        """
        self.track_id = track_id
        self.kalman = KalmanBoxTracker(detection)

        # Appearance feature gallery (store last 100)
        self.features = [feature]
        self.max_features = 100

        self.time_since_update = 0
        self.hits = 1
        self.age = 0

    def predict(self):
        """Predict next position."""
        pred = self.kalman.predict()[0]
        self.age += 1
        if self.time_since_update > 0:
            self.hits = 0
        self.time_since_update += 1
        return pred

    def update(self, detection, feature):
        """Update with new detection and feature."""
        self.kalman.update(detection)

        self.features.append(feature)
        if len(self.features) > self.max_features:
            self.features.pop(0)

        self.hits += 1
        self.time_since_update = 0

    def get_state(self):
        """Get current bounding box."""
        return self.kalman.get_state()[0]

    def get_score(self):
        """Get confidence score."""
        return self.kalman.last_score

    def min_cost_feature(self, feature):
        """Compute minimum cosine distance to feature gallery."""
        if len(self.features) == 0:
            return 1.0

        distances = [cosine_distance(feature, f) for f in self.features]
        return min(distances)

class DeepSORT:
    """
    Deep SORT: Simple Online Realtime Tracking with Deep Association Metric.
    IMPROVED PARAMETERS FOR BETTER ID PERSISTENCE.
    """
    def __init__(self, max_age=90, min_hits=1, iou_threshold=0.2, lambda_param=0.0):
        """
        Args:
            max_age: Maximum frames to keep track without detection (increased to 90)
            min_hits: Minimum hits to confirm track (decreased to 1)
            iou_threshold: IOU threshold for matching (decreased to 0.2)
            lambda_param: 0 = appearance only (paper recommendation)
        """
        self.max_age = max_age
        self.min_hits = min_hits
        self.iou_threshold = iou_threshold
        self.lambda_param = lambda_param

        self.tracks = []
        self.next_id = 1
        self.frame_count = 0

        print("Initializing Deep SORT appearance extractor...")
        self.appearance_extractor = AppearanceExtractor()
        print("Deep SORT ready!")

    def update(self, detections, frame):
        """
        Update tracker with new detections.

        Args:
            detections: numpy array (N, 5) - [x1, y1, x2, y2, conf]
            frame: current frame (numpy array)

        Returns:
            tracks: numpy array (M, 6) - [x1, y1, x2, y2, track_id, score]
        """
        self.frame_count += 1

        # Step 1: Predict new locations
        predicted_boxes = []
        for track in self.tracks:
            pred = track.predict()
            predicted_boxes.append(pred)
        predicted_boxes = np.array(predicted_boxes) if predicted_boxes else np.empty((0, 4))

        # Step 2: Extract appearance features for detections
        detection_features = []
        for det in detections:
            feature = self.appearance_extractor.extract(frame, det[:4])
            detection_features.append(feature)

        # Step 3: Compute cost matrix
        if len(self.tracks) > 0 and len(detections) > 0:
            cost_matrix = self._compute_cost_matrix(
                predicted_boxes, detections, detection_features
            )
        else:
            cost_matrix = np.empty((0, 0))

        # Step 4: Hungarian assignment
        if cost_matrix.size > 0:
            row_ind, col_ind = linear_sum_assignment(cost_matrix)
            matches = []
            unmatched_detections = set(range(len(detections)))
            unmatched_tracks = set(range(len(self.tracks)))

            # Filter matches by cost threshold (more lenient)
            for r, c in zip(row_ind, col_ind):
                if cost_matrix[r, c] < 0.8:  # Increased from 0.7 to 0.8
                    matches.append((r, c))
                    unmatched_detections.discard(c)
                    unmatched_tracks.discard(r)
        else:
            matches = []
            unmatched_detections = set(range(len(detections)))
            unmatched_tracks = set(range(len(self.tracks)))

        # Step 5: Update matched tracks
        for track_idx, det_idx in matches:
            self.tracks[track_idx].update(
                detections[det_idx],
                detection_features[det_idx]
            )

        # Step 6: Create new tracks (lowered threshold)
        for det_idx in unmatched_detections:
            if detections[det_idx][4] >= 0.3:  # Lowered from 0.5 to 0.3
                new_track = DeepSORTTrack(
                    detections[det_idx],
                    self.next_id,
                    detection_features[det_idx]
                )
                self.tracks.append(new_track)
                self.next_id += 1

        # Step 7: Remove dead tracks
        self.tracks = [t for t in self.tracks if t.time_since_update < self.max_age]

        # Step 8: Return confirmed tracks
        output_tracks = []
        for track in self.tracks:
            if track.hits >= self.min_hits or self.frame_count <= self.min_hits:
                state = track.get_state()
                score = track.get_score()
                output_tracks.append(
                    np.array([state[0], state[1], state[2], state[3], track.track_id, score])
                )

        if len(output_tracks) > 0:
            return np.array(output_tracks)
        return np.empty((0, 6))

    def _compute_cost_matrix(self, predicted_boxes, detections, detection_features):
        """Compute combined motion + appearance cost matrix."""
        n_tracks = len(self.tracks)
        n_detections = len(detections)
        cost_matrix = np.zeros((n_tracks, n_detections))

        for i, (track, pred_box) in enumerate(zip(self.tracks, predicted_boxes)):
            for j, (det, det_feature) in enumerate(zip(detections, detection_features)):
                # Motion cost (1 - IOU)
                iou = self._compute_iou(pred_box, det[:4])
                motion_cost = 1.0 - iou

                # Appearance cost
                appearance_cost = track.min_cost_feature(det_feature)

                # Combined cost (lambda=0 means appearance only)
                cost_matrix[i, j] = (
                    self.lambda_param * motion_cost +
                    (1 - self.lambda_param) * appearance_cost
                )

        return cost_matrix

    @staticmethod
    def _compute_iou(box1, box2):
        """Compute IOU between two boxes."""
        x1 = max(box1[0], box2[0])
        y1 = max(box1[1], box2[1])
        x2 = min(box1[2], box2[2])
        y2 = min(box1[3], box2[3])

        intersection = max(0, x2 - x1) * max(0, y2 - y1)
        area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
        area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
        union = area1 + area2 - intersection

        if union == 0:
            return 0
        return intersection / union

class RobotDetector:
    """YOLO-based detector for FRC robots."""
    def __init__(self, model_name='best.pt', conf_threshold=0.05):
        """
        Args:
            model_name: Path to YOLO model
            conf_threshold: Minimum confidence (lowered to 0.05)
        """
        print(f"Loading YOLO model: {model_name}")
        self.model = YOLO(model_name)
        self.conf_threshold = conf_threshold
        print(f" Model loaded successfully!")

    def detect(self, frame):
        """
        Detect robots in frame.
        Returns: numpy array (N, 5) - [x1, y1, x2, y2, conf]
        """
        results = self.model(frame, conf=self.conf_threshold, verbose=False)

        detections = []
        for result in results:
            boxes = result.boxes
            for box in boxes:
                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
                conf = box.conf[0].cpu().numpy()
                cls = box.cls[0].cpu().numpy()

                if cls == 0:  # Adjust based on your model
                    detections.append([x1, y1, x2, y2, conf])

        if len(detections) > 0:
            return np.array(detections)
        else:
            return np.empty((0, 5))

def draw_tracks_on_frame(frame, tracks):
    """Draw bounding boxes and IDs on frame."""
    frame_copy = frame.copy()
    np.random.seed(42)
    colors = np.random.randint(0, 255, size=(1000, 3), dtype=np.uint8)

    for track in tracks:
        x1, y1, x2, y2, track_id, score = track
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
        track_id = int(track_id)

        color = tuple(int(c) for c in colors[track_id % len(colors)])

        cv2.rectangle(frame_copy, (x1, y1), (x2, y2), color, 2)

        label = f"Robot {track_id} ({score:.2f})"
        label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
        cv2.rectangle(frame_copy, (x1, y1 - label_size[1] - 10),
                     (x1 + label_size[0], y1), color, -1)
        cv2.putText(frame_copy, label, (x1, y1 - 5),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)

    return frame_copy

def process_video_to_csv(video_path, output_csv_path, model_name='best.pt', display_every=100, save_frames=True, frames_output_folder=None):
    """
    Process video with Deep SORT and save to CSV.

    CSV format: video_name, frame, timestamp, track_id, x1, y1, x2, y2, score
    """
    # Initialize detector and tracker
    detector = RobotDetector(model_name=model_name, conf_threshold=0.05)
    tracker = DeepSORT(max_age=90, min_hits=1, iou_threshold=0.2, lambda_param=0.0)

    # Reset tracker count
    KalmanBoxTracker.count = 0

    # Create folder for sample frames
    if save_frames:
        if frames_output_folder is None:
            frames_output_folder = Path(output_csv_path).parent / 'sample_frames'
        frames_output_folder = Path(frames_output_folder)
        frames_output_folder.mkdir(exist_ok=True, parents=True)

        video_frames_folder = frames_output_folder / Path(video_path).stem
        video_frames_folder.mkdir(exist_ok=True, parents=True)

    # Open video
    cap = cv2.VideoCapture(str(video_path))
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    print(f"\nProcessing: {Path(video_path).name}")
    print(f"Resolution: {width}x{height} @ {fps:.2f} FPS")
    print(f"Total frames: {total_frames}")

    frame_count = 0
    tracking_data = []
    frames_saved = 0
    save_interval = max(total_frames // 10, 30)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1
        timestamp = frame_count / fps

        # Detect robots
        detections = detector.detect(frame)

        # Update tracker (Deep SORT needs frame for appearance)
        tracks = tracker.update(detections, frame)

        # Save sample frames
        if save_frames and (frame_count % save_interval == 0 or frame_count == 1) and len(tracks) > 0:
            frame_with_tracks = draw_tracks_on_frame(frame, tracks)
            frame_filename = video_frames_folder / f"frame_{frame_count:05d}.jpg"
            cv2.imwrite(str(frame_filename), frame_with_tracks)
            frames_saved += 1

        # Store tracking data
        for track in tracks:
            x1, y1, x2, y2, track_id, score = track

            tracking_data.append({
                'video_name': Path(video_path).name,
                'frame': frame_count,
                'timestamp': timestamp,
                'track_id': int(track_id),
                'x1': x1,
                'y1': y1,
                'x2': x2,
                'y2': y2,
                'score': score
            })

        # Display progress
        if frame_count % display_every == 0:
            print(f"Progress: {frame_count}/{total_frames} ({100*frame_count/total_frames:.1f}%) - Current tracks: {len(tracks)}")

    cap.release()

    # Create DataFrame
    df = pd.DataFrame(tracking_data)

    # Save to CSV
    df.to_csv(output_csv_path, index=False)

    print(f" Saved tracking data to: {output_csv_path}")
    print(f"  Total detections: {len(df)}")
    if len(df) > 0:
        print(f"  Unique track IDs: {df['track_id'].nunique()}")
        print(f"  Max track ID seen: {df['track_id'].max()}")
    if save_frames:
        print(f"  Sample frames saved: {frames_saved} in {video_frames_folder}")

    return df

def process_folder_to_csv(data_folder, output_folder, model_name='best.pt', save_sample_frames=True):
    """Process all MP4 videos in folder with Deep SORT."""
    data_folder = Path(data_folder)
    output_folder = Path(output_folder)
    output_folder.mkdir(exist_ok=True, parents=True)

    video_files = list(data_folder.glob('*.mp4')) + list(data_folder.glob('*.MP4'))

    print(f"Found {len(video_files)} MP4 videos in {data_folder}")

    if len(video_files) == 0:
        print(" No MP4 files found.")
        return {}

    all_results = {}
    all_dataframes = []

    for i, video_path in enumerate(video_files):
        print(f"\n{'='*60}")
        print(f"Video {i+1}/{len(video_files)}")
        print(f"{'='*60}")

        output_csv = output_folder / f"{video_path.stem}_tracking.csv"

        try:
            df = process_video_to_csv(
                video_path,
                output_csv,
                model_name=model_name,
                save_frames=save_sample_frames,
                frames_output_folder=output_folder / 'sample_frames'
            )
            all_results[video_path.name] = df
            all_dataframes.append(df)
        except Exception as e:
            print(f" Error processing {video_path.name}: {e}")
            import traceback
            traceback.print_exc()
            continue

    # Create combined CSV
    if all_dataframes:
        combined_df = pd.concat(all_dataframes, ignore_index=True)
        combined_csv_path = output_folder / "all_videos_combined.csv"
        combined_df.to_csv(combined_csv_path, index=False)
        print(f"\n{'='*60}")
        print(f" Saved combined tracking data to: {combined_csv_path}")
        print(f"  Total rows: {len(combined_df)}")
        print(f"  Videos processed: {len(all_dataframes)}")

    # Summary
    print("\n" + "="*60)
    print("BATCH PROCESSING SUMMARY - Deep SORT")
    print("="*60)
    for video_name, df in all_results.items():
        if len(df) > 0:
            print(f"\n{video_name}:")
            print(f"  - Total detections: {len(df)}")
            print(f"  - Unique track IDs: {df['track_id'].nunique()}")
            print(f"  - Max track ID: {df['track_id'].max()}")
            print(f"  - Duration: {df['timestamp'].max():.2f}s")
            print(f"  - Avg detections/frame: {len(df)/df['frame'].max():.2f}")

    return all_results

# ============================================================================
# MAIN EXECUTION
# ============================================================================

DATA_FOLDER = '/content/drive/MyDrive/Colab Notebooks/Test'
OUTPUT_FOLDER = '/content/drive/MyDrive/Colab Notebooks/DeepSORTTesting'
MODEL_PATH = '/content/drive/MyDrive/Colab Notebooks/best.pt'

# print("DEEP SORT WITH IMPROVED SETTINGS:")
# print("  - max_age: 90 frames - keeps tracks alive longer during occlusion")
# print("  - min_hits: 1 - confirms tracks immediately")
# print("  - iou_threshold: 0.2 - more lenient matching")
# print("  - lambda: 0.0 - appearance-based matching (paper recommendation)")
# print("  - YOLO conf: 0.05 - detects more robots")
# print("  - Appearance model: ResNet18 pre-trained features")

# Process all videos
results = process_folder_to_csv(
    data_folder=DATA_FOLDER,
    output_folder=OUTPUT_FOLDER,
    model_name=MODEL_PATH,
    save_sample_frames=True
)

Mounted at /content/drive
Found 1 MP4 videos in /content/drive/MyDrive/Colab Notebooks/Test

Video 1/1
Loading YOLO model: /content/drive/MyDrive/Colab Notebooks/best.pt
 Model loaded successfully!
Initializing Deep SORT appearance extractor...
Appearance extractor using device: cpu




Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 66.2MB/s]


Deep SORT ready!

Processing: test.mp4
Resolution: 1920x1080 @ 29.99 FPS
Total frames: 4766
Progress: 100/4766 (2.1%) - Current tracks: 11
Progress: 200/4766 (4.2%) - Current tracks: 15
Progress: 300/4766 (6.3%) - Current tracks: 13
Progress: 400/4766 (8.4%) - Current tracks: 12
Progress: 500/4766 (10.5%) - Current tracks: 9
Progress: 600/4766 (12.6%) - Current tracks: 9
Progress: 700/4766 (14.7%) - Current tracks: 9
Progress: 800/4766 (16.8%) - Current tracks: 8
Progress: 900/4766 (18.9%) - Current tracks: 11
Progress: 1000/4766 (21.0%) - Current tracks: 16
Progress: 1100/4766 (23.1%) - Current tracks: 13
Progress: 1200/4766 (25.2%) - Current tracks: 11
Progress: 1300/4766 (27.3%) - Current tracks: 18
Progress: 1400/4766 (29.4%) - Current tracks: 10
Progress: 1500/4766 (31.5%) - Current tracks: 12
Progress: 1600/4766 (33.6%) - Current tracks: 18
Progress: 1700/4766 (35.7%) - Current tracks: 18
Progress: 1800/4766 (37.8%) - Current tracks: 9
Progress: 1900/4766 (39.9%) - Current tracks