In [1]:
pip install ultralytics opencv-python-headless


Collecting opencv-python-headless
  Downloading opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Downloading opencv_python_headless-4.10.0.84-cp37-abi3-win_amd64.whl (38.8 MB)
   ---------------------------------------- 0.0/38.8 MB ? eta -:--:--
   - -------------------------------------- 1.0/38.8 MB 8.4 MB/s eta 0:00:05
   --- ------------------------------------ 3.7/38.8 MB 11.5 MB/s eta 0:00:04
   -------- ------------------------------- 7.9/38.8 MB 15.2 MB/s eta 0:00:03
   ----------- ---------------------------- 11.3/38.8 MB 16.0 MB/s eta 0:00:02
   ---------------- ----------------------- 16.0/38.8 MB 17.3 MB/s eta 0:00:02
   --------------------- ------------------ 20.7/38.8 MB 18.4 MB/s eta 0:00:01
   ------------------------- -------------- 24.6/38.8 MB 19.3 MB/s eta 0:00:01
   ---------------------------- ----------- 28.0/38.8 MB 18.7 MB/s eta 0:00:01
   ------------------------------- -------- 30.4/38.8 MB 17.7 MB/s eta 0:00:01
   -------------------

In [10]:
import os
import pandas as pd

# Path to annotation folder
annotation_root = r"C:\Users\pc\Downloads\VisDrone2019-VID-train\Annotation\Train-ann"

# Initialize a set to store unique class IDs
class_ids = set()

# Process all annotation files
for ann_file in os.listdir(annotation_root):
    if ann_file.endswith(".txt"):
        with open(os.path.join(annotation_root, ann_file), "r") as file:
            for line in file:
                parts = line.strip().split(",")
                if len(parts) > 7:
                    class_id = int(parts[7])  # Extract object_category
                    class_ids.add(class_id)

# Print all unique class IDs and their count
print(f"Unique class IDs: {sorted(class_ids)}")
print(f"Total number of classes: {len(class_ids)}")


Unique class IDs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Total number of classes: 12


In [3]:
pip install  torch torchvision





In [5]:
import cv2
import os
import glob
import torch
from ultralytics import YOLO  # Correct import for YOLOv8

# Paths
image_root = r"C:\Users\pc\Downloads\VisDrone2019-VID-train\images\train"
output_root = r"C:\Users\pc\Downloads\VisDrone2019-VID-train\Output"

# YOLO setup
yolo_weights = r"C:\Users\pc\Downloads\best.pt"  # Path to your trained YOLOv8 model
yolo_confidence_threshold = 0.5  # Confidence threshold for detections

# Load YOLOv8 model
model = YOLO(yolo_weights)

# Ensure output folder exists
os.makedirs(output_root, exist_ok=True)

def annotate_frame(frame, detections):
    """
    Draw bounding boxes and labels on a frame.
    """
    for det in detections:
        x1, y1, x2, y2 = det[:4]  # Always unpack the first four values
        conf = det[4] if len(det) > 4 else 0.0  # Confidence (default to 0.0 if missing)
        cls = det[5] if len(det) > 5 else -1    # Class ID (default to -1 if missing)

        # Format label with class name and confidence
        label = f"{model.names[int(cls)]} {conf:.2f}" if cls >= 0 else "Unknown"
        
        # Draw rectangle
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
        # Put label
        cv2.putText(frame, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
    return frame

def process_video_folder(image_folder, output_path):
    """
    Process images in a folder and create a video with annotations.
    """
    # Get sorted list of image files
    image_files = sorted(glob.glob(os.path.join(image_folder, "*.jpg")))
    if not image_files:
        print(f"No images found in {image_folder}")
        return

    # Get video properties
    sample_frame = cv2.imread(image_files[0])
    height, width, _ = sample_frame.shape
    fps = 10  # Default FPS for output video

    # Initialize video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # Process images
    for img_path in image_files:
        frame = cv2.imread(img_path)

        # Perform YOLO inference
        results = model.predict(frame, conf=yolo_confidence_threshold, verbose=False)

        # Combine bounding boxes, confidence scores, and class IDs
        detections = torch.cat((
            results[0].boxes.xyxy,  # Bounding box coordinates
            results[0].boxes.conf.unsqueeze(1),  # Confidence scores
            results[0].boxes.cls.unsqueeze(1)    # Class IDs
        ), dim=1).cpu().numpy()

        # Annotate frame
        annotated_frame = annotate_frame(frame, detections)

        # Write frame to video
        video_writer.write(annotated_frame)

    video_writer.release()
    print(f"Video saved to {output_path}")

# Main processing loop
for subfolder in os.listdir(image_root):
    image_folder = os.path.join(image_root, subfolder)
    output_path = os.path.join(output_root, subfolder + ".mp4")

    if os.path.isdir(image_folder):
        print(f"Processing folder: {image_folder}")
        process_video_folder(image_folder, output_path)
    else:
        print(f"Skipping: {image_folder}")


Processing folder: C:\Users\pc\Downloads\VisDrone2019-VID-train\images\train\uav0000013_00000_v
Video saved to C:\Users\pc\Downloads\VisDrone2019-VID-train\Output\uav0000013_00000_v.mp4
Processing folder: C:\Users\pc\Downloads\VisDrone2019-VID-train\images\train\uav0000013_01073_v
Video saved to C:\Users\pc\Downloads\VisDrone2019-VID-train\Output\uav0000013_01073_v.mp4
Processing folder: C:\Users\pc\Downloads\VisDrone2019-VID-train\images\train\uav0000013_01392_v
Video saved to C:\Users\pc\Downloads\VisDrone2019-VID-train\Output\uav0000013_01392_v.mp4
Processing folder: C:\Users\pc\Downloads\VisDrone2019-VID-train\images\train\uav0000020_00406_v
Video saved to C:\Users\pc\Downloads\VisDrone2019-VID-train\Output\uav0000020_00406_v.mp4
Processing folder: C:\Users\pc\Downloads\VisDrone2019-VID-train\images\train\uav0000071_03240_v
Video saved to C:\Users\pc\Downloads\VisDrone2019-VID-train\Output\uav0000071_03240_v.mp4
Processing folder: C:\Users\pc\Downloads\VisDrone2019-VID-train\images

KeyboardInterrupt: 

In [10]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


Looking in indexes: https://download.pytorch.org/whl/cu118


In [6]:
from ultralytics import YOLO

# Load a pre-trained YOLOv8 model
model = YOLO('yolov8s.pt')  # You can also use yolov8n.pt, yolov8m.pt, yolov8l.pt

# Train the model using CPU (since CUDA is not available)
model.train(data=r'C:\Users\pc\Downloads\VisDrone2019-VID-train\config.yaml', epochs=3, imgsz=640, batch=16, device='cuda')


Ultralytics 8.3.35  Python-3.12.7 torch-2.5.1+cpu 


ValueError: Invalid CUDA 'device=0' requested. Use 'device=cpu' or pass valid CUDA device(s) if available, i.e. 'device=0' or 'device=0,1,2,3' for Multi-GPU.

torch.cuda.is_available(): False
torch.cuda.device_count(): 0
os.environ['CUDA_VISIBLE_DEVICES']: None
See https://pytorch.org/get-started/locally/ for up-to-date torch install instructions if no CUDA devices are seen by torch.


In [20]:
import cv2
import os
import glob

# Paths
image_root = r"C:\Users\pc\Downloads\VisDrone2019-VID-train\images\Train-sequ"
annotation_root = r"C:\Users\pc\Downloads\VisDrone2019-VID-train\Annotation\Train-ann"
output_root = r"C:\Users\pc\Downloads\VisDrone2019-VID-train\Output"

# YOLO setup
yolo_weights = "yolov8s.pt"  # Update with your YOLO weights path
yolo_confidence_threshold = 0.5

# Load YOLO model
import torch
model = torch.hub.load('ultralytics/yolov8', 'custom', path=yolo_weights)

# Ensure output folder exists
os.makedirs(output_root, exist_ok=True)

def annotate_frame(frame, detections):
    """
    Draw bounding boxes and labels on a frame.
    """
    for det in detections:
        x1, y1, x2, y2, conf, cls = det
        label = f"{model.names[int(cls)]} {conf:.2f}"
        # Draw rectangle
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
        # Put label
        cv2.putText(frame, label, (int(x1), int(y1)-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
    return frame

def process_video_folder(image_folder, annotation_folder, output_path):
    """
    Process images in a folder and create a video with annotations.
    """
    # Get sorted list of image files
    image_files = sorted(glob.glob(os.path.join(image_folder, "*.jpg")))
    if not image_files:
        return

    # Get video properties
    sample_frame = cv2.imread(image_files[0])
    height, width, _ = sample_frame.shape
    fps = 10  # Default FPS for output video
    
    # Initialize video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # Process images
    for img_path in image_files:
        frame = cv2.imread(img_path)

        # Perform YOLO inference
        results = model(frame)
        detections = results.xyxy[0].cpu().numpy()  # Extract results as [x1, y1, x2, y2, conf, class]

        # Annotate frame
        annotated_frame = annotate_frame(frame, detections)

        # Write frame to video
        video_writer.write(annotated_frame)

    video_writer.release()

# Main processing loop
for subfolder in os.listdir(image_root):
    image_folder = os.path.join(image_root, subfolder)
    annotation_file = os.path.join(annotation_root, subfolder + ".txt")
    output_path = os.path.join(output_root, subfolder + ".mp4")

    if os.path.isdir(image_folder) and os.path.isfile(annotation_file):
        print(f"Processing folder: {image_folder}")
        process_video_folder(image_folder, annotation_file, output_path)
    else:
        print(f"Skipping: {image_folder}")


Using cache found in C:\Users\pc/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-11-24 Python-3.12.7 torch-2.5.1+cpu CPU



YOLOv8s summary (fused): 168 layers, 11,156,544 parameters, 0 gradients, 28.6 GFLOPs


Adding AutoShape... 


Exception: 'Detect' object has no attribute 'grid'. Cache may be out of date, try `force_reload=True` or see https://docs.ultralytics.com/yolov5/tutorials/pytorch_hub_model_loading for help.

In [4]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.

In [14]:
import torch
import time
import cv2
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
from sklearn.metrics import precision_score, recall_score
import numpy as np

# Paths
MODEL_PATH = 'best.pt'
TEST_VIDEO = r'C:\Users\pc\Downloads\input.mp4'

# Initialize YOLO model
model = YOLO(MODEL_PATH)

# Initialize DeepSORT tracker
tracker = DeepSort(max_age=30, n_init=3, nms_max_overlap=1.0)

# Metrics initialization
precision_list = []
recall_list = []
fps_list = []
total_id_switches = 0
total_missed_objects = 0

# Ground truth annotations for precision and recall computation
# Format: [(bbox, class_id)] - Use your actual ground truth values here
ground_truth = [
    # Example: (bbox, class_id)
    # [(x_min, y_min, x_max, y_max), class_id]
    # These would be replaced by the actual annotations of your dataset
    # Example for frame 1: [([50, 50, 150, 150], 0), ([100, 100, 200, 200], 1)]
]

# Calculate IoU (Intersection over Union) between two bounding boxes
def calculate_iou(box1, box2):
    # Unpack the bounding box coordinates
    x1_min, y1_min, x1_max, y1_max = box1
    x2_min, y2_min, x2_max, y2_max = box2

    # Compute the intersection
    inter_x_min = max(x1_min, x2_min)
    inter_y_min = max(y1_min, y2_min)
    inter_x_max = min(x1_max, x2_max)
    inter_y_max = min(y1_max, y2_max)

    inter_width = max(0, inter_x_max - inter_x_min)
    inter_height = max(0, inter_y_max - inter_y_min)
    intersection_area = inter_width * inter_height

    # Compute the areas of both bounding boxes
    box1_area = (x1_max - x1_min) * (y1_max - y1_min)
    box2_area = (x2_max - x2_min) * (y2_max - y2_min)

    # Compute the union
    union_area = box1_area + box2_area - intersection_area

    # Compute IoU
    iou = intersection_area / union_area if union_area != 0 else 0
    return iou

# Start processing the video
cap = cv2.VideoCapture(TEST_VIDEO)
frame_count = 0
previous_track_ids = set()  # To keep track of previous frame's track IDs
start_time = time.time()

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    start = time.time()

    # YOLO detection
    results = model(frame)
    detections = results[0].boxes

    # Extract bounding boxes and class IDs
    boxes = []
    scores = []
    class_ids = []
    raw_detections = []
    for box in detections:
        # Ensure the bounding box is in the correct format [x_min, y_min, x_max, y_max]
        bbox = [box.xyxy[0][0].item(), box.xyxy[0][1].item(), box.xyxy[0][2].item(), box.xyxy[0][3].item()]
        score = box.conf.item()  # Convert tensor to float
        class_id = int(box.cls)  # Convert tensor to int

        # Add detection to the list
        raw_detections.append((bbox, score, class_id))

    # Prepare embeddings (for DeepSORT, if needed)
    embeddings = None  # Replace with actual embeddings if required

    # Pass raw_detections, embeddings, and frame to the tracker
    tracks = tracker.update_tracks(raw_detections, embeddings, frame)

    # Evaluate tracking metrics
    tracked_objects = {track.track_id: track.to_tlbr() for track in tracks if track.is_confirmed()}

    # Calculate ID switches and missed objects
    current_track_ids = set([track.track_id for track in tracks if track.is_confirmed()])
    id_switches = len(current_track_ids - previous_track_ids)  # Compare with previous track IDs
    total_id_switches += id_switches

    # Missed objects are those that were in the previous frame but are missing now
    total_missed_objects += len(previous_track_ids - current_track_ids)

    # Update the previous track IDs for the next frame
    previous_track_ids = current_track_ids

    # Add detection precision/recall computation
    y_true = []
    y_pred = []

    for gt_bbox, gt_class in ground_truth:
        best_match = None
        best_iou = 0  # Intersection over Union (IoU)

        for pred_bbox, _, pred_class in raw_detections:
            iou = calculate_iou(gt_bbox, pred_bbox)
            if iou > best_iou:
                best_iou = iou
                best_match = pred_class  # Get the predicted class with the highest IoU

        if best_match is not None and best_iou > 0.5:  # Only consider if IoU > 0.5
            y_true.append(gt_class)
            y_pred.append(best_match)

    # Compute precision and recall for this frame
    if y_true and y_pred:
        precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
        recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)
    else:
        precision = 0
        recall = 0

    precision_list.append(precision)
    recall_list.append(recall)

    # Compute FPS
    fps = 1 / (time.time() - start)
    fps_list.append(fps)

cap.release()
end_time = time.time()

# Compute Final Metrics
average_precision = sum(precision_list) / len(precision_list) if precision_list else 0
average_recall = sum(recall_list) / len(recall_list) if recall_list else 0
average_fps = sum(fps_list) / len(fps_list) if fps_list else 0

# Compute MOTA (Multiple Object Tracking Accuracy) - Avoid division by zero
total_objects = len(tracked_objects)  # Total number of objects tracked
mota = (total_objects - total_missed_objects - total_id_switches) / total_objects if total_objects > 0 else 0

# Display Metrics
print("Metrics Summary:")
print(f"Precision: {average_precision:.2f}")
print(f"Recall: {average_recall:.2f}")
print(f"Average FPS: {average_fps:.2f}")
print(f"ID Switches: {total_id_switches}")
print(f"Missed Objects: {total_missed_objects}")
print(f"MOTA: {mota:.2f}")



0: 384x640 12 cars, 5 vans, 1 bus, 243.8ms
Speed: 7.0ms preprocess, 243.8ms inference, 4.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 cars, 4 vans, 1 bus, 221.8ms
Speed: 5.0ms preprocess, 221.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 cars, 3 vans, 1 bus, 254.4ms
Speed: 3.0ms preprocess, 254.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 cars, 2 vans, 1 bus, 214.1ms
Speed: 4.0ms preprocess, 214.1ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 13 cars, 3 vans, 1 bus, 283.3ms
Speed: 4.0ms preprocess, 283.3ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 cars, 2 vans, 2 buss, 208.7ms
Speed: 4.0ms preprocess, 208.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 cars, 2 vans, 1 bus, 200.8ms
Speed: 4.0ms preprocess, 200.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 3