In [None]:
pip install effdet

In [1]:
import torch
import cv2
import torchvision.ops as ops
from torchvision import transforms
import numpy as np
from pathlib import Path
from effdet import create_model
from effdet.config import get_efficientdet_config
from boxmot import BotSort
from boxmot.ops import letterbox


# Load EfficientDet model
device = torch.device('mps')  # Use 'cuda' if you have a GPU

model_name = 'tf_efficientdet_d0'  # You can choose a different variant like 'tf_efficientdet_d3'
config = get_efficientdet_config(model_name)
model = create_model(model_name, bench_task='predict', pretrained=True).to(device)
model.eval()

# Initialize the tracker
tracker = BotSort(
    reid_weights=Path('osnet_x0_25_msmt17.pt'),  # Path to ReID model
    device=device,  # Use CPU for inference
    half=False
)

input_size = config.image_size

preprocess = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Open the video file
vid = cv2.VideoCapture(0)  # or 'path/to/your.avi'

while True:
    # Capture frame-by-frame
    ret, frame = vid.read()

    # If ret is False, it means we have reached the end of the video
    if not ret:
        break

    # Apply letterbox resizing
    frame_letterbox, ratio, (dw, dh) = letterbox(frame, new_shape=input_size, auto=False, scaleFill=True)
    
    # Preprocess frame for EfficientDet (resize and normalize)
    frame_tensor = preprocess(frame_letterbox).unsqueeze(0).to(device)

    # Perform detection
    with torch.no_grad():
        detections = model(frame_tensor)[0]
                
    # Assuming detections is shaped [100, 6], with [x1, y1, x2, y2, confidence, class]
    confidence_threshold = 0.5
    
    # Filter detections based on confidence threshold
    mask = detections[:, 4] >= confidence_threshold
    filtered_dets = detections[mask]

    # Rescale coordinates from letterbox back to the original frame size
    filtered_dets[:, 0] = (filtered_dets[:, 0] - dw) / ratio[0]
    filtered_dets[:, 1] = (filtered_dets[:, 1] - dh) / ratio[1]
    filtered_dets[:, 2] = (filtered_dets[:, 2] - dw) / ratio[0]
    filtered_dets[:, 3] = (filtered_dets[:, 3] - dh) / ratio[1]

    # Convert class to integer and stack results
    dets = torch.cat((filtered_dets[:, :5], filtered_dets[:, 5].unsqueeze(1).int()), dim=1)

    # Convert to numpy array (N X (x, y, x, y, conf, cls))
    dets = dets.cpu().numpy()

    # Update the tracker
    res = tracker.update(dets, frame)  # --> M X (x, y, x, y, id, conf, cls, ind)

    # Plot tracking results on the image
    tracker.plot_results(frame, show_trajectories=True)

    # Display the frame
    cv2.imshow('BoXMOT + EfficientDet', frame)

    # Simulate wait for key press to continue, press 'q' to exit
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break

# Release resources
vid.release()
cv2.destroyAllWindows()


  from .autonotebook import tqdm as notebook_tqdm
[32m2024-10-25 20:09:26.729[0m | [1mINFO    [0m | [36mboxmot.utils.torch_utils[0m:[36mselect_device[0m:[36m52[0m - [1mYolo Tracking v11.0.4 🚀 Python-3.11.5 torch-2.2.2MPS[0m
[32m2024-10-25 20:09:26.749[0m | [32m[1mSUCCESS [0m | [36mboxmot.appearance.reid_model_factory[0m:[36mload_pretrained_weights[0m:[36m183[0m - [32m[1mLoaded pretrained weights from osnet_x0_25_msmt17.pt[0m


KeyboardInterrupt: 