In [None]:
pip install effdet

In [None]:
import torch
import cv2
import torchvision.ops as ops
from torchvision import transforms
import numpy as np
from pathlib import Path
from effdet import create_model
from effdet.config import get_efficientdet_config
from boxmot import BotSort


# Function for letterbox resizing (padding to maintain aspect ratio)
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):
    shape = img.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val performance)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, 32), np.mod(dh, 32)  # wh padding
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return img, ratio, (dw, dh)

# Load EfficientDet model
device = torch.device('mps')  # Use 'cuda' if you have a GPU

model_name = 'tf_efficientdet_d0'  # You can choose a different variant like 'tf_efficientdet_d3'
config = get_efficientdet_config(model_name)
model = create_model(model_name, bench_task='predict', pretrained=True).to(device)
model.eval()

# Initialize the tracker
tracker = BotSort(
    reid_weights=Path('osnet_x0_25_msmt17.pt'),  # Path to ReID model
    device=device,  # Use CPU for inference
    half=False
)

input_size = config.image_size

preprocess = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Open the video file
vid = cv2.VideoCapture(0)  # or 'path/to/your.avi'

while True:
    # Capture frame-by-frame
    ret, frame = vid.read()

    # If ret is False, it means we have reached the end of the video
    if not ret:
        break

    # Apply letterbox resizing
    frame_letterbox, ratio, (dw, dh) = letterbox(frame, new_shape=input_size, auto=False, scaleFill=True)
    
    # Preprocess frame for EfficientDet (resize and normalize)
    frame_tensor = preprocess(frame_letterbox).unsqueeze(0).to(device)

    # Perform detection
    with torch.no_grad():
        detections = model(frame_tensor)[0]
                
    # Assuming detections is shaped [100, 6], with [x1, y1, x2, y2, confidence, class]
    confidence_threshold = 0.5
    
    # Filter detections based on confidence threshold
    mask = detections[:, 4] >= confidence_threshold
    filtered_dets = detections[mask]

    # Rescale coordinates from letterbox back to the original frame size
    filtered_dets[:, 0] = (filtered_dets[:, 0] - dw) / ratio[0]
    filtered_dets[:, 1] = (filtered_dets[:, 1] - dh) / ratio[1]
    filtered_dets[:, 2] = (filtered_dets[:, 2] - dw) / ratio[0]
    filtered_dets[:, 3] = (filtered_dets[:, 3] - dh) / ratio[1]

    # Convert class to integer and stack results
    dets = torch.cat((filtered_dets[:, :5], filtered_dets[:, 5].unsqueeze(1).int()), dim=1)

    # Convert to numpy array (N X (x, y, x, y, conf, cls))
    dets = dets.cpu().numpy()

    # Update the tracker
    res = tracker.update(dets, frame)  # --> M X (x, y, x, y, id, conf, cls, ind)

    # Plot tracking results on the image
    tracker.plot_results(frame, show_trajectories=True)

    # Display the frame
    cv2.imshow('BoXMOT + EfficientDet', frame)

    # Simulate wait for key press to continue, press 'q' to exit
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break

# Release resources
vid.release()
cv2.destroyAllWindows()
