In [None]:
!pip install timm

In [None]:
!pip install -q transformers torch torchvision pillow opencv-python

In [None]:
import torch
import cv2
from transformers import DetrImageProcessor, DetrForObjectDetection
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
model.to(device)
model.eval()

input_video_path = "/content/fall.mp4"
output_video_path = "/content/output_annotated.mp4"


In [None]:

cap = cv2.VideoCapture(input_video_path)

fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))


In [None]:

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(rgb_frame)

    inputs = processor(images=pil_image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    target_sizes = torch.tensor([pil_image.size[::-1]]).to(device)
    results = processor.post_process_object_detection(
        outputs,
        target_sizes=target_sizes,
        threshold=0.7
    )[0]

    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        x_min, y_min, x_max, y_max = map(int, box.tolist())
        class_name = model.config.id2label[label.item()]
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)
        cv2.putText(
            frame,
            f"{class_name}: {score:.2f}",
            (x_min, max(y_min - 10, 0)),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.6,
            (0, 0, 255),
            2
        )

    out.write(frame)

cap.release()
out.release()
