In [None]:
!pip install --quiet transformers timm

In [None]:
from transformers import AutoImageProcessor, AutoModelForObjectDetection
import cv2
import matplotlib.pyplot as plt
import torch
from PIL import Image, ImageDraw
import numpy as np
import requests

In [None]:
# Detect the current device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

In [None]:
model_name = "facebook/detr-resnet-50"

# Load the image processor to apply all the necessary transformations
processor = AutoImageProcessor.from_pretrained(model_name)

# Load the model from the Hugging Face Hub
model = AutoModelForObjectDetection.from_pretrained(model_name)

# Put the model into the evaluation mode and transfer it to the current device
model.eval().to(device);

In [None]:
# Download the image from the web
url = "https://www.purina.co.uk/sites/default/files/2023-03/Hero%20Pedigree%20Cats.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [None]:
# Visualize the image
image

In [None]:
# Turn off the gradients since we are not training anything
with torch.inference_mode():
    # Apply the neccessary transformations to the image
    inputs = processor(images=image, return_tensors="pt")
    # Put the image through the model to get the predictions
    outputs = model(**inputs.to(device))
    # Get the target size of the image to map the predictions
    target_sizes = torch.tensor([image.size[::-1]])
    # Post-process the predictions and save the results
    results = processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]

In [None]:
# Print out the results
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    print(
        f"Detected {model.config.id2label[label.item()]} with confidence "
        f"{round(score.item(), 3)} at location {box}"
    )

In [None]:
# Visualize the predictions
draw = ImageDraw.Draw(image)

for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
    box = [round(i, 2) for i in box.tolist()]
    x, y, x2, y2 = tuple(box)
    draw.rectangle((x, y, x2, y2), outline="red", width=1)
    draw.text((x, y), model.config.id2label[label.item()], fill="black")

image

In [None]:
# Project Task
# Make a pipeline to get a YouTube video from the URL and then detect all the objects on it

# You can use pytube package (https://pytube.io/en/latest/) that you can install by
# !pip install git+https://github.com/ytdl-org/ytdl-nightly.git
# !pip install pytube

# Then, you download the video and read it frame by frame with opencv (https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html)
# For each frame you predict the objects and draw the boundaries around them and write them to the output video (you might want to use FMP4 codec for that)

In [None]:
!pip install --quiet git+https://github.com/ytdl-org/ytdl-nightly.git
!pip install --quiet pytube

In [None]:
from pytube import YouTube

yt = YouTube("https://www.youtube.com/watch?v=dQw4w9WgXcQ")