# Exploring Video Datasets

In [None]:
%pip install -U opencv-python timm transformers
%restart_python

This notebook is designed to unpack a single video then to run the DET algorithm and return the bounding boxes.
At this moment this runs serially

We can investigate ways in order to run this in parallel
We can also leverage a LLM in order to annotate and describe the image if we so wish

In [None]:
# setup configs
import os
import cv2

catalog = 'brian_ml_dev'
schema = 'image_processing'
raw_data = 'raw_data'

In [None]:
video_folder = f'/Volumes/{catalog}/{schema}/{raw_data}'
files = os.listdir(video_folder)
files

In [None]:
# test read first file
first_file = os.path.join( video_folder, files[0] )
capture = cv2.VideoCapture(first_file)

frames = []
frame_index = 0

while True:
    success, frame = capture.read()
    if not success:
        break
    frames.append(frame)
    frame_index += 1

capture.release()

In [None]:
from PIL import Image
bgr_image_array = frames[0]
rgb_array = cv2.cvtColor(bgr_image_array, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(rgb_array)
display(pil_image)

In [None]:
from transformers import DetrFeatureExtractor, DetrForObjectDetection

# Load model and feature extractor
feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

In [None]:
import torch

inputs = feature_extractor(images=pil_image, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

results = feature_extractor.post_process_object_detection(outputs, 
                                                          target_sizes=torch.tensor([(pil_image.height, pil_image.width)]), threshold=0.3)

In [None]:
for result in results:
    for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
        score, label = score.item(), label_id.item()
        box = [round(i, 2) for i in box.tolist()]
        print(f"{model.config.id2label[label]}: {score:.2f} {box}")

In [None]:
import numpy as np

rgb_image = np.array(bgr_image_array)

for result in results:
    for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
        x1, y1, x2, y2 = map(int, box.tolist())
        cv2.rectangle(rgb_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        text = f"{model.config.id2label[label_id.item()]} {score:.2f}"
        cv2.putText(rgb_image, text, (x1, max(0, y1 - 5)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)


In [None]:
def cv2_imshow(a):
  """A replacement for cv2.imshow() for use in Jupyter notebooks.

  Args:
    a : np.ndarray. shape (N, M) or (N, M, 1) is an NxM grayscale image. shape
      (N, M, 3) is an NxM BGR color image. shape (N, M, 4) is an NxM BGRA color
      image.
  """
  a = a.clip(0, 255).astype('uint8')
  # cv2 stores colors as BGR; convert to RGB
  if a.ndim == 3:
    if a.shape[2] == 4:
      a = cv2.cvtColor(a, cv2.COLOR_BGRA2RGBA)
    else:
      a = cv2.cvtColor(a, cv2.COLOR_BGR2RGB)
  display(Image.fromarray(a))


In [None]:
cv2_imshow(rgb_image)
#cv2.waitKey(0)
#cv2.destroyAllWindows()