# Using Yolo to create better bounding boxes

In [3]:
from ultralytics import YOLO
import os
import json
import tqdm
import torch

from video_dataset import load_rgb_frames_from_video
raw_path = "../data/WLASL2000/"
instance_path = "./preprocessed_labels/asl100/train_instances.json"
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
# from ultralytics import YOLO

# Load a pretrained YOLO11n model
model = YOLO("yolo11n.pt")

# Run inference on an image
results = model("https://ultralytics.com/images/bus.jpg")  # results list

# View results
for r in results:
    print(r.boxes)  # print the Boxes object containing the detection bounding boxes


Found https://ultralytics.com/images/bus.jpg locally at bus.jpg
image 1/1 /home/luke/ExtraStorage/WLASL/lukes-code/bus.jpg: 640x480 4 persons, 1 bus, 27.9ms
Speed: 1.5ms preprocess, 27.9ms inference, 106.6ms postprocess per image at shape (1, 3, 640, 480)
ultralytics.engine.results.Boxes object with attributes:

cls: tensor([5., 0., 0., 0., 0.], device='cuda:0')
conf: tensor([0.9402, 0.8882, 0.8783, 0.8558, 0.6219], device='cuda:0')
data: tensor([[3.8327e+00, 2.2936e+02, 7.9619e+02, 7.2841e+02, 9.4015e-01, 5.0000e+00],
        [6.7102e+02, 3.9483e+02, 8.0981e+02, 8.7871e+02, 8.8822e-01, 0.0000e+00],
        [4.7405e+01, 3.9957e+02, 2.3930e+02, 9.0420e+02, 8.7825e-01, 0.0000e+00],
        [2.2306e+02, 4.0869e+02, 3.4447e+02, 8.6044e+02, 8.5577e-01, 0.0000e+00],
        [2.1726e-02, 5.5607e+02, 6.8886e+01, 8.7236e+02, 6.2192e-01, 0.0000e+00]], device='cuda:0')
id: None
is_track: False
orig_shape: (1080, 810)
shape: torch.Size([5, 6])
xywh: tensor([[400.0136, 478.8883, 792.3620, 499.0480

In [5]:
result = results[0]  # Get the first result
print(result.boxes.xyxy)  # Print the bounding boxes in xyxy format

tensor([[3.8327e+00, 2.2936e+02, 7.9619e+02, 7.2841e+02],
        [6.7102e+02, 3.9483e+02, 8.0981e+02, 8.7871e+02],
        [4.7405e+01, 3.9957e+02, 2.3930e+02, 9.0420e+02],
        [2.2306e+02, 4.0869e+02, 3.4447e+02, 8.6044e+02],
        [2.1726e-02, 5.5607e+02, 6.8886e+01, 8.7236e+02]], device='cuda:0')


In [6]:
#select only bounding boxes for the class 'person'
person_bboxes = result.boxes.xyxy[result.boxes.cls == 0]  #
print(person_bboxes)  # Print the bounding boxes for the 'person' class

tensor([[6.7102e+02, 3.9483e+02, 8.0981e+02, 8.7871e+02],
        [4.7405e+01, 3.9957e+02, 2.3930e+02, 9.0420e+02],
        [2.2306e+02, 4.0869e+02, 3.4447e+02, 8.6044e+02],
        [2.1726e-02, 5.5607e+02, 6.8886e+01, 8.7236e+02]], device='cuda:0')


In [11]:
def get_largest_bbox(bboxes):
  if not bboxes:
    return None
  x_min, y_min, x_max, y_max = bboxes[0]
  for box in bboxes:
    x1, y1, x2, y2 = box
    if x1 < x_min:
      x_min = x1
    if y1 < y_min:
      y_min = y1
    if x2 > x_max:
      x_max = x2
    if y2 > y_max:
      y_max = y2
  return [x_min, y_min, x_max, y_max]

from video_dataset import load_rgb_frames_from_video
sample = '../data/WLASL2000/00295.mp4'
frames = load_rgb_frames_from_video(sample, 0, 100, all=True)
frames = frames.float()  # Convert to float32
results = model(frames, device=device)  # Run inference on the frames
bboxes = []
for i, result in enumerate(results):
  print(f"Frame {i}: {result.boxes.xyxy}")
  person_bboxes = result.boxes.xyxy[result.boxes.cls == 0]
  if len(person_bboxes) > 0:
    bboxes.extend(person_bboxes.tolist())
# Get the largest bounding box
largest_bbox = get_largest_bbox(bboxes)
print("Largest bounding box:", largest_bbox)


0: 256x256 1 person, 0.8ms
1: 256x256 1 person, 0.8ms
2: 256x256 1 person, 0.8ms
3: 256x256 1 person, 0.8ms
4: 256x256 1 person, 0.8ms
5: 256x256 1 person, 0.8ms
6: 256x256 1 person, 0.8ms
7: 256x256 1 person, 0.8ms
8: 256x256 1 person, 0.8ms
9: 256x256 1 person, 0.8ms
10: 256x256 1 person, 0.8ms
11: 256x256 1 person, 0.8ms
12: 256x256 1 person, 0.8ms
13: 256x256 1 person, 0.8ms
14: 256x256 1 person, 0.8ms
15: 256x256 1 person, 0.8ms
16: 256x256 1 person, 0.8ms
17: 256x256 1 person, 0.8ms
18: 256x256 1 person, 0.8ms
19: 256x256 1 person, 0.8ms
20: 256x256 1 person, 0.8ms
Speed: 0.0ms preprocess, 0.8ms inference, 0.5ms postprocess per image at shape (1, 3, 256, 256)
Frame 0: tensor([[ 60.0529,  44.7802, 212.9297, 220.2935]], device='cuda:0')
Frame 1: tensor([[ 60.2409,  44.6726, 212.7652, 220.2927]], device='cuda:0')
Frame 2: tensor([[ 60.2409,  44.6726, 212.7652, 220.2927]], device='cuda:0')
Frame 3: tensor([[ 60.2730,  44.6530, 212.5301, 220.3191]], device='cuda:0')
Frame 4: tensor([

In [12]:
#visualize the largest bounding box on the video, write to output file
import cv2
output_path = './output/output.mp4'
#convert frames to numpy uint8
frames = frames.permute(0, 2, 3, 1).cpu().numpy().astype('uint8')
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, 30.0, (frames.shape[2], frames.shape[1]))
for i, frame in enumerate(frames):
  if largest_bbox is not None:
    x_min, y_min, x_max, y_max = map(int, largest_bbox)
    cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)
  out.write(frame)
out.release()
print(f"Output video saved to {output_path}")

Output video saved to ./output/output.mp4


### It seems that particular video is actually really bad

goign to try with a different video, and later exclude certain videos if they are too short

In [13]:
sample = '../data/WLASL2000/00333.mp4'
frames = load_rgb_frames_from_video(sample, 0, 100, all=True)
frames = frames.float()  # Convert to float32
results = model(frames, device=device)  # Run inference on the frames
bboxes = []
for i, result in enumerate(results):
  print(f"Frame {i}: {result.boxes.xyxy}")
  person_bboxes = result.boxes.xyxy[result.boxes.cls == 0]
  if len(person_bboxes) > 0:
    bboxes.extend(person_bboxes.tolist())



0: 256x256 1 person, 16.6ms
1: 256x256 1 person, 16.6ms
2: 256x256 1 person, 16.6ms
3: 256x256 1 person, 16.6ms
4: 256x256 1 person, 16.6ms
5: 256x256 1 person, 16.6ms
6: 256x256 1 person, 16.6ms
7: 256x256 1 person, 16.6ms
8: 256x256 1 person, 16.6ms
9: 256x256 1 person, 16.6ms
10: 256x256 1 person, 16.6ms
11: 256x256 1 person, 16.6ms
12: 256x256 1 person, 16.6ms
13: 256x256 1 person, 16.6ms
14: 256x256 1 person, 16.6ms
15: 256x256 1 person, 16.6ms
16: 256x256 1 person, 16.6ms
17: 256x256 1 person, 16.6ms
18: 256x256 1 person, 16.6ms
19: 256x256 1 person, 16.6ms
20: 256x256 1 person, 16.6ms
21: 256x256 1 person, 16.6ms
22: 256x256 1 person, 16.6ms
23: 256x256 1 person, 16.6ms
24: 256x256 1 person, 16.6ms
25: 256x256 1 person, 16.6ms
26: 256x256 1 person, 16.6ms
27: 256x256 1 person, 16.6ms
28: 256x256 1 person, 16.6ms
29: 256x256 1 person, 16.6ms
30: 256x256 1 person, 16.6ms
31: 256x256 1 person, 16.6ms
32: 256x256 1 person, 16.6ms
33: 256x256 1 person, 16.6ms
34: 256x256 1 person, 1

In [14]:
# Get the largest bounding box
largest_bbox = get_largest_bbox(bboxes)
print("Largest bounding box:", largest_bbox)

Largest bounding box: [47.234710693359375, 40.5211181640625, 210.39431762695312, 223.93048095703125]


In [15]:
frames = frames.permute(0, 2, 3, 1).cpu().numpy().astype('uint8')
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, 30.0, (frames.shape[2], frames.shape[1]))
for i, frame in enumerate(frames):
  if largest_bbox is not None:
    x_min, y_min, x_max, y_max = map(int, largest_bbox)
    cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)
  out.write(frame)
out.release()
print(f"Output video saved to {output_path}")

Output video saved to ./output/output.mp4


### That seems to have worked very well, testing on another

In [16]:
sample = '../data/WLASL2000/10892.mp4'
frames = load_rgb_frames_from_video(sample, 0, 100, all=True)
frames = frames.float()  # Convert to float32
results = model(frames, device=device)  # Run inference on the frames
bboxes = []
for i, result in enumerate(results):
  print(f"Frame {i}: {result.boxes.xyxy}")
  person_bboxes = result.boxes.xyxy[result.boxes.cls == 0]
  if len(person_bboxes) > 0:
    bboxes.extend(person_bboxes.tolist())
# Get the largest bounding box
largest_bbox = get_largest_bbox(bboxes)
print("Largest bounding box:", largest_bbox)
frames = frames.permute(0, 2, 3, 1).cpu().numpy().astype('uint8')
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, 30.0, (frames.shape[2], frames.shape[1]))
for i, frame in enumerate(frames):
  if largest_bbox is not None:
    x_min, y_min, x_max, y_max = map(int, largest_bbox)
    cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (255, 0, 0), 2)
  out.write(frame)
out.release()
print(f"Output video saved to {output_path}")


0: 256x256 1 person, 4.0ms
1: 256x256 1 person, 4.0ms
2: 256x256 1 person, 4.0ms
3: 256x256 1 person, 4.0ms
4: 256x256 1 person, 4.0ms
5: 256x256 1 person, 4.0ms
6: 256x256 1 person, 4.0ms
7: 256x256 1 person, 4.0ms
8: 256x256 1 person, 4.0ms
9: 256x256 1 person, 4.0ms
10: 256x256 1 person, 4.0ms
11: 256x256 1 person, 4.0ms
12: 256x256 1 person, 4.0ms
13: 256x256 1 person, 4.0ms
14: 256x256 1 person, 4.0ms
15: 256x256 1 person, 4.0ms
16: 256x256 1 person, 4.0ms
17: 256x256 1 person, 4.0ms
18: 256x256 1 person, 4.0ms
19: 256x256 1 person, 4.0ms
20: 256x256 1 person, 4.0ms
21: 256x256 1 person, 4.0ms
22: 256x256 1 person, 4.0ms
23: 256x256 1 person, 4.0ms
24: 256x256 1 person, 4.0ms
25: 256x256 1 person, 4.0ms
26: 256x256 1 person, 4.0ms
27: 256x256 1 person, 4.0ms
28: 256x256 1 person, 4.0ms
29: 256x256 1 person, 4.0ms
30: 256x256 1 person, 4.0ms
31: 256x256 1 person, 4.0ms
32: 256x256 1 person, 4.0ms
33: 256x256 1 person, 4.0ms
34: 256x256 1 person, 4.0ms
35: 256x256 1 person, 4.0ms
3

In [None]:
model = YOLO("yolov11.pt")  # Load a pretrained YOLOv8 model
model.to(device)  # Move the model to the appropriate device
model.eval()  # Set the model to evaluation mode

output = './output/train_instances.json'
log = './output/bounding_box_issues.txt'
with open(instance_path, 'r') as f:
  instances = json.load(f)

new_instances = []

for instance in tqdm.tqdm(instances, desc="Processing instances"):
  vid_path = os.path.join(raw_path, instance['video_id'] + '.mp4')
  frames = load_rgb_frames_from_video(vid_path, start=instance['frame_start'],
                                      end=instance['frame_end'])
  
  # Run inference on the frames
  results = model(frames, verbose=False, device=device)
  
  # Extract bounding boxes and scores
  bboxes = []
  for result in results:
    for box in result.boxes:
      bbox = box.xyxy.cpu().numpy().tolist()  # Convert to list
      score = box.conf.cpu().item()  # Get confidence score
      bboxes.append({'bbox': bbox, 'score': score})
  
  instance['bboxes'] = bboxes
  