# Bird Instance Segmentation Notebook (part 1 of the code pipeline)

This notebook performs **instance segmentation** on video frames, visualizes results, and fine-tunes a pretrained YOLOv8 (or v11) segmentation model. Add in your annotated frames (downloaded from roboflow) and then plug in your video path and run!

## 1. Install Dependencies

In [1]:
# Install required libraries
!pip install --upgrade --pre ultralytics
!pip install ipywidgets



## 2. Imports & Configuration

In [2]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import cv2
from tqdm import tqdm
from ultralytics import YOLO
from tqdm import tqdm
import yaml
import random, shutil
import tempfile

#plotting
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from ipywidgets import interact, IntSlider



In [3]:

#set up path here for the images downloaded from the manual segmentation
#SRC_ROOT_DIR = Path("/Users/inesaitsahalia/Desktop/labeling_data/yolo/birdLabels.v3i.yolov7pytorch")
SRC_ROOT_DIR = Path("/Users/inesaitsahalia/Desktop/labeling_data/yolo/bird labels.v7i.yolov11")

TARGET_ROOT_DIR = Path("training_images")


## 3. Create training and test sets

In [5]:
#split the dataset into training and validation randomly 

def split_dataset(target_root_dir, images_dir, labels_dir, train_ratio=0.8):
    
    # Debugging: print out what’s in directory
    print(f"Looking for images in: {images_dir.resolve()}")
    print("Exists?", images_dir.exists())
    
    # Create directories if they don't exist
    for dir_name in ['train/images', 'train/labels', 'valid/images', 'valid/labels']:
        Path(target_root_dir /dir_name).mkdir(parents=True, exist_ok=True)
    
    # Get all image files
    image_files = [f for f in os.listdir(images_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]
    
    # Randomly shuffle the files
    random.shuffle(image_files)
    
    # Calculate split index
    split_idx = int(len(image_files) * train_ratio)
    
    # Split into train and validation sets
    train_files = image_files[:split_idx]
    valid_files = image_files[split_idx:]
    
    # Move files to respective directories
    for files, subset in [(train_files, target_root_dir/'train'), (valid_files, target_root_dir/'valid')]:
        for img_file in files:
            # Move image
            print(f"Moving {img_file} to {subset} / images / {img_file}...")
            shutil.copy2(
                os.path.join(images_dir, img_file),
                os.path.join(subset, 'images', img_file)
            )
            
            # Move corresponding label file
            label_file = os.path.splitext(img_file)[0] + '.txt'
            if os.path.exists(os.path.join(labels_dir, label_file)):
                shutil.copy2(
                    os.path.join(labels_dir, label_file),
                    os.path.join(subset, 'labels', label_file)
                )
    
    # Print statistics
    print(f"Total images: {len(image_files)}")
    print(f"Training set: {len(train_files)}")
    print(f"Validation set: {len(valid_files)}")

In [6]:
split_dataset(TARGET_ROOT_DIR,
              SRC_ROOT_DIR / 'train' /'images',
              SRC_ROOT_DIR / 'train' / 'labels',
              train_ratio=0.8)

Looking for images in: /Users/inesaitsahalia/Desktop/labeling_data/yolo/bird labels.v7i.yolov11/train/images
Exists? True
Moving rgb_cranes_mp4-0561_jpg.rf.7ada39dc225b23e3db3cfed176027b28.jpg to training_images/train / images / rgb_cranes_mp4-0561_jpg.rf.7ada39dc225b23e3db3cfed176027b28.jpg...
Moving thermal_12_17_mov-0043_jpg.rf.8fae4551f9af795f9ea12bc5fd021d34.jpg to training_images/train / images / thermal_12_17_mov-0043_jpg.rf.8fae4551f9af795f9ea12bc5fd021d34.jpg...
Moving rgb_cranes_mp4-0753_jpg.rf.b18aa461c35ac8cdfc9eadd162d60ef3.jpg to training_images/train / images / rgb_cranes_mp4-0753_jpg.rf.b18aa461c35ac8cdfc9eadd162d60ef3.jpg...
Moving thermal_12_17_mov-0009_jpg.rf.515b34319ebe83227bc04fa5cf224a18.jpg to training_images/train / images / thermal_12_17_mov-0009_jpg.rf.515b34319ebe83227bc04fa5cf224a18.jpg...
Moving thermal_12_17_mov-0031_jpg.rf.8adefbc014e5672ccf67f8625f524b3a.jpg to training_images/train / images / thermal_12_17_mov-0031_jpg.rf.8adefbc014e5672ccf67f8625f524b

## 4. Load Pretrained Segmentation Model

In [7]:
# Load a pretrained segmentation model
# model = YOLO('yolov8n-seg.pt')  # nano version
model = YOLO('yolo11n-seg.pt')  # nano version




#### create the data file for the model using the path you want: Note that if the roboflow export is in yolo11 format, you can skip this step, as it is generated automatically. 

In [None]:

# # Write a simple dataset YAML for YOLO
# def create_data_yaml(train_dir, val_dir, nc, names, save_path="data.yaml"):
#     data = {
#         'train': str(train_dir),
#         'val': str(val_dir),
#         'nc': nc,
#         'names': names
#     }
#     save_path = Path(save_path)
#     save_path.parent.mkdir(parents=True, exist_ok=True)  # ensure folder exists
#     with open(save_path, 'w') as f:
#         yaml.dump(data, f)
#     print(f"✓ Wrote {save_path.resolve()}")


In [None]:
# create_data_yaml(
#     train_dir=SRC_ROOT_DIR / 'train' / 'images',
#     val_dir=SRC_ROOT_DIR / 'valid' / 'images',
#     nc=1,
#     names=['bird'],
#     save_path=TARGET_ROOT_DIR / 'data.yaml'
# )

## 5. Train the model so it doesn't suck

In [None]:

# Train the model
results = model.train(
    data=SRC_ROOT_DIR / 'data.yaml',
    epochs=50,
    imgsz=(320, 240),
    batch=4,
    device='cpu'  # use 'cpu' if no GPU
)

print('Training complete! :D')

Ultralytics 8.3.148 🚀 Python-3.11.11 torch-2.7.0 CPU (Apple M4)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=4, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/Users/inesaitsahalia/Desktop/labeling_data/yolo/bird labels.v7i.yolov11/data.yaml, degrees=0.0, deterministic=True, device=cpu, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=(320, 240), int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11n-seg.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train8, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patien

[34m[1mtrain: [0mScanning /Users/inesaitsahalia/Desktop/labeling_data/yolo/bird labels.v7i.yolov11/train/labels... 472 images, 3 backgrounds, 0 corrupt: 100%|██████████| 472/472 [00:00<00:00, 4401.02it/s]

[34m[1mtrain: [0mNew cache created: /Users/inesaitsahalia/Desktop/labeling_data/yolo/bird labels.v7i.yolov11/train/labels.cache
[34m[1mval: [0mFast image access ✅ (ping: 0.1±0.1 ms, read: 95.1±48.0 MB/s, size: 32.4 KB)



[34m[1mval: [0mScanning /Users/inesaitsahalia/Desktop/labeling_data/yolo/bird labels.v7i.yolov11/valid/labels... 41 images, 18 backgrounds, 0 corrupt: 100%|██████████| 41/41 [00:00<00:00, 4846.31it/s]

[34m[1mval: [0mNew cache created: /Users/inesaitsahalia/Desktop/labeling_data/yolo/bird labels.v7i.yolov11/valid/labels.cache
Plotting labels to runs/segment/train8/labels.jpg... 





[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 90 weight(decay=0.0), 101 weight(decay=0.0005), 100 bias(decay=0.0)
Image sizes 320 train, 320 val
Using 0 dataloader workers
Logging results to [1mruns/segment/train8[0m
Starting training for 50 epochs...

      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size


  xy = xy @ M.T  # transform
  xy = xy @ M.T  # transform
  xy = xy @ M.T  # transform
       1/50         0G      2.383      3.198       2.45     0.9193         43        320: 100%|██████████| 118/118 [00:56<00:00,  2.08it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 6/6 [00:01<00:00,  3.71it/s]

                   all         41        389    0.00717      0.226     0.0456     0.0171    0.00676      0.213      0.027    0.00858

      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size



  xy = xy @ M.T  # transform
  xy = xy @ M.T  # transform
  xy = xy @ M.T  # transform
       2/50         0G      2.234      2.977      1.415     0.8907         18        320: 100%|██████████| 118/118 [00:55<00:00,  2.14it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 6/6 [00:01<00:00,  4.48it/s]

                   all         41        389      0.652      0.216      0.212     0.0954      0.376      0.141      0.109     0.0372

      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size



  xy = xy @ M.T  # transform
  xy = xy @ M.T  # transform
  xy = xy @ M.T  # transform
       3/50         0G      2.283       2.88      1.297     0.8925         43        320: 100%|██████████| 118/118 [00:55<00:00,  2.12it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 6/6 [00:01<00:00,  4.18it/s]

                   all         41        389       0.63      0.219      0.223     0.0982      0.372      0.162      0.137     0.0527

      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size



  xy = xy @ M.T  # transform
  xy = xy @ M.T  # transform
  xy = xy @ M.T  # transform
       4/50         0G      2.181      2.846      1.188      0.885         39        320: 100%|██████████| 118/118 [00:56<00:00,  2.09it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 6/6 [00:01<00:00,  4.63it/s]

                   all         41        389      0.696      0.234      0.249      0.106       0.41      0.172      0.143     0.0475

      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size



  xy = xy @ M.T  # transform
  xy = xy @ M.T  # transform
  xy = xy @ M.T  # transform
       5/50         0G      2.267      2.812      1.129     0.8818         40        320:  90%|████████▉ | 106/118 [00:50<00:05,  2.04it/s]

## test the model on other videos 

In [None]:

# trained model path
model_path = 'runs/segment/train8/weights/best.pt'
#model = YOLO('yolo11n-seg.pt')  # nano version, use this if the 
model = YOLO(model_path)


In [None]:
#chek to make sure you're where you want to be 

# Print the current working directory
print(f"Current working directory: {os.getcwd()}")

# List files in the videos directory
video_dir = "videos"
if os.path.exists(video_dir):
    print(f"Files in {video_dir}:")
    for file in os.listdir(video_dir):
        print(f"  - {file}")
else:
    print(f"Directory {video_dir} does not exist!")


In [None]:
#load in the video you want 
VIDEO_PATH = "videos/thermal_15_20.mp4"   # for now still testing on the one video 

# 1) Check it really exists
if not Path(VIDEO_PATH).is_file():
    raise FileNotFoundError(f"Video file not found at: {VIDEO_PATH!r}")

# 2) Try opening and verify
cap = cv2.VideoCapture(VIDEO_PATH)
if not cap.isOpened():
    raise IOError(f"OpenCV failed to open video file: {VIDEO_PATH!r}")
# if you get here, it’s safe to proceed
cap.release()

FRAMES_DIR = "frames"
ANNOTATED_DIR = "annotations"
os.makedirs(FRAMES_DIR, exist_ok=True)
os.makedirs(ANNOTATED_DIR, exist_ok=True)

### split the videos into frames

In [None]:
#turn the video into images you'll be able to predict the model on later

def video_to_frames(video_path, out_dir, prefix="frame"):
    """
    Extracts frames from a video and saves them as .jpg.
    """
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    idx = 0
    with tqdm(total=total, desc="Extracting frames") as pbar:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            fname = f"{prefix}_{idx:06d}.jpg"
            cv2.imwrite(os.path.join(out_dir, fname), frame)
            idx += 1
            pbar.update(1)
    cap.release()
    print(f"Saved {idx} frames to {out_dir}")

# Run this to extract frames
video_to_frames(VIDEO_PATH, FRAMES_DIR)

In [None]:
# give the path to hat you want to test on 
inference_source = FRAMES_DIR # new video to test on
results = model(inference_source, stream = True)


# Display not every frame 
step = 1000
for i, result in enumerate(results):
    if i % step != 0:
        continue
    
    result.show()




## try to align the frames of the rgb to the thermal

test version for a single image

In [None]:


# --- Load images ---
rgb_img = cv2.imread(TARGET_ROOT_DIR/'rgb_trial.jpeg')
thermal_img = cv2.imread(TARGET_ROOT_DIR/'thermal_trial.png')

# Resize to the same shape for ECC (optional: crop instead if needed)
rgb_crop = cv2.resize(rgb_img, (640, 480))
thermal_crop = cv2.resize(thermal_img, (640, 480))



def select_points(img, window_name, min_points: int = 4):
    """
    Let the user click corresponding points in a window.
    ▸ Left‑click  to add a point  (green dot).
    ▸ Enter/Return or Esc closes the window *after* min_points clicks.
    Returns
    -------
    np.ndarray[float32]  shape = (N, 2)
    """
    points = []
    clone = img.copy()

    def click_event(event, x, y, flags, param):
        if event == cv2.EVENT_LBUTTONDOWN:
            points.append((x, y))
            cv2.circle(clone, (x, y), 5, (0, 255, 0), -1)
            cv2.imshow(window_name, clone)

    cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
    cv2.setMouseCallback(window_name, click_event)
    cv2.imshow(window_name, clone)

    print(f"🖱  Click at least {min_points} points, "
          "then press Esc to continue…")

    while True:
        key = cv2.waitKey(1) & 0xFF
        # 13 = Enter/Return, 27 = Esc
        if key in (13, 27):
            if len(points) >= min_points:
                break
            else:
                print(f" Need {min_points} points; you have {len(points)}.")
    cv2.destroyWindow(window_name)
    return np.array(points, dtype=np.float32)


print("Select points in RGB image")
pts_rgb = select_points(rgb_crop, "RGB - select points")

print("Select corresponding points in Thermal image")
pts_thm = select_points(thermal_crop, "Thermal - select points")

assert len(pts_rgb) >= 4 and len(pts_thm) == len(pts_rgb), "Need at least 4 matching points"

# --- Compute Homography & Warp ---
H, _ = cv2.findHomography(pts_thm, pts_rgb, cv2.RANSAC)
aligned_thm = cv2.warpPerspective(thermal_crop, H, (rgb_crop.shape[1], rgb_crop.shape[0]))

# --- Overlay ---
alpha = 0.5
overlay = cv2.addWeighted(rgb_crop, alpha, aligned_thm, 1 - alpha, 0)

# Convert BGR to RGB for display
rgb_crop = cv2.cvtColor(rgb_crop, cv2.COLOR_BGR2RGB)
aligned_thm = cv2.cvtColor(aligned_thm, cv2.COLOR_BGR2RGB)
overlay = cv2.cvtColor(overlay, cv2.COLOR_BGR2RGB)

# --- Plot ---
fig, axs = plt.subplots(1, 3, figsize=(18, 6))
axs[0].imshow(rgb_crop)
axs[0].set_title('RGB Cropped')
axs[1].imshow(aligned_thm)
axs[1].set_title('Aligned Thermal')
axs[2].imshow(overlay)
axs[2].set_title('Overlay')
for ax in axs: ax.axis('off')
plt.tight_layout()
plt.show()


### modify the rgb FOV to fit the crop of the thermal (roughly) 
use file for the gui (stored separately) 

In [None]:
import subprocess

vid_in = "videos/rgb_518152_A.mp4"
vid_out = "videos/rgb_518152_A_adjusted.mp4"

# Run the external script
subprocess.run([
    "python", "rotate_crop_gui.py",
    "--input", vid_in,
    "--output", vid_out
])


### align spatially

In [None]:

# --- Configuration ---
RGB_PATH = "videos/rgb_518152_A_adjusted.mp4"
THERMAL_PATH = "videos/thermal_12_17.mov"
OUTPUT_DIR = "aligned_frames"

RECALIBRATION_FRAMES = [1]  # manual alignment trigger frames (good for checking in longer vids)
FRAME_SIZE = (640, 480)     # Standard frame size for processing (idk this was the default in the 
OVERLAY_ALPHA = 0.3
MAX_FRAMES = 300            # Set to None to process the full video

# --- Utility: Interactive point selection ---
def select_points(img, window_name):
    points = []

    def click_event(event, x, y, flags, param):
        if event == cv2.EVENT_LBUTTONDOWN:
            points.append((x, y))
            cv2.circle(img, (x, y), 5, (0, 255, 0), -1)
            cv2.imshow(window_name, img)

    cv2.imshow(window_name, img)
    cv2.setMouseCallback(window_name, click_event)
    print("Click any 4+ corresponding points in the image in the **same** order, then press any key to continue :)")
    cv2.waitKey(0)
    cv2.destroyWindow(window_name)
    return np.array(points, dtype=np.float32)

# --- Load videos ---
rgb_video = cv2.VideoCapture(RGB_PATH)
thermal_video = cv2.VideoCapture(THERMAL_PATH)

if not rgb_video.isOpened() or not thermal_video.isOpened():
    raise IOError("yoinks! Could not open one or both video files.")

# --- Get video info ---
frame_count = int(min(rgb_video.get(cv2.CAP_PROP_FRAME_COUNT), thermal_video.get(cv2.CAP_PROP_FRAME_COUNT)))
fps = int(rgb_video.get(cv2.CAP_PROP_FPS))

# --- Limit frame count if MAX_FRAMES is set ---
if MAX_FRAMES is not None:
    frame_count = min(frame_count, MAX_FRAMES)

# --- Setup output ---
os.makedirs(OUTPUT_DIR, exist_ok=True)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_aligned = cv2.VideoWriter('aligned_thermal.mp4', fourcc, fps, FRAME_SIZE)
out_aligned_rgb = cv2.VideoWriter('aligned_rgb.mp4', fourcc, fps, FRAME_SIZE)
out_overlay = cv2.VideoWriter('overlay_video.mp4', fourcc, fps, FRAME_SIZE)

current_H = None

for frame_idx in tqdm(range(frame_count), desc="Processing video"):
    ret_rgb, frame_rgb = rgb_video.read()
    ret_thm, frame_thm = thermal_video.read()
    if not (ret_rgb and ret_thm):
        break

    frame_rgb = cv2.resize(frame_rgb, FRAME_SIZE)
    frame_thm = cv2.resize(frame_thm, FRAME_SIZE)

    # --- Manual alignment setup ---
    if frame_idx in RECALIBRATION_FRAMES:
        print(f"\n📌 Frame {frame_idx}: Manual alignment")
        print("➡ Select corresponding points on RGB frame...")
        pts_rgb = select_points(frame_rgb.copy(), f"RGB @ {frame_idx}")
        print("➡ Select corresponding points on Thermal frame...")
        pts_thm = select_points(frame_thm.copy(), f"Thermal @ {frame_idx}")
        if pts_rgb.shape[0] >= 4 and pts_rgb.shape == pts_thm.shape:
            current_H, _ = cv2.findHomography(pts_thm, pts_rgb, cv2.RANSAC)
            print("✔ Homography updated.")
        else:
            print("yikes! Invalid point selection. Skipping frame.")
            continue

    if current_H is None:
        print(f"⚠ Skipping frame {frame_idx}: No valid homography available yet.")
        continue

    # --- Align thermal frame ---
    aligned_thm = cv2.warpPerspective(frame_thm, current_H, FRAME_SIZE)

    # --- Match channels if needed ---
    if len(aligned_thm.shape) < 3 or aligned_thm.shape[2] != 3:
        aligned_thm = cv2.cvtColor(aligned_thm, cv2.COLOR_GRAY2BGR)

    # --- Create overlay ---
    overlay = cv2.addWeighted(frame_rgb, OVERLAY_ALPHA, aligned_thm, 1 - OVERLAY_ALPHA, 0)

    # --- Save frames ---
    aligned_path = os.path.join(OUTPUT_DIR, f"aligned_{frame_idx:05d}.png")
    overlay_path = os.path.join(OUTPUT_DIR, f"overlay_{frame_idx:05d}.png")
    cv2.imwrite(aligned_path, aligned_thm)
    cv2.imwrite(overlay_path, overlay)

    # --- Write video frames ---
    out_aligned.write(aligned_thm)
    out_aligned_rgb.write(frame_rgb)
    out_overlay.write(overlay)

# --- Cleanup ---
rgb_video.release()
thermal_video.release()
out_aligned.release()
out_aligned_rgb.release()
out_overlay.release()

print("\n✅ Done! Saved:")
print("   ▶ Aligned thermal video → 'aligned_thermal.mp4'")
print("   ▶ Aligned RGB video → 'aligned_rgb.mp4'")
print("   ▶ RGB-thermal overlay video → 'overlay_video.mp4'")
print(f"   ▶ Aligned frames saved in → '{OUTPUT_DIR}/'")


### align temporally

In [None]:
#turn the video into images you'll be able to predict the model on later

def video_to_frames(video_path, out_dir, prefix="frame"):
    """
    Extracts frames from a video and saves them as .jpg.
    """
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    idx = 0
    with tqdm(total=total, desc="Extracting frames") as pbar:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            fname = f"{prefix}_{idx:06d}.jpg"
            cv2.imwrite(os.path.join(out_dir, fname), frame)
            idx += 1
            pbar.update(1)
    cap.release()
    print(f"Saved {idx} frames to {out_dir}")


In [None]:
VIDEO_PATH_RGB = 'processed_videos/aligned_rgb.mp4'
FRAMES_DIR_RGB = 'processed_videos/RGB_frames'
# Run this to extract frames from the adjusted and aligned videos 
video_to_frames(VIDEO_PATH_RGB, FRAMES_DIR_RGB)

VIDEO_PATH_TH = 'processed_videos/aligned_thermal.mp4'
FRAMES_DIR_TH = 'processed_videos/THM_frames'
# Run this to extract frames from the adjusted and aligned videos 
video_to_frames(VIDEO_PATH_TH, FRAMES_DIR_TH)



## run the tracking on the modified version

In [None]:
# path to the new aligned frame folders 
inference_source = 'processed_videos/THM_frames'  
results = model(inference_source)

# Display every so often frame only
step = 50
for i, result in enumerate(results):
    if i % step != 0:
        continue
    
    rendered_img = result.plot()  # Renders image with masks and boxes
    plt.figure(figsize=(10, 10))
    plt.imshow(rendered_img)
    plt.title(f'Frame {i}')
    plt.axis('off')
    plt.show()