In [None]:
# Tested on python 3.10.11 on Windows 11 PC with RTX 3080 Ti GPU
%cd ..; %pip install -e .; %cd notebooks # installs segment-anything 
%pip install numpy matplotlib opencv-python-headless scipy torch torchvision scikit-learn pyyaml ultralytics jupyter_bbox_widget ipywidgets # installs dependencies
# restart kernel

# Begin

- Import necessary packages and initialise SAM model and mask generator
- Specify you SAM model directory and type (If you don't have a SAM model, download [here](https://github.com/facebookresearch/segment-anything#model-checkpoints))

In [3]:
import numpy as np # for operations on masks

import matplotlib.pyplot as plt # for plotting images
import cv2 # for image processing
from scipy import ndimage # for image processing
import base64 # for encoding images

import os # for file operations
import shutil # for file operations
import glob # for file operations
import pickle # for data serialization
import json # for reading json files

import torch # for deep learning
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor # for the SAM model
from sklearn.model_selection import train_test_split # for splitting the dataset
from IPython.display import clear_output # for clearing the output
import yaml # for creating yaml file for YOLO dataset

from ultralytics import YOLO # for YOLO model

from jupyter_bbox_widget import BBoxWidget # for creating bounding box widget
import ipywidgets as widgets # for creating widgets

sam_checkpoint = "../models/sam_vit_h_4b8939.pth" # Path to the checkpoint file
model_type = 'vit_h' # Model type

device = "cuda" if torch.cuda.is_available() else "cpu" # Use GPU if available, otherwise use CPU

sam = sam_model_registry[model_type](checkpoint=sam_checkpoint) # Load the model
sam.to(device=device) # Move the model to the device

mask_generator = SamAutomaticMaskGenerator(sam) # Create a mask generator
mask_predictor = SamPredictor(sam) # Create a mask predictor

Functions

In [4]:
# this function returns the area of a mask (number of pixels)
def get_area(mask):
    area = 0
    for row in mask:
        for col in row:
            if col:
                area += 1
    return area


# this function returns the index of the mask with the largest area
def get_max_area(masks):
    max_area = 0
    idx = 0
    for i in range(len(masks)):
        if(get_area(masks[i]) > max_area):
            max_area = get_area(masks[i])
            idx = i
    return idx


# this function draws the mask on the image
def overlay_mask_on_image(image, coord):
    # Ensure the mask is in 8-bit format
    image = cv2.drawContours(image, coord, -1, (0, 0, 255), 1)
    return image


# process the mask to remove the holes in the mask and return the largest region
def process_mask(mask):
    # Identify each separate region in the mask.
    labeled_mask, num_labels = ndimage.label(mask)
    
    # Count the size of each region.
    region_sizes = np.bincount(labeled_mask.flatten())
    
    # The first region (index 0) is the background, which we don't want to consider.
    region_sizes[0] = 0
    
    # Find the largest region.
    largest_region = np.argmax(region_sizes)
    
    # Create a mask that only includes the largest region.
    largest_mask = (labeled_mask == largest_region)
    
    # Fill in the holes in this region.
    filled_mask = ndimage.binary_fill_holes(largest_mask)
    
    return filled_mask



# extract the coordinates of the segment from SAM and store them in a list
def extract_segment(mask):
    binary_mask = np.array(mask) # get the segmentation of the mask and convert it to a numpy array
    binary_mask = (binary_mask * 255).astype(np.uint8) # convert the mask to a binary mask

    binary_mask = (process_mask(binary_mask) * 255).astype(np.uint8) # returns a single pixel_array with no holes in the mask


    contours, hierarchy = cv2.findContours(binary_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) # find contours from the binary image
    polygon_coords = [] # stores the coordinates of the vertices of the polygon
    
    for contour in contours:
        epsilon = 0.0001 * cv2.arcLength(contour, True) # approximate contour with accuracy proportional to the contour perimeter
        approx = cv2.approxPolyDP(contour, epsilon, True) # approximate contour with the Douglas-Peucker algorithm

        polygon_coords.append(approx) # add the coordinates of the vertices of the polygon to the list
    return polygon_coords


# this function is to encode the image for rendering by the widget
def encode_image_existing_mask(filepath, data):
    image = cv2.imread(filepath)
    segmentations = []
    for i in data:
        segmentations.append(i[1:])
    
    for segmentation in segmentations:
        points = np.array(segmentation).reshape(-1, 2) * [image.shape[1], image.shape[0]]
        points = points.astype(int)
        image = cv2.polylines(image, [points], isClosed=True, color=(0, 0, 255), thickness=1)
    is_success, im_buf_arr = cv2.imencode(".jpg", image)
    byte_im = im_buf_arr.tobytes()

    encoded = base64.b64encode(byte_im).decode('utf-8')
    return "data:image/jpg;base64,"+encoded


def encode_image(filepath):
    with open(filepath, 'rb') as f:
        image_bytes = f.read()
    encoded = str(base64.b64encode(image_bytes), 'utf-8')

    return "data:image/jpg;base64,"+encoded

# this function is to encode the image with mask for rendering by the widget, whilst also returning the polygon coordinates of the mask and the original height and width of the image
def encode_image_mask(filepath, boxes):
    # read in the image file
    image = cv2.imread(filepath)
    h, w = image.shape[:2]
    poly_coords_list = []
    for box in boxes:
        # convert the bbox to format expected by mask_predictor
        box = np.array([
            box['x'],
            box['y'],
            box['x'] + box['width'],
            box['y'] + box['height']
        ])

        mask_predictor.set_image(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        # predict the masks
        masks, scores, logits = mask_predictor.predict(
            box = box,
            multimask_output = True
        )

        # get the index of the mask with the largest area
        idx = get_max_area(masks)
        mask = masks[idx]

        # convert the pixel array format of the mask to a polygon coordinates
        polygon_coords = extract_segment(mask)
        poly_coords_list.append(polygon_coords)
        # overlay the mask on the image
        image = overlay_mask_on_image(image, polygon_coords)

    # convert the image with mask back to bytes
    is_success, im_buf_arr = cv2.imencode(".jpg", image)
    byte_im = im_buf_arr.tobytes()

    # encode to Base64 for rendering on the web
    encoded = base64.b64encode(byte_im).decode('utf-8')
    
    return "data:image/jpg;base64,"+encoded, poly_coords_list, h, w



# converts the list of numpy array to a list a list for easier manipulation
def numpy_to_list(numpy_arr):
    coord_list = [coord[0].tolist() for array in numpy_arr for coord in array] # Convert each numpy array in the list to a regular list and extract the inner lists into coord_list
    flat_list = [] # stores the flattened list of coordinates
    for coord in coord_list: # convert the list of lists into a flat list
        flat_list.append(coord[0]) # append the x coordinate
        flat_list.append(coord[1]) # append the y coordinate
    return flat_list


# this is used to extract the frames from a video file and output into specified directory as jpg images
def extract_frames(video_path, output_dir, frame_interval=300):
    filename = os.path.splitext(os.path.basename(video_path))[0]
    os.makedirs(output_dir, exist_ok=True)

    video = cv2.VideoCapture(video_path)

    if not video.isOpened():
        print(f"Could not open video file: {video_path}")
        return

    fps = video.get(cv2.CAP_PROP_FPS)

    if fps >= 50:
        frame_interval *= 2
    
    frame_index = 0

    while True:
        success, frame = video.read()
        if not success: 
            break

        if frame_index % frame_interval == 0:
            output_path = os.path.join(output_dir, f"{filename}_frame_{frame_index}.png")
            cv2.imwrite(output_path, frame)

        frame_index += 1

    video.release()


# this is used to split the images into training and validation sets into the dataset folder
def split_data(src_directory, out_directory, test_size=0.2):
    os.makedirs(out_directory, exist_ok=True) # create the dataset directory

    os.makedirs(os.path.join(out_directory, 'train', 'images'), exist_ok=False) # create the train images directory
    os.makedirs(os.path.join(out_directory, 'valid', 'images'), exist_ok=False) # create the valid images directory
    os.makedirs(os.path.join(out_directory, 'train', 'labels'), exist_ok=False) # create the train labels directory
    os.makedirs(os.path.join(out_directory, 'valid', 'labels'), exist_ok=False) # create the valid labels directory

    all_files = os.listdir(src_directory) # get all the files in the source directory
    train_files, valid_files = train_test_split(all_files, test_size=test_size, random_state=42) # split the files into training and validation sets

    # Move files into the train and valid directories
    for file_name in train_files:
        shutil.copy(os.path.join(src_directory, file_name), os.path.join(out_directory, 'train', 'images', file_name))
    for file_name in valid_files:
        shutil.copy(os.path.join(src_directory, file_name), os.path.join(out_directory, 'valid', 'images', file_name))


# this is used to load the images from the specified directory and output the data in the format required for YOLO training
def load_images_from_video(img_path, vid_path, ds_path, frame_interval):
    for filename in os.listdir(vid_path):
        extract_frames(os.path.join(vid_path, filename), img_path, frame_interval)
    
    split_data(img_path, ds_path) # splits into train, valid sets and moves into ds_path folder


# this is used to format the YOLO data into appropriate txt files for use in YOLO training
def output_to_txt(data_dict, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for image_name, content in data_dict.items():
        # remove the file extension from the image file name
        base_name = os.path.splitext(image_name)[0]
        # create the output file name by adding .txt extension
        output_file_name = base_name + '.txt'
        output_file_path = os.path.join(output_dir, output_file_name)
        
        with open(output_file_path, 'w') as f:
            for line in content:
                line_str = [str(item) for item in line]  # Convert all items to strings
                f.write(' '.join(line_str))  # Join all items in the line with ',' as separator
                f.write('\n')  # Write a new line after each line
    


# this is used to move the data from 3-dataset to dataset for use in YOLO training
def move_files(src_img, src_label, dest_img, dest_label):
    # Check if destination directories exist, if not, create them
    os.makedirs(dest_img, exist_ok=True)
    os.makedirs(dest_label, exist_ok=True)

    all_images = os.listdir(src_img)
    all_labels = os.listdir(src_label)
    
    for image in all_images:
        shutil.copy(os.path.join(src_img, image), os.path.join(dest_img, image))

    
    for label in all_labels:
        shutil.copy(os.path.join(src_label, label), os.path.join(dest_label, label))


# this is used to move the videos from 1-source to dataset/sources
def move_source_vid(src_vid, dest_vid):
    # Check if destination directories exist, if not, create them
    os.makedirs(dest_vid, exist_ok=True)

    all_videos = os.listdir(src_vid)
    
    for video in all_videos:
        shutil.copy(os.path.join(src_vid, video), os.path.join(dest_vid, video))


# clears directory
def clear_directory(directory):
    # Be careful with this function! It deletes all files and subdirectories in the specified directory
    shutil.rmtree(directory)
    os.mkdir(directory)


# this is used to create the data.yaml (necessary for YOLO training) file in the dataset folder 
def create_yaml(labels, path, output_path, train_path="train/images", val_path="valid/images", ):
    # copies labels with the last two elemens removed
    my_dict = {i: labels[i] for i in range(len(labels))}

    data = {
        'names': my_dict,
        'path': path,
        'train': train_path,
        'val': val_path
    }

    with open(output_path, 'w') as outfile: # write the data to the yaml file
        yaml.dump(data, outfile, default_flow_style=False)


# cleans the dataset
def delete_empty_labels_and_images(label_dir, image_dir):
    # Get a list of all txt files in label directory
    label_files = glob.glob(os.path.join(label_dir, '*.txt'))
    
    for label_file in label_files:
        # Check if the file is empty
        if os.stat(label_file).st_size == 0:
            # If empty, delete the label file
            os.remove(label_file)
            
            # Construct the corresponding image file path
            image_file = os.path.join(image_dir, os.path.splitext(os.path.basename(label_file))[0] + '.png')
            
            # Delete the corresponding image file, if it exists
            if os.path.exists(image_file):
                os.remove(image_file)



# Generate dataset

This generates a dataset from the video sources placed in `1-source` directory. Run this if you don't have an existing set of images to annotate and want to create a set of images from video sources

In [5]:
source_dir = 'X:/1-source' # specify where you want to store the source videos
source_extracted_dir = 'X:/2-source-extracted' # specify where you want to store the extracted frames from the source videos
staging_dir = 'X:/3-dataset' # specify where you want to store the YOLO-labelled dataset
ds_dir = 'X:/dataset' # this directory is the actual dataset for YOLO training ; dataset in the staging dir will be moved here after labelling

staging_dir_train_images = os.path.join(staging_dir, 'train', 'images') # staging directory for train images
staging_dir_train_labels = os.path.join(staging_dir, 'train', 'labels') # staging directory for train labels
staging_dir_valid_images = os.path.join(staging_dir, 'valid', 'images') # staging directory for valid images
staging_dir_valid_labels = os.path.join(staging_dir, 'valid', 'labels') # staging directory for valid labels

ds_dir_train_images = os.path.join(ds_dir, 'train', 'images') # dataset directory for train images
ds_dir_train_labels = os.path.join(ds_dir, 'train', 'labels') # dataset directory for train labels
ds_dir_valid_images = os.path.join(ds_dir, 'valid', 'images') # dataset directory for valid images
ds_dir_valid_labels = os.path.join(ds_dir, 'valid', 'labels') # dataset directory for valid labels
ds_dir_source = os.path.join(ds_dir, 'source') # dataset directory for sources

In [6]:
frame_interval = 300 # specify the frame interval to extract from the video

# extract frames from videos in 1-source folder, extract them to 2-source-extracted folder, and split them into train and valid sets in 3-dataset folder
load_images_from_video(source_extracted_dir, source_dir, staging_dir, frame_interval)

If you already have a collection of images, simply place all your images in a directory `sam-2-yolo/2-source-extracted/` and run the following cell

In [60]:
split_data(source_extracted_dir, staging_dir) # splits images into train, valid sets and moves into ../3-dataset folder

# Annotate training set

In [7]:
classes = ['person', 'wall', 'floor', 'ceiling', 'door', 'window', 'tools', 'building material', 'pipes', 'ceiling hole'] # list of classes

In [8]:
path = staging_dir_train_images # path to training images
images = sorted(os.listdir(path))

annotations = {} # dictionary with key = image name, value = corresponding bbox
data = {} # dictionary with key = image name, value = list of list containing: [label_id, x1, y1, x2, y2, ...]
cur_img_idx = 0 # current image index

Initialise the widget

In [9]:
# initialise the bbox widget
w_bbox = BBoxWidget(
    image = encode_image(os.path.join(path, images[cur_img_idx])),
    classes=classes
)
# a progress bar to show how far we got
w_progress = widgets.IntProgress(value=0, max=len(images), description='Progress')

# initialise the buttons
button_next = widgets.Button(description="Next")
button_prev = widgets.Button(description="Previous")
# combine the buttons and the bbox widget into a container
w_container = widgets.VBox([
    w_progress,
    button_prev,
    button_next,
    w_bbox

])

# function that updates the image when the buttons are clicked so that the next or previous image is shown
def update_image(change):
    global cur_img_idx
    annotations[images[cur_img_idx]] = w_bbox.bboxes # save the annotations for the current image before moving to the next image

    if images[cur_img_idx] not in data: # this is for the first time the image is shown
        data[images[cur_img_idx]] = []
    
    # move the current image index forward or backward
    if change.description == "Next":
        cur_img_idx = (cur_img_idx + 1) % len(images)
    elif change.description == "Previous":
        cur_img_idx = (cur_img_idx - 1) % len(images)
    w_progress.value = cur_img_idx # update the progress bar
    if images[cur_img_idx] not in data: # if the next image is not in the data dictionary, add it as an empty list
        data[images[cur_img_idx]] = []

    # check if annotations[cur_img_idx] exists
    if images[cur_img_idx] in annotations:
        w_bbox.bboxes = annotations[images[cur_img_idx]] # if it exists, load the annotations for the image
    else:
        w_bbox.bboxes = [] # if it doesn't exist, the image has no annotations, so set the bounding boxes to an empty list

    w_bbox.image = encode_image_existing_mask(os.path.join(path, images[cur_img_idx]), data[images[cur_img_idx]])
# add the update_image function to the buttons
button_next.on_click(update_image)
button_prev.on_click(update_image)

# defines what happens when the submit button is clicked, which is to run SAM with the bounding boxes specified by the user and to display the result and save the result to the data dictionary for conversion to YOLO format later
@w_bbox.on_submit
def submit():
    data[images[cur_img_idx]] = []

    if len(w_bbox.bboxes) > 0:
        w_bbox.image, poly_coords_list, h, w = encode_image_mask(os.path.join(path, images[cur_img_idx]), w_bbox.bboxes)
        i = 0
        for polygon_coords in poly_coords_list:
            label_id = [classes.index(w_bbox.bboxes[i]['label'])]
            flat_segment_coords = numpy_to_list(polygon_coords)

            for j in range(len(flat_segment_coords)): # normalise the coordinates of the segment
                if j%2 == 0:
                    flat_segment_coords[j] = flat_segment_coords[j]/w
                else:
                    flat_segment_coords[j] = flat_segment_coords[j]/h
            
            data[images[cur_img_idx]].append(label_id + flat_segment_coords)
            i += 1
    else:
        w_bbox.image = encode_image(os.path.join(path, images[cur_img_idx]))
        data[images[cur_img_idx]] = [] # if no bounding boxes are specified, then the image is not annotated

In [10]:
clear_output()
w_container # display the widget

VBox(children=(IntProgress(value=0, description='Progress', max=56), Button(description='Previous', style=Butt…

Extract annotations to corresponding .txt file and delete unlabelled images

In [79]:
output_to_txt(data, staging_dir_train_labels) # output the data to txt files for YOLO training
delete_empty_labels_and_images(staging_dir_train_labels, staging_dir_train_images) # delete empty labels and images

# Annotate validation set

In [80]:
path = staging_dir_valid_images # path to validation images
images = sorted(os.listdir(path))

annotations = {} # dictionary with key = image name, value = corresponding bbox
data = {} # dictionary with key = image name, value = list of list containing: [label_id, x1, y1, x2, y2, ...]
cur_img_idx = 0 # current image index

In [82]:
# initialise the bbox widget
w_bbox = BBoxWidget(
    image = encode_image(os.path.join(path, images[cur_img_idx])),
    classes=classes
)
# a progress bar to show how far we got
w_progress = widgets.IntProgress(value=0, max=len(images), description='Progress')

# initialise the buttons
button_next = widgets.Button(description="Next")
button_prev = widgets.Button(description="Previous")
# combine the buttons and the bbox widget into a container
w_container = widgets.VBox([
    w_progress,
    button_prev,
    button_next,
    w_bbox

])

# function that updates the image when the buttons are clicked so that the next or previous image is shown
def update_image(change):
    global cur_img_idx
    annotations[images[cur_img_idx]] = w_bbox.bboxes # save the annotations for the current image before moving to the next image

    if images[cur_img_idx] not in data: # this is for the first time the image is shown
        data[images[cur_img_idx]] = []
    
    # move the current image index forward or backward
    if change.description == "Next":
        cur_img_idx = (cur_img_idx + 1) % len(images)
    elif change.description == "Previous":
        cur_img_idx = (cur_img_idx - 1) % len(images)
    w_progress.value = cur_img_idx # update the progress bar
    if images[cur_img_idx] not in data: # if the next image is not in the data dictionary, add it as an empty list
        data[images[cur_img_idx]] = []

    # check if annotations[cur_img_idx] exists
    if images[cur_img_idx] in annotations:
        w_bbox.bboxes = annotations[images[cur_img_idx]] # if it exists, load the annotations for the image
    else:
        w_bbox.bboxes = [] # if it doesn't exist, the image has no annotations, so set the bounding boxes to an empty list

    w_bbox.image = encode_image_existing_mask(os.path.join(path, images[cur_img_idx]), data[images[cur_img_idx]])
# add the update_image function to the buttons
button_next.on_click(update_image)
button_prev.on_click(update_image)

# defines what happens when the submit button is clicked, which is to run SAM with the bounding boxes specified by the user and to display the result and save the result to the data dictionary for conversion to YOLO format later
@w_bbox.on_submit
def submit():
    data[images[cur_img_idx]] = []

    if len(w_bbox.bboxes) > 0:
        w_bbox.image, poly_coords_list, h, w = encode_image_mask(os.path.join(path, images[cur_img_idx]), w_bbox.bboxes)
        i = 0
        for polygon_coords in poly_coords_list:
            label_id = [classes.index(w_bbox.bboxes[i]['label'])]
            flat_segment_coords = numpy_to_list(polygon_coords)

            for j in range(len(flat_segment_coords)): # normalise the coordinates of the segment
                if j%2 == 0:
                    flat_segment_coords[j] = flat_segment_coords[j]/w
                else:
                    flat_segment_coords[j] = flat_segment_coords[j]/h
            
            data[images[cur_img_idx]].append(label_id + flat_segment_coords)
            i += 1
    else:
        w_bbox.image = encode_image(os.path.join(path, images[cur_img_idx]))
        data[images[cur_img_idx]] = [] # if no bounding boxes are specified, then the image is not annotated

In [83]:
clear_output()
w_container # display the widget

VBox(children=(IntProgress(value=0, description='Progress', max=5), Button(description='Previous', style=Butto…

Extract annotations to corresponding .txt file and delete unlabelled images

In [84]:
output_to_txt(data, staging_dir_valid_labels) # output the data to txt files for YOLO training
delete_empty_labels_and_images(staging_dir_valid_labels, staging_dir_valid_images) # delete empty labels and images

# Clean up

Move annotated dataset from temporary space (`3-dataset`) to YOLO training directory (`dataset`)

In [85]:
move_files(staging_dir_train_images, staging_dir_train_labels, ds_dir_train_images, ds_dir_train_labels) # moves the data collected in staging to the 3-dataset folder for splitting into train and valid sets
move_files(staging_dir_valid_images, staging_dir_valid_labels, ds_dir_valid_images, ds_dir_valid_labels) # moves the data collected in staging to the 3-dataset folder for splitting into train and valid sets
move_source_vid(source_dir, ds_dir_source) # moves the videos from source to dataset/sources for documentation/backup

Clears `1-source`, `2-source-extracted` and `3-dataset` to make space for next batch

In [4]:
if(input("Are you sure you want to clear the source videos, extracted images, and dataset? (y/n): ")) == 'y':
    clear_directory(source_dir)
    clear_directory(source_extracted_dir)
    clear_directory(staging_dir)

PermissionError: [WinError 5] Access is denied: 'X:/1-source'

Now, you can add more video to `1-source` and add more data to the dataset ; Or you can proceed with YOLO Training below

# Edit

In [None]:
path = '../dataset/train/images/' # path to validation images
images = sorted(os.listdir(path))
path = '../dataset/train/valid'
images.append(os.listdir(path))



# YOLO Training

In [117]:
abspath_ds = os.path.abspath('../dataset/') # get the absolute path of the dataset folder
output_path = '../dataset/data.yaml' # path to the data.yaml file

create_yaml(classes, abspath_ds, output_path) # create the data.yaml file in the dataset folder

In [18]:
# YOLO Training
model = YOLO('../models/yolov8x-seg.pt')
model.train(data='../dataset/data.yaml', epochs=300, imgsz=640, batch=8)

New https://pypi.org/project/ultralytics/8.0.120 available  Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.0.110  Python-3.10.11 torch-2.0.1 CUDA:0 (NVIDIA GeForce RTX 3080 Ti Laptop GPU, 16384MiB)
[34m[1myolo\engine\trainer: [0mtask=segment, mode=train, model=../models/yolov8x-seg.pt, data=../dataset/data.yaml, epochs=300, patience=50, batch=8, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=None, exist_ok=False, pretrained=False, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=0, resume=False, amp=True, fraction=1.0, profile=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, show=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, vid_stride=1, line_width=None, visualize=False, augm

KeyboardInterrupt: 

In [13]:
trained_model = YOLO(r'C:\Users\Adrian\Desktop\projects\sam-2-yolo\notebooks\runs\segment\train5\weights\best.pt')
trained_model = YOLO(r'C:\Users\Adrian\Desktop\projects\sam-2-yolo\models\yolov8x-seg.pt')
results = trained_model.predict(source="../y2mate.is - TIMELAPSE of 15 TAKEOFFs in 1MIN _ World's Busiest Single Runway Airport - CSMIA Mumbai.-C1XFFvtKYy0-720p-1687243551.mp4", show=True)



    causing potential out-of-memory errors for large sources or long-running streams/videos.

    Usage:
        results = model(source=..., stream=True)  # generator of Results objects
        for r in results:
            boxes = r.boxes  # Boxes object for bbox outputs
            masks = r.masks  # Masks object for segment masks outputs
            probs = r.probs  # Class probabilities for classification outputs

video 1/1 (1/2250) C:\Users\Adrian\Desktop\projects\sam-2-yolo\y2mate.is - TIMELAPSE of 15 TAKEOFFs in 1MIN _ World's Busiest Single Runway Airport - CSMIA Mumbai.-C1XFFvtKYy0-720p-1687243551.mp4: 384x640 (no detections), 39.5ms
video 1/1 (2/2250) C:\Users\Adrian\Desktop\projects\sam-2-yolo\y2mate.is - TIMELAPSE of 15 TAKEOFFs in 1MIN _ World's Busiest Single Runway Airport - CSMIA Mumbai.-C1XFFvtKYy0-720p-1687243551.mp4: 384x640 (no detections), 148.1ms
video 1/1 (3/2250) C:\Users\Adrian\Desktop\projects\sam-2-yolo\y2mate.is - TIMELAPSE of 15 TAKEOFFs in 1MIN _ World's

KeyboardInterrupt: 

In [19]:
video_path = "../test.mp4"
cap = cv2.VideoCapture(video_path)

while cap.isOpened():
    success, frame = cap.read()

    if success:
        results = trained_model(frame)
        annotated_frame = results[0].plot()
        cv2.imshow("YOLOv8 Inference", annotated_frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    else:
        break
cap.release()
cv2.destroyAllWindows()


0: 384x640 1 airtug, 5 planes, 88.7ms
Speed: 3.0ms preprocess, 88.7ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 airtug, 5 planes, 79.8ms
Speed: 1.0ms preprocess, 79.8ms inference, 6.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 airtug, 5 planes, 73.0ms
Speed: 2.0ms preprocess, 73.0ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 airtug, 5 planes, 44.2ms
Speed: 2.0ms preprocess, 44.2ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 airtug, 5 planes, 45.0ms
Speed: 1.0ms preprocess, 45.0ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 airtug, 5 planes, 32.0ms
Speed: 2.2ms preprocess, 32.0ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 airtug, 5 planes, 32.0ms
Speed: 2.0ms preprocess, 32.0ms inference, 5.0ms postprocess per image at shape (1, 3, 640, 640)

0: 384x640 1 airtug, 5 planes, 29.4ms
Speed: 1.

# Model Performance