In [1]:
from ultralytics import YOLO

In [2]:
model = YOLO(r"runs\detect\YOLOv8s_FSL_100_v1\weights\best.pt")

In [3]:
# results = model.predict(source=1, show=True, imgsz=640, conf=0.65)



In [8]:
import cv2
from ultralytics import YOLO
import numpy as np

model_path = r"runs\detect\YOLOv8s_FSL_100_v1\weights\best.pt"
device = "cuda:0"
confidence_threshold = 0.5
video_source = 1
frame_limit = 20

yolo = YOLO(model_path)
yolo.to(device)

videoCap = cv2.VideoCapture(video_source)

def getColours(cls_num):
    base_colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
    color_index = cls_num % len(base_colors)
    increments = [(1, -2, 1), (-2, 1, -1), (1, -1, 2)]
    color = [base_colors[color_index][i] + increments[color_index][i] * 
             (cls_num // len(base_colors)) % 256 for i in range(3)]
    return tuple(color)

current_letter = None
previous_letter = None
consecutive_frames = 0
detected_letters = []

while True:
    ret, frame = videoCap.read()
    if not ret:
        continue

    results = yolo.track(frame, stream=True, device=device)

    detected_letter = None
    for result in results:
        class_names = result.names

        for box in result.boxes:
            if box.conf[0] > confidence_threshold:
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                cls = int(box.cls[0])
                detected_letter = class_names[cls]
                color = getColours(cls)

                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                cv2.putText(frame, f'{detected_letter} {box.conf[0]:.2f}', 
                            (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

    if detected_letter is not None:
        if detected_letter == current_letter:
            consecutive_frames += 1
        else:
            if consecutive_frames >= frame_limit and current_letter != previous_letter:
                detected_letters.append(current_letter)
            current_letter = detected_letter
            consecutive_frames = 1

    if detected_letter is not None and consecutive_frames >= frame_limit and current_letter != previous_letter:
        detected_letters.append(current_letter)
        previous_letter = current_letter

    height, width, _ = frame.shape
    text_window = np.full((height // 3, width, 3), 255, dtype=np.uint8)

    if detected_letters:
        text_to_display = ''.join(detected_letters)
        cv2.putText(text_window, text_to_display, 
                    (20, text_window.shape[0] // 2 + 10), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 0, 0), 3)

    combined_frame = cv2.vconcat([frame, text_window])

    cv2.imshow('Sign Language Detection', combined_frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

videoCap.release()
cv2.destroyAllWindows()



0: 480x640 (no detections), 11.2ms
Speed: 2.0ms preprocess, 11.2ms inference, 0.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 19.0ms
Speed: 1.9ms preprocess, 19.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 10.3ms
Speed: 1.0ms preprocess, 10.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 11.7ms
Speed: 0.0ms preprocess, 11.7ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 12.5ms
Speed: 0.0ms preprocess, 12.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 9.8ms
Speed: 2.3ms preprocess, 9.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 9.3ms
Speed: 0.4ms preprocess, 9.3ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 5.2ms
Speed: 1.0ms preprocess, 5.2ms inferen

In [20]:
#Display and save in 'outputs' directory

import cv2
from ultralytics import YOLO
import numpy as np
import os

# Ensure the output directory exists
output_dir = "Outputs"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Path to save the output video
output_path = os.path.join(output_dir, "sign_language_detection_output.avi")

# Model and video settings
model_path = r"runs\detect\YOLOv8s_FSL_100_v1\weights\best.pt"
device = "cuda:0"
confidence_threshold = 0.5
video_source = 1
frame_limit = 20

# Initialize YOLO model
yolo = YOLO(model_path)
yolo.to(device)

# Initialize video capture
videoCap = cv2.VideoCapture(video_source)
frame_width = int(videoCap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(videoCap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define VideoWriter to save the combined frame video
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter(output_path, fourcc, 20, (frame_width, frame_height + frame_height // 3))

# Function to get colors
def getColours(cls_num):
    base_colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255)]
    color_index = cls_num % len(base_colors)
    increments = [(1, -2, 1), (-2, 1, -1), (1, -1, 2)]
    color = [base_colors[color_index][i] + increments[color_index][i] * 
             (cls_num // len(base_colors)) % 256 for i in range(3)]
    return tuple(color)

# Initialize detection variables
current_letter = None
previous_letter = None
consecutive_frames = 0
detected_letters = []

while True:
    ret, frame = videoCap.read()
    if not ret:
        continue

    results = yolo.track(frame, stream=True, device=device)

    detected_letter = None
    for result in results:
        class_names = result.names

        for box in result.boxes:
            if box.conf[0] > confidence_threshold:
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                cls = int(box.cls[0])
                detected_letter = class_names[cls]
                color = getColours(cls)

                cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                cv2.putText(frame, f'{detected_letter} {box.conf[0]:.2f}', 
                            (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

    # Update detected letters based on consecutive frames
    if detected_letter is not None:
        if detected_letter == current_letter:
            consecutive_frames += 1
        else:
            if consecutive_frames >= frame_limit and current_letter != previous_letter:
                detected_letters.append(current_letter)
            current_letter = detected_letter
            consecutive_frames = 1

    if detected_letter is not None and consecutive_frames >= frame_limit and current_letter != previous_letter:
        detected_letters.append(current_letter)
        previous_letter = current_letter

    # Create the text window with detected letters
    height, width, _ = frame.shape
    text_window = np.full((height // 3, width, 3), 255, dtype=np.uint8)

    if detected_letters:
        text_to_display = ''.join(detected_letters)
        cv2.putText(text_window, text_to_display, 
                    (20, text_window.shape[0] // 2 + 10), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 0, 0), 3)

    # Combine the frame and text window
    combined_frame = cv2.vconcat([frame, text_window])

    # Display the combined frame
    cv2.imshow('Sign Language Detection', combined_frame)

    # Write the combined frame to the output video file
    out.write(combined_frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release resources
videoCap.release()
out.release()
cv2.destroyAllWindows()



0: 480x640 (no detections), 19.4ms
Speed: 2.2ms preprocess, 19.4ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 11.0ms
Speed: 2.0ms preprocess, 11.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 11.9ms
Speed: 1.0ms preprocess, 11.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 11.0ms
Speed: 1.0ms preprocess, 11.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 11.3ms
Speed: 1.3ms preprocess, 11.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 10.4ms
Speed: 1.3ms preprocess, 10.4ms inference, 1.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 9.8ms
Speed: 2.0ms preprocess, 9.8ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 o, 1 r, 15.5ms
Speed: 1.4ms preprocess, 15.5ms inference,