# Label ASL dataset with bounding boxes

In [None]:
import os
os.chdir('/home/jovyan/yolo')
os.listdir()

In [None]:
import cv2
import matplotlib.pyplot as plt

"datasets/American-Sign-Language-Letters-1/"

os.listdir('datasets/American-Sign-Language-Letters-1/')
label = 'datasets/American-Sign-Language-Letters-1/A0_jpg.rf.7a3779f9166b9d8f3c6e91cc6ff41edb.txt'
image = 'datasets/American-Sign-Language-Letters-1/A0_jpg.rf.7a3779f9166b9d8f3c6e91cc6ff41edb.jpg'

with open(label, 'r') as f:
    line = [float(x) for x in f.readlines()[0].strip().split()][1:]
    print(line)


In [None]:
import cv2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

def plot_image_with_bbox(image_path, bbox_xywhn):
    """
    Reads an image and plots it with a bounding box in normalized xywh format.

    Parameters:
        image_path (str): Path to the image file.
        bbox_xywhn (np.ndarray or list or tensor): Normalized bounding box (x_center, y_center, width, height).
    """
    # Read and convert image
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    img_h, img_w = image.shape[:2]
    
    # Ensure bbox is a flat numpy array
    bbox = np.array(bbox_xywhn).flatten()
    x_center, y_center, w, h = bbox

    # Convert normalized to absolute coordinates
    abs_x = (x_center - w / 2) * img_w
    abs_y = (y_center - h / 2) * img_h
    abs_w = w * img_w
    abs_h = h * img_h

    # Plot
    fig, ax = plt.subplots(1)
    ax.imshow(image)
    rect = patches.Rectangle((abs_x, abs_y), abs_w, abs_h,
                             linewidth=2, edgecolor='red', facecolor='none')
    ax.add_patch(rect)
    plt.axis('off')
    plt.show()

In [None]:
plot_image_with_bbox(image, line)

# Create ASL Dataset with bounding boxes

In [None]:
# Download the dataset from Kaggle
#!wget https://www.kaggle.com/api/v1/datasets/download/debashishsau/aslamerican-sign-language-aplhabet-dataset -O datasets/aslamerican-sign-language-aplhabet-dataset.zip
#!unzip datasets/aslamerican-sign-language-aplhabet-dataset.zip -d datasets/aslamerican-sign-language-aplhabet-dataset

In [None]:
# from ultralytics import YOLO
# # path to trained key point estimation model 
# model_path = "/home/jovyan/yolo/runs/pose/train/weights/best.pt"
# # Load the model
# model = YOLO(model_path)


In [None]:
# import os
# import cv2
# import numpy as np
# import json


# class_names = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", 
#                 "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
#                 "U", "V", "W", "X", "Y", "Z"]

# output_folder_images = "/home/jovyan/yolo/datasets/ASL_test/train/images"
# output_folder_labels = "/home/jovyan/yolo/datasets/ASL_test/train/labels"

# os.makedirs(output_folder_images, exist_ok=True)
# os.makedirs(output_folder_labels, exist_ok=True)

# class_states = {}
# for class_index, class_name in enumerate(class_names): 
#     image_folder = f"/home/jovyan/yolo/datasets/ASL_Alphabet_Dataset/asl_alphabet_train/{class_name}"

#     # Efficient inference with streaming
#     results = model.predict(source=image_folder, stream=True)

#     states = {}
#     for i, result in enumerate(results):

#         image_name = os.path.basename(result.path)
#         new_image_name = f"{class_name}_{image_name}"

#         if len(result.boxes) == 0:
#             print(f"No bounding boxes detected in image: {image_name}")
#             states["no_bbox"] = states.get("no_bbox", 0) + 1
#             continue
#         elif len(result.boxes) > 1:
#             print(f"Multiple bounding boxes detected in image: {image_name}")
#             states["multiple_bbox"] = states.get("multiple_bbox", 0) + 1
#             continue
#         elif result.boxes[0].conf[0] < 0.5:
#             print(f"Low confidence in image: {image_name}")
#             states["low_confidence"] = states.get("low_confidence", 0) + 1
#             continue
        
#         # save original image at output_folder_images
#         image_path = os.path.join(image_folder, image_name)
#         new_image_path = os.path.join(output_folder_images, new_image_name)
#         cv2.imwrite(new_image_path, cv2.imread(image_path))

#         # save bounding boxses as txt file like class_index x_center y_center width height in the output_folder_labels
#         label_path = os.path.join(output_folder_labels, new_image_name.replace(".jpg", ".txt"))

#         #result.boxes[0].xywhn returns tensor([[0.5577, 0.3010, 0.4520, 0.5482]], device='cuda:0') 
#         x, y, w, h = result.boxes[0].xywhn[0].cpu().numpy()

#         with open(label_path, "w") as f:
#             f.write(f"{class_index} {x} {y} {w} {h}")

#         class_states[class_name] = states

# # save the class states to a json file
# with open("class_states.json", "w") as f:
#     json.dump(class_states, f, indent=4)    

# print(class_states)

In [5]:
from ultralytics import YOLO
# path to trained key point estimation model 
model_path = "/home/jovyan/yolo/runs/pose/train/weights/best.pt"
# Load the model
model = YOLO(model_path)

class_names = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")

In [None]:
from tqdm import tqdm
import os
import json
import cv2


output_folder_images = "./data/ASL/data/images"
output_folder_labels = "./data/ASL/data/labels"

os.makedirs(output_folder_images, exist_ok=True)
os.makedirs(output_folder_labels, exist_ok=True)

# if class_states exist, load it 
if os.path.exists("class_states.json"):
    with open("class_states.json", "r") as f:
        class_states = json.load(f)
else:
    # if class_states does not exist, create it
    class_states = {}

def process_result(result, class_index, class_name, states, image_folder):
    image_name = os.path.basename(result.path)
    new_image_name = f"{class_name}_{image_name}"

    if len(result.boxes) == 0:
        states["no_bbox"] = states.get("no_bbox", 0) + 1
        return

    if len(result.boxes) > 1:
        states["multiple_bbox"] = states.get("multiple_bbox", 0) + 1
        return

    if result.boxes[0].conf[0] < 0.5:
        states["low_confidence"] = states.get("low_confidence", 0) + 1
        return

    # Read and save image
    image_path = os.path.join(image_folder, image_name)
    image = cv2.imread(image_path)
    if image is not None:
        cv2.imwrite(os.path.join(output_folder_images, new_image_name), image)

    # Save label
    x, y, w, h = result.boxes[0].xywhn[0].cpu().numpy()
    label_path = os.path.join(output_folder_labels, new_image_name.replace(".jpg", ".txt"))
    with open(label_path, "w") as f:
        f.write(f"{class_index} {x} {y} {w} {h}")

for class_index, class_name in tqdm(enumerate(class_names), desc="Processing classes", total=len(class_names)):

    image_folder = f"./data/ASL_Alphabet_Dataset/asl_alphabet_train/{class_name}"
    
    # Non-streaming, batch prediction
    results = model.predict(source=image_folder, stream=False, batch=512, verbose=False)
    
    states = {}
    for result in tqdm(results, desc=f"Processing {class_name}", total=len(results)):
        process_result(result, class_index, class_name, states, image_folder)

    class_states[class_name] = states

# Save class states
with open("class_states.json", "w") as f:
    json.dump(class_states, f, indent=4)

print(class_states)


Processing A: 100%|██████████| 8458/8458 [00:10<00:00, 831.14it/s] 
Processing B: 100%|██████████| 8309/8309 [00:12<00:00, 672.56it/s]
Processing C: 100%|██████████| 8146/8146 [00:09<00:00, 887.94it/s] 
Processing D: 100%|██████████| 7629/7629 [00:09<00:00, 824.27it/s]
Processing E: 100%|██████████| 7744/7744 [00:10<00:00, 765.14it/s] 
Processing F: 100%|██████████| 8031/8031 [00:11<00:00, 676.83it/s]
Processing G: 100%|██████████| 7844/7844 [00:09<00:00, 793.32it/s]
Processing H: 100%|██████████| 7906/7906 [00:09<00:00, 852.89it/s]
Processing I: 100%|██████████| 7953/7953 [00:09<00:00, 808.25it/s] 
Processing J: 100%|██████████| 7503/7503 [00:06<00:00, 1073.27it/s]
Processing K: 100%|██████████| 7876/7876 [00:10<00:00, 749.78it/s]]
Processing L: 100%|██████████| 7939/7939 [00:11<00:00, 708.54it/s]]
Processing M: 100%|██████████| 7900/7900 [00:06<00:00, 1134.92it/s]
Processing N: 100%|██████████| 7932/7932 [00:06<00:00, 1276.69it/s]
Processing O: 100%|██████████| 8140/8140 [00:07<00:00

{'A': {'multiple_bbox': 42, 'no_bbox': 1739, 'low_confidence': 609}, 'B': {'multiple_bbox': 42, 'low_confidence': 224, 'no_bbox': 387}, 'C': {'low_confidence': 497, 'multiple_bbox': 130, 'no_bbox': 1613}, 'D': {'multiple_bbox': 153, 'no_bbox': 1055, 'low_confidence': 640}, 'E': {'multiple_bbox': 49, 'no_bbox': 1014, 'low_confidence': 490}, 'F': {'multiple_bbox': 39, 'no_bbox': 216, 'low_confidence': 245}, 'G': {'multiple_bbox': 369, 'low_confidence': 342, 'no_bbox': 702}, 'H': {'multiple_bbox': 318, 'no_bbox': 1016, 'low_confidence': 499}, 'I': {'no_bbox': 1194, 'low_confidence': 624, 'multiple_bbox': 181}, 'J': {'no_bbox': 1138, 'low_confidence': 403, 'multiple_bbox': 30}, 'K': {'multiple_bbox': 166, 'no_bbox': 369, 'low_confidence': 204}, 'L': {'multiple_bbox': 135, 'no_bbox': 181, 'low_confidence': 112}, 'M': {'multiple_bbox': 211, 'low_confidence': 1116, 'no_bbox': 2109}, 'N': {'multiple_bbox': 141, 'no_bbox': 2723, 'low_confidence': 1221}, 'O': {'no_bbox': 2357, 'low_confidence': 




# Analyse new Dataset

In [None]:
from collections import defaultdict

base_dir = "/home/jovyan/yolo/datasets/ASL_test"
input_images_dir = os.path.join(base_dir, "data/images")
input_labels_dir = os.path.join(base_dir, "data/labels")

class_groups = defaultdict(list)
for fname in os.listdir(input_images_dir):
    if fname.endswith(".jpg") or fname.endswith(".jpeg"):
        class_prefix = fname.split("_")[0]  # Assumes format like A_image1.jpg
        class_groups[class_prefix].append(fname)



In [16]:
for class_prefix, images in class_groups.items():
    print(f"Class {class_prefix} has {len(images)} images.")

Class D has 493 images.
Class U has 541 images.
Class W has 678 images.
Class Y has 386 images.
Class N has 197 images.
Class P has 454 images.
Class B has 654 images.
Class C has 108 images.
Class Z has 291 images.
Class Q has 503 images.
Class I has 320 images.
Class K has 584 images.
Class O has 38 images.
Class E has 269 images.
Class L has 620 images.
Class A has 405 images.
Class F has 696 images.
Class V has 622 images.
Class H has 542 images.
Class G has 372 images.
Class R has 397 images.
Class X has 165 images.
Class T has 215 images.
Class S has 220 images.
Class J has 161 images.
Class M has 113 images.


In [None]:
import os
import shutil
import random
from collections import defaultdict

# Define paths
base_dir = "/home/jovyan/yolo/datasets/ASL_test"
input_images_dir = os.path.join(base_dir, "data/images")
input_labels_dir = os.path.join(base_dir, "data/labels")

output_dirs = {
    "train": {
        "images": os.path.join(base_dir, "train/images"),
        "labels": os.path.join(base_dir, "train/labels")
    },
    "val": {
        "images": os.path.join(base_dir, "val/images"),
        "labels": os.path.join(base_dir, "val/labels")
    },
    "test": {
        "images": os.path.join(base_dir, "test/images"),
        "labels": os.path.join(base_dir, "test/labels")
    }
}

# Create output directories
for split in output_dirs:
    os.makedirs(output_dirs[split]["images"], exist_ok=True)
    os.makedirs(output_dirs[split]["labels"], exist_ok=True)

# Define split ratios
split_ratios = {"train": 0.8, "val": 0.1, "test": 0.1}

# Group images by class prefix (e.g., A_, B_, ...)
class_groups = defaultdict(list)
for fname in os.listdir(input_images_dir):
    if fname.endswith(".jpg") or fname.endswith(".jpeg"):
        class_prefix = fname.split("_")[0]  # Assumes format like A_image1.jpg
        class_groups[class_prefix].append(fname)

# Process each class group
for class_name, image_list in class_groups.items():
    random.shuffle(image_list)
    total = len(image_list)

    train_end = int(split_ratios["train"] * total)
    val_end = train_end + int(split_ratios["val"] * total)

    split_map = {
        "train": image_list[:train_end],
        "val": image_list[train_end:val_end],
        "test": image_list[val_end:]
    }

    for split, images in split_map.items():
        for img_name in images:
            # Move image
            src_img = os.path.join(input_images_dir, img_name)
            dst_img = os.path.join(output_dirs[split]["images"], img_name)
            shutil.move(src_img, dst_img)

            # Move label
            label_name = img_name.replace(".jpg", ".txt")
            src_label = os.path.join(input_labels_dir, label_name)
            dst_label = os.path.join(output_dirs[split]["labels"], label_name)
            if os.path.exists(src_label):
                shutil.move(src_label, dst_label)

print("Dataset split completed.")

Dataset split completed.


# Train YOLOv11 on the novel dataset to predict the letter and the bb 

In [None]:
from ultralytics import YOLO

model = YOLO("yolo11n.pt")  # load a pretrained model (recommended for training)

# Train the model
results = model.train(data="/home/jovyan/yolo/datasets/ASL_test/data.yaml", epochs=50, batch=256, imgsz=640)

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt'...


100%|██████████| 5.35M/5.35M [00:00<00:00, 57.2MB/s]


Ultralytics 8.3.134 🚀 Python-3.10.12 torch-2.7.0+cu128 CUDA:0 (NVIDIA A100-PCIE-40GB, 40442MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=256, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/home/jovyan/yolo/datasets/ASL_test/data.yaml, degrees=0.0, deterministic=True, device=4, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=100, perspecti

[34m[1mtrain: [0mScanning /home/jovyan/yolo/datasets/ASL_test/train/labels... 119040 images, 8025 backgrounds, 0 corrupt: 100%|██████████| 127065/127065 [01:47<00:00, 1177.72it/s]


[34m[1mtrain: [0mNew cache created: /home/jovyan/yolo/datasets/ASL_test/train/labels.cache
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 425.6±195.8 MB/s, size: 12.1 KB)


[34m[1mval: [0mScanning /home/jovyan/yolo/datasets/ASL_test/val/labels... 14871 images, 994 backgrounds, 0 corrupt: 100%|██████████| 15865/15865 [00:15<00:00, 1007.95it/s]


[34m[1mval: [0mNew cache created: /home/jovyan/yolo/datasets/ASL_test/val/labels.cache
Plotting labels to runs/detect/train/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m SGD(lr=0.01, momentum=0.9) with parameter groups 81 weight(decay=0.0), 88 weight(decay=0.002), 87 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1mruns/detect/train[0m
Starting training for 50 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/50      37.4G      1.505      4.632      1.805        523        640:   2%|▏         | 11/497 [00:55<30:44,  3.80s/it] 

# Test on Test-Dataset

In [2]:
# Evaluate the model on the test set 
from ultralytics import YOLO

model_path = "/home/jovyan/yolo/runs/detect_asl/train/weights/best.pt"
model = YOLO(model_path)  # load a custom model

# Evaluate the model on the test set
results = model.val(data="/home/jovyan/yolo/datasets/ASL_test/data.yaml", split="test")

Ultralytics 8.3.134 🚀 Python-3.10.12 torch-2.7.0+cu128 CUDA:0 (NVIDIA A100-PCIE-40GB, 40442MiB)
YOLO11n summary (fused): 100 layers, 2,587,222 parameters, 0 gradients, 6.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 517.1±81.7 MB/s, size: 13.3 KB)


[34m[1mval: [0mScanning /home/jovyan/yolo/datasets/ASL_test/test/labels... 14904 images, 1025 backgrounds, 0 corrupt: 100%|██████████| 15929/15929 [00:13<00:00, 1224.85it/s]


[34m[1mval: [0mNew cache created: /home/jovyan/yolo/datasets/ASL_test/test/labels.cache


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 996/996 [00:58<00:00, 17.13it/s]


                   all      15929      14904      0.998      0.997      0.995      0.964
                     A        567        567      0.998      0.998      0.995      0.974
                     B        701        701          1          1      0.995      0.981
                     C        581        581      0.992       0.99      0.994      0.957
                     D        530        530      0.996      0.995      0.995      0.954
                     E        593        593      0.999      0.997      0.995      0.978
                     F        684        684      0.997      0.997      0.995      0.979
                     G        607        607      0.998      0.997      0.994       0.93
                     H        554        554      0.997      0.996      0.995      0.962
                     I        564        564      0.993      0.996      0.995      0.969
                     J        578        578          1      0.995      0.995      0.966
                     

# App.py on KI-Server

Jupyter Notebook and YOLO train does not work very well due to doing this in Jupyter or IPython, memory leaks are much more common due to how the kernel retains references. Use standalone .py scripts if possible

In [8]:
%%writefile /home/jovyan/yolo/yolo.py
from ultralytics import YOLO

model = YOLO("/home/jovyan/yolo/runs/detect_asl/train2/weights/last.pt")  # load a pretrained model (recommended for training)

results = model.train(data="/home/jovyan/yolo/datasets/ASL_test/data.yaml", epochs=5, batch=128, imgsz=640, project="runs/detect_asl")


Overwriting /home/jovyan/yolo/yolo.py
