# **ULGEN - YOLO v8 Object Detection Model (COCO Subset)**

## 1 - Importing required libraries

In [1]:
from ultralytics import YOLO

# 2 - Loading YAML and Data

In [7]:
yaml_content = """
path: /content/datasets/coco
train: images/train2017
val: images/val2017
names:
  0: person
  1: bicycle
  2: car
  3: motorcycle
  4: airplane
  5: bus
  6: train
  7: truck
  8: boat
  9: traffic light
  10: fire hydrant
  11: stop sign
  12: parking meter
  13: bench
  14: bird
  15: cat
  16: dog
  17: horse
  18: sheep
  19: cow
  20: elephant
  21: bear
  22: zebra
  23: giraffe
  24: backpack
  25: umbrella
  26: handbag
  27: tie
  28: suitcase
  29: frisbee
  30: skis
  31: snowboard
  32: sports ball
  33: kite
  34: baseball bat
  35: baseball glove
  36: skateboard
  37: surfboard
  38: tennis racket
  39: bottle
  40: wine glass
  41: cup
  42: fork
  43: knife
  44: spoon
  45: bowl
  46: banana
  47: apple
  48: sandwich
  49: orange
  50: broccoli
  51: carrot
  52: hot dog
  53: pizza
  54: donut
  55: cake
  56: chair
  57: couch
  58: potted plant
  59: bed
  60: dining table
  61: toilet
  62: tv
  63: laptop
  64: mouse
  65: remote
  66: keyboard
  67: cell phone
  68: microwave
  69: oven
  70: toaster
  71: sink
  72: refrigerator
  73: book
  74: clock
  75: vase
  76: scissors
  77: teddy bear
  78: hair drier
  79: toothbrush
"""

with open("./datasets/coco/coco.yaml", "w") as f:
    f.write(yaml_content)

# 3 - Converting COCO labels to YOLO labels

In [8]:
import json, os, tqdm
from pathlib import Path

def coco_to_yolo(coco_json, img_dir, out_label_dir):
    os.makedirs(out_label_dir, exist_ok=True)
    data = json.load(open(coco_json))
    images = {img["id"]: img for img in data["images"]}
    cats = {cat["id"]: cat["name"] for cat in data["categories"]}
    img_to_anns = {}

    for ann in data["annotations"]:
        img_to_anns.setdefault(ann["image_id"], []).append(ann)

    # Mapping COCO categories with ID's 0...79
    cat_ids = sorted(cats.keys())
    cat_to_idx = {cid: i for i, cid in enumerate(cat_ids)}

    for img_id, meta in tqdm.tqdm(images.items()):
        width, height = meta["width"], meta["height"]
        anns = img_to_anns.get(img_id, [])
        lines = []
        for a in anns:
            x, y, bw, bh = a["bbox"]
            cx, cy = x + bw / 2, y + bh / 2

            # Normalize
            nx, ny = cx / width, cy / height
            nw, nh = bw / width, bh/ height
            cls = cat_to_idx[a["category_id"]]
            lines.append(f"{cls} {nx:.6f} {ny:.6f} {nw:.6f} {nh:.6f}")
        
        # Label filename
        stem = Path(meta["file_name"]).stem
        with open(os.path.join(out_label_dir, f"{stem}.txt"), "w", encoding="utf-8") as f:
            f.write("\n".join(lines))



In [9]:
coco_to_yolo("./datasets/coco/annotations/instances_train2017.json", "./datasets/coco/images/train2017", "./datasets/coco/labels/train2017")
coco_to_yolo("./datasets/coco/annotations/instances_val2017.json", "./datasets/coco/images/val2017", "./datasets/coco/labels/val2017")

100%|██████████| 118287/118287 [01:39<00:00, 1185.75it/s]
100%|██████████| 5000/5000 [00:03<00:00, 1316.14it/s]


# 4 - Model Training

In [2]:
model = YOLO("yolov8n.pt")
print(model)

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_s

In [3]:
import torch

print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

In [4]:
results = model.train(data="./datasets/coco/coco.yaml", epochs=100, imgsz=512, batch=8, fraction=0.1)

Ultralytics 8.3.202  Python-3.10.18 torch-2.8.0+cu128 CUDA:0 (NVIDIA GeForce RTX 3050 Ti Laptop GPU, 4096MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=8, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=./datasets/coco/coco.yaml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=100, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=0.1, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=512, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train7, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=10

# 4 - Model Evaluation / Testing

In [5]:
import cv2

model = YOLO("runs/detect/train7/weights/best.pt")
cap = cv2.VideoCapture("test.mp4")

while True:
    ok, frame = cap.read()
    if not ok:
        break

    frame = cv2.resize(frame, (1280, 720))

    res = model(frame, imgsz=512, conf=0.4)[0]
    for b in res.boxes:
        x1, y1, x2, y2 = map(int, b.xyxy[0])
        cls = int(b.cls[0])
        conf = float(b.conf[0])
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f"{model.names[cls]} {conf*100:.1f}%", (x1, y1 - 6), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)
    
    cv2.imshow("Detections", frame)

    if cv2.waitKey(1) & 0xFF == 27:
        break

cap.release()
cv2.destroyAllWindows()


0: 288x512 13 persons, 1 car, 1 traffic light, 23.1ms
Speed: 12.4ms preprocess, 23.1ms inference, 16.2ms postprocess per image at shape (1, 3, 288, 512)

0: 288x512 11 persons, 1 car, 1 traffic light, 22.5ms
Speed: 2.5ms preprocess, 22.5ms inference, 14.3ms postprocess per image at shape (1, 3, 288, 512)

0: 288x512 11 persons, 1 car, 1 traffic light, 19.6ms
Speed: 2.5ms preprocess, 19.6ms inference, 18.3ms postprocess per image at shape (1, 3, 288, 512)

0: 288x512 8 persons, 1 car, 1 traffic light, 20.3ms
Speed: 2.6ms preprocess, 20.3ms inference, 15.2ms postprocess per image at shape (1, 3, 288, 512)

0: 288x512 10 persons, 2 cars, 1 traffic light, 19.3ms
Speed: 2.3ms preprocess, 19.3ms inference, 19.7ms postprocess per image at shape (1, 3, 288, 512)

0: 288x512 8 persons, 1 traffic light, 20.2ms
Speed: 5.8ms preprocess, 20.2ms inference, 24.8ms postprocess per image at shape (1, 3, 288, 512)

0: 288x512 8 persons, 1 traffic light, 26.2ms
Speed: 3.4ms preprocess, 26.2ms inference,