# Model Training

First let's import the libraries that we are going to use for this tasks

In [68]:
import os
import shutil
import random
import yaml
from ultralytics import YOLO
import xml.etree.ElementTree as ET

Next, we define the path location for our images

In [13]:
image_dir = "../data/raw/may/images"   
label_dir = "../data/raw/may/labels"   
os.makedirs(label_dir, exist_ok = True)
os.makedirs(image_dir, exist_ok = True)

### Pre-Preprocessing

We want the images and the labels to have standard naming format such that the name tells you the period and the image type.

For example; "may_afternoon_0_lwir_89.xml"

First rename the images

In [18]:
prefix = "may_afternoon_"

for filename in os.listdir(image_dir):

    if filename.endswith(".jpg") and not filename.startswith(prefix):
        
        old_path = os.path.join(image_dir, filename)
        new_filename = prefix + filename
        new_path = os.path.join(image_dir, new_filename)

        os.rename(old_path, new_path)

Next rename the labels

In [16]:
for filename in os.listdir(label_dir):

    if filename.endswith(".xml") and not filename.startswith(prefix):
        
        old_path = os.path.join(label_dir, filename)
        new_filename = prefix + filename
        new_path = os.path.join(label_dir, new_filename)

        os.rename(old_path, new_path)

Each of the objects in the images are labelled as either *'ap_metal', 'ap_plastic', 'at_metal'* or , *'at_plastic'*. Let us write a code that iterates through all the labels and extract these unique classes.

In [32]:
unique_classes = set()

for filename in os.listdir(label_dir):
    
    if filename.endswith(".xml"):
        
        filepath = os.path.join(label_dir, filename)
        tree = ET.parse(filepath)
        root = tree.getroot()
        
        # Iterate over each object tag
        for obj in root.findall("object"):
            
            class_name = obj.find("name").text
            unique_classes.add(class_name)

# Convert to a sorted list
class_list = sorted(list(unique_classes))

print("Unique classes found:", class_list)

Unique classes found: ['ap_metal', 'ap_plastic', 'at_metal', 'at_plastic']


Next, we need to convert the labels into a format that is acceptable by YOLO. We achieve this by writing a function that accepts the ".xml" annotation file, it extracts the image width and height, loops over each label to find the image class, check if its known against the class list from the previous code and then convert the class names into unique index as YOLO only recognizes IDs and not names.

The function then extracts bounding box coordinates from the .xml file before converting it to YOLO format by normalizing them from 0 to 1. 

In [38]:
def convert_voc_to_yolo(xml_file):
    """
    This function reads .xml annotation file, 
    extracts bounding boxes and class names 
    before converting them to YOLO format of 
    one string per object.

    Parameters
    ----------
    xml_file : string
        The path to a Pascal VOC-style XML 
        annotation label file.

    Returns
    -------
    """
    tree = ET.parse(xml_file)
    root = tree.getroot()
    w = int(root.find("size/width").text)
    h = int(root.find("size/height").text)
    
    yolo_lines = []
    for obj in root.findall("object"):
        
        cls = obj.find("name").text
        if cls not in class_list:
            
            continue
        cls_id = class_list.index(cls)
        xmlbox = obj.find("bndbox")
        xmin = int(xmlbox.find("xmin").text)
        ymin = int(xmlbox.find("ymin").text)
        xmax = int(xmlbox.find("xmax").text)
        ymax = int(xmlbox.find("ymax").text)

        # Convert to YOLO format
        x_center = ((xmin + xmax) / 2) / w
        y_center = ((ymin + ymax) / 2) / h
        bw = (xmax - xmin) / w
        bh = (ymax - ymin) / h
        yolo_lines.append(f"{cls_id} {x_center} {y_center} {bw} {bh}")
        
    return yolo_lines

We loop over the .xml files in label folder, convert the annotations from VOC format to YOLO using the our function and then save them as text files. 

In [55]:
text_dir = "../data/raw/may/text" 
os.makedirs(text_dir, exist_ok = True)

for xml_file in os.listdir(label_dir):
    
    if not xml_file.endswith(".xml"):
        
        continue
        
    xml_path = os.path.join(label_dir, xml_file)
    txt_path = os.path.join(text_dir, xml_file.replace(".xml", ".txt"))
    
    yolo_data = convert_voc_to_yolo(xml_path)
    with open(txt_path, "w") as f:
        
        f.write("\n".join(yolo_data))

In [64]:
output_base = "../results/may/dataset"
train_ratio, val_ratio, test_ratio = 0.7, 0.2, 0.1

#Shuffle the original images
images = [f for f in os.listdir(image_dir) if f.endswith((".jpg", ".png"))]
random.shuffle(images)

# Compute split indices
total = len(images)
train_end = int(total * train_ratio)
val_end = train_end + int(total * val_ratio)

# Split image filenames
split_data = {"train": images[:train_end], "val": images[train_end:val_end],
    "test": images[val_end:]}

In [None]:
# Create folder structure and copy files
for split in ["train", "val", "test"]:
    img_out_dir = os.path.join(output_base, "images", split)
    lbl_out_dir = os.path.join(output_base, "labels", split)
    os.makedirs(img_out_dir, exist_ok=True)
    os.makedirs(lbl_out_dir, exist_ok=True)

    for img_file in split_data[split]:
        # Copy image
        shutil.copy(os.path.join(image_dir, img_file), os.path.join(img_out_dir, img_file))

        # Copy corresponding label
        txt_file = os.path.splitext(img_file)[0] + ".txt"
        src_lbl = os.path.join(label_dir, txt_file)
        if os.path.exists(src_lbl):
            shutil.copy(src_lbl, os.path.join(lbl_out_dir, txt_file))
        else:
            pass #print(f"⚠️ Label not found for image: {img_file}")

In [71]:
data = {
    "path":output_base, 
    "train": os.path.join("../results/may/dataset/images/train"),
    "val": os.path.join("../results/may/dataset/images/val"),
    "test": os.path.join("../results/may/dataset/images/test"),
    "nc": len(class_list),
    "names": class_list,
}

yaml_path = os.path.join(output_base, "data.yaml")
with open(yaml_path, "w") as f:
    
    yaml.dump(data, f, default_flow_style=False)

✅ data.yaml created!


In [70]:
model = YOLO("yolov8n.pt") 

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt': 100% ━━━━━━━━━━━━ 6.2/6.2MB 3.1MB/s 2.0s0s


In [75]:
os.listdir(output_base)

['.DS_Store', 'images', 'labels', 'data.yaml']

In [77]:
yaml_file = os.path.join(output_base, "data.yaml")
print(yaml_file)

../results/may/dataset/data.yaml


In [None]:
model.train(data = yaml_path, epochs=50, imgsz=640, batch=16)

Ultralytics 8.3.186 🚀 Python-3.11.11 torch-2.8.0 CPU (Apple M1 Pro)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=../results/may/dataset/data.yaml, degrees=0.0, deterministic=True, device=cpu, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train3, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=100, perspective=0.0, plots=True, pose=12.0, pretrain

Matplotlib is building the font cache; this may take a moment.


Overriding model.yaml nc=80 with nc=4

                   from  n    params  module                                       arguments                     
  0                  -1  1       464  ultralytics.nn.modules.conv.Conv             [3, 16, 3, 2]                 
  1                  -1  1      4672  ultralytics.nn.modules.conv.Conv             [16, 32, 3, 2]                
  2                  -1  1      7360  ultralytics.nn.modules.block.C2f             [32, 32, 1, True]             
  3                  -1  1     18560  ultralytics.nn.modules.conv.Conv             [32, 64, 3, 2]                
  4                  -1  2     49664  ultralytics.nn.modules.block.C2f             [64, 64, 2, True]             
  5                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  6                  -1  2    197632  ultralytics.nn.modules.block.C2f             [128, 128, 2, True]           
  7                  -1  1    295424  ultralytics

  i = smooth(f1_curve.mean(0), 0.1).argmax()  # max F1 index
  ret = um.true_divide(


[K       2/50         0G          0      105.2          0          0        640: 100% ━━━━━━━━━━━━ 13/13 0.20it/s 1:05s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 2/2 0.30it/s 6.6s
                   all         55          0          0          0          0          0

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


  i = smooth(f1_curve.mean(0), 0.1).argmax()  # max F1 index
  ret = um.true_divide(


[K       3/50         0G          0      96.14          0          0        640: 100% ━━━━━━━━━━━━ 13/13 0.21it/s 1:03s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 2/2 0.30it/s 6.6s
                   all         55          0          0          0          0          0

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


  i = smooth(f1_curve.mean(0), 0.1).argmax()  # max F1 index
  ret = um.true_divide(


[K       4/50         0G          0      89.84          0          0        640: 100% ━━━━━━━━━━━━ 13/13 0.21it/s 1:02s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 2/2 0.31it/s 6.5s
                   all         55          0          0          0          0          0

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


  i = smooth(f1_curve.mean(0), 0.1).argmax()  # max F1 index
  ret = um.true_divide(


[K       5/50         0G          0      84.56          0          0        640: 100% ━━━━━━━━━━━━ 13/13 0.21it/s 1:02s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 2/2 0.31it/s 6.5s
                   all         55          0          0          0          0          0

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


  i = smooth(f1_curve.mean(0), 0.1).argmax()  # max F1 index
  ret = um.true_divide(


[K       6/50         0G          0      80.36          0          0        640: 100% ━━━━━━━━━━━━ 13/13 0.21it/s 1:02s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 2/2 0.30it/s 6.6s
                   all         55          0          0          0          0          0

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


  i = smooth(f1_curve.mean(0), 0.1).argmax()  # max F1 index
  ret = um.true_divide(


[K       7/50         0G          0      76.43          0          0        640: 100% ━━━━━━━━━━━━ 13/13 0.21it/s 1:03s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 2/2 0.31it/s 6.5s
                   all         55          0          0          0          0          0

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


  i = smooth(f1_curve.mean(0), 0.1).argmax()  # max F1 index
  ret = um.true_divide(


[K       8/50         0G          0      79.43          0          0        640:  23% ━━╸───────── 3/13 0.12it/s 21.1s