In [30]:
import os
def extract_data_from_xml(root_dir):
    xml_path = os.path.join(root_dir, 'words.xml') 
    tree = ET.parse(xml_path)
    root = tree.getroot()

    image_paths = []
    image_sizes = []
    image_labels = []
    bounding_boxes = []

    for image in root:
        bbs_of_image = []
        labels_of_image = []

        for bbs in image.findall('taggedRectangles'):
            for bb in bbs:
                if not bb[0].text.isalnum():
                    continue
                if 'é' in bb[0].text.lower() or 'ñ' in bb[0].text.lower():
                        continue
                
                bbs_of_image.append(
                    [
                        float(bb.attrib['x']),
                        float(bb.attrib['y']),
                        float(bb.attrib['width']),
                        float(bb.attrib['height'])
                    ]
                )
                labels_of_image.append(bb[0].text.lower())
        
        image_paths.append(image[0].text)
        image_sizes.append((int(image[1].attrib['x']), int(image[1].attrib['y'])))
        bounding_boxes.append(bbs_of_image)
        image_labels.append(labels_of_image) 

    return image_paths, image_sizes, image_labels, bounding_boxes

In [31]:
dataset_dir = "Datasets/icdar2003/SceneTrialTrain"
img_paths , img_sizes, img_labels, bboxes = extract_data_from_xml(dataset_dir)

In [32]:
def convert_to_yolov8_format(image_paths, image_sizes, bounding_boxes):

    """
    Normalize bounding box
    Parameters:
       image_paths (list): The list contains image paths
       image_size (list): The list contains image resolutions
       bounding_boxes(list): The list contains bounding box
    Returns:
       yolov8_data(list) : list contains (image_path, image_size, bbboxes)
    """
    #Declare an empty list to contain the result
    yolov8_data = []

    for image_path, image_size, bboxes in zip(image_paths, image_sizes, bounding_boxes):
        image_width, image_height = image_size

        # Declare the empty list to contain label( new format)
        yolov8_labels = []

        # Browse through  each bbox 
        for bbox in bboxes:
            x, y, w, h = bbox

            # normalize bbox
            # current format : (x_min, y_min, width, height)
            # yolo of format: (x_center, y_center, width, height)
            center_x = (x + w/2) / image_width
            center_y = (y + h/2) / image_height
            normalized_width = w / image_width
            normalized_height = h / image_height

            # set default class code = 0 because has 1 class is 'text'
            class_id = 0

            yolov8_label = f"{class_id} {center_x} {center_y} {normalized_width} {normalized_height}"
            yolov8_labels.append(yolov8_label)

        yolov8_data.append((image_path,yolov8_labels))
    return yolov8_data

In [33]:
class_labels = ['text']
yolov8_data = convert_to_yolov8_format(
    img_paths, 
    img_sizes, 
    bboxes
)

 <font face="Arial" size="6"> 4.Train, val, test split</font>


In [35]:
from sklearn.model_selection import train_test_split
seed = 0
val_size = 0.2
test_size = 0.125

train_data, test_data = train_test_split(
    yolov8_data,
    test_size = val_size,
    random_state = seed,
    shuffle = True
)

test_data, valid_data = train_test_split(
    test_data,
    test_size = test_size,
    random_state = seed,
    shuffle = True
)

<font face = "Arial" size = "6">5. Save data </font>

In [36]:
import shutil
def save_data(data, src_img_dir, save_dir):
    """
    build folder to containing data in YOLO format 
    parameters:
       data(list) : The list containing image label information
       src_img_dir(str): Path to the original data directory
       save_dir(str): Path to the orginal data directory

    """
    # create the new folder data  
    os.makedirs(save_dir, exist_ok = True)

    # create the images folder and labels folder
    os.makedirs(os.path.join(save_dir, 'images'), exist_ok = True)
    os.makedirs(os.path.join(save_dir, 'labels') , exist_ok = True)

    for image_path, yolov8_labels in data:
        # copy image from original folder to images folder
        shutil.copy(
            os.path.join(src_img_dir, image_path),
            os.path.join(save_dir, 'images')
        )
        image_name = os.path.basename(image_path)
        image_name = os.path.splitext(image_name)[0]

        with open(os.path.join(save_dir, 'labels', f"{image_name}.txt"), 'w') as f:
            for label in yolov8_labels : 
                f.write(f"{label}\n")
                

In [37]:
import os
save_yolo_data_dir = 'Datasets/yolo_data'
os.makedirs(save_yolo_data_dir, exist_ok = True)
save_train_dir = os.path.join(save_yolo_data_dir, 
                              'train')
save_val_dir = os.path.join(
    save_yolo_data_dir,
    'validation'
)

save_test_dir = os.path.join(
    save_yolo_data_dir,
    'test'
)
save_data(
    train_data, 
    dataset_dir,
    save_train_dir
)
save_data(
    test_data,
    dataset_dir,
    save_test_dir

)
save_data(
    valid_data,
    dataset_dir,
    save_val_dir
)


<font face="Arial" size="6"> 6. Create yaml file </font>

In [39]:
import yaml
data_yaml = {
    'path' : 'yolo_data',
    'train': 'train/images',
    'test' : 'test/images',
    'val' : 'validation/images',
    'nc' : 1, 
    'names' : class_labels
}
yolo_yaml_path = os.path.join(
    save_yolo_data_dir,
    'data.yml'
)

with open(yolo_yaml_path, 'w') as f:
    yaml.dump(data_yaml, f, default_flow_style = False)


<font face = 'Arial' size = '6'> 7.Training </font>

In [40]:
from ultralytics import YOLO

#load a model
model = YOLO('yolov8s.yaml').load('yolov8s.pt')

#train the model
epochs = 200
imgsz = 1024
resuls = model.train(
    data = yolo_yaml_path, 
    epochs = epochs,
    imgsz = imgsz,
    project = 'models',
    name = 'yolov8/detect/train'
)

Transferred 355/355 items from pretrained weights
Ultralytics YOLOv8.2.92  Python-3.12.4 torch-2.3.0 CPU (AMD Ryzen 5 5500U with Radeon Graphics)
[34m[1mengine\trainer: [0mtask=detect, mode=train, model=yolov8s.yaml, data=Datasets/yolo_data\data.yml, epochs=200, time=None, patience=100, batch=16, imgsz=1024, save=True, save_period=-1, cache=False, device=None, workers=8, project=models, name=train5, exist_ok=False, pretrained=yolov8s.pt, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, sav

100%|██████████| 755k/755k [00:00<00:00, 3.99MB/s]

Overriding model.yaml nc=80 with nc=1

                   from  n    params  module                                       arguments                     
  0                  -1  1       928  ultralytics.nn.modules.conv.Conv             [3, 32, 3, 2]                 
  1                  -1  1     18560  ultralytics.nn.modules.conv.Conv             [32, 64, 3, 2]                
  2                  -1  1     29056  ultralytics.nn.modules.block.C2f             [64, 64, 1, True]             
  3                  -1  1     73984  ultralytics.nn.modules.conv.Conv             [64, 128, 3, 2]               
  4                  -1  2    197632  ultralytics.nn.modules.block.C2f             [128, 128, 2, True]           
  5                  -1  1    295424  ultralytics.nn.modules.conv.Conv             [128, 256, 3, 2]              
  6                  -1  2    788480  ultralytics.nn.modules.block.C2f             [256, 256, 2, True]           
  7                  -1  1   1180672  ultralytics




 17            [-1, 12]  1         0  ultralytics.nn.modules.conv.Concat           [1]                           
 18                  -1  1    493056  ultralytics.nn.modules.block.C2f             [384, 256, 1]                 
 19                  -1  1    590336  ultralytics.nn.modules.conv.Conv             [256, 256, 3, 2]              
 20             [-1, 9]  1         0  ultralytics.nn.modules.conv.Concat           [1]                           
 21                  -1  1   1969152  ultralytics.nn.modules.block.C2f             [768, 512, 1]                 
 22        [15, 18, 21]  1   2116435  ultralytics.nn.modules.head.Detect           [1, [128, 256, 512]]          
YOLOv8s summary: 225 layers, 11,135,987 parameters, 11,135,971 gradients, 28.6 GFLOPs

Transferred 349/355 items from pretrained weights
Freezing layer 'model.22.dfl.conv.weight'


[34m[1mtrain: [0mScanning D:\NLP_CVS\Datasets\yolo_data\train\labels... 200 images, 7 backgrounds, 0 corrupt: 100%|██████████| 200/200 [00:00<00:00, 314.97it/s]

[34m[1mtrain: [0mNew cache created: D:\NLP_CVS\Datasets\yolo_data\train\labels.cache



[34m[1mval: [0mScanning D:\NLP_CVS\Datasets\yolo_data\validation\labels... 7 images, 0 backgrounds, 0 corrupt: 100%|██████████| 7/7 [00:00<00:00, 355.02it/s]

[34m[1mval: [0mNew cache created: D:\NLP_CVS\Datasets\yolo_data\validation\labels.cache





Plotting labels to models\yolov8\detect\train5\labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Image sizes 1024 train, 1024 val
Using 0 dataloader workers
Logging results to [1mmodels\yolov8\detect\train5[0m
Starting training for 200 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


  0%|          | 0/13 [00:00<?, ?it/s]