In [1]:
import os
import sys
import json
import base64
import pandas as pd
import cv2
import yaml
import shutil
import albumentations as A
from matplotlib import pyplot as plt
from pathlib import Path
from ultralytics import YOLOWorld

In [2]:
import re
import nltk
import itertools
nltk.download('stopwords')

def clean_caption(caption):
    # This function cleans the caption by removing stopwords and punctuations.
    caption = re.sub(r'[^\w\s]', '', caption)
    word_list = [word for word in caption.split() if word.lower() not in nltk.corpus.stopwords.words('english')]
    return ' '.join(word_list)

def create_inferior_labels(caption):
    # This function generates multiple inferior versions of a label by having different permutations of reduced adjectives for the label.
    # For example, a red cargo airplane can be labeled as a red cargo airplane, a red airplane, a cargo airplane, and an airplane.
    # remove stopwords and punctuations
    caption = re.sub(r'[^\w\s]', '', caption)
    word_list = [word for word in caption.split() if word.lower() not in nltk.corpus.stopwords.words('english')]
    
    # get all possible combinations of words
    inferior_labels = []
    for i in range(len(word_list), 0, -1):
        for subset in itertools.combinations(word_list, i):
            inferior_labels.append(' '.join(subset))
    
            
    return inferior_labels

# print(create_inferior_labels('blue, yellow, and white cargo aircraft'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zedon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
input_dir = Path("data")
tables = []
counter = 0
num_train_files = int(5107 * 0.8)

yolo_root = Path("data/yolo")

image_dir = yolo_root / "images"
label_dir = yolo_root / "labels"

image_train_dir = image_dir / "train"
image_val_dir = image_dir / "val"
label_train_dir = label_dir / "train"
label_val_dir = label_dir / "val"
shutil.rmtree(image_dir, ignore_errors=True)
shutil.rmtree(label_dir, ignore_errors=True)
image_train_dir.mkdir(parents=True, exist_ok=True)
image_val_dir.mkdir(parents=True, exist_ok=True)
label_train_dir.mkdir(parents=True, exist_ok=True)
label_val_dir.mkdir(parents=True, exist_ok=True)

labels = []

tranform = A.Compose([
    A.GridDistortion(p=0.0),
    # A.RandomCrop(width=640, height=640, p=0.5),
    # A.ShiftScaleRotate(p=1),
    # A.HorizontalFlip(p=0.5),
    # A.VerticalFlip(p=0.5),
    # A.RandomBrightnessContrast(p=1),
    # A.HueSaturationValue(p=1),
    # A.AdvancedBlur(blur_limit=3, p=0.5),
    # A.RandomRain(p=0.5),
    # A.GaussNoise(p=1, var_limit=(10.0, 150.0), per_channel=True),
    # A.ISONoise(p=1, intensity=(0.1, 0.5), color_shift=(0.01, 0.05)),
], bbox_params=A.BboxParams(format='yolo', label_fields=['captions'], min_visibility=0.2))

with open(input_dir / "vlm.jsonl", "r") as f:
    for line in f:
        if line.strip() == "":
            continue
        
        sub_dir = "train" if counter < num_train_files else "val"
        
        instance = json.loads(line.strip())
        image = cv2.imread(str(input_dir / "images" / instance["image"]))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        img_h, img_w, _ = image.shape
        
        bboxes = []
        captions = []
        for annotation in instance["annotations"]:
            caption = clean_caption(annotation["caption"]) #'/'.join(create_inferior_labels(annotation["caption"]))
            x, y, w, h = annotation["bbox"]
            if caption not in labels:
                labels.append(caption)
            captions.append(caption)
            yolo_x = (x + w/2) / img_w
            yolo_y = (y + h/2) / img_h
            yolo_w = w / img_w
            yolo_h = h / img_h
            bboxes.append([yolo_x, yolo_y, yolo_w, yolo_h])
        transformed = tranform(image=image, captions=captions, bboxes=bboxes)
        image = transformed["image"]
        bboxes = transformed["bboxes"]
        captions = transformed["captions"]
        
        cv2.imwrite(str(image_dir / sub_dir / instance["image"]), cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
        
        with open(label_dir / sub_dir / instance["image"].replace("jpg", "txt"), "w") as label_file:
            for caption, bbox in zip(captions, bboxes):
                yolo_x, yolo_y, yolo_w, yolo_h = bbox
                label_file.write(f"{labels.index(caption)} {yolo_x} {yolo_y} {yolo_w} {yolo_h}\n")
                # cv2.rectangle(image, (int((yolo_x - yolo_w/2) * img_w), int((yolo_y - yolo_h/2) * img_h), int(yolo_w * img_w), int(yolo_h * img_h)), (0, 255, 0), 2)
                        
        # plt.imshow(image)
        # plt.show()
        # break
    
        counter += 1
        
# write yaml file
with open(yolo_root / "dataset_open.yaml", "w") as f:
    yaml.dump({
        "names": dict(enumerate(labels)),
        "val": "images/val",
        "train": "images/train",
        "path": os.path.abspath(yolo_root),
    }, f)

In [5]:
from ultralytics.data.augment import Albumentations
from ultralytics.utils import LOGGER, colorstr

def __init__(self, p=1.0):
    """Initialize the transform object for YOLO bbox formatted params."""
    self.p = p
    self.transform = None
    prefix = colorstr("albumentations: ")
    try:
        import albumentations as A

        # Define your desired transformations here
        T = [
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.ShiftScaleRotate(p=1),
            A.RandomBrightnessContrast(p=1),
            A.AdvancedBlur(blur_limit=3, p=0.2),
            A.GaussNoise(p=1, var_limit=(10.0, 300.0), per_channel=True),
            A.CLAHE(p=0.5),
            A.ImageCompression(quality_lower=75, p=0.2),
        ]

        self.transform = A.Compose(T, bbox_params=A.BboxParams(format="yolo", label_fields=["class_labels"]))
        LOGGER.info(prefix + ", ".join(f"{x}".replace("always_apply=False, ", "") for x in T if x.p))
    except ImportError:
        # Albumentations package not installed, skip
        pass
    except Exception as e:
        LOGGER.info(f"{prefix}{e}")

Albumentations.__init__ = __init__

In [6]:
# download pre-trained model
model = YOLOWorld('vlm_album_large.pt') # 'data/yolo/vlm_yolo_album_large/weights/best.pt'

In [7]:
model.train(
    data="data/yolo/dataset_open.yaml",
    epochs=10,
    batch=8,
    imgsz=640,
    device=0,
    save_period=10,
    project="data/yolo",
    name="vlm_album_large",
    exist_ok=True,
    verbose=True,
    seed=0,
    patience=10,
)
    


New https://pypi.org/project/ultralytics/8.2.18 available  Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.2.14  Python-3.10.11 torch-2.3.0+cu118 CUDA:0 (NVIDIA GeForce RTX 4060 Laptop GPU, 8188MiB)
[34m[1mengine\trainer: [0mtask=detect, mode=train, model=vlm_album_large.pt, data=data/yolo/dataset_open.yaml, epochs=10, time=None, patience=10, batch=8, imgsz=640, save=True, save_period=10, cache=False, device=0, workers=8, project=data/yolo, name=vlm_album_large, exist_ok=True, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=Non

  return F.conv2d(input, weight, bias, self.stride,


[34m[1mAMP: [0mchecks passed 


[34m[1mtrain: [0mScanning C:\Users\zedon\Documents\GitHub\til-24-overflow\data\yolo\labels\train... 4085 images, 0 backgrounds, 0 corrupt: 100%|██████████| 4085/4085 [00:08<00:00, 475.85it/s]


[34m[1mtrain: [0mNew cache created: C:\Users\zedon\Documents\GitHub\til-24-overflow\data\yolo\labels\train.cache
[34m[1malbumentations: [0mHorizontalFlip(p=0.5), VerticalFlip(p=0.5), ShiftScaleRotate(p=1.0, shift_limit_x=(-0.0625, 0.0625), shift_limit_y=(-0.0625, 0.0625), scale_limit=(-0.09999999999999998, 0.10000000000000009), rotate_limit=(-45, 45), interpolation=1, border_mode=4, value=0.0, mask_value=0.0, rotate_method='largest_box'), RandomBrightnessContrast(p=1.0, brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), brightness_by_max=True), AdvancedBlur(p=0.2, blur_limit=(3, 3), sigma_x_limit=(0.2, 1.0), sigma_y_limit=(0.2, 1.0), rotate_limit=(-90, 90), beta_limit=(0.5, 8.0), noise_limit=(0.9, 1.1)), GaussNoise(p=0.2, var_limit=(10.0, 150.0), per_channel=True, mean=0.0), CLAHE(p=0.5, clip_limit=(1, 4.0), tile_grid_size=(8, 8)), ImageCompression(p=0.2, quality_range=(75, 100), compression_type=0)


[34m[1mval: [0mScanning C:\Users\zedon\Documents\GitHub\til-24-overflow\data\yolo\labels\val... 1022 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1022/1022 [00:02<00:00, 506.65it/s]

[34m[1mval: [0mNew cache created: C:\Users\zedon\Documents\GitHub\til-24-overflow\data\yolo\labels\val.cache





Plotting labels to data\yolo\vlm_album_large\labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=7.7e-05, momentum=0.9) with parameter groups 104 weight(decay=0.0), 115 weight(decay=0.0005), 121 bias(decay=0.0)


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1mdata\yolo\vlm_album_large[0m
Starting training for 10 epochs...
Closing dataloader mosaic
[34m[1malbumentations: [0mHorizontalFlip(p=0.5), VerticalFlip(p=0.5), ShiftScaleRotate(p=1.0, shift_limit_x=(-0.0625, 0.0625), shift_limit_y=(-0.0625, 0.0625), scale_limit=(-0.09999999999999998, 0.10000000000000009), rotate_limit=(-45, 45), interpolation=1, border_mode=4, value=0.0, mask_value=0.0, rotate_method='largest_box'), RandomBrightnessContrast(p=1.0, brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), brightness_by_max=True), AdvancedBlur(p=0.2, blur_limit=(3, 3), sigma_x_limit=(0.2, 1.0), sigma_y_limit=(0.2, 1.0), rotate_limit=(-90, 90), beta_limit=(0.5, 8.0), noise_limit=(0.9, 1.1)), GaussNoise(p=0.2, var_limit=(10.0, 150.0), per_channel=True, mean=0.0), CLAHE(p=0.5, clip_limit=(1, 4.0), tile_grid_size=(8, 8)), ImageCompression(p=0.2, quality_range=(75, 100), compression_type=0)

      Epoch    GP

       1/10      6.35G       1.19      0.939     0.9731         23        640: 100%|██████████| 511/511 [07:12<00:00,  1.18it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 64/64 [00:11<00:00,  5.38it/s]

                   all       1022       5702      0.852      0.816      0.874      0.555






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/10      6.48G      1.149     0.9033     0.9574         46        640:  38%|███▊      | 193/511 [02:40<04:25,  1.20it/s]


KeyboardInterrupt: 

In [3]:
# test model on validation set
model.val(
    data="data/yolo/dataset_open.yaml", 
    batch=8, 
    imgsz=640, 
    device=0, 
    verbose=True,
    project="data/yolo",
    name="vlm_album_large",
    exist_ok=True,
)

Ultralytics YOLOv8.2.14  Python-3.10.11 torch-2.3.0+cu118 CUDA:0 (NVIDIA GeForce RTX 4060 Laptop GPU, 8188MiB)
YOLOv8l-worldv2 summary (fused): 295 layers, 46807922 parameters, 0 gradients, 220.8 GFLOPs


[34m[1mval: [0mScanning C:\Users\zedon\Documents\GitHub\til-24-overflow\data\yolo\labels\val.cache... 1022 images, 24 backgrounds, 0 corrupt: 100%|██████████| 1022/1022 [00:00<?, ?it/s]
  return F.conv2d(input, weight, bias, self.stride,
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 128/128 [00:34<00:00,  3.67it/s]


                   all       1022       4210      0.773      0.656      0.724      0.484
          grey missile       1022         26      0.406      0.269      0.373      0.238
red white blue light aircraft       1022         32       0.93      0.832      0.886      0.567
   green black missile       1022         40      0.858        0.7      0.781      0.581
  white red helicopter       1022         33      0.875      0.638      0.723      0.438
grey camouflage fighter jet       1022         33      0.483      0.424      0.376       0.29
grey white fighter plane       1022         42      0.907      0.738      0.827      0.611
     white black drone       1022         28      0.762       0.75      0.802       0.54
white black fighter jet       1022         37      0.899       0.72      0.841      0.642
         white missile       1022         41      0.574      0.395      0.413      0.234
black white commercial aircraft       1022         28      0.769      0.714      0.775      0.5

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,
        62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
       124, 125])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x000002089C8A8A90>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([      