In [1]:
import os
import sys
import json
import base64
import pandas as pd
import cv2
import yaml
import shutil
import albumentations as A
from matplotlib import pyplot as plt
from pathlib import Path
from ultralytics import YOLOWorld

INFO:albumentations.check_version:A new version of Albumentations is available: 1.4.8 (you have 1.4.7). Upgrade using: pip install --upgrade albumentations


In [2]:
import re
import nltk
import itertools
nltk.download('stopwords')

def clean_caption(caption):
    # This function cleans the caption by removing stopwords and punctuations.
    caption = re.sub(r'[^\w\s]', '', caption)
    word_list = [word for word in caption.split() if word.lower() not in nltk.corpus.stopwords.words('english')]
    return ' '.join(word_list)

def create_inferior_labels(caption):
    # This function generates multiple inferior versions of a label by having different permutations of reduced adjectives for the label.
    # For example, a red cargo airplane can be labeled as a red cargo airplane, a red airplane, a cargo airplane, and an airplane.
    # remove stopwords and punctuations
    caption = re.sub(r'[^\w\s]', '', caption)
    word_list = [word for word in caption.split() if word.lower() not in nltk.corpus.stopwords.words('english')]
    
    # get all possible combinations of words
    inferior_labels = []
    for i in range(len(word_list), 0, -1):
        for subset in itertools.combinations(word_list, i):
            inferior_labels.append(' '.join(subset))
    
            
    return inferior_labels

# print(create_inferior_labels('blue, yellow, and white cargo aircraft'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gatastol/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
input_dir = Path("data")
tables = []
counter = 0
num_train_files = int(5107 * 0.95 *5)#*0.8

yolo_root = Path("data/yolo")

image_dir = yolo_root / "images"
label_dir = yolo_root / "labels"

image_train_dir = image_dir / "train"
image_val_dir = image_dir / "val"
label_train_dir = label_dir / "train"
label_val_dir = label_dir / "val"
shutil.rmtree(image_dir, ignore_errors=True)
shutil.rmtree(label_dir, ignore_errors=True)
image_train_dir.mkdir(parents=True, exist_ok=True)
image_val_dir.mkdir(parents=True, exist_ok=True)
label_train_dir.mkdir(parents=True, exist_ok=True)
label_val_dir.mkdir(parents=True, exist_ok=True)

labels = []

tranform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.ShiftScaleRotate(p=1, border_mode=cv2.BORDER_CONSTANT, value=0),
    A.RandomBrightnessContrast(p=1),
    A.Blur(blur_limit=3, p=0.2),
    A.GaussNoise(p=1, var_limit=(0.0, 400.0), per_channel=True),
    A.CLAHE(p=0.5),
    A.ImageCompression(quality_lower=75, p=0.2),
    A.RandomRain(p=0.1),
    A.RandomFog(p=0.1),
    A.MultiplicativeNoise(p=1, var_limit=(0.0, 400.0), per_channel=True),
    A.RandomSunFlare(p=0.1),
    A.GridDistortion(p=0.2)
    #A.GridDistortion(p=0.0),
    # A.RandomCrop(width=640, height=640, p=0.5),
    # A.ShiftScaleRotate(p=1),
    # A.HorizontalFlip(p=0.5),
    # A.VerticalFlip(p=0.5),
    # A.RandomBrightnessContrast(p=1),
    # A.HueSaturationValue(p=1),
    # A.AdvancedBlur(blur_limit=3, p=0.5),
    # A.RandomRain(p=0.5),
    # A.GaussNoise(p=1, var_limit=(10.0, 150.0), per_channel=True),
    # A.ISONoise(p=1, intensity=(0.1, 0.5), color_shift=(0.01, 0.05)),
], bbox_params=A.BboxParams(format='yolo', label_fields=['captions'], min_visibility=0.2))

for i in range(3,8):
    with open(input_dir / "vlm.jsonl", "r") as f:
        for line in f:
            if line.strip() == "":
                continue
            
            sub_dir = "train" if counter < num_train_files else "val"
            
            instance = json.loads(line.strip())
            image = cv2.imread(str(input_dir / "images" / instance["image"]))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            img_h, img_w, _ = image.shape
            
            bboxes = []
            captions = []
            for annotation in instance["annotations"]:
                caption = clean_caption(annotation["caption"]) #'/'.join(create_inferior_labels(annotation["caption"]))
                x, y, w, h = annotation["bbox"]
                if caption not in labels:
                    labels.append(caption)
                captions.append(caption)
                yolo_x = (x + w/2) / img_w
                yolo_y = (y + h/2) / img_h
                yolo_w = w / img_w
                yolo_h = h / img_h
                bboxes.append([yolo_x, yolo_y, yolo_w, yolo_h])
            transformed = tranform(image=image, captions=captions, bboxes=bboxes)
            image = transformed["image"]
            bboxes = transformed["bboxes"]
            captions = transformed["captions"]


            cv2.imwrite(str(image_dir / sub_dir / instance["image"].replace(".jpg", "-"+str(i)+".jpg")), cv2.cvtColor(image, cv2.COLOR_RGB2BGR))
            with open(label_dir / sub_dir / instance["image"].replace(".jpg", "-"+str(i)+".txt"), "w") as label_file:
                for caption, bbox in zip(captions, bboxes):
                    yolo_x, yolo_y, yolo_w, yolo_h = bbox
                    label_file.write(f"{labels.index(caption)} {yolo_x} {yolo_y} {yolo_w} {yolo_h}\n")
                    # cv2.rectangle(image, (int((yolo_x - yolo_w/2) * img_w), int((yolo_y - yolo_h/2) * img_h), int(yolo_w * img_w), int(yolo_h * img_h)), (0, 255, 0), 2)
                            
            # plt.imshow(image)>
            # plt.show()
            # break
        
            counter += 1
        
# write yaml file
with open(yolo_root / "dataset_open.yaml", "w") as f:
    yaml.dump({
        "names": dict(enumerate(labels)),
        "val": "images/val",
        "train": "images/train",
        "path": os.path.abspath(yolo_root),
    }, f)

  warn(


In [4]:
# from ultralytics.data.augment import Albumentations
# from ultralytics.utils import LOGGER, colorstr

# def __init__(self, p=1.0):
#     """Initialize the transform object for YOLO bbox formatted params."""
#     self.p = p
#     self.transform = None
#     prefix = colorstr("albumentations: ")
#     try:
#         import albumentations as A

#         # Define your desired transformations here
#         T = [
#             A.HorizontalFlip(p=0.5),
#             A.VerticalFlip(p=0.5),
#             A.ShiftScaleRotate(p=1),
#             A.RandomBrightnessContrast(p=1),
#             A.AdvancedBlur(blur_limit=3, p=0.2),
#             A.GaussNoise(p=1, var_limit=(0.0, 300.0), per_channel=True),
#             A.CLAHE(p=0.5),
#             A.ImageCompression(quality_lower=75, p=0.2),
#             A.RandomRain(p=0.1),
#             A.RandomFog(p=0.1),
#             A.MultiplicativeNoise(p=1, var_limit=(0.0, 300.0), per_channel=True),
#             A.RandomSunFlare(p=0.1),
#             A.grid_distortion(0.2)
#         ]

#         self.transform = A.Compose(T, bbox_params=A.BboxParams(format="yolo", label_fields=["class_labels"]))
#         LOGGER.info(prefix + ", ".join(f"{x}".replace("always_apply=False, ", "") for x in T if x.p))
#     except ImportError:
#         # Albumentations package not installed, skip
#         pass
#     except Exception as e:
#         LOGGER.info(f"{prefix}{e}")

# Albumentations.__init__ = __init__last

In [2]:
# download pre-trained model
model = YOLOWorld('data/yolo/vlm_album_large/weights/best.pt') # 'data/yolo/vlm_yolo_album_large/weights/best.pt'

In [3]:
model.train(
    data="data/yolo/dataset_open.yaml",
    epochs=1,
    batch=8,
    imgsz=640,
    device=0,
    #save_period=1,
    project="data/yolo",
    name="vlm_album_large",
    exist_ok=True,
    verbose=True,
    seed=0,
    #patience=10,
)
    


New https://pypi.org/project/ultralytics/8.2.25 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.2.18 🚀 Python-3.10.12 torch-2.0.1+rocm5.4.2 CUDA:0 (AMD Radeon RX 6800 XT, 16368MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=data/yolo/vlm_album_large/weights/last.pt, data=data/yolo/dataset_open.yaml, epochs=1, time=None, patience=100, batch=8, imgsz=640, save=True, save_period=-1, cache=False, device=0, workers=8, project=data/yolo, name=vlm_album_large, exist_ok=True, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=F

[34m[1mtrain: [0mScanning /home/gatastol/Documents/GitHub/til-24-overflow/data/yolo/labels/train.cache... 38046 images, 1 backgrounds, 0 corrupt: 100%|██████████| 38046/38046 [00:00<?, ?it/s]


[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01), CLAHE(p=0.01, clip_limit=(1, 4.0), tile_grid_size=(8, 8))


[34m[1mval: [0mScanning /home/gatastol/Documents/GitHub/til-24-overflow/data/yolo/labels/val.cache... 2810 images, 0 backgrounds, 0 corrupt: 100%|██████████| 2810/2810 [00:00<?, ?it/s]


Plotting labels to data/yolo/vlm_album_large/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=7.7e-05, momentum=0.9) with parameter groups 104 weight(decay=0.0), 115 weight(decay=0.0005), 121 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1mdata/yolo/vlm_album_large[0m
Starting training for 1 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/1      5.98G      1.214      1.044       0.97         71        640:   9%|▉         | 442/4756 [04:29<43:48,  1.64it/s]

In [3]:
# test model on validation set
model.val(
    data="data/yolo/dataset_open.yaml", 
    batch=8, 
    imgsz=640, 
    device=0, 
    verbose=True,
    project="data/yolo",
    name="vlm_album_large",
    exist_ok=True,
)

Ultralytics YOLOv8.2.18 🚀 Python-3.10.12 torch-2.0.1+rocm5.4.2 CUDA:0 (AMD Radeon Graphics, 512MiB)


: 