В данном ноутбуке происходит загрузка датасета из kaggle и его конвертация в YOLO формат

## Скачивание датасета

In [33]:
import os
from pathlib import Path
from collections import Counter, defaultdict
from shutil import rmtree, copy
from tqdm import tqdm

In [5]:
os.environ["KAGGLE_USERNAME"] = "<введите имя пользователя>"
os.environ["KAGGLE_KEY"] = "<введите ключ>"

Авторизовываемся в kaggle

In [12]:
from kaggle import api

api.authenticate()

In [13]:
def download_dataset(dataset_id: str, output_dataset_path: Path | str) -> None:
    """Скачивает датасет из kaggle по id."""
    api.dataset_download_files(dataset_id, path=output_dataset_path, unzip=True, quiet=False)

Скачиваем датасет

In [14]:
download_dataset(dataset_id = "watchman/rtsd-dataset", output_dataset_path = "data")

Downloading rtsd-dataset.zip to data


100%|██████████| 17.1G/17.1G [38:57<00:00, 7.87MB/s]  





## EDA

In [28]:
##тут точно будут картинки

In [44]:
dataset_path = Path('data')

In [84]:
with open(dataset_path / 'train_anno.json') as file:
    train_anno = json.loads(file.read())

In [None]:
with open(dataset_path / 'val_anno.json') as file:
    val_anno = json.loads(file.read())

In [85]:
class_names = {item['id']: item['name'] for item in train_anno['categories']}

In [86]:
samples_count_by_category = Counter([class_names[item['category_id']] for item in train_anno['annotations']])

In [87]:
invalid_categories = {cat_name:cat_count for cat_name, cat_count in samples_count_by_category.items() if cat_count < 25}


In [88]:
def filter_invalid_categories(annotation, invalid_categories:dict[str, int]):
    valid_annotations = [ann for ann in annotation['annotations'] if ann['category_id'] not in invalid_categories.values()]
    valid_categories = [cat for cat in annotation['categories'] if cat['id'] not in invalid_categories]
    valid_image_ids = set(ann['image_id'] for ann in valid_annotations)
    valid_images = [img for img in annotation['images'] if img['id'] in valid_image_ids]
    annotation['annotations'] = valid_annotations
    annotation['categories']  = valid_categories
    annotation['images'] = valid_images
    return annotation

In [89]:
with open(dataset_path / 'train_annotation.json', 'w') as f:
    json.dump(filter_invalid_categories(train_anno, invalid_categories), f)

In [90]:
with open(dataset_path / 'val_annotation.json', 'w') as f:
    json.dump(filter_invalid_categories(val_anno, invalid_categories), f)

Уберем невалидные лейблы из labels.txt

In [91]:
with open(dataset_path / "labels.txt") as file:
    labels = file.read().splitlines()
    
labels = [label for label in labels if label not in invalid_categories.keys()]

with open(dataset_path / "valid_labels.txt", 'w') as file:
     for label in labels:
        file.write(f"{label}\n")

## Конвертация в YOLO формат

Так как датасет в формате COCO, необходимо его конвертировать в YOLO формат

In [92]:
output_dataset_path = Path("output_data")

In [93]:
if output_dataset_path.exists():
    rmtree(output_dataset_path)
output_dataset_path.mkdir(parents=True)

In [None]:
def convert_coco_json(annotation_path: Path | str, output_dir: Path | str) -> None:
    """Конвертирует датасет в формате COCO в формат YOLO."""
    yolo_annotation_path: Path = Path(output_dir)  / annotation_path.stem / "labels"
    yolo_annotation_path.mkdir(parents=True)
    with open(annotation_path) as f:
        data = json.load(f)

    images = {'%g' % x['id']: x for x in data['images']}
    img_to_annotaitons = defaultdict(list)
    for ann in data['annotations']:
        img_to_annotaitons[ann['image_id']].append(ann)

    # Write labels file
    for img_id, anns in tqdm(img_to_annotaitons.items(), desc=f'Annotations {annotation_path}'):
        img = images['%g' % img_id]
        h, w, filename = img['height'], img['width'], img['file_name'].split('/')[1]

        bboxes = []
        for ann in anns:
            if ann['iscrowd']:
                continue
            box = np.array(ann['bbox'], dtype=np.float64)
            box[:2] += box[2:] / 2 
            box[[0, 2]] /= w 
            box[[1, 3]] /= h  
            if box[2] <= 0 or box[3] <= 0: 
                continue

            cls = ann['category_id'] - 1 
            box = [cls] + box.tolist()
            if box not in bboxes:
                bboxes.append(box)

        # Write
        with open((yolo_annotation_path  / Path(filename).name).with_suffix('.txt'), 'a') as file:
            for i in range(len(bboxes)):
                line = *(bboxes[i]),
                file.write(('%g ' * len(line)).rstrip() % line + '\n')


In [94]:
convert_coco_json(dataset_path / "train_annotation.json", output_dataset_path)
convert_coco_json(dataset_path / "val_annotation.json", output_dataset_path)

Annotations data/train_annotation.json: 100%|██████████| 29479/29479 [00:02<00:00, 14217.27it/s]
Annotations data/val_annotation.json: 100%|██████████| 2715/2715 [00:00<00:00, 14624.06it/s]


Добавим изображения в датасет YOLO

In [95]:
train_labels = [filename.stem for filename in (output_dataset_path / "train_annotation/labels").iterdir()]
val_labels = [filename.stem for filename in (output_dataset_path / 'val_annotation/labels').iterdir()]

In [96]:
val_images_path = (output_dataset_path / "val_annotation/images")
val_images_path.mkdir(parents=True, exist_ok=True)
train_images_path = (output_dataset_path / "train_annotation/images")
train_images_path.mkdir(parents=True, exist_ok=True)

In [97]:
images_path = dataset_path  / 'rtsd-frames/rtsd-frames'
for filename in tqdm(images_path.iterdir(),  desc=f'Image copy...'):
    if filename.stem in train_labels:
        copy(filename, train_images_path / filename.name)
    if filename.stem in val_labels:
         copy(filename, val_images_path  / filename.name)

Image copy...: 179138it [01:24, 2111.16it/s]


Создадим файл traffic-sign.yaml

In [98]:
with open(dataset_path / "valid_labels.txt") as file:
    labels = file.read().splitlines()

In [99]:
with open(output_dataset_path / "traffic-sign.yaml", 'w') as file:
    file.write(f"train: {train_images_path}\n")
    file.write(f"val: {val_images_path}\n")
    file.write(f"nc: {len(labels)}\n")
    file.write(f"names: {labels}")

In [100]:
!cat output_data/traffic-sign.yaml

train: output_data/train_annotation/images
val: output_data/val_annotation/images
nc: 117
names: ['2_1', '1_23', '1_17', '3_24', '8_2_1', '5_20', '5_19_1', '5_16', '3_25', '6_16', '2_2', '2_4', '8_13_1', '4_2_1', '1_20_3', '1_25', '3_4', '8_3_2', '3_4_1', '4_1_6', '4_2_3', '4_1_1', '1_33', '5_15_5', '3_27', '1_15', '4_1_2_1', '6_3_1', '8_1_1', '6_7', '5_15_3', '7_3', '1_19', '6_4', '8_1_4', '1_16', '1_11_1', '6_6', '5_15_1', '7_2', '5_15_2', '7_12', '3_18', '5_6', '5_5', '7_4', '4_1_2', '8_2_2', '7_11', '1_22', '1_27', '2_3_2', '5_15_2_2', '1_8', '3_13', '2_3', '2_3_3', '7_7', '1_11', '8_13', '1_12_2', '1_20', '1_12', '3_32', '2_5', '3_1', '4_8_2', '3_20', '3_2', '5_22', '7_5', '8_4_1', '3_14', '1_2', '1_20_2', '4_1_4', '7_6', '8_3_1', '4_3', '4_1_5', '8_2_3', '8_2_4', '3_10', '4_2_2', '7_1', '3_28', '4_1_3', '5_3', '3_31', '6_2', '1_21', '3_21', '1_13', '1_14', '6_15_2', '2_6', '3_18_2', '4_1_2_2', '3_19', '8_5_4', '5_15_7', '5_14', '5_21', '1_1', '6_15_1', '8_6_4', '8_15', '3_11',