In [None]:
import zipfile

with zipfile.ZipFile('/content/Vehicle Registration Plates.v1-original-images.coco (2).zip', 'r') as zip_ref:
    zip_ref.extractall('./license_plate_dataset')  # Extract to a specific directory

In [None]:
import os
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value, Image
from tqdm import tqdm
import json

In [None]:
DATASET_ROOT = "/content/license_plate_dataset/Vehicle Registration Plates.v1-original-images.coco"

SPLITS = ["train", "valid", "test"]

ANNOTATION_DIR = f"{DATASET_ROOT}/annotations"
IMAGE_DIR = f"{DATASET_ROOT}/images"

In [None]:
def clean_coco_json(input_json, output_json):
    with open(input_json, "r") as f:
        coco = json.load(f)

    # 1. Fix categories (keep only license_plate as id=0)
    coco["categories"] = [{
        "id": 0,
        "name": "license_plate",
        "supercategory": "none"
    }]

    # 2. Remap annotations: category_id 1 → 0
    for ann in coco["annotations"]:
        if ann["category_id"] == 1:
            ann["category_id"] = 0
        else:
            # If anything else exists, drop it safely
            ann["ignore"] = 1

    with open(output_json, "w") as f:
        json.dump(coco, f)

    print(f"Saved cleaned file → {output_json}")


In [None]:
for split in SPLITS:
    input_json = f"{ANNOTATION_DIR}/{split}.json"
    output_json = f"{ANNOTATION_DIR}/{split}_clean.json"
    clean_coco_json(input_json, output_json)


In [None]:
def load_coco_as_hf(json_path, images_dir):
    with open(json_path, "r") as f:
        coco = json.load(f)

    # Build image_id → image_info map
    images = {img["id"]: img for img in coco["images"]}

    # Build image_id → annotations map
    ann_map = {}
    for ann in coco["annotations"]:
        img_id = ann["image_id"]
        ann_map.setdefault(img_id, []).append(ann)

    records = []

    for img_id, img_info in tqdm(images.items()):
        img_path = os.path.join(images_dir, img_info["file_name"])

        if not os.path.exists(img_path):
            continue

        anns = ann_map.get(img_id, [])
        if len(anns) == 0:
            continue

        boxes = []
        labels = []

        for ann in anns:
            boxes.append(ann["bbox"])   # COCO format
            labels.append(ann["category_id"])  # always 0 now

        records.append({
            "image": img_path,
            "objects": {
                "bbox": boxes,
                "category": labels
            }
        })

    return Dataset.from_list(records)


In [None]:
features = Features({
    "image": Image(),
    "objects": {
        "bbox": Sequence(
            Sequence(Value("float32"), length=4)
        ),
        "category": Sequence(
            ClassLabel(names=["license_plate"])
        )
    }
})

In [None]:
train_ds = load_coco_as_hf(
    f"{ANNOTATION_DIR}/train_clean.json",
    f"{IMAGE_DIR}/train"
).cast(features)

val_ds = load_coco_as_hf(
    f"{ANNOTATION_DIR}/valid_clean.json",
    f"{IMAGE_DIR}/valid"
).cast(features)

test_ds = load_coco_as_hf(
    f"{ANNOTATION_DIR}/test_clean.json",
    f"{IMAGE_DIR}/test"
).cast(features)

In [None]:
dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

dataset

In [None]:
sample = dataset["train"][55]
sample

In [None]:
import matplotlib.pyplot as plt

img = sample["image"]
x, y, w, h = sample["objects"]["bbox"][0]

plt.imshow(img)
plt.gca().add_patch(
    plt.Rectangle((x, y), w, h, fill=False, edgecolor="red", linewidth=2)
)
plt.axis("off")

In [None]:
dataset.push_to_hub("justjuu/license-plate-detection")

In [None]:
dataset["train"].features