In [None]:
#@title Imports
import transformers
from datasets import load_dataset
import torch
import random
import matplotlib.pyplot as plt
import matplotlib.patches as patches

In [None]:
#@title Training Pipeline
dataset = load_dataset("justjuu/license-plate-detection")
dataset

In [None]:
categories = dataset["train"].features["objects"]["category"]
id2label = {i: cat for i, cat in enumerate(categories.feature.names)}
label2id = {cat: i for i, cat in id2label.items()}
COLOR_MAP = {
    0: "GREEN"
}

In [None]:
def visualize_random_samples(dataset, split = "train", num_sample = 3):
  sample_idx = random.sample(range(len(dataset[split])), num_sample)

  for idx in sample_idx:
    sample = dataset[split][idx]
    image = sample["image"]
    objects = sample["objects"]

    fig, ax = plt.subplots(1, figsize = (8, 8))
    ax.imshow(image)
    ax.axis("off")

    for bbox, label in zip(objects["bbox"], objects["category"]):
      x, y, w, h = bbox
      rect = patches.Rectangle(
          (x, y), w, h,
          linewidth = 2,
          edgecolor = COLOR_MAP[label],
          facecolor = "none"
      )
      ax.add_patch(rect)

      ax.text(x, y-5, id2label[label], color = COLOR_MAP[label], fontsize = 10,bbox=dict(facecolor="black", alpha=0.5, pad=1))

In [None]:
visualize_random_samples(dataset)

In [None]:
from transformers import AutoModelForObjectDetection, AutoImageProcessor

MODEL_ID = "PekingU/rtdetr_v2_r50vd"
model = AutoModelForObjectDetection.from_pretrained(MODEL_ID, id2label = id2label, label2id = label2id, ignore_mismatched_sizes = True)
image_processor = AutoImageProcessor.from_pretrained(MODEL_ID, use_fast = True)

In [None]:
from dataclasses import dataclass, asdict
from typing import List, Tuple

@dataclass
class SingleCOCOAnnotation:
  image_id: int
  category_id: int
  bbox: List[int]

@dataclass
class ImageAnnotation:
  image_id: int
  annotations: List[SingleCOCOAnnotation]

In [None]:
def hf_to_coco_targets(images, objects, image_id):
    """
    Convert Hugging Face object-detection samples to COCO-style
    targets required by RT-DETR v2 image processor.

    Args:
        images (List[PIL.Image]): list of images
        objects (List[dict]): list of objects dicts from dataset
            each dict contains:
                - "bbox": List[List[x, y, w, h]]
                - "category": List[int]

    Returns:
        List[dict]: COCO-style targets (one per image)
    """

    targets = []

    for  img, obj in zip(images, objects):
        annotations = []

        for bbox, category_id in zip(obj["bbox"], obj["category"]):
            x, y, w, h = bbox
            area = w * h

            annotations.append({
                "image_id": image_id,
                "bbox": [x, y, w, h],
                "category_id": int(category_id),
                "area": float(area),
                "iscrowd": 0
            })

        targets.append({
            "image_id": image_id,
            "annotations": annotations
        })

    return targets

In [None]:
random_idx = random.randint(0, len(dataset["train"]))
random_image = dataset["train"][random_idx]["image"]
random_objects = dataset["train"][random_idx]["objects"]

In [None]:
random_sample_coco_annotation = hf_to_coco_targets([random_image], [random_objects], 1)
random_sample_coco_annotation

In [None]:
random_sample_preprocessed = image_processor.preprocess(images=random_image, annotations=random_sample_coco_annotation, return_tensors = "pt")
random_sample_preprocessed

In [None]:
def preprocess_batch(batch, image_processor=image_processor):
  images = []
  coco_annotations = []
  for image_id, (image, objects) in enumerate(zip(batch["image"], batch["objects"])):
    images.append(image)
    coco_annotations.extend(hf_to_coco_targets([image], [objects], image_id))

  processed_batch = image_processor.preprocess(images = images, annotations = coco_annotations, return_tensors = "pt")
  return processed_batch

In [None]:
dataset["train"] = dataset["train"].with_transform(preprocess_batch)
dataset["validation"] = dataset["validation"].with_transform(preprocess_batch)
dataset["test"] = dataset["test"].with_transform(preprocess_batch)

In [None]:
from typing import List, Dict, Any

def data_collate_function(preprocessed_batch: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Stacks together groups of preprocessed samples into batches for our model.

    Args:
        preprocessed_batch: A list of dictionaries where each dictionary represnets a preprocessed sample.

    Returns:
        collated_data: A dictionary containing the batched data ready in the format our model
            is expecting. The dictionary has the following keys:
                - "pixel_values": A stacked tensor of preprocessed pixel values.
                - "labels": A list of label dictionaries.
                - "pixel_mask": (Optional) A stacked tensor of pixel masks (this will be present
                    only if the input contains a "pixel_mask" key.
    """
    # Create an empty dictionary (our model wants a dictionary input)
    collated_data = {}

    # Stack together a collection of pixel_values tensors
    collated_data["pixel_values"] = torch.stack([sample["pixel_values"] for sample in preprocessed_batch])

    # Get the labels (these are dictionaries so no need to use torch.stack)
    collated_data["labels"] = [sample["labels"] for sample in preprocessed_batch]

    # If there is a pixel_mask key, return the pixel_mask's as well
    if "pixel_mask" in preprocessed_batch[0]:
        collated_data["pixel_mask"] = torch.stack([sample["pixel_mask"] for sample in preprocessed_batch])

    return collated_data

In [None]:
from transformers import TrainingArguments

BATCH_SIZE = 12
DATALOADER_NUM_WORKERS = 0 # note: if you're on Google Colab, you may have to lower this to os.cpu_count() or to 0

# Set number of epochs to how many laps you'd like to do over the data
NUM_EPOCHS = 3

# Setup hyperameters for training from the DETR paper(s)
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 1e-4
MAX_GRAD_NORM = 0.1
WARMUP_RATIO = 0.05 # learning rate warmup from 0 to learning_rate as a ratio of total steps (e.g. 0.05 = 5% of total steps)

# Create directory to save models to
OUTPUT_DIR = "rtdetr_v2_r50vd-v1"
print(f"[INFO] Saving model to: {OUTPUT_DIR}")

# Create TrainingArguments to pass to Trainer
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    max_grad_norm=MAX_GRAD_NORM,
    num_train_epochs=NUM_EPOCHS,
    lr_scheduler_type="linear",
    warmup_ratio=WARMUP_RATIO,
    # warmup_steps=2000, # number of warmup steps from 0 to learning_rate (overrides warmup_ratio, found this to be too long for our dataset)
    logging_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    fp16=True, # use mixed precision training
    dataloader_num_workers=DATALOADER_NUM_WORKERS, # note: if you're on Google Colab, you may have to lower this to os.cpu_count() or to 0
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False, # want to minimize eval_loss (e.g. lower is better)
    report_to="none", # don't save experiments to a third party service
    push_to_hub=False,
    eval_do_concat_batches=False, # this defaults to True but we'll set it to False for our evaluation function
    # save_safetensors=False # turn this off to prevent potential checkpoint issues
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collate_function,
    train_dataset = dataset['train'],
    eval_dataset = dataset['validation'],
    compute_metrics = None
)
training_results = trainer.train()

In [None]:
#@title Pushing the Model to Huggingface
HF_REPO_ID = "justjuu/rtdetr-v2-license-plate-detection"
model.config.id2label = {0: "license_plate"}
model.config.label2id = {"license_plate": 0}
model.push_to_hub(HF_REPO_ID)
image_processor.push_to_hub(HF_REPO_ID)

In [None]:
from transformers import AutoModelForObjectDetection, AutoImageProcessor

model = AutoModelForObjectDetection.from_pretrained(
    HF_REPO_ID
)

image_processor = AutoImageProcessor.from_pretrained(
    HF_REPO_ID
)

In [None]:
from transformers import AutoModelForObjectDetection, AutoImageProcessor
from PIL import Image
import torch

image = Image.open("/content/500_4148.jpg").convert("RGB")

processor = AutoImageProcessor.from_pretrained(
    "justjuu/rtdetr-v2-license-plate-detection"
)
model = AutoModelForObjectDetection.from_pretrained(
    "justjuu/rtdetr-v2-license-plate-detection"
)

inputs = processor(image, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

results = processor.post_process_object_detection(
    outputs,
    target_sizes=torch.tensor([(image.height, image.width)]),
    threshold=0.5
)

In [None]:
results