In [None]:
%load_ext autoreload
%autoreload 2

https://huggingface.co/docs/transformers/en/model_doc/mask2former#transformers.Mask2FormerImageProcessor.post_process_instance_segmentation.return_binary_maps

In [None]:
from pathlib import Path

import numpy as np
import tlc
import torch
import torchvision.transforms as T
from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation, Mask2FormerImageProcessor

from tlc_tools.common import infer_torch_device

In [None]:
PROJECT_NAME = "3LC Tutorials"
DATASET_NAME = "Mask2Former Example"
TABLE_NAME = "Horses"
HF_MODEL_ID = "facebook/mask2former-swin-tiny-coco-instance"

In [None]:
# load Mask2Former fine-tuned on COCO instance segmentation
image_processor: Mask2FormerImageProcessor = AutoImageProcessor.from_pretrained(
    HF_MODEL_ID,
    use_fast=False,
    do_rescale=False,
)
model = Mask2FormerForUniversalSegmentation.from_pretrained(HF_MODEL_ID)

device = infer_torch_device()
model.to(device)

In [None]:
image_folder = Path("C:/Data/Caltech256/256_ObjectCategories/105.horse")

In [None]:
value_map = {k: tlc.MapElement(v) for k, v in model.config.id2label.items()}

In [None]:
table = tlc.Table.from_image_folder(
    image_folder,
    include_label_column=False,
    table_name=TABLE_NAME,
    dataset_name=DATASET_NAME,
    project_name=PROJECT_NAME,
    if_exists="rename",
)

In [None]:
def table_map(sample):
    img_tensor = T.ToTensor()(sample.convert("RGB"))

    inputs = image_processor(images=img_tensor, return_tensors="pt")
    inputs["pixel_values"] = inputs["pixel_values"].squeeze(0)
    inputs["original_size"] = torch.tensor([sample.height, sample.width])
    return dict(inputs)


table.map(table_map)

In [None]:
def collect_fn(batch, predictor_output):
    original_sizes = [(int(h), int(w)) for h, w in batch["original_size"]]

    results = image_processor.post_process_instance_segmentation(
        predictor_output.forward,
        target_sizes=original_sizes,
        return_binary_maps=True,
    )

    predicted_instances = []

    for result, (height, width) in zip(results, original_sizes):
        masks = result["segmentation"]
        labels = [i["label_id"] for i in result["segments_info"]]
        scores = [i["score"] for i in result["segments_info"]]

        masks = (
            np.expand_dims(masks.cpu().numpy(), axis=2)
            if len(masks.shape) == 2
            else masks.cpu().numpy().transpose(1, 2, 0)
        )

        masks = masks.astype(np.uint8)

        instances = {
            "image_height": height,
            "image_width": width,
            "masks": masks,
            "instance_properties": {"label": labels, "scores": scores},
        }
        predicted_instances.append(instances)

    return {"predicted_masks": predicted_instances}


metrics_collector = tlc.FunctionalMetricsCollector(
    collect_fn,
    column_schemas={
        "predicted_masks": tlc.InstanceSegmentationMasks(
            "predicted_masks",
            instance_properties_structure={
                "label": tlc.CategoricalLabel("label", value_map),
                "scores": tlc.IoU("scores"),
            },
            is_prediction=True,
        ),
    },
    compute_aggregates=False,
)

In [None]:
run = tlc.init(project_name=PROJECT_NAME, run_name="Collect Segmentation Metrics")

tlc.collect_metrics(
    table,
    metrics_collector,
    predictor=model,
    collect_aggregates=False,
    dataloader_args={"batch_size": 4},
)

run.set_status_completed()