In [None]:
%load_ext autoreload
%autoreload 2

kaggle competitions download -c sartorius-cell-instance-segmentation

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import pycocotools.mask as mask_utils
import tlc
from tqdm import tqdm

In [None]:
PROJECT_NAME = "SEGMENTATION"
DATASET_NAME = "SARTORIUS_CELL_INSTANCE_SEGMENTATION_TRAIN"
TABLE_NAME = "initial"

In [None]:
DATASET_ROOT = Path("C:/Data/sartorius-cell-instance-segmentation")
assert DATASET_ROOT.exists(), f"Dataset root {DATASET_ROOT} does not exist"

In [None]:
tlc.register_url_alias("SARTORIUS_CELL_INSTANCE_SEGMENTATION_TRAIN", DATASET_ROOT / "train")

In [None]:
tlc.register_project_url_alias(
    "SARTORIUS_CELL_INSTANCE_SEGMENTATION_TRAIN",
    "s3://3lc-projects/data/sartorius-cell-instance-segmentation/train",
    project=PROJECT_NAME,
    root="s3://3lc-projects",
)

In [None]:
train_csv_file = DATASET_ROOT / "train.csv"
assert train_csv_file.exists(), f"Train CSV file {train_csv_file} does not exist"

train_df = pd.read_csv(train_csv_file)

In [None]:
cell_types = set()

for cell_type in train_df["cell_type"]:
    cell_types.add(cell_type)

cell_types = list(cell_types)
cell_types.sort()
cell_types_to_index = {cell_type: index for index, cell_type in enumerate(cell_types)}

print(cell_types_to_index)

In [None]:
# Group annotations by image_id
image_annotations = {}

for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    image_id = row["id"]

    if image_id not in image_annotations:
        image_annotations[image_id] = {
            "width": row["width"],
            "height": row["height"],
            "sample_id": row["sample_id"],
            "image_path": DATASET_ROOT / "train" / f"{image_id}.png",
            "annotations": [],
        }
        # Verify image exists
        assert image_annotations[image_id][
            "image_path"
        ].exists(), f"Image {image_annotations[image_id]['image_path']} does not exist"

    # Add this annotation
    annotation = {
        "cell_type": row["cell_type"],
        "cell_type_index": cell_types_to_index[row["cell_type"]],
        "segmentation": list(map(int, row["annotation"].split())),
    }
    image_annotations[image_id]["annotations"].append(annotation)

In [None]:
def starts_lengths_to_coco_rle(starts_lengths, image_height, image_width):
    # Convert to numpy array and get starts/lengths
    s = np.array(starts_lengths, dtype=int)
    starts = s[0::2] - 1  # Convert from 1-based to 0-based indexing
    lengths = s[1::2]

    # Create binary mask
    mask = np.zeros(image_height * image_width, dtype=np.uint8)
    for start, length in zip(starts, lengths):
        mask[start : start + length] = 1
    mask = mask.reshape(image_height, image_width)

    # Convert to COCO RLE format
    rle = mask_utils.encode(np.asfortranarray(mask))
    return rle["counts"]

In [None]:
# Now image_annotations contains all annotations grouped by image_id
# We can process them further as needed:
def annotations_to_3lc_format(image_annotations):
    """
    Input format:
    {
        "cell_type_index": int,
        "segmentation": list[int],
        "width": int,
        "height": int,
    }

    Output format:
    {
        "image_height": int,
        "image_width": int,
        "rles": list[bytes],
        "instance_properties": {
            "cell_type": list[int],
        }
    }
    """
    image_height = image_annotations["height"]
    image_width = image_annotations["width"]

    rles = []
    cell_types = []

    for annotation in image_annotations["annotations"]:
        rle = starts_lengths_to_coco_rle(annotation["segmentation"], image_height, image_width)
        rles.append(rle)
        cell_types.append(annotation["cell_type_index"])

    return {
        "image_height": image_height,
        "image_width": image_width,
        "rles": rles,
        "instance_properties": {
            "cell_type": cell_types,
        },
    }


sample_ids = []
image_paths = []
segmentations = []

for image_id, image_data in tqdm(image_annotations.items(), total=len(image_annotations)):
    sample_ids.append(image_data["sample_id"])
    image_paths.append(tlc.Url(DATASET_ROOT / "train" / f"{image_id}.png").to_relative().to_str())
    segmentations.append(annotations_to_3lc_format(image_data))

In [None]:
table_data = {
    "sample_id": sample_ids,
    "image": image_paths,
    "segmentations": segmentations,
}

table_schemas = {
    "image": tlc.PILImage("image"),
    "segmentations": tlc.InstanceSegmentationMasks(
        "segmentations",
        instance_properties_structure={
            "cell_type": tlc.CategoricalLabel("cell_type", list(cell_types_to_index.keys()))
        },
    ).schema,
}

table = tlc.Table.from_dict(
    table_data,
    structure=table_schemas,
    project_name=PROJECT_NAME,
    dataset_name=DATASET_NAME,
    table_name=TABLE_NAME,
    if_exists="rename",
)

In [None]:
table.url

In [None]:
table.table_rows[0]

In [None]:
table[0]