In [1]:
import spatialdata
import skimage.io as io
import numpy as np
import opendvp

  from pkg_resources import DistributionNotFound, get_distribution


In [None]:
sdata = spatialdata.datasets.blobs()

In [25]:
path_to_test_mask = "test_data/blobs_labels.tif"

In [None]:
import time

import geopandas as gpd
import numpy as np
import tifffile
from shapely.geometry import MultiPolygon
from shapely.geometry import Polygon
from skimage import measure

from opendvp.utils import logger


def mask_to_polygons(
    mask_path: str,
    simplify: float | None = None,
    max_memory_mb: int = 16000,
) -> gpd.GeoDataFrame:
    """Convert a labeled segmentation mask (TIFF file) into a GeoDataFrame of polygons and/or multipolygons.

    Parameters:
    ----------
    mask_path : str
        Path to a 2D labeled segmentation mask TIFF. Pixel values represent cell IDs; background is 0.
    simplify : float, optional
        Tolerance for geometry simplification. If None, no simplification is performed.
    max_memory_mb : int, optional
        Maximum memory (in MB) allowed to safely process the image (default: 16000).

    Returns:
    -------
    gpd.GeoDataFrame
        A GeoDataFrame containing polygons/multipolygons and their cell IDs.

    Raises:
    ------
    ValueError
        If the estimated memory usage exceeds max_memory_mb or cell IDs exceed int32 range.
    """
    with tifffile.TiffFile(mask_path) as tif:  # type: ignore
        shape = tif.series[0].shape
        dtype = tif.series[0].dtype
        estimated_bytes = np.prod(shape) * np.dtype(dtype).itemsize
        estimated_mb = estimated_bytes / (1024 ** 2)
        logger.info(f"  Mask shape: {shape}, dtype: {dtype}, estimated_mb: {estimated_mb:.1f}")

    if estimated_mb > max_memory_mb:
        raise ValueError(f"Estimated mask size is {estimated_mb:.2f} MB, exceeding {max_memory_mb:.1f} MB.")

    # Load the image data and ensure it's a 2D array
    array = tifffile.imread(mask_path)  # type: ignore
    array = np.squeeze(array)  # type: ignore

    # Find contours using skimage.measure.find_contours
    contours_gdf = []
    start_time_contours = time.time()
    for label_value in np.unique(array[array > 0]):
        binary_mask = (array == label_value).astype(np.uint8)
        contours = measure.find_contours(binary_mask, 0.5)
        for contour in contours:
            # Convert contour points to a Shapely polygon
            polygon = Polygon(contour[:, ::-1])
            if not polygon.is_valid:
                polygon = polygon.buffer(0)
            contours_gdf.append({'cellId': int(label_value), 'geometry': polygon})

    logger.info(f"Extracted contours in {time.time() - start_time_contours:.2f} seconds")

    # Create GeoDataFrame from contours
    gdf = gpd.GeoDataFrame(contours_gdf, crs="EPSG:4326")

    # Group by 'cellId' and combine geometries into MultiPolygons if necessary
    start_time_grouping = time.time()
    gdf = gdf.dissolve(by='cellId', aggfunc='first')
    gdf['geometry'] = gdf['geometry'].apply(lambda x: x if x.geom_type != 'MultiPolygon' else MultiPolygon(list(x.geoms)))
    gdf = gdf.reset_index()
    logger.info(f"Combined geometries in {time.time() - start_time_grouping:.2f} seconds")

    if simplify is not None:
        logger.info(f"Simplifying the geometry with tolerance {simplify}")
        gdf['geometry'] = gdf['geometry'].simplify(simplify, preserve_topology=True)

    # Ensure 'cellId' is integer type
    gdf['cellId'] = gdf['cellId'].astype(int)

    logger.success(" -- Created geodataframe from segmentation mask -- ")

    return gdf

In [28]:
gdf = mask_to_polygons(path_to_test_mask)

[32m15:07:28.16[0m | [1mINFO[0m |   Mask shape: (512, 512), dtype: int16, estimated_mb: 0.5
[32m15:07:28.21[0m | [1mINFO[0m | Extracted contours in 0.05 seconds
[32m15:07:28.24[0m | [1mINFO[0m | Combined geometries in 0.02 seconds
[32m15:07:28.24[0m | [32m[1mSUCCESS[0m |  -- Created geodataframe from segmentation mask -- 


In [38]:
gdf2 = mask_to_polygons(path_to_test_mask)

[32m15:16:57.84[0m | [1mINFO[0m |   Mask shape: (512, 512), dtype: int16, estimated_mb: 0.5
[32m15:16:57.89[0m | [1mINFO[0m | Extracted contours in 0.05 seconds
[32m15:16:57.91[0m | [1mINFO[0m | Combined geometries in 0.01 seconds
[32m15:16:57.91[0m | [32m[1mSUCCESS[0m |  -- Created geodataframe from segmentation mask -- 


In [41]:
gdf3 = mask_to_polygons(path_to_test_mask, simplify=1)

[32m15:20:11.95[0m | [1mINFO[0m |   Mask shape: (512, 512), dtype: int16, estimated_mb: 0.5
[32m15:20:12.00[0m | [1mINFO[0m | Extracted contours in 0.05 seconds
[32m15:20:12.01[0m | [1mINFO[0m | Combined geometries in 0.01 seconds
[32m15:20:12.01[0m | [1mINFO[0m | Simplifying the geometry with tolerance 1
[32m15:20:12.02[0m | [32m[1mSUCCESS[0m |  -- Created geodataframe from segmentation mask -- 


In [39]:
gdf2

Unnamed: 0,cellId,geometry
0,1,"MULTIPOLYGON (((379.5 0, 380 0.5, 380.5 0, 379..."
1,2,"MULTIPOLYGON (((350.5 300, 350 299.5, 349.5 30..."
2,3,"POLYGON ((118 304.5, 117 304.5, 116 304.5, 115..."
3,4,"POLYGON ((74 416.5, 73 416.5, 72 416.5, 71 416..."
4,5,"POLYGON ((250 257.5, 249 257.5, 248 257.5, 247..."
5,6,"POLYGON ((466 259.5, 465 259.5, 464.5 259, 464..."
6,8,"MULTIPOLYGON (((0 319.5, 0 320.5, 0.5 320, 0 3..."
7,9,"POLYGON ((138 458.5, 137 458.5, 136 458.5, 135..."
8,10,"POLYGON ((460 300.5, 459.5 300, 459 299.5, 458..."
9,11,"POLYGON ((102 74.5, 101 74.5, 100 74.5, 99 74...."


In [42]:
gdf3.to_file("blobs_labels_shapes_3.geojson")

In [35]:
sdata

SpatialData object
├── Images
│     ├── 'blobs_image': DataArray[cyx] (3, 512, 512)
│     └── 'blobs_multiscale_image': DataTree[cyx] (3, 512, 512), (3, 256, 256), (3, 128, 128)
├── Labels
│     ├── 'blobs_labels': DataArray[yx] (512, 512)
│     └── 'blobs_multiscale_labels': DataTree[yx] (512, 512), (256, 256), (128, 128)
├── Points
│     └── 'blobs_points': DataFrame with shape: (<Delayed>, 4) (2D points)
├── Shapes
│     ├── 'blobs_circles': GeoDataFrame shape: (5, 2) (2D shapes)
│     ├── 'blobs_multipolygons': GeoDataFrame shape: (2, 1) (2D shapes)
│     └── 'blobs_polygons': GeoDataFrame shape: (5, 1) (2D shapes)
└── Tables
      └── 'table': AnnData (26, 3)
with coordinate systems:
    ▸ 'global', with elements:
        blobs_image (Images), blobs_multiscale_image (Images), blobs_labels (Labels), blobs_multiscale_labels (Labels), blobs_points (Points), blobs_circles (Shapes), blobs_multipolygons (Shapes), blobs_polygons (Shapes)

In [32]:
im = sdata['blobs_image'].values

In [33]:
im.shape

(3, 512, 512)

In [34]:
io.imsave("test_data/multichannel_blobs.tif",im)