In [11]:
from loguru import logger
import anndata as ad
import sys
sys.path.append("../src/opendvp/")
import anndata_utils
import imaging
import time

import tifffile
import numpy as np
import geopandas as gpd
from shapely.geometry import shape as shapely_shape, MultiPolygon
from skimage.measure import find_contours
from rasterio.features import shapes
import shapely
import os

import matplotlib.colors as mcolors
import re
from itertools import cycle

import json

In [3]:
def mask_to_polygons(mask_path, savepath=None, simplify=None, max_memory_mb=16000):
    """
    Converts a labeled segmentation mask (TIFF file) into a GeoDataFrame with polygons or multipolygons.
    
    Args:
        mask_path (str): Path to a 2D labeled segmentation mask TIFF. Pixel values represent cell IDs; background is 0.
        max_memory_mb (int): Maximum memory (in MB) allowed to safely process the image.
    
    Returns:
        geopandas.GeoDataFrame: A GeoDataFrame containing polygons/multipolygons and their cell IDs.
    
    Raises:
        ValueError: If the estimated memory usage exceeds the max_memory_mb.
    """

    logger.info(f" -- Convering {mask_path} to geodataframe of polygons -- ")

    # Load image metadata to check shape and memory usage
    with tifffile.TiffFile(mask_path) as tif:
        shape = tif.series[0].shape
        dtype = tif.series[0].dtype
        estimated_bytes = np.prod(shape) * np.dtype(dtype).itemsize
        estimated_mb = estimated_bytes / (1024 ** 2)
        logger.info(f"  Mask shape: {shape}, dtype: {dtype}, estimated_mb: {estimated_mb:.1f}")

    if estimated_mb > max_memory_mb:
        raise ValueError(f"Estimated memory usage is {estimated_mb:.2f} MB, exceeding the threshold of {max_memory_mb:.1f} MB.")

    # Load the image data
    
    array = tifffile.imread(mask_path)

    # convert to int32
    start_time = time.time()
    max_label = array.max()
    logger.debug(f"Calculated max pixel value in {time.time() - start_time:.1f} seconds")
    if max_label <= np.iinfo(np.int32).max:
        array = array.astype(np.int32)
    else:
        raise ValueError(f"Cell IDs exceed int32 range, and rasterio doesn't support uint32 or int64.")

    #Ensure 2D mask
    array = np.squeeze(array)

    # Dictionary to store geometries grouped by cell ID
    cell_geometries = {}

    # Extract shapes and corresponding values
    start_time = time.time()
    for shape_dict, cell_id in shapes(array, mask=(array > 0)):
        polygon = shapely_shape(shape_dict)
        cell_id = int(cell_id)
        cell_geometries.setdefault(cell_id, []).append(polygon)
    logger.info(f"Transformed pixel mask into polygons in {time.time() - start_time:.1f} seconds")

    # Combine multiple polygons into MultiPolygons if needed
    records = []
    for cell_id, polygons in cell_geometries.items():
        geometry = polygons[0] if len(polygons) == 1 else MultiPolygon(polygons)
        records.append({'cellId': cell_id, 'geometry': geometry})

    # Create a GeoDataFrame
    gdf = gpd.GeoDataFrame(records, crs="EPSG:4326")

    if simplify is not None:
        logger.info(f"Simplifying the geometry with tolerance {simplify}")
        gdf['geometry'] = gdf['geometry'].simplify(simplify, preserve_topology=True)

    if savepath is not None:
        logger.info(f"Writing geodataframe as GeoJSON here {savepath}")
        start_time = time.time()
        gdf.to_file(savepath, driver="GeoJSON")
        logger.info(f"Writing of file took {time.time() - start_time:.1f} seconds")

    logger.success(" -- Created geodataframe from segmentation mask -- ")

    return gdf

In [6]:
gdf = mask_to_polygons(
    mask_path="/Volumes/Extreme SSD/P26_BCCN/P26E07_E08_E09_SOPA_Gates/expanded-masks/992.tif",
    savepath="/Users/jnimoca/Jose_BI/4_Projects/P26_BCCN/P26E10_E11_ImageAnalysis/data/perSample/992/qc/992_mask_polygons.geojson",
    simplify=1,
    max_memory_mb=16000)

[32m2025-04-10 08:30:02.163[0m | [1mINFO    [0m | [36m__main__[0m:[36mmask_to_polygons[0m:[36m16[0m - [1m -- Convering /Volumes/Extreme SSD/P26_BCCN/P26E07_E08_E09_SOPA_Gates/expanded-masks/992.tif to geodataframe of polygons -- [0m
[32m2025-04-10 08:30:02.174[0m | [1mINFO    [0m | [36m__main__[0m:[36mmask_to_polygons[0m:[36m24[0m - [1m  Mask shape: (1, 48157, 73093), dtype: uint32, estimated_mb: 13427.5[0m
[32m2025-04-10 08:30:19.645[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mmask_to_polygons[0m:[36m36[0m - [34m[1mCalculated max pixel value in 3.1 seconds[0m
[32m2025-04-10 08:33:27.773[0m | [1mINFO    [0m | [36m__main__[0m:[36mmask_to_polygons[0m:[36m54[0m - [1mTransformed pixel mask into polygons in 182.8 seconds[0m
[32m2025-04-10 08:33:28.839[0m | [1mINFO    [0m | [36m__main__[0m:[36mmask_to_polygons[0m:[36m66[0m - [1mSimplifying the geometry with tolerance 1[0m
[32m2025-04-10 08:35:24.203[0m | [1mINFO    [0m | [3

In [13]:
gdf

Unnamed: 0,cellId,geometry
0,202520,"POLYGON ((68595 0, 68596 21, 68613 21, 68615 2..."
1,202521,"POLYGON ((71124 0, 71121 6, 71121 11, 71123 17..."
2,906503,"POLYGON ((44292 12, 44284 17, 44283 26, 44285 ..."
3,202522,"POLYGON ((72803 0, 72801 12, 72803 32, 72807 3..."
4,207429,"POLYGON ((43404 9, 43395 13, 43393 24, 43395 2..."
...,...,...
1134342,140705,"POLYGON ((64083 48111, 64076 48114, 64069 4812..."
1134343,298936,"POLYGON ((63616 48117, 63606 48120, 63599 4812..."
1134344,140727,"POLYGON ((63733 48125, 63719 48132, 63716 4813..."
1134345,140716,"POLYGON ((63914 48120, 63907 48124, 63905 4812..."


In [None]:
def color_geojson_w_adata(
        geodataframe,
        geodataframe_index_key,
        adata,
        adata_obs_index_key,
        adata_obs_category_key,
        color_dict,
        export_path,
        simplify_value=1,
        return_gdf=False
):
    
    """
    Add classification colors from an AnnData object to a GeoDataFrame for QuPath visualization.

    Parameters
    ----------
    geodataframe : geopandas.GeoDataFrame
        GeoDataFrame containing polygons to annotate.
    
    geodataframe_index_key : str
        Column in the GeoDataFrame that corresponds to the index or column in adata.obs used for matching.

    adata : anndata.AnnData
        AnnData object containing cell annotations in `adata.obs`.

    adata_obs_index_key : str
        Column name in `adata.obs` used to match to `geodataframe_index_key`.

    adata_obs_category_key : str
        Column in `adata.obs` that defines the classification/grouping to color.

    color_dict : dict, optional
        Dictionary mapping class names to RGB color lists (e.g., {'Tcell': [255, 0, 0]}).
        If None, a default color cycle will be used.

    export_path : str, optional
        Path where the output GeoJSON will be saved.

    simplify_value : float, optional
        Tolerance value for geometry simplification (higher = more simplified).
        default = 1

    return_gdf : bool, optional
        If True, returns the modified GeoDataFrame with classifications.

    Returns
    -------
    geopandas.GeoDataFrame or None
        Returns the updated GeoDataFrame if `return_gdf=True`, else writes to file only.
    """
    
    logger.info(" -- Adding color to polygons for QuPath visualization -- ")
    
    gdf = geodataframe.copy()
    gdf['objectType'] = "detection"
    
    phenotypes_series = adata.obs.set_index(adata_obs_index_key)[adata_obs_category_key]

    if gdf[geodataframe_index_key].dtype != phenotypes_series.index.dtype:
        gdf_dtype = gdf[geodataframe_index_key].dtype
        adata_dtype = phenotypes_series.index.dtype
        logger.warning(f"Data types between geodaframe {gdf_dtype} and adataobs col {adata_dtype} do not match")

    if geodataframe_index_key:
        logger.info(f"Matching gdf[{geodataframe_index_key}] to adata.obs[{adata_obs_index_key}]")
        gdf['class'] = gdf[geodataframe_index_key].map(phenotypes_series)
    else:
        logger.info("geodataframe index key not passed, using index")
        gdf.index = gdf.index.astype(str)
        gdf['class'] = gdf.index.map(phenotypes_series).astype(str)

    gdf['class'] = gdf['class'].astype("category")
    gdf['class'] = gdf['class'].cat.add_categories('filtered_out') 
    gdf['class'] = gdf['class'].fillna('filtered_out')
    gdf['class'] = gdf['class'].replace("nan", "filtered_out")

    if color_dict:
            logger.info(f"Using color_dict found in table.uns[{color_dict}]")
            color_dict = parse_color_for_qupath(color_dict)
    else:
            logger.info("No color_dict found, using defaults")
            default_colors = [[31, 119, 180], [255, 127, 14], [44, 160, 44], [214, 39, 40], [148, 103, 189]]
            color_cycle = cycle(default_colors)
            color_dict = dict(zip(adata.obs[adata_obs_category_key].cat.categories.astype(str), color_cycle))

    if 'filtered_out' not in color_dict:
        color_dict['filtered_out'] = [0,0,0]

    gdf['classification'] = gdf.apply(lambda x: {'name': x['class'], 'color': color_dict[x['class']]}, axis=1)
    gdf.drop(columns='class', inplace=True)

    #simplify the geometry
    if simplify_value is not None:
        logger.info(f"Simplifying the geometry with tolerance {simplify_value}")
        start_time = time.time()
        gdf['geometry'] = gdf['geometry'].simplify(simplify_value, preserve_topology=True)
        logger.info(f"Simplified all polygons in {time.time() - start_time:.1f} seconds")

    logger.info("Writing polygons as geojson file")
    start_time = time.time()
    gdf.to_file(export_path, driver='GeoJSON')
    logger.info(f"File written in {time.time() - start_time:.1f} seconds")

    if return_gdf:
        return gdf
    
def parse_color_for_qupath(color_dict):
    parsed_colors = {}
    for name, color in color_dict.items():
        if isinstance(color, tuple) and len(color) == 3:
            # Handle RGB fraction tuples (0-1)
            parsed_colors[name] = list(int(c * 255) for c in color)
        elif isinstance(color, str) and re.match(r'^#(?:[0-9a-fA-F]{3}){1,2}$', color):
            # Handle hex codes
            parsed_colors[name] = mcolors.hex2color(color)
            parsed_colors[name] = list(int(c * 255) for c in parsed_colors[name])
        else:
            raise ValueError(f"Invalid color format for '{name}': {color}")
        
    return parsed_colors

In [9]:
adata = ad.read_h5ad("/Users/jnimoca/Jose_BI/4_Projects/P26_BCCN/P26E10_E11_ImageAnalysis/data/perSample/992/adata_checkpoints/7_adata_pheno/20250303_1112_7_adata_pheno_adata.h5ad")

In [12]:
path_to_json = "/Users/jnimoca/Jose_BI/4_Projects/P26_BCCN/P26E10_E11_ImageAnalysis/assets/pheno_colors.json"
with open(path_to_json) as f:
    pheno_colors = json.load(f)

In [14]:
color_geojson_w_adata(
    geodataframe = gdf ,
    geodataframe_index_key="cellId",
    adata=adata,
    adata_obs_index_key="CellID",
    adata_obs_category_key="phenotype",
    color_dict=pheno_colors,
    export_path="/Users/jnimoca/Jose_BI/4_Projects/P26_BCCN/P26E10_E11_ImageAnalysis/data/perSample/992/qc/992_color_pheno.geojson",
    simplify_value=None,
    return_gdf=False
)

[32m2025-04-10 08:38:31.924[0m | [1mINFO    [0m | [36m__main__[0m:[36mcolor_geojson_w_adata[0m:[36m17[0m - [1m -- Adding color to polygons for QuPath visualization -- [0m
[32m2025-04-10 08:38:31.979[0m | [1mINFO    [0m | [36m__main__[0m:[36mcolor_geojson_w_adata[0m:[36m30[0m - [1mgeodataframe[cellId] used as matching column[0m
[32m2025-04-10 08:38:32.007[0m | [1mINFO    [0m | [36m__main__[0m:[36mcolor_geojson_w_adata[0m:[36m45[0m - [1mUsing color_dict found in table.uns[{'B_cells': '#ff7df3', 'CD4_Tcells': '#6efa6e', 'CD8_Tcells': '#2020d8', 'Cancer_cells': '#f55148', 'Vimentin_cells': '#866a21', 'Macrophages': '#ffff00', 'COL1A1_cells': '#ff7f0e', 'Unknown': '#7f7f7f'}][0m
[32m2025-04-10 08:38:35.690[0m | [1mINFO    [0m | [36m__main__[0m:[36mcolor_geojson_w_adata[0m:[36m66[0m - [1mWriting polygons as geojson file[0m
[32m2025-04-10 08:39:09.412[0m | [1mINFO    [0m | [36m__main__[0m:[36mcolor_geojson_w_adata[0m:[36m69[0m - [1mFi

In [15]:
imaging.adataobs_to_voronoi_geojson(
    df=adata.obs,
    imageid=992,
    subset=None,
    category_1 = "phenotype",
    output_path = "/Users/jnimoca/Jose_BI/4_Projects/P26_BCCN/P26E10_E11_ImageAnalysis/data/perSample/992/qc/"
)

[32m2025-04-10 08:43:38.741[0m | [34m[1mDEBUG   [0m | [36mimaging[0m:[36madataobs_to_voronoi_geojson[0m:[36m51[0m - [34m[1m df shape: (1005051, 24)[0m
[32m2025-04-10 08:43:38.782[0m | [34m[1mDEBUG   [0m | [36mimaging[0m:[36madataobs_to_voronoi_geojson[0m:[36m59[0m - [34m[1m df shape after imageid subset: (1005051, 24)[0m
[32m2025-04-10 08:43:38.787[0m | [1mINFO    [0m | [36mimaging[0m:[36madataobs_to_voronoi_geojson[0m:[36m60[0m - [1mProcessing 992, loaded dataframe[0m
[32m2025-04-10 08:43:38.789[0m | [1mINFO    [0m | [36mimaging[0m:[36madataobs_to_voronoi_geojson[0m:[36m76[0m - [1mRunning Voronoi[0m
[32m2025-04-10 08:44:20.899[0m | [1mINFO    [0m | [36mimaging[0m:[36madataobs_to_voronoi_geojson[0m:[36m86[0m - [1mVoronoi done[0m
[32m2025-04-10 08:44:20.931[0m | [1mINFO    [0m | [36mimaging[0m:[36madataobs_to_voronoi_geojson[0m:[36m90[0m - [1mTransformed to geodataframe[0m
[32m2025-04-10 08:44:20.936[0m | [1m