In [1]:
#imports
import spatialdata
import dask_image.imread
import dask.array as da
import geopandas as gpd
import anndata as ad
import napari_spatialdata
import time

import sys, os
sys.path.append(os.path.abspath("../functions/"))
import filtering
import anndata_utils

for package in [spatialdata, gpd, ad, napari_spatialdata]:
    print(f"{package.__name__}: {package.__version__}")



spatialdata: 0.2.3
geopandas: 1.0.1
anndata: 0.11.3
napari_spatialdata: 0.5.3


In [2]:
# load sdata
sdata = spatialdata.read_zarr("../data/exemplar001/exemplar001.zarr")
sdata

SpatialData object, with associated Zarr store: /Users/jnimoca/Jose_BI/1_Pipelines/openDVP/data/exemplar001/exemplar001.zarr
├── Images
│     └── 'image': DataArray[cyx] (12, 3139, 2511)
├── Labels
│     └── 'mask': DataArray[yx] (3139, 2511)
└── Tables
      └── 'imaging': AnnData (9711, 12)
with coordinate systems:
    ▸ 'global', with elements:
        image (Images), mask (Labels)

In [4]:
sdata['imaging'].var

Unnamed: 0,math,marker
DNA_6,DNA,6.0
ELANE,ELANE,
CD57,CD57,
CD45,CD45,
DNA_7,DNA,7.0
CD11B,CD11B,
SMA,SMA,
CD16,CD16,
DNA_8,DNA,8.0
ECAD,ECAD,


# Filter by value threshold in a feature

In [5]:
feature_to_filter = "DNA_6"

### Explore values

In [6]:
import plotly.graph_objects as go
import numpy as np
import pandas as pd

def plot_dynamic_histogram(df, column, bins=100):
    """
    Create a dynamic histogram with a threshold slider, 
    showing counts to the left and right of the threshold.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        column (str): The name of the column to plot.
    """
    # Validate input
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")

    data = df[column].dropna()  # Handle missing values
    if data.empty:
        raise ValueError(f"No valid data in column '{column}'.")

    # Initial threshold
    initial_threshold = data.mean()  # Start at the mean as default

    # Function to calculate counts based on threshold
    def calculate_counts(data, threshold):
        left_count = (data < threshold).sum()
        right_count = (data >= threshold).sum()
        return left_count, right_count

    # Initial counts
    left_count, right_count = calculate_counts(data, initial_threshold)

    # Create figure
    fig = go.Figure()

    # Add histogram trace
    fig.add_trace(go.Histogram(x=data, nbinsx=bins, name=column))

    # Add initial annotations for counts
    fig.update_layout(
        annotations=[dict(x=0.02, y=1.1, xref="paper", yref="paper", text=f"Left Count: {left_count}", showarrow=False),
                    dict(x=0.98, y=1.1, xref="paper", yref="paper", text=f"Right Count: {right_count}", showarrow=False)]
    )

    # Initial vertical line at threshold
    fig.add_shape(
        type="line",
        x0=initial_threshold, y0=0, x1=initial_threshold, y1=1,
        xref="x", yref="paper",
        line=dict(color="red", width=2, dash="dash")
    )

    # Function to update both the annotations and line
    def update_slider(threshold):
        # Update counts
        left_count, right_count = calculate_counts(data, threshold)
        # Update annotations
        annotations = [
            dict(x=0.02, y=1.1, xref="paper", yref="paper", text=f"Left Count: {left_count}", showarrow=False),
            dict(x=0.98, y=1.1, xref="paper", yref="paper", text=f"Right Count: {right_count}", showarrow=False)]
        # Update line position
        shapes = [dict(
            type="line",
            x0=threshold, y0=0, x1=threshold, y1=1,
            xref="x", yref="paper",
            line=dict(color="red", width=2, dash="dash"))]
        return {"annotations": annotations, "shapes": shapes}

    # Add slider with threshold steps
    thresholds = np.linspace(data.min(), data.max(), bins)
    fig.update_layout(
        sliders=[{
            "active": 0,
            "currentvalue": {"prefix": "Threshold: "},
            "pad": {"t": 50},
            "steps": [
                {
                    "label": str(round(threshold, 2)),
                    "method": "relayout",
                    "args": [update_slider(threshold)]
                }
                for threshold in thresholds
            ]
        }]
    )

    # Show figure
    fig.show()

In [7]:
df = pd.DataFrame(data=sdata['imaging'].X, columns=sdata['imaging'].var_names)
plot_dynamic_histogram(df, feature_to_filter, bins=300)

In [8]:
# I decided from histogram that 5k is a good min threshold for the DNA_6 feature.
# I decided from histogram that 25k is a good max threshold for the DNA_6 feature. 
# I will now filter the data based on this threshold.

### Label cells not passing filter

In [9]:
# now visualize in sdata
sdata['imaging'] = filtering.filter_by_abs_value(
    adata=sdata['imaging'], 
    marker=feature_to_filter,
    value=5000,
    direction='above',
    plot=False)

[32m2025-01-14 16:24:21.169[0m | [1mINFO    [0m | [36mfiltering[0m:[36mfilter_by_abs_value[0m:[36m37[0m - [1m ---- filter_by_abs_value : version number 1.1.0 ----[0m
[32m2025-01-14 16:24:21.174[0m | [1mINFO    [0m | [36mfiltering[0m:[36mfilter_by_abs_value[0m:[36m71[0m - [1mNumber of cells with DNA_6 above 5000: 9627[0m
[32m2025-01-14 16:24:21.175[0m | [1mINFO    [0m | [36mfiltering[0m:[36mfilter_by_abs_value[0m:[36m86[0m - [1m ---- filter_by_abs_value is done, took 0s  ----[0m

Key `imaging` already exists. Overwriting it in-memory.



In [10]:
sdata['imaging'] = filtering.filter_by_abs_value(
    adata=sdata['imaging'], 
    marker=feature_to_filter,
    value=25000,
    direction='below',
    plot=False)

[32m2025-01-14 16:24:21.183[0m | [1mINFO    [0m | [36mfiltering[0m:[36mfilter_by_abs_value[0m:[36m37[0m - [1m ---- filter_by_abs_value : version number 1.1.0 ----[0m
[32m2025-01-14 16:24:21.187[0m | [1mINFO    [0m | [36mfiltering[0m:[36mfilter_by_abs_value[0m:[36m71[0m - [1mNumber of cells with DNA_6 below 25000: 9565[0m
[32m2025-01-14 16:24:21.188[0m | [1mINFO    [0m | [36mfiltering[0m:[36mfilter_by_abs_value[0m:[36m86[0m - [1m ---- filter_by_abs_value is done, took 0s  ----[0m

Key `imaging` already exists. Overwriting it in-memory.



In [12]:
# since two filters were applied, let's merge them
sdata['imaging'].obs['DNA_6_filter'] = sdata['imaging'].obs['DNA_6_above_5000'] & sdata['imaging'].obs['DNA_6_below_25000']

In [13]:
sdata['imaging'].obs.head()

Unnamed: 0,CellID,Y_centroid,X_centroid,Area,MajorAxisLength,MinorAxisLength,Eccentricity,Orientation,Extent,Solidity,DNA_6_above_5000,DNA_6_below_25000,DNA_6_filter
0,1,258.003817,1768.854962,262,18.967496,17.676168,0.362667,0.102867,0.811146,0.949275,True,True,True
1,2,662.910714,1071.553571,112,13.456567,10.658541,0.61043,-0.149277,0.783217,0.949153,True,True,True
2,3,667.654867,1109.185841,226,19.693311,15.227421,0.634128,1.009633,0.69969,0.900398,True,True,True
3,4,674.85022,1326.76652,227,17.455343,16.809391,0.269523,1.005338,0.700617,0.926531,True,True,True
4,5,676.988571,907.4,175,16.430375,13.860761,0.536963,1.339714,0.729167,0.906736,False,True,False


## QC2: visualize the filtered cells

### Preparation 1: Transform the segmentation mask into polygons

In [15]:
# create shapes from mask
mask_polygons = spatialdata.to_polygons(sdata['mask'])
sdata['mask_polygons'] = spatialdata.models.ShapesModel.parse(mask_polygons)
# important to have same number of cells in imaging and mask
assert sdata['imaging'].shape[0] == sdata['mask_polygons'].shape[0], "Number of cells in imaging and mask do not match."
sdata

### Preparation 2: Link quantification table to polygons

We now have to match the index of "sdata['mask_polygons']" to a column in adata.obs; this will be CellID, lets confirm they match

In [21]:
print(sdata['mask_polygons'].index.to_list()[:5])
print(sdata['mask_polygons'].index.to_list()[-5:])
print("Index datatype:", sdata['mask_polygons'].index.dtype)

[1, 2, 3, 4, 5]
[9707, 9708, 9709, 9710, 9711]
Index datatype: int64


In [22]:
print(sdata['imaging'].obs['CellID'].to_list()[:5])
print(sdata['imaging'].obs['CellID'].to_list()[-5:])
print("CellID datatype:", sdata['imaging'].obs['CellID'].dtype)

[1, 2, 3, 4, 5]
[9707, 9708, 9709, 9710, 9711]
CellID datatype: int64


In [24]:
# link table to mask_polygons
sdata['imaging'].obs["shapes"] = "mask_polygons"
sdata['imaging'].obs["shapes"] = sdata['imaging'].obs["shapes"].astype("category")
table = spatialdata.models.TableModel.parse(sdata['imaging'].copy(), region = "mask_polygons", region_key = "shapes", instance_key = "CellID")
sdata["imaging"] = table

#For some reason this breaks something that prevents napari interactive from loading the original mask...


Key `imaging` already exists. Overwriting it in-memory.



In [27]:
# QC2 Visual check of filtered cells
interactive = napari_spatialdata.Interactive(sdata)
interactive.run()

# unfortunately color palettes are still not available 
# See issue: https://github.com/scverse/napari-spatialdata/issues/324


Transforming to str index.

[32m2025-01-14 16:05:48.756[0m | [1mINFO    [0m | [36mnapari_spatialdata._view[0m:[36m_on_layer_update[0m:[36m355[0m - [1mUpdating layer.[0m
[32m2025-01-14 16:05:48.756[0m | [1mINFO    [0m | [36mnapari_spatialdata._view[0m:[36m_on_layer_update[0m:[36m355[0m - [1mUpdating layer.[0m
Traceback (most recent call last):
  File "/Users/jnimoca/Jose_BI/1_Pipelines/openDVP/.pixi/envs/default/lib/python3.12/site-packages/napari_spatialdata/_sdata_widgets.py", line 97, in <lambda>
    self.elements_widget.itemDoubleClicked.connect(lambda item: self._onClick(item.text()))
                                                                ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jnimoca/Jose_BI/1_Pipelines/openDVP/.pixi/envs/default/lib/python3.12/site-packages/napari_spatialdata/_sdata_widgets.py", line 121, in _onClick
    self.viewer_model.add_sdata_labels(sdata, text, selected_cs, multi)
  File "/Users/jnimoca/Jose_BI/1_Pipelines/openDVP/.pixi