In [None]:
import cv2
import numpy as np
import pandas as pd
import re
import sys
import tifffile

from pathlib import Path
from tqdm import tqdm

from steinbock import io
from steinbock.segmentation import deepcell

# **Data formatting**

In this notebook images and masks from the "Pilot" experiment are formatted in order to be used to generate the islet segmentation model.

This notebook is *for documentation only*. The output files, which are used in the `islet_segmentation.ipynb` notebook can be downloaded from Zenodo (see the islet_segmentation notebook).

The "raw images" were generated from `.mcd` files using [steinbock](https://github.com/BodenmillerGroup/steinbock). The "raw masks" were generated from the raw images using ilastik and CellProfiler.

## **Settings**

In [None]:
DIR_DATA = Path("/home/ubuntu/Data2/islet_segmentation/")
assert Path.exists(DIR_DATA), f"{DIR_DATA} does not exist"

IMG_SIZE = 160

## **Process training data**

### **Settings**
Directories containing raw images and masks (tiff format)

In [None]:
# Directories containing antibody panels
DIR_PANELS = DIR_DATA / "panels"

# Directories to store raw images and masks (tiff)
DIR_RAW_IMAGES = DIR_DATA / "raw_images"
DIR_RAW_MASKS = DIR_DATA / "raw_masks"
DIR_RAW_IMAGES.mkdir(exist_ok=True)
DIR_RAW_MASKS.mkdir(exist_ok=True)

# Directories to save processed images and masks (png)
DIR_IMAGES = DIR_DATA / "images"
DIR_MASKS = DIR_DATA / "masks"
DIR_IMAGES.mkdir(exist_ok=True)
DIR_MASKS.mkdir(exist_ok=True)

### **List input images and masks**

In [None]:
# List input images
input_images = [x for x in DIR_RAW_IMAGES.rglob("*") if x.name.endswith(".tiff")]
print("\nImages:\n", [x.name for x in input_images[:5]])

# List input masks
input_masks = [x for x in DIR_RAW_MASKS.rglob("*") if x.name.endswith(".tiff")]
print("\nMasks:\n", [x.name for x in input_masks[:5]])

### **Load antibody panels**

#### List panel files

In [None]:
panels = {
    "Islet": (DIR_PANELS / 'panel_Islet.csv'),
    "Immune": (DIR_PANELS / 'panel_Immune.csv'),
    "Islet2": (DIR_PANELS / 'panel_Islet2.csv'),
    "Lympho": (DIR_PANELS / 'panel_Lympho.csv'),
    "Myelo": (DIR_PANELS / 'panel_Myelo.csv')
}

#### Load and display the panels

In [None]:
# Loop through the panels
for panel_name, panel_path in panels.items():
    print("Panel:", panel_name)
    
    # Load the panel file
    assert Path.exists(panel_path), f"{panel_path} does not exist"
    cur_panel = pd.read_csv(panel_path, sep = ',', index_col = False)
    
    # Subset the panel
    cur_panel = cur_panel[cur_panel["full"]==1]
    panels[panel_name] = cur_panel
    
    # Display the panel
    display(panels[panel_name].head())

## **Process input images and masks**

### **Process images**

An image stack containing only islet channels is generated and saved as a `.png` file.

In [None]:
# Regular expression
regex_img = "(?P<caseid>[0-9]{3,4})_(?P<panel>[a-zA-Z0-9]{5,6})_ROI_(?P<acqid>[0-9]{1,3})*"

# Define image preprocessing options
channelwise_zscore = False
channelwise_minmax = True
aggr_func = np.mean

In [None]:
for panel_name, panel in panels.items():
    print("Processing", panel_name, "panel")
    
    # Define channels to use for segmentation ("seg_islets" column in panels)
    islet_channels = panel["seg_islets"].values
    islet_channels = np.where(islet_channels == 0, np.nan, islet_channels)
    
    # Define the input folder
    img_subdir = DIR_RAW_IMAGES / panel_name

    # Generate 
    for img_path in tqdm(io.list_image_files(img_subdir)):
        islet_img = deepcell.create_segmentation_stack(
            img = io.read_image(img_path),
            channelwise_minmax = channelwise_minmax,
            channelwise_zscore = channelwise_zscore,
            channel_groups = islet_channels,
            aggr_func = aggr_func
        )
        
        # Rename the image
        re_comp = re.compile(regex_img)
        fn_caseid = (re_comp.search(img_path.name).group("caseid"))
        fn_panel = (re_comp.search(img_path.name).group("panel"))
        fn_acqid = (re_comp.search(img_path.name).group("acqid"))
        fn_elements = (fn_caseid, fn_panel, fn_acqid)
        fn_out = "_".join(fn_elements) + ".png"
        
        # Save the islet stack
        io.write_image(islet_img, DIR_IMAGES / fn_out)

### **Process masks**

Islet masks are binarized and saved as `.png` files.

In [None]:
regex_mask = "(?P<caseid>[0-9]{3,4})_(?P<panel>[a-zA-Z0-9]{5,6})_s0_a(?P<acqid>[0-9]{1,3})_ac_full_islet_mask*"

In [None]:
for panel_name, panel in panels.items():
    print("Processing", panel_name, "panel")
        
    # Define the input folder
    masks_subdir = DIR_RAW_MASKS / panel_name
    input_masks = [x for x in Path.iterdir(masks_subdir) if x.name.endswith(".tiff")]
    
    for mask_file in tqdm(input_masks):
        
        # Rename the mask
        re_comp = re.compile(regex_mask)
        fn_caseid = (re_comp.search(mask_file.name).group("caseid"))
        fn_panel = (re_comp.search(mask_file.name).group("panel"))
        fn_acqid = int(re_comp.search(mask_file.name).group("acqid"))
        # fn_elements = (fn_caseid, fn_panel, fn_acqid)
        fn_out = f"{fn_caseid}_{fn_panel}_{fn_acqid:03}" + ".png"
        
        # Read the input file, binarize and save
        mask = tifffile.imread(mask_file)        
        thr, mask = cv2.threshold(mask, 0, 255, cv2.THRESH_BINARY)
        mask = (mask / 255).astype("uint8")
        tifffile.imwrite(DIR_MASKS / fn_out, mask)

### **Remove images that do not have an associated mask**

In [None]:
input_images = [x for x in DIR_IMAGES.iterdir() if x.name.endswith(".png")]
input_masks = [x for x in DIR_MASKS.iterdir() if x.name.endswith(".png")]

missing_masks = []
missing_images = []

for img in input_images:
    if img.name not in [x.name for x in input_masks]:
        missing_masks.append(img.name)
        Path.unlink(img)
        
for mask in input_masks:
    if mask.name not in [x.name for x in input_images]:
        missing_images.append(mask.name)
        Path.unlink(mask)

print("Removed images (missing mask):", len(missing_masks))
print("Removed masks (missing images):", len(missing_images))

In [None]:
!conda list