# Pixie: pixel clustering notebook

In [None]:
# Add directory above current directory to path
import sys; sys.path.append('..')

In [None]:
import json
import os
import subprocess
from datetime import datetime as dt

import feather
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
import random
from matplotlib import rc_file_defaults
from alpineer import io_utils, load_utils

from ark.analysis import visualize
from ark.phenotyping import (pixie_preprocessing,
                             pixel_cluster_utils,
                             pixel_som_clustering,
                             pixel_meta_clustering)
from ark.utils import data_utils, example_dataset, plot_utils
from ark.utils.metacluster_remap_gui import (MetaClusterData, MetaClusterGui,
                                             colormap_helper,
                                             metaclusterdata_from_files)

In [None]:
SEED = 42
DATASET = "IMMUcan_2022_CancerExample"
RESULTS_DIR = 'Blur=2'


In [None]:
CONFIG_PATH = f"/home/dani/Documents/Thesis/Methods/IMCBenchmark/output/{DATASET}/pixie/{RESULTS_DIR}/config.json"

# load the params
with open(CONFIG_PATH) as f:
    pixie_config = json.load(f)
    
# assign the params to variables
input_dir = pixie_config['input_dir']
output_dir = pixie_config['output_dir']
fovs = pixie_config['fovs']
channels = pixie_config['channels']

random.seed(SEED)
validation_fovs = random.sample(fovs, 4)

print(f'Data Folder: {input_dir}')
print(f'Output Folder: {output_dir}\n')
print(f'FOVS: {fovs}\n')
print(f'FOVS for validation: {validation_fovs}\n')

## 1: Set file paths and parameters

In [None]:
# define the output directory of the pixel clustering
pixel_output_dir = 'pixel_output'

if not os.path.exists(os.path.join(output_dir, pixel_output_dir)):
    os.makedirs(os.path.join(output_dir, pixel_output_dir))

* `tiff_dir`: path to the directory containing your imaging data. Images should be single-channel TIFFs.
* `masks_dir`: path to the directory containing your segmentation masks (where each pixel value corresponds to a cell ID, can be generated using the segmentation notebook at github.com/angelolab/ark-analysis). Set this argument to `None` if you do not have segmentation labels or wish to run pixel clustering without them (they are required for cell clustering)
* `seg_suffix`: the suffix plus the file extension of the segmented images for each FOV. Note that these should be the same for all FOVs. This argument will be ignored if `masks_dir` is set to `None`

In [None]:
# define the name of the directory with the extracted image data
tiff_dir = os.path.join(input_dir, "images")

# define the name of the directory with segmentation masks
masks_dir = os.path.join(input_dir, "masks")

# define suffix of the segmentation mask files
seg_suffix = '_whole_cell.tiff'

### Define multiprocessing parameters

Turning on multiprocessing provides a speed boost; however, it is not always cross-platform compatible. If you receive errors such as hanging cells without progress updates, try setting `multiprocess` back to `False`.

In [None]:
# set to True to turn on multiprocessing
multiprocess = pixie_config['pixels']['multiprocess']

# define the number of FOVs to process in parallel, ignored if multiprocessing is set to False
batch_size = pixie_config['pixels']['batch_size']

print(f'Multiprocess: {multiprocess}')
print(f'Batch size: {batch_size}')

## 2: Preprocess

Set a prefix to be applied to all data directories/files created during pixel clustering. If the prefix is not set, a default of the datetime at the start of the run is used.

The following data directories/files will be created with names prefixed by `pixel_cluster_prefix`:

* `preprocessed_dir`: directory name where the preprocessed pixel data are stored
* `subsetted_dir`: directory name where the subsetted pixel data are stored
* `norm_vals_name`: file name where the values used to normalize each channel are stored

In [None]:
# define the preprocessed pixel data folders
pixel_data_dir = os.path.join(pixel_output_dir, 'pixel_mat_data')
pixel_subset_dir = os.path.join(pixel_output_dir, 'pixel_mat_subset')
norm_vals_name = os.path.join(pixel_output_dir, 'channel_norm_post_rowsum.feather')

Set the following arguments:

* `channels`: channels to run pixel clustering on
* `blur_factor`: sigma (standard deviation) for the Gaussian blur. Higher values are more aggressive in smoothing signal.
* `subset_proportion`: the fraction of pixels to take from each FOV for training. Sampling is random.

In [None]:
channels = pixie_config['channels']
type_channels = pixie_config['type_channels']

blur_factor = pixie_config['pixels']['blur_factor']
subset_proportion = pixie_config['pixels']['subset_proportion']

print(f'Channels to use: {channels}\n')
print(f'Blur Factor: {blur_factor}')
print(f'Subset Proportion: {subset_proportion}\n')

During pixel preprocessing, the following is done for each FOV:

* Gaussian blur each channel separately
* Remove empty pixels
* For the remaining pixels, normalize each pixel by the sum of all the channels
* Subset a `subset_proportion` fraction of non-empty, normalized pixels. This creates the subsetted dataset for training

Note: if you get integer overflow errors loading in your data, try changing the `dtype` argument to a larger type.

In [None]:
# run pixel data preprocessing
pixie_preprocessing.create_pixel_matrix(
    fovs,
    channels,
    output_dir,
    tiff_dir,
    masks_dir,
    img_sub_folder=None,
    seg_suffix=seg_suffix,
    pixel_output_dir=pixel_output_dir,
    data_dir=pixel_data_dir,
    subset_dir=pixel_subset_dir,
    norm_vals_name=norm_vals_name,
    blur_factor=blur_factor,
    subset_proportion=subset_proportion,
    multiprocess=multiprocess,
    batch_size=batch_size,
    seed=SEED,
)

## 3: Pixel clustering

### 3.1: Train pixel SOM

Train the pixel SOM using the subsetted data. Training is done using a self-organizing map (SOM).

The following data directories/files will be created for pixel clustering:

* `pixel_som_weights_name`: file name to store the pixel SOM weights
* `pc_chan_avg_som_cluster_name`: file name to store the average channel expression across all pixel SOM clusters
* `pc_chan_avg_meta_cluster_name`: same as above for pixel meta clusters
* `pixel_meta_cluster_remap_name`: file name to store the SOM cluster to meta cluster manual mappings created using the GUI below

In [None]:
pixel_som_weights_name = os.path.join(pixel_output_dir, 'pixel_som_weights.feather')
pc_chan_avg_som_cluster_name = os.path.join(pixel_output_dir, 'pixel_channel_avg_som_cluster.csv')
pc_chan_avg_meta_cluster_name = os.path.join(pixel_output_dir, 'pixel_channel_avg_meta_cluster.csv')
pixel_meta_cluster_remap_name = os.path.join(pixel_output_dir, 'pixel_meta_cluster_mapping.csv')

Each channel is normalized by their 99.9% value across the entire dataset before training. These values get saved to `norm_vals_name`.

For a full set of parameters you can customize for `train_pixel_som`, please consult <a href=https://ark-analysis.readthedocs.io/en/latest/_markdown/ark.phenotyping.html#ark.phenotyping.pixel_cluster_utils.train_pixel_som>pixel training docs</a>.

In [None]:
# create the pixel SOM weights
pixel_pysom = pixel_som_clustering.train_pixel_som(
    fovs,
    channels,
    output_dir,
    subset_dir=pixel_subset_dir,
    norm_vals_name=norm_vals_name,
    som_weights_name=pixel_som_weights_name,
    num_passes=1,
    seed=SEED
)

### 3.2: Assign pixel SOM clusters

Use the SOM weights learned from `train_pixel_som` to assign pixel clusters to the full preprocessed dataset.

Note that each channel is normalized by the respective value stored in `norm_vals_name` (computed in `train_pixel_som`) prior to cluster assignment.

`generate_som_avg_files` will compute the average channel expression across all pixel SOM clusters, as well as the number of pixels in each pixel SOM cluster (the data in `pc_chan_avg_som_cluster_name`). This is needed for consensus clustering.

In [None]:
# use pixel SOM weights to assign pixel clusters
pixel_som_clustering.cluster_pixels(
    fovs,
    channels,
    output_dir,
    pixel_pysom,
    data_dir=pixel_data_dir,
    multiprocess=multiprocess,
    batch_size=batch_size,
)

# generate the SOM cluster summary files
pixel_som_clustering.generate_som_avg_files(
    fovs,
    channels,
    output_dir,
    pixel_pysom,
    data_dir=pixel_data_dir,
    pc_chan_avg_som_cluster_name=pc_chan_avg_som_cluster_name,
    seed=SEED,
)

### 3.3: Run pixel consensus clustering

Use consensus hierarchical clustering to cluster pixel SOM clusters into a user-defined number of meta clusters. The consensus clusters are trained on the average channel expression across all pixel SOM clusters (the data stored in `pc_chan_avg_som_cluster_name`). These values are z-scored and capped at the value specified in the `cap` argument prior to consensus clustering. This helps improve meta clustering performance.

After consensus clustering, the following are computed by `generate_meta_avg_files`:

* The average channel expression across all pixel meta clusters, and the number of pixels per meta cluster (the data in `pc_chan_avg_meta_cluster_name`)
* The meta cluster mapping for each pixel SOM cluster in `pc_chan_avg_som_cluster_name` (data is resaved, same data except with an associated meta cluster column)

For a full set of parameters you can customize for `pixel_consensus_cluster`, please consult <a href=https://ark-analysis.readthedocs.io/en/latest/_markdown/ark.phenotyping.html#ark.phenotyping.pixel_cluster_utils.pixel_consensus_cluster>pixel consensus clustering docs</a>

* `max_k`: the number of consensus clusters desired
* `cap`: used to clip z-scored values prior to consensus clustering (in the range `[-cap, cap]`)

In [None]:
max_k = pixie_config['pixels']['meta_max_k']
cap = pixie_config['pixels']['meta_cap']
print(f'For metaclustering using max_k: {max_k} and z-score cap: [-{cap}, +{cap}].\n')

# run hierarchical clustering using average pixel SOM cluster expression
pixel_cc = pixel_meta_clustering.pixel_consensus_cluster(
    fovs,
    channels,
    output_dir,
    max_k=max_k,
    cap=cap,
    data_dir=pixel_data_dir,
    pc_chan_avg_som_cluster_name=pc_chan_avg_som_cluster_name,
    multiprocess=multiprocess,
    batch_size=batch_size,
    seed=SEED,
)

# generate the meta cluster summary files
pixel_meta_clustering.generate_meta_avg_files(
    fovs,
    channels,
    output_dir,
    pixel_cc,
    data_dir=pixel_data_dir,
    pc_chan_avg_som_cluster_name=pc_chan_avg_som_cluster_name,
    pc_chan_avg_meta_cluster_name=pc_chan_avg_meta_cluster_name,
    seed=SEED,
)

## 4: Visualize results

### 4.1: Interactive adjustments to relabel pixel meta clusters

The visualization shows the z-scored average channel expression per pixel SOM and meta cluster. The heatmaps are faceted by pixel SOM clusters on the left and pixel meta clusters on the right.

## Usage

### Quickstart
- **Select**: Left Click
- **Remap**: **New metacluster** button or Right Click
- **Edit Metacluster Name**: Textbox at bottom right of the heatmaps.

### Selection and remapping details
- To select a SOM cluster, click on its respective position in the **selected** bar. Click on it again to deselect.
- To select a meta cluster, click on its corresponding color in the **metacluster** bar. Click on it again to deselect.
- To remap the selected clusters, click the **New metacluster** button (alternatively, right click anywhere). Note that remapping an entire metacluster deletes it.
- To clear the selected SOM/meta clusters, use the **Clear Selection** button.
- **After remapping a meta cluster, make sure to deselect the newly created one to prevent unwanted combinations.**

### Other features and notes
- You will likely need to zoom out to see the entire visualization. To toggle Zoom, use Ctrl -/Ctrl + on Windows or ⌘ +/⌘ - on Mac.
- The bars at the top show the number of pixels in each SOM cluster.
- The text box at the bottom right allows you to rename a particular meta cluster. This can be useful as remapping may cause inconsistent numbering.
- Adjust the z-score limit using the slider on the bottom left to adjust your dynamic range.
- When meta clusters are combined or a meta cluster is renamed, the change is immediately saved to `pixel_meta_cluster_remap_name`.
- **You won't be able to advance in the notebook until you've clicked `New metacluster` or renamed a meta cluster at least once. If you don't want to make changes, just click `New metacluster` to trigger a save before continuing.**

In [None]:
%matplotlib widget
rc_file_defaults()
plt.ion()

pixel_mcd = metaclusterdata_from_files(
    os.path.join(output_dir, pc_chan_avg_som_cluster_name),
    cluster_type='pixel',
    prefix_trim=None,
    subset_channels=type_channels
)
pixel_mcd.output_mapping_filename = os.path.join(output_dir, pixel_meta_cluster_remap_name)
pixel_mcg = MetaClusterGui(pixel_mcd, width=9)

Relabel the pixel meta clusters using the mapping, and recompute the meta cluster average files with the new meta cluster names.

In [None]:
# rename the meta cluster values in the pixel dataset
pixel_meta_clustering.apply_pixel_meta_cluster_remapping(
    fovs,
    channels,
    output_dir,
    pixel_data_dir,
    pixel_meta_cluster_remap_name,
    multiprocess=multiprocess,
    batch_size=batch_size
)

# recompute the mean channel expression per meta cluster and apply these new names to the SOM cluster average data
pixel_meta_clustering.generate_remap_avg_files(
    fovs,
    channels,
    output_dir,
    pixel_data_dir,
    pixel_meta_cluster_remap_name,
    pc_chan_avg_som_cluster_name,
    pc_chan_avg_meta_cluster_name,
    seed=SEED,
)

Generate the color scheme returned by the interactive reclustering process. This will be for visualizing the pixel phenotype maps.

In [None]:
raw_cmap, _ = colormap_helper.generate_meta_cluster_colormap_dict(
    pixel_mcd.output_mapping_filename,
    pixel_mcg.im_cl.cmap
)

### 4.2: Generate pixel phenotype maps

Generate pixel phenotype maps, in which each pixel in the image corresponds to its pixel meta cluster. Select a small subset of your FOVs to view within this notebook. Or if you wish to generate and save a significant amount of FOVs, the masks will be created and saved in batches. Note that because each pixel value corresponds to a metacluster number, masks likely will not render with colors in image viewer software.

Files will be written as `{fov_name}_pixel_mask.tiff` in `pixel_output_dir`

In [None]:
# define the path to the channel file
chan_file = os.path.join(
    io_utils.list_files(os.path.join(tiff_dir, fovs[0]), substrs=['.tiff'])[0]
)

# generate and save the pixel cluster masks for each fov in subset_pixel_fovs
data_utils.generate_and_save_pixel_cluster_masks(
    fovs=validation_fovs,
    base_dir=output_dir,
    save_dir=os.path.join(output_dir, pixel_output_dir),
    tiff_dir=tiff_dir,
    chan_file=chan_file,
    pixel_data_dir=pixel_data_dir,
    pixel_cluster_col='pixel_meta_cluster',
    sub_dir='pixel_masks',
    name_suffix='_pixel_mask',
)

Load a subset of the pixel phenotype maps that you would like to preview.

In [None]:
for pixel_fov in validation_fovs:
    pixel_cluster_mask = load_utils.load_imgs_from_dir(
        data_dir=os.path.join(output_dir, pixel_output_dir, "pixel_masks"),
        files=[pixel_fov + "_pixel_mask.tiff"],
        trim_suffix="_pixel_mask",
        match_substring="_pixel_mask",
        xr_dim_name="pixel_mask",
        xr_channel_names=None,
    )

    plot_utils.plot_pixel_cell_cluster_overlay(
        pixel_cluster_mask,
        [pixel_fov],
        os.path.join(output_dir, pixel_meta_cluster_remap_name),
        metacluster_colors=raw_cmap
    )

## 5: Save parameters for use in cell clustering

The following parameters are saved:

* `pixel_data_dir`: name of the directory containing the full pixel data with the pixel SOM and meta cluster assignments
* `pc_chan_avg_som_cluster_name`: name of the file containing the average channel expression per pixel SOM cluster
* `pc_chan_avg_meta_cluster_name`: same as above for pixel meta clusters

The file will be saved to `{pixel_cluster_prefix}_cell_clustering_params.json` and will be placed in `pixel_output_dir`. Note that the `pixel_output_dir` you use in `1_Pixie_Cluster_Pixels.ipynb` should be the same as in `2_Pixie_Cluster_Cells.ipynb`.

In [None]:
# define the params dict
cell_clustering_params = {
    'pixel_data_dir': pixel_data_dir,
    'pc_chan_avg_som_cluster_name': pc_chan_avg_som_cluster_name,
    'pc_chan_avg_meta_cluster_name': pc_chan_avg_meta_cluster_name
}

# save the params dict
with open(os.path.join(output_dir, pixel_output_dir, 'cell_clustering_params.json'), 'w') as fh:
    json.dump(cell_clustering_params, fh)

### 5.1 Save images for Mantis Viewer

Mantis Viewer is a visualization tool for multi-dimensional imaging in pathology.

In [None]:
plot_utils.create_mantis_dir(
    fovs=validation_fovs,
    mantis_project_path=os.path.join(output_dir, pixel_output_dir, "mantis"),
    img_data_path=tiff_dir,
    mask_output_dir=os.path.join(output_dir, pixel_output_dir, "pixel_masks"),
    mapping = os.path.join(output_dir, pixel_meta_cluster_remap_name),
    seg_dir=masks_dir,
    mask_suffix="_pixel_mask",
    seg_suffix_name=seg_suffix
)