# Single Cell Decompression Training

## Import Libraries

First, we import the neccessary libraries.

In [1]:
# Import libraries
import anndata as ad
from pathlib import Path
import errno
import os
import sys

## Helper Functions

Next, we import some helper functions including the cisi segementation training function.

In [4]:
## CISI
# Find code directory relative to our directory
THIS_DIR = os.path.dirname('__file__')
CODE_DIR = os.path.abspath(os.path.join(THIS_DIR, '../..', 'code'))
# Add code directory to systems paths
sys.path.append(CODE_DIR)

# Import CISI training fnc.
from train_dictionary_and_compositions import train_U_and_A

## Inputs

In the first part we specify the paths to the input files (.h5ad files created from R SpatialExperiment) and where the outputs should be stored. (Note: There are also some lines reading in the TIFF files generated by the steinbock pipeline and the panel metadata file. These are for reading in pixel-level expression values into an anndata object, such that CISI could be run directly on pixels. This hasn't been working so far, so the code has subsequently been commented out in this file.) It is advised to check that the input files actually exist before continuing. An example to do so given in analysis_utils and shown (outcommented) underneath.

In [5]:
# Specify input paths
training_data_path = Path('/mnt/bb_dqbm_volume')
spe_path = Path(os.path.join(training_data_path, 
                             'data/Tonsil_th152/preprocessed_data/spe.h5ad'))
'''
tiffs_path = Path(os.path.join(training_data_path, 'data/Tonsil_th152/steinbock/img'))
panel_path = Path(os.path.join(training_data_path, 'data/Tonsil_th152/steinbock/panel.csv'))
'''

# Specify output path
out_path = Path(os.path.join(training_data_path, 'analysis/Tonsil_th152/example'))

# Create output directory if it doesn't exist
out_path.mkdir(parents=True, exist_ok=True)

In [None]:
'''
# Helper fncs
import analysis_utils


# Check that input files/dictionary exist
if not analysis_utils.is_valid_file(spe_path, ['.h5ad']):
    # If file is not found, throw error
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                            spe_path)
if not analysis_utils.is_valid_file(panel_path, ['.csv']):
    # If file is not found, throw error
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                            panel_path)
if not analysis_utils.is_valid_directory(tiffs_path):
    # If directory is not found or doesn't contain TIFF files, throw an error
    raise Exception('Input TIFFS path {0} directory does not exist or does\
    not contain any valid TIFF files.'.format(tiffs_path)) from FileNotFoundError
'''

Next, we read in the input files for training. For this we have the Tonsil th152 dataset consisting of 5 ROIs and we read the data in once it has been processed by steinbock into segmented single cells and saved as anndata object (.h5ad). From this we create one numpy array of channels x cells and one numpy array containing channels vs pixels.

In [16]:
# Read in SpatialExperiment converted to anndata by cellconverter in R
spe = ad.read_h5ad(spe_path)
# Remove uninteresting proteins/channels
spe = spe[:, ~spe.var.index.str.contains('Histone|Ir[0-9]', regex=True, case=False)]
print(spe)

View of AnnData object with n_obs × n_vars = 162624 × 40
    obs: 'sample_id', 'ObjectNumber', 'area', 'major_axis_length', 'minor_axis_length', 'eccentricity', 'width_px', 'height_px', 'ROI'
    var: 'channel', 'name', 'keep', 'ilastik', 'deepcell', 'Tube.Number', 'Metal', 'use_channel', 'channel_name'
    uns: 'X_name'
    layers: 'compcounts', 'compexprs', 'exprs', 'log_exprs'


In [None]:
'''
# Pixel-level 
images = analysis_utils.anndata_from_tiff(tiffs_path, panel_path)
# Remove uninteresting proteins/channels
images = images[:, ~images.var.index.str.contains('Histone|Ir[0-9]', regex=True, case=False)]
print(images)
'''

We now run the CISI training function for IMC. To have a look at the different parameter options, please refer to the code directly.

In [None]:
training_res, training_res_no_noise, U_best, 
Phi_best, X_test_index = train_U_and_A(spe, 
                                       outpath,
                                       test_set=('20220520_TsH_th152_cisi1_001',),
                                       layer=None, 
                                       d=80, 
                                       lda1=3, 
                                       lda2=0.2, 
                                       maxItr=10,
                                       UW=None, 
                                       posW=False, 
                                       posU=True, 
                                       use_chol=False, 
                                       module_lower=1,
                                       activity_lower=1, 
                                       donorm=False, 
                                       mode_smaf=1, 
                                       mink=0, 
                                       U0=[],
                                       U0_delta=0.1, 
                                       doprint=False, 
                                       normalization='paper_norm',
                                       THREADS_smaf=4, 
                                       nmeasurements=10, 
                                       maxcomposition=3, 
                                       mode_phi='G',
                                       lasso_sparsity=0.2, 
                                       THREADS_A=20, 
                                       num_phi=1, 
                                       THREADS_A_and_U=20,
                                       split_by='roi', 
                                       k_cv=4, 
                                       test_size=None,
                                       threshold_cond_prob=10.0, 
                                       save='no_noise', 
                                       snr=5,
                                       analysis_normalization=True, 
                                       maxItr_A=2000, 
                                       num_blocks=20,
                                       best_A_method='mean')