# Single Cell Decompression Training

## Import Libraries

First, we import the neccessary libraries.

In [12]:
# Import libraries
import anndata as ad
import tifffile
from pathlib import Path
import errno
import os
import numpy as np
import pandas as pd

## Helper Functions

Next, we define some helper functions.

In [2]:
# Function that checks that file_path is an existing file and has a certain extension
def is_valid_file(file_path, extension):
    file = Path(file_path)
    if file.exists() and file.is_file() and file.suffix in extension:
        return True
    else:
        return False
    
    
# Function that checks if directory exists and contains at least one file
# with certain extension
def is_valid_directory(directory_path): 
    # Checking if the directory exists 
    if os.path.exists(directory_path): 
        # Checking if the directory is empty or not
        if any([is_valid_file(os.path.join(directory_path, f), '.tiff') 
                for f in directory_path.iterdir() if not f.name.startswith('.')]) == True:
            return True
        else:
            return False
    else:
        return  False

## Inputs

In the first part we specify the paths to the input files (.h5ad files created from R SpatialExperiment, TIFF files generated by the steinbock pipeline and panel metadata file) and where the outputs should be stored.

In [10]:
# Specify input paths
spe_path = Path('spe.h5ad')
tiffs_path = Path('../img')
panel_path = Path('../panel.csv')


# Check that input files/dictionary exist
if not is_valid_file(spe_path, ['.h5ad']):
    # If file is not found, throw error
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                            spe_path)
if not is_valid_file(panel_path, ['.csv']):
    # If file is not found, throw error
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                            panel_path)
if not is_valid_directory(tiffs_path):
    # If directory is not found or doesn't contain TIFF files, throw an error
    raise Exception('Input TIFFS path {0} directory does not exist or does\
    not contain any valid TIFF files.'.format(tiffs_path)) from FileNotFoundError
    

# Specify output path
out_path = Path('../preprocessed_data')

# Create output directory if it doesn't exist
out_path.mkdir(parents=True, exist_ok=True)

Next, we read in the input files for training. For this we have the Tonsil th152 dataset consisting of 5 ROIs and we read the data in once it has been processed by steinbock into segmented single cells and once using the hot pixel corrected TIFF image files directly. From this we create one numpy array of channels x cells and once numpy array containing channels vs pixels.

In [62]:
# Read in SpatialExperiment converted to anndata by cellconverter in R
spe = ad.read_h5ad(spe_path)
print(spe)

AnnData object with n_obs × n_vars = 156121 × 43
    obs: 'sample_id', 'ObjectNumber', 'area', 'major_axis_length', 'minor_axis_length', 'eccentricity', 'width_px', 'height_px', 'ROI'
    var: 'channel', 'name', 'keep', 'ilastik', 'deepcell', 'Tube.Number', 'Metal', 'use_channel'
    uns: 'X_name'
    layers: 'exprs', 'log_exprs'


In [64]:
# Read in TIFF images and flatten into one numpy array (channels x pixels)
image_names = [f.stem for f in tiffs_path.iterdir() if not f.name.startswith('.')]
images_unflattened = [tifffile.imread(os.path.join(tiffs_path, f)) 
                      for f in tiffs_path.iterdir() if not f.name.startswith('.')]
images_list = [img.reshape(img.shape[0], (img.shape[1]*img.shape[2])) for img in images_unflattened]

# Read in panel metadata
images_panel = pd.read_csv(panel_path)

# Create anndata from images
# Add image intensities per pixel matrix
images = ad.AnnData(np.transpose(np.hstack(images_list)))

# Add observation and variable names
num_pixels = [img.shape[1] for img in images_list]
images.obs_names = [(ele + '_' + str(j)) for i, ele in enumerate(image_names) for j in range(num_pixels[i])]
images.var_names = images_panel['name']

# Add observation metadata
images.obs = pd.DataFrame({
    'sample_id': [ele for i, ele in enumerate(image_names) for j in range(num_pixels[i])],
    'ObjectNumber': [j for i, ele in enumerate(image_names) for j in range(num_pixels[i])],
    'ROI': [ele.replace('20220520_TsH_th152_cisi1_00', '') for i, ele in enumerate(image_names) 
            for j in range(num_pixels[i])]
})
images.obs.index = images.obs['sample_id'].tolist()

# Add panel data as variable metadata
images.var = images_panel
images.var.index = images.var['name'].tolist()
print(images)


OverloadedDict, wrapping:
	OrderedDict()
With overloaded keys:
	['neighbors'].
