# Single Cell Decompression Training

## Import Libraries

First, we import the neccessary libraries.

In [1]:
# Import libraries
import anndata as ad
from pathlib import Path
import errno
import os
# import numpy as np
# import pandas as pd

## Helper Functions

Next, we import some helper functions.

In [2]:
import preprocessing_utils

## Inputs

In the first part we specify the paths to the input files (.h5ad files created from R SpatialExperiment, TIFF files generated by the steinbock pipeline and panel metadata file) and where the outputs should be stored.

In [3]:
# Specify input paths
training_data_path = Path('/mnt/bb_dqbm_volume/data/Tonsil_th152')
spe_path = Path(os.path.join(training_data_path, 'preprocessed_data/spe.h5ad'))
tiffs_path = Path(os.path.join(training_data_path, 'steinbock/img'))
panel_path = Path(os.path.join(training_data_path, 'steinbock/panel.csv'))

# Specify output path
out_path = Path(os.path.join(training_data_path, 'training'))

# Create output directory if it doesn't exist
out_path.mkdir(parents=True, exist_ok=True)

In [4]:
# Check that input files/dictionary exist
if not preprocessing_utils.is_valid_file(spe_path, ['.h5ad']):
    # If file is not found, throw error
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                            spe_path)
if not preprocessing_utils.is_valid_file(panel_path, ['.csv']):
    # If file is not found, throw error
    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                            panel_path)
if not preprocessing_utils.is_valid_directory(tiffs_path):
    # If directory is not found or doesn't contain TIFF files, throw an error
    raise Exception('Input TIFFS path {0} directory does not exist or does\
    not contain any valid TIFF files.'.format(tiffs_path)) from FileNotFoundError

Next, we read in the input files for training. For this we have the Tonsil th152 dataset consisting of 5 ROIs and we read the data in once it has been processed by steinbock into segmented single cells and once using the hot pixel corrected TIFF image files directly. From this we create one numpy array of channels x cells and one numpy array containing channels vs pixels.

In [5]:
# Read in SpatialExperiment converted to anndata by cellconverter in R
spe = ad.read_h5ad(spe_path)
print(spe)

AnnData object with n_obs × n_vars = 162624 × 43
    obs: 'sample_id', 'ObjectNumber', 'area', 'major_axis_length', 'minor_axis_length', 'eccentricity', 'width_px', 'height_px', 'ROI'
    var: 'channel', 'name', 'keep', 'ilastik', 'deepcell', 'Tube.Number', 'Metal', 'use_channel'
    uns: 'X_name'
    layers: 'exprs', 'log_exprs'


In [6]:
images = preprocessing_utils.anndata_from_tiff(tiffs_path, panel_path)
print(images)


AnnData object with n_obs × n_vars = 9250000 × 43
    obs: 'sample_id', 'ObjectNumber', 'ROI'
    var: 'channel', 'name', 'keep', 'ilastik', 'deepcell', 'Tube Number', 'Metal'
