In [None]:
import sys
sys.path.insert(0, '../')
import anndata as ad
import scanpy as sc
import matplotlib.pyplot as plt
import os
from random import seed
import scvi
from utils.utils import correction_validity, setup_logger, create_directory, nuclei_clustering, \
                        setup_reference_label, DEGs_HVGs, scIntegration, UMAP_visualization, save_anndata

# __Setup Non-Tuned Parameters (Unsupervised Model - [scVI](https://www.nature.com/articles/s41592-018-0229-2))__
⭐ <u>SEED</u>: random seed, guarantee reproducibility of results;<br>
⭐ <u>accelerator</u>: accelerator type - e.g., 'gpu';<br>

⭐ <u>dir_adata</u>: directory of AnnData to be integrated, should be with dimension n_cell X n_genes [e.g., all protein coding genes];<br>
⭐ <u>layer</u>: layers in AnnData, scVI and scANVI use raw counts;<br>
⭐ <u>level</u>: cell type resolution level; e.g., level = 1 indicates all cell types, level = 2 indicates cell type subtypes, like Neuron subtypes, etc.;<br>
⭐ <u>celltype</u>: celltype AnnData to be integrated;<br>
⭐ <u>dataset_key</u>: column name in integrated AnnData that denotes dataset;<br>

⭐ <u>dir_scvi</u>: directory of saving scVI (unsupervised) model;<br>
⭐ <u>dir_hvg</u>: directory of saving highly variable genes;<br>
⭐ <u>top_hvg_genes</u>: numbers of top highly variable genes considered;<br>
⭐ <u>subset</u>: Inplace subset to highly-variable genes if True otherwise merely indicate highly variable genes;<br>
⭐ <u>span</u>: The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3';<br>

⭐ <u>use_unsupervised</u>: use unsupervised model (True -> scVI) or semi-supervised model (False -> scANVI);<br>

⭐ <u>train_ratio</u>: ratio of training data, train_ratio + validation_test_ratio = 1;<br>
⭐ <u>test_ratio</u>: ratio of test data considering of both validation and test data, e.g., train_ratio = 0.6, test_ratio = 0.5, implies training data is 60%, validation data is 20% and test data is 20%;<br>
⭐ <u>shffule</u>: whether or not to shuffle the data before splitting;<br>
⭐ <u>correction</u>: various types of corrections to be adjusted during integration, it can be 'no_batch_covariates': no batch and any covariates adjustments; 'batch_only': only adjust batch; 'covariates_only': only adjust covariates; 'batch_covariates': adjust both batch and covariates;<br>
⭐ <u>batch_key</u>: batch column in AnnData.obs;<br>
⭐ <u>categorical_covariate_keys</u>: list of categorical covariates to be adjusted during integration, e.g., sex, brain_region;<br>
⭐ <u>continuous_covariate_keys</u>: list of continuous covariates to be adjusted during integration, e.g., QC factors.<br>

⭐ <u>clustering_res</u>: resolution of cell/nuclei clustering.<br>
⭐ <u>clustering_method</u>: clustering algorithm, could be leiden and louvain.<br>

In [None]:
SEED = 1234
accelerator = 'gpu'
dir_adata = "../test_Data/raw_count_before_integration.h5ad"
layer = 'counts'
level = 1
celltype = 'All'
dataset_key = 'Dataset'
dir_scvi = f'../output_test/Level{level}/{celltype}/scVI'
dir_hvg = f'../output_test/Level{level}/{celltype}_HVG.txt'
top_hvg_genes = 2000
subset = True
span = 1.0

use_unsupervised = False

train_ratio = 0.6
test_ratio = 0.5
shuffle = True
correction = 'batch_covariates'
batch_key = 'BatchID'
categorical_covariate_keys = None
continuous_covariate_keys = ["n_genes_by_counts", "total_counts", "pct_counts_mt", "pct_counts_rb"]


correction_validity(correction = correction, 
                        batch_key = batch_key, 
                        continuous_covariate_keys = continuous_covariate_keys, 
                        categorical_covariate_keys = categorical_covariate_keys
                    )

clustering_resolution = 0.5
clustering_method = 'leiden'

# __Setup Additional Non-Tuned Parameters (Semi-supervised Model - [scANVI](https://www.embopress.org/doi/full/10.15252/msb.20209620))__
⭐ <u>dir_scanvi</u>: directory of saving scANVI (semi-supervised) model;<br>
⭐ <u>labels_key</u>: label column in AnnData.obs, e.g., Column where you save the cell type labels;<br>
⭐ <u>scanvi_epochs</u>: numbers of epochs for training scANVI (semi-supervised) model;<br>
⭐ <u>dir_deg</u>: directory of saving differentially expressed genes (if using semi-supervised model);<br>
⭐ <u>deg_group</u>: column in AnnData.obs that is used as the group variable when computing differentially expressed genes (if using semi-supervised model);<br>
⭐ <u>top_deg_num</u>: numbers of top differentially expressed genes to be kept;<br>
⭐ <u>use_sampling</u>: whether or not do downsampling when computing differentially expressed genes;<br>
⭐ <u>sampling_ratio</u>: downsampling ratio when computing differentially expressed genes;<br>

⭐ <u>prediction_key</u>: column in AnnData.obs that stores predicted cell type annotations;<br>
⭐ <u>use_reference</u>: whether or not use reference cell labels for integration;<br>
⭐ <u>reference_datasets</u>: list of reference datasets to be used, reference_datasets is more preferred when you have multiple reference datasets and have not been merged;<br>
⭐ <u>dir_reference</u>: directory of reference datasets, dir_reference is more preferred when you already have a reference_adata.h5ad for example;<br>
⭐ <u>ref_label</u>: column in AnnData.obs that indicates the reference cell type labels;<br>

In [None]:

dir_scanvi = f'../output_test/Level{level}/{celltype}/scANVI' if not use_unsupervised else None
labels_key = 'seed_label' if not use_unsupervised else None
scanvi_epochs = 5 if not use_unsupervised else None
dir_deg = f'../output_test/Level{level}/{celltype}_DEG.pkl' if not use_unsupervised else None
deg_group = labels_key if not use_unsupervised else None
top_deg_num = 500 if not use_unsupervised else None
use_sampling = True if not use_unsupervised else None
sampling_ratio = 0.1 if not use_unsupervised else None
if use_sampling:
    assert sampling_ratio > 0 and sampling_ratio < 1, 'Choose the correct sampling ratio.'

prediction_key = 'C_scANVI' if not use_unsupervised else None
use_reference = True if not use_unsupervised else None
reference_datasets = ['37824655', '37824663']
dir_reference = None
ref_label = 'Super_Celltype' if not use_unsupervised else None



# __Setup Tuned HyperParameters__
 
⭐ <u>enable_parameter_tuning</u>: boolean variable, indicating whether to tune hyperparameters or not. If               enable_parameter_tuning = False, hyperparameters are set to default values;<br>
⭐ <u>num_random_trials</u>: numbers of random trials when performing hyperparameter tuning;<br>
⭐ <u>n_latent</u>: dimension of latent embeddings;<br>
⭐ <u>n_hidden</u>: dimensions of hidden layers;<br>
⭐ <u>n_layers</u>: numbers of hidden layers;<br>
⭐ <u>dispersion</u>: dispersion pattern of gene expression;<br>
⭐ <u>lr</u>: learning rate;<br>
⭐ <u>max_epochs</u>: numbers of epochs to be used to train scVI (unsupervised) model.<br>

### __All parameters to be tuned follow the same <u>Dictionary</u> data structure.__ 

🔹 <u>default</u>: parameter default value;<br>
🔹 <u>continuous low</u>: setup it when parameter is assumed to be continuous, and <u>continuous_low</u> denotes the lower bound during tuning;<br>
🔹 <u>continuous high</u>: setup it when parameter is assumed to be continuous, and <u>continuous_high</u> denotes the upper bound during tuning;<br>
🔹 <u>categorical_choices</u>: setup it when parameter is assumed to be categorical, put all categorical values in a list;<br>
🔹 <u>log</u>: log = True when parameter is assumed to be continuous, otherwise log = False.<br>


In [None]:
enable_parameter_tuning = False
nums_random_trials = 5

n_latent = {
    "default": 50,
    "continuous_low": None,
    "continuous_high": None,
    "categorical_choices": [10, 30, 50],
    "log": False
}

n_hidden = {
    "default": 64,
    "continuous_low": None,
    "continuous_high": None,
    "categorical_choices": [64, 128, 256],
    "log": False
}

n_layers = {
    "default": 1,
    "continuous_low": None,
    "continuous_high": None,
    "categorical_choices": [1, 2, 3],
    "log": False
}


dispersion = {
    "default": "gene",
    "continuous_low": None,
    "continuous_high": None,
    "categorical_choices": ["gene", "gene-batch"],
    "log": False
}


lr = {
    "default": 5e-4,
    "continuous_low": 1e-5,
    "continuous_high": 1e-3,
    "categorical_choices": None,
    "log": True
}

max_epochs = {
    "default": 10,
    "continuous_low": None,
    "continuous_high": None,
    "categorical_choices": [50, 100, 200],
    "log": False
}



In [None]:
# seed everything
seed(SEED)
scvi.settings.seed = SEED

logger = setup_logger(name = "TACA", 
                    log_file = f'{celltype}.log',
                    log_dir = f'../output_test/Level{level}')

In [None]:
# load and preprocess AnnData to be integarated

adata = ad.read_h5ad(dir_adata)
adata.layers["counts"] = adata.X.copy()

logger.info("AnnData has {:,} cell/nuclei and {:,} genes.".format(adata.shape[0], adata.shape[1]))

if not use_unsupervised:
    adata = setup_reference_label(
        adata = adata,
        labels_key = labels_key,
        ref_label = ref_label,
        celltype = celltype,
        dataset_key = dataset_key,
        reference_datasets = reference_datasets,
        dir_reference = dir_reference
    )



In [None]:
# Create directory to save unsupervised (scVI) and semi-supervised (scANVI) models

create_directory(dir_scvi)

if dir_scanvi is not None:
    create_directory(dir_scanvi)

## Function DEGs_HVGs

- Compute highly variable genes (HVGs) and differential expressed genes (DEGs, if using semi-supervised model);

- Subset anndata.

In [None]:

adata = DEGs_HVGs(
        adata = adata, 
        dir_deg = dir_deg, 
        dir_hvg = dir_hvg,
        labels_key = labels_key,
        deg_group = deg_group, 
        top_deg_num = top_deg_num,
        use_sampling = use_sampling,
        sampling_ratio = sampling_ratio,
        use_reference = use_reference,
        top_hvg_genes = top_hvg_genes,
        subset = subset,
        layer = layer,
        batch_key = batch_key,
        span = span
    )


## Function scIntegration

- Integration with either unsupervised ([scVI](https://www.nature.com/articles/s41592-018-0229-2)) or semi-supervised ([scANVI](https://www.embopress.org/doi/full/10.15252/msb.20209620)) model

    1. split adata into adata_train, adata_val, adata_test using train_ratio and test_ratio predefined;

    2. tune hyperparameter parameters based on evidence lower bound (ELBO) value, and generate best hyperparameters;

    3. generate entire nuclei embedding (and cell type predictions if using semi-supervised model) using best hyperparameters.


In [None]:
adata = scIntegration(
    adata = adata,
    level = level,
    train_ratio = train_ratio,
    test_ratio = test_ratio,
    seed_value = SEED,
    shuffle = shuffle,
    adjust_cols = [batch_key],
    accelerator = accelerator,
    labels_key = labels_key,
    prediction_key = prediction_key,
    layer = layer,
    batch_key = batch_key,
    categorical_covariate_keys = categorical_covariate_keys,
    continuous_covariate_keys = continuous_covariate_keys,
    enable_parameter_tuning = enable_parameter_tuning,
    n_latent = n_latent if enable_parameter_tuning else n_latent["default"],
    n_hidden = n_hidden if enable_parameter_tuning else n_hidden["default"],
    n_layers = n_layers if enable_parameter_tuning else n_layers["default"],
    dispersion = dispersion if enable_parameter_tuning else dispersion["default"],
    lr = lr if enable_parameter_tuning else lr["default"],
    max_epochs = max_epochs if enable_parameter_tuning else max_epochs["default"],
    scanvi_epochs = scanvi_epochs,
    dir_scvi = dir_scvi,
    dir_scanvi = dir_scanvi,
    use_unsupervised = use_unsupervised,
    nums_random_trials = nums_random_trials
)


## __Function nuclei_clustering__
- Cluster all nuclei with learned embedding using either 'leiden' or 'louvain' clustering algorithm.

In [None]:
adata = nuclei_clustering(
    adata = adata,
    level = level,
    method = clustering_method,
    resolution = clustering_resolution,
    seed_value = SEED,
    use_unsupervised = use_unsupervised
)

## __Function UMAP_visulization__
- Generate 2D UMAP embedding for visulization.

In [None]:
adata = UMAP_visualization(
    adata = adata,
    seed_value = SEED,
    level = level,
    use_unsupervised = use_unsupervised
)

plt.figure(figsize = (6,6))
sc.pl.umap(adata)
plt.savefig('./test1.png')

## __Function save_anndata__
- Load AnnData object before integration (dir_adata), should be with dimension n_cell X n_genes [e.g., all protein coding genes];<br>
- Add cell type prediction if any;
- Add cell embeddings from unsupervised model (e.g., scVI);
- Add umap embeddings;

In [None]:
save_anndata(
    dir_adata = dir_adata, 
    adata = adata,
    level = level,
    prediction_key = prediction_key,
    use_unsupervised = use_unsupervised,
    dir_save = dir_scvi if use_unsupervised else dir_scanvi,
)