In [None]:
import os
import sys
sys.path.insert(0, '../')
from utils.constants import GENE_SELECT_INTERSECTION, GENE_SELECT_UNION, GENE_SELECT_PROTEIN_CODING
from utils.utils import setup_logger, merge_datasets, load_protein_coding_genes, create_directory

# __Setup Variables & Constants__

⭐ <u>dir_datasets</u>: directory of all datasets;<br>
⭐ <u>dir_integration</u>: directory to save AnnData that represents merged datasets;<br>
⭐ <u>dir_data</u>: directory of auxiliary files;<br>

⭐ <u>ref_label</u>: column name in integrated AnnData that denotes cell type; <br>
⭐ <u>cell_type_key</u>: cell type key for each independent dataset;<br>
⭐ <u>dataset_key</u>: column name in integrated AnnData that denotes dataset;<br>
⭐ <u>sampleid_key</u>: column name in integrated AnnData that denotes sample ID;<br>
⭐ <u>batch_key</u>: column name in integrated AnnData that denotes batch ID;<br>
⭐ <u>kept_genes</u>: rules for gene selection when merging different datasets;<br>
⭐ <u>GENE_SELECT_PROTEIN_CODING</u>: when merging multiple datasets, keep all protein coding genes that are sequenced by at least one dataset;<br>
⭐ <u>GENE_SELECT_PROTEIN_GENE_SELECT_INTERSECTION</u>: when merging multiple datasets, keep only common genes sequenced by all datasets merged;<br>
⭐ <u>GENE_SELECT_PROTEIN_GENE_SELECT_UNION</u>: when merging multiple datasets, keep all genes that are sequenced by at least one dataset;<br>


⭐ <u>add_meta</u>: boolean variable, whether or not add meta information; <br>
⭐ <u>meta_cols</u>: types of meta information to be added;<br>


In [None]:
dir_datasets = '../test_Data'
dir_integration = '../test_Data'
dir_data = '../data'

ref_label = 'Super_Celltype'
cell_type_key = 'cell_type'
dataset_key = 'Dataset'
sampleid_key = 'SampleID'
batch_key = 'BatchID'
kept_genes = GENE_SELECT_PROTEIN_CODING
assert kept_genes in [GENE_SELECT_PROTEIN_CODING, GENE_SELECT_INTERSECTION, GENE_SELECT_UNION]

add_meta = True
meta_cols = ['Sex', 'BrainRegion', 'Assay']


In [None]:
# setup logger
logger = setup_logger(name = "TACA", 
                    log_file = 'preintegration.log',
                    log_dir = '../output_test/')

In [None]:
all_ds = [name for name in os.listdir(dir_datasets) if os.path.isdir(os.path.join(dir_datasets, name))]
logger.info('There are {:,} datasets used for integration.'.format(len(all_ds)))

In [None]:
# Load protein coding genes
protein_coding_genes = set(load_protein_coding_genes(dir_file = os.path.join(dir_data, 'protein_coding_genes.tsv')))
logger.info('There are totally {:,} protein coding genes.'.format(len(protein_coding_genes)))

## __Merge Datasets for All CellTypes__
- Import references cell type annotations if any;
- Import 'SampleID' (or defined by sampleid_key) from each dataset into AnnData object;
- Import 'BatchID' (or defined by batch_key) from each dataset into AnnData object;
- Import 'Dataset' (or defined by dataset_key) from each dataset into AnnData object;
- Import additional meta information (defined by meta_cols) from each dataset into AnnData object;
- Delete all pre-existing cell embeddings if any;
- Merge all datasets;
- Save the merged AnnData object.


In [None]:
adata = merge_datasets(
    all_datasets = all_ds,
    dir_datasets = dir_datasets,
    ref_label = ref_label,
    cell_type_key = cell_type_key,
    dataset_key = dataset_key,
    sampleid_key = sampleid_key,
    batch_key = batch_key,
    kept_genes = kept_genes,
    protein_coding_genes = protein_coding_genes,
    add_meta = add_meta,
    meta_cols = meta_cols
)

create_directory(dir_integration)

adata.write_h5ad(os.path.join(dir_integration, 'raw_count_before_integration.h5ad'))