### Load needed libraries

In [None]:
import importlib
import iterative_scANVI
importlib.reload(iterative_scANVI)
from iterative_scANVI import *
%matplotlib inline

sc.settings.n_jobs = 32

pwd = os.getcwd()

### Self projection with reference data

In [None]:
## Load the reference MTG dataset
dataset = "reference"
technology = "singleomeCR6"
region = "MTG"
date = "2022-04-08"

adata_ref_mtg = sc.read_h5ad(filename=os.path.join(pwd, "input", region + "_" + dataset + "_" + technology + "." + date + ".h5ad"))

# Define MTG supertypes based on results above

low_confidence = ["L2/3 IT_4", "L2/3 IT_9", "L2/3 IT_11",
                  "L5 IT_4",
                  "L5/6 NP_5",
                  "Micro-PVM_3",
                  "Pvalb_4", "Pvalb_11",
                  "Sncg_7",
                  "Sst_6", "Sst_8", "Sst_14", "Sst_15", "Sst_16", "Sst_17", "Sst_18", "Sst_21", "Sst_24", "Sst_26", 
                  "Vip_3", "Vip_7",  "Vip_8", "Vip_10", "Vip_17", "Vip_20", "Vip_22"]

adata_ref_mtg.obs["supertype"] = adata_ref_mtg.obs["cluster"].copy()
adata_ref_mtg.obs["supertype"] = adata_ref_mtg.obs["supertype"].astype("object")

for i in low_confidence:
    adata_ref_mtg.obs.loc[adata_ref_mtg.obs["cluster"] == i, "supertype"] = "Unknown"

adata_ref_mtg.obs["supertype"] = adata_ref_mtg.obs["supertype"].astype("category")

## Load the reference dataset
dataset = "reference"
technology = "singleomeCR6"
region = "A9"
date = "2022-08-19"

adata_ref = sc.read_h5ad(filename=os.path.join(pwd, "input", region + "_" + dataset + "_" + technology + "." + date + ".h5ad"))

adata_ref.obs["class"] = "Unknown"
adata_ref.obs["subclass"] = "Unknown"
adata_ref.obs["supertype"] = "Unknown"

adata_ref = adata_ref.concatenate(adata_ref_mtg, index_unique=None)
adata_ref.uns["Great Apes Metadata"] = adata_ref_mtg.uns["Great Apes Metadata"]
del adata_ref_mtg

output_dir=os.path.join(pwd, "output", region + "_" + dataset + "_"  + technology + "_MTG_liftover")
results_file = "iterative_scANVI_results.2023-06-08.csv"
scANVI_results = pd.read_csv(os.path.join(output_dir, results_file), index_col=0)
adata_ref.obs.loc[scANVI_results[scANVI_results["class"] == "Unknown"].index, "class"] = scANVI_results.loc[scANVI_results["class"] == "Unknown", "class_scANVI"].copy()
adata_ref.obs.loc[scANVI_results[scANVI_results["class"] == "Unknown"].index, "subclass"] = scANVI_results.loc[scANVI_results["class"] == "Unknown", "subclass_scANVI"].copy()
adata_ref.obs.loc[scANVI_results[scANVI_results["class"] == "Unknown"].index, "supertype"] = scANVI_results.loc[scANVI_results["class"] == "Unknown", "supertype_scANVI"].copy()
adata_ref.obs["Source"] = "Allen"

### Map external datasets to the reference

In [None]:
external_datasets = np.array(glob.glob(os.path.join(pwd, "input", "*_external_singleomeCR6_078.2023-10-12.h5ad")))
external_datasets = external_datasets[np.argsort([re.sub("^PFC_[A-Za-z]+_([0-9]{4})_external_singleomeCR6_078.2023-10-12.h5ad", "\\1", os.path.basename(i)) for i in external_datasets])]

for i in external_datasets:
    print(os.path.basename(i))
    adata_query = sc.read_h5ad(i)
    Source = os.path.basename(i).replace("PFC_", "").replace("_external_singleomeCR6_078.2023-10-12.h5ad", "")
    adata_query.obs["Source"] = Source
    iterative_scANVI(
        adata_query,
        adata_ref,
        output_dir=os.path.join(pwd, "external", "iterative_scANVI", Source),
        labels_keys=["class", "subclass", "supertype"],
        **{
            "layer": "UMIs",
            "batch_key": "Source",
            "categorical_covariate_keys": ["library_prep"],
            "scVI_model_args": {
                "n_layers": 2,
                "dispersion": "gene-batch",
            },
            "scANVI_model_args": {
                "n_layers": 2,
                "dispersion": "gene-batch",
            }
        }
    )

### Save the AnnData for each of the external datasets for reference

In [None]:
for i in external_datasets:
    print(os.path.basename(i))
    adata_query = sc.read_h5ad(i)
    adata_ref.var = adata_query.var.copy()
    adata_query.obs.index.name = "index_name"
    Source = os.path.basename(i).replace("PFC_", "").replace("_external_singleomeCR6_078.2023-10-12.h5ad", "")
    adata_query.obs["Source"] = Source
    
    save_anndata(
        adata_query=adata_query,
        adata_ref=adata_ref,
        split_key=None,
        groupby="class",
        output_dir=os.path.join(pwd, "external", "iterative_scANVI", Source),
        date = "2023-11-13",
        model_args={
            "layer": "UMIs",
            "batch_key": "Source",
            "categorical_covariate_keys": ["library_prep"],
            "continuous_covariate_keys": None,
            "scVI_model_args": {
                "n_layers": 2,
                "dispersion": "gene-batch",
            },
            "scANVI_model_args": {
                "n_layers": 2,
                "dispersion": "gene-batch",
            }
        },
        **{
            "n_cores": 32,
            "cluster_cells": False,
        }
    )