### Load needed libraries

In [None]:
import importlib
import iterative_scANVI
importlib.reload(iterative_scANVI)
from iterative_scANVI import *
%matplotlib inline

sc.settings.n_jobs = 32

pwd = os.getcwd()

### Self projection with reference data

In [None]:
## Load the reference 10x snRNA-seq singleome dataset
dataset = "reference"
technology = "singleomeCR6"
region = "MTG"
date = "2022-04-08"

adata_ref = sc.read_h5ad(filename=os.path.join(pwd, "input", region + "_" + dataset + "_" + technology + "." + date + ".h5ad"))

for i in adata_ref.obs["donor_name"].unique():
    ref = adata_ref.obs["donor_name"] == i

    for j in ["class", "subclass", "cluster"]:
        adata_ref.obs[j + "_held"] = "Unknown"
        adata_ref.obs.loc[ref, j + "_held"] = adata_ref.obs.loc[ref, i]
        adata_ref.obs[j + "_held"] = adata_ref.obs[i + "_held"].astype("category")

    iterative_scANVI(
        adata_ref[[not i for i in ref]], 
        adata_ref[ref],
        output_dir=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "_" + i), 
        labels_keys=["class_held", "subclass_held", "cluster_held"],
        **{
            "categorical_covariate_keys": ["donor_name"],
            "continuous_covariate_keys": ["nFeature_RNA"]
          }
    )

In [None]:
# Define MTG supertypes based on results above

low_confidence = ["L2/3 IT_4", "L2/3 IT_9", "L2/3 IT_11",
                  "L5 IT_4",
                  "L5/6 NP_5",
                  "Micro-PVM_3",
                  "Pvalb_4", "Pvalb_11",
                  "Sncg_7",
                  "Sst_6", "Sst_8", "Sst_14", "Sst_15", "Sst_16", "Sst_17", "Sst_18", "Sst_21", "Sst_24", "Sst_26", 
                  "Vip_3", "Vip_7",  "Vip_8", "Vip_10", "Vip_17", "Vip_20", "Vip_22"]

adata_ref.obs["supertype"] = adata_ref.obs["cluster"].copy()
adata_ref.obs["supertype"] = adata_ref.obs["supertype"].astype("object")

for i in low_confidence:
    adata_ref.obs.loc[adata_ref.obs["cluster"] == i, "supertype"] = "Unknown"

adata_ref.obs["supertype"] = adata_ref.obs["supertype"].astype("category")
adata_ref.obs["Source"] = "10x"

In [None]:
# Re-run the model with only the selected supertypes

for i in adata_ref.obs["donor_name"].unique():
    ref = adata_ref.obs["donor_name"] == i

    for j in ["class", "subclass", "supertype"]:
        adata_ref.obs[j + "_held"] = "Unknown"
        adata_ref.obs.loc[ref, j + "_held"] = adata_ref.obs.loc[ref, i]
        adata_ref.obs[j + "_held"] = adata_ref.obs[i + "_held"].astype("category")

    iterative_scANVI(
        adata_ref[[not i for i in ref]], 
        adata_ref[ref],
        output_dir=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "_supertype_" + i), 
        labels_keys=["class_held", "subclass_held", "supertype_held"],
        **{
            "categorical_covariate_keys": ["donor_name"],
            "continuous_covariate_keys": ["nFeature_RNA"]
          }
    )

### Self projection with smFISH genes

In [None]:
## Load the list of MERFISH genes to evaluate supertype performance with this feature set
mFISH_genes = pd.read_csv(os.path.join(pwd, "input", "human_vizgen_genelist.csv"), index_col=0)

for i in adata_ref.obs["donor_name"].unique():
    ref = adata_ref.obs["donor_name"] == i

    for j in ["class", "subclass", "supertype"]:
        adata_ref.obs[j + "_held"] = "Unknown"
        adata_ref.obs.loc[ref, j + "_held"] = adata_ref.obs.loc[ref, i]
        adata_ref.obs[j + "_held"] = adata_ref.obs[i + "_held"].astype("category")

    iterative_scANVI(
        adata_ref[[not i for i in ref]], 
        adata_ref[ref],
        output_dir=os.path.join(pwd, "output", region + "_" + dataset + "_" + technology + "_supertype_MERFISH_" + i), 
        labels_keys=["class_held", "subclass_held", "supertype_held"],
        **{
            "categorical_covariate_keys": ["donor_name"],
            "continuous_covariate_keys": ["nFeature_RNA"]
            "user_genes": [mFISH_genes.index.to_list(), mFISH_genes.index.to_list(), mFISH_genes.index.to_list()]
          }
    )

### Run on AD singleome and multiome combined data

In [None]:
dataset = "AD"
region = "MTG"
date = "2022-04-08"

adata_query = sc.read_h5ad(filename=os.path.join(pwd, "input", region + "_" + dataset + "_combined." + date + ".h5ad"))

iterative_scANVI(
    adata_query,
    adata_ref,
    output_dir=os.path.join(pwd, "output", region + "_" + dataset),
    labels_keys=["class", "subclass", "supertype"],
    **{
        "categorical_covariate_keys": ["donor_name"],
        "continuous_covariate_keys": ["nFeature_RNA"]
      }
)

### Save subclass AnnData objects for manual curation

In [None]:
save_anndata(
    adata=adata,
    adata_ref=adata_ref,
    split_key="subclass_scANVI",
    groupby="supertype",
    output_dir=os.path.join(pwd, "output", region + "_" + dataset),
    date="2022-03-24",
    model_args={
        "layer": "UMIs",
        "categorical_covariate_keys": ["donor_name"],
        "continuous_covariate_keys": ["nFeature_RNA"]
    },
    **{
        "n_cores": 32,
        "cluster_cells": True
    }
)

# These files were manually curated in cellxgene

### Leiden clustering refinement

In [None]:
region = "MTG"
dataset = "AD"

groups = {
    "Astro": {
        "type": "glia"
    },
    "Endo": {
        "type": "glia"
    },
    "Micro-PVM": {
        "type": "glia",
        "cutoffs": {
            "nFeature_RNA": (1000, "gt")
        }
    },
    "Oligo": {
        "type": "glia"
    },
    "OPC": {
        "type": "glia"
    },
    "VLMC": {
        "type": "glia"
    }
}

clean_taxonomies(
    groups=groups,
    splitby="supertype_scANVI",
    reference_key="reference_cell",
    object_dir=os.path.join(pwd, "output", region + "_" + dataset, "objects"),
    **{
        "layer": "UMIs",
        "categorical_covariate_keys": ["donor_name", "sex", "ch_race___1", "method"],
        "continuous_covariate_keys": ["nFeature_RNA", "age_at_death", "fraction_mito"],
        "diagnostic_plots": ["sex", "age_at_death", "donor_name", "ch_cognitivestatus_id", "braak", "adneurochange", "method", "ch_race___1", "doublet_score", "nFeature_RNA", "fraction_mito"],
        "use_markers": True
    }
)

In [None]:
region = "DLPFC"
dataset = "AD"
groups = {
    "L2/3 IT": {
        "type": "excitatory"
    },
    "L4 IT": {
        "type": "excitatory"
    },
    "L5 ET": {
        "type": "excitatory",
        "cutoffs": {
            "doublet_score": (0.7, "lt")
        }
    },
    "L5 IT": {
        "type": "excitatory"
    },
    "L5/6 NP": {
        "type": "excitatory"
    },
    "L6 CT": {
        "type": "excitatory"
    },
    "L6 IT": {
        "type": "excitatory"
    },
    "L6 IT Car3": {
        "type": "excitatory"
    },
    "L6b": {
        "type": "excitatory"
    },
    "Lamp5": {
        "type": "inhibitory"
    },
    "Lamp5_Lhx6": {
        "type": "inhibitory"
    },
    "Pax6": {
        "type": "inhibitory"
    },
    "Pvalb": {
        "type": "inhibitory"
    },
    "Sncg": {
        "type": "inhibitory"
    },
    "Sst": {
        "type": "inhibitory"
    },
    "Sst Chodl": {
        "type": "inhibitory"
    },
    "Vip": {
        "type": "inhibitory"
    }
}

clean_taxonomies(
    groups=groups,
    splitby="supertype_scANVI",
    reference_key="reference_cell",
    object_dir=os.path.join(pwd, "output", region + "_" + dataset, "objects"),
    **{
        "layer": "UMIs",
        "categorical_covariate_keys": ["donor_name", "sex", "ch_race___1", "method"],
        "continuous_covariate_keys": ["nFeature_RNA", "age_at_death", "fraction_mito"],
        "diagnostic_plots": ["sex", "age_at_death", "donor_name", "ch_cognitivestatus_id", "braak", "adneurochange", "method", "ch_race___1", "doublet_score", "nFeature_RNA", "fraction_mito"],
        "use_markers": True,
        "refine_supertypes": False
    }
)

### Expand the non-neuronal taxonomy and pull final QC together

In [None]:
region = "MTG"
dataset = "AD"
output_dir = os.path.join(pwd, "output", region + "_" + dataset)
corrected_label = "supertype_scANVI"
results_file = "iterative_scANVI_results.2022-03-24.csv"
split_key = "subclass_scANVI"

scANVI_results = pd.read_csv(os.path.join(output_dir, results_file), index_col=0)
corrected = pd.DataFrame(columns=[corrected_label])

for n,i in enumerate(os.listdir(os.path.join(output_dir, "objects"))):
    if os.path.isdir(os.path.join(output_dir, "objects", i)) is False or i.startswith("."):
        continue

    if i in scANVI_results[split_key].astype("category").cat.categories is False:
        continue

    print(str(datetime.now()) + " -- " + i)
    cell_labels = pd.read_csv(os.path.join(output_dir, "objects", i, "corrections.csv"), index_col=0)
    corrected = pd.concat([corrected, cell_labels])

corrected.loc[corrected[corrected_label + "_leiden"].isna(), corrected_label + "_leiden"] = corrected.loc[corrected[corrected_label + "_leiden"].isna(), corrected_label]
scANVI_results.drop(corrected_label, axis=1, inplace=True)
scANVI_results = pd.concat([scANVI_results, corrected.loc[scANVI_results.index, :]], axis=1)
scANVI_results.to_csv(os.path.join(output_dir, "iterative_scANVI_results_refined." + str(datetime.date(datetime.now())) + ".csv"))

### Add in finalized scANVI results to the AnnData

In [None]:
dataset = "AD"
region = "MTG"

adata = sc.read_h5ad("MTG_combined.2022-04-08.h5ad")

output_dir = os.path.join(pwd, "output", region + "_" + dataset)
results_file = "iterative_scANVI_results_refined.2022-04-13.csv"

scANVI_results = pd.read_csv(os.path.join(output_dir, results_file), index_col=0)
scANVI_results = scANVI_results.loc[:, np.setdiff1d(scANVI_results.columns, adata.obs.columns)]
    
if scANVI_results.shape[0] != adata.shape[0]:
    common_cells = np.intersect1d(adata.obs_names, scANVI_results.index)
    adata = adata[common_cells].copy()
    print("WARNING: Mismatch between cells in scANVI results and merged AnnData object, using " + str(len(common_cells)) + " common cells. Was this expected?") 

adata.obs = pd.concat([adata.obs, scANVI_results.loc[adata.obs_names, :]], axis=1)

adata.obs.loc[adata.obs["supertype_scANVI"] == "Micro-PVM_reassign", "subclass_scANVI"] = "Micro-PVM"
adata.obs.loc[adata.obs["supertype_scANVI"] == "Micro-PVM_reassign", "supertype_scANVI"] = "Micro-PVM_2"
adata.obs.loc[adata.obs["supertype_scANVI_leiden"] == "Micro-PVM_reassign", "supertype_scANVI_leiden"] = "Micro-PVM_Unknown_200"

### Rename new populations and create colors

In [None]:
tmp = pd.read_csv(os.path.join(pwd, "input", "Great_Ape_subclass_cluster_colors.csv"))
subclass_colors = tmp.loc[:,["subclass", "subclass_color"]].drop_duplicates()
subclass_colors.index = subclass_colors["subclass"].copy()

cluster_colors = tmp.loc[:,["cluster", "new_cluster_color"]].drop_duplicates()
cluster_colors.index = cluster_colors["cluster"].copy()

unknowns = np.setdiff1d(adata.obs["supertype_scANVI_leiden"], cluster_colors["cluster"])
tag = "SEAAD"
added = {}
for i in unknowns:
    tmp = adata.obs.loc[:,["supertype_scANVI", "supertype_scANVI_leiden"]].groupby("supertype_scANVI_leiden").value_counts(sort=False)[i]
    tmp = tmp[tmp > 0] / tmp.sum()
    if any(tmp > 0.85) is True:
        base = tmp[tmp > 0.85].index[0]
        if base in added.keys():
            added[base] = added[base] + 1
            number = copy.copy(added[base])
        else:
            number = 1
            added[base] = 1
    else:
        base = re.sub("_Unknown_[0-9]{1,3}", "", i)
        if base in added.keys():
            added[base] = added[base] + 1
            number = added[base] + int(re.sub("^(.*)_([0-9]{1,2})$", "\\2", tmp.index[-1]))
        else:
            added[base] = 1
            number = added[base] + int(re.sub("^(.*)_([0-9]{1,2})$", "\\2", tmp.index[-1]))
    
    new_name = base + "_" + str(number) + "-" + tag
    
    if "_" not in base:
        ref_color = subclass_colors.loc[base, "subclass_color"]
        if base == "Micro-PVM":
            ref_color = cluster_colors.loc["Micro-PVM_3", "new_cluster_color"]
    else:
        ref_color = cluster_colors.loc[base, "new_cluster_color"]
        
    new_color = colors.rgb_to_hsv(colors.to_rgb(ref_color))
    
    if new_color[2] > 0.5:
        if new_color[1] > 0.5:
            new_color[1] = new_color[1] - 0.2 * added[base]
        else:
            new_color[1] = new_color[1] + 0.2 * added[base]
    else:
        new_color[2] = new_color[2] + 0.2 * added[base]
        
        
    new_color = colors.to_hex(colors.hsv_to_rgb(new_color))
    
    print("Converting " + i + " to " + new_name + " with color " + new_color)
    adata.obs["supertype_scANVI_leiden"] = adata.obs["supertype_scANVI_leiden"].astype("object")
    adata.obs.loc[adata.obs["supertype_scANVI_leiden"] == i, "supertype_scANVI_leiden"] = new_name
    adata.obs["supertype_scANVI_leiden"] = adata.obs["supertype_scANVI_leiden"].astype("category")
    cluster_colors = pd.concat([cluster_colors, pd.DataFrame(np.matrix([new_name, new_color]), columns=["cluster", "new_cluster_color"], index=[new_name])], axis=0)


subclass_colors = subclass_colors.loc[:, "subclass_color"].to_dict()
cluster_colors = cluster_colors.loc[:, "new_cluster_color"].to_dict()

### Write out final AnnData object/CSVs that includes all nuclei

In [None]:
to_keep = ~(
    ([i not in adata.obs["supertype"].cat.categories for i in adata.obs["supertype_scANVI_leiden"]]) & 
    (~adata.obs["supertype_scANVI_leiden"].str.contains("_Unknown_"))
adata.obs["for_analysis"] = to_keep
    
adata.write(os.path.join(pwd, "output", region + "_" + dataset, "raw." + str(datetime.date(datetime.now())) + ".h5ad"))

### Remove low quality cells, train model for final representation and write out

In [None]:
adata = adata[to_keep].copy()

final_model_args = {
    "n_layers": 2,
    "n_latent": 20,
    "dispersion": "gene-label"
}
print("Setting up AnnData...")
scvi.model.SCVI.setup_anndata(
    adata,
    layer="UMIs",
    categorical_covariate_keys=["donor_name"],
    continuous_covariate_keys=["nFeature_RNA"],
    labels_key="supertype_scANVI_leiden"
)
if os.path.exists(os.path.join(pwd, "output", region + "_" + dataset, "final_model")) == False:
    print("Creating model...")
    final_model = scvi.model.SCVI(adata, **final_model_args)
    print("Training model...")
    final_model.train(max_epochs=200)
    final_model.save(os.path.join(pwd, "output", region + "_" + dataset, "final_model"))
else:
    print("Loading model...")
    final_model = scvi.model.SCVI.load(os.path.join(pwd, "output", region + "_" + dataset, "final_model"), adata)

print("Calculcating latent representation and UMAP...")
with parallel_backend('threading', n_jobs=32):
    adata.obsm["X_scVI"] = final_model.get_latent_representation()
    sc.pp.neighbors(adata, use_rep="X_scVI")
    sc.tl.umap(adata)
    
plt.rcParams["figure.figsize"] = (10, 10)
sc.pl.umap(adata, color="subclass_scANVI", palette=subclass_colors, legend_loc="on data", frameon=False, size=3)
sc.pl.umap(adata, color="supertype_scANVI_leiden", palette=cluster_colors, legend_loc="on data", frameon=False, size=3, legend_fontoutline=3)

adata.write(os.path.join(pwd, "output", region + "_" + dataset, "final." + str(datetime.date(datetime.now())) + ".h5ad"))

### Write out Subclass specific latent spaces and UMAP coordinates

In [None]:
for i in adata.obs["subclass_scANVI"].cat.categories:

    markers = pd.read_csv(os.path.join(pwd, "output", region + "_" + dataset, "objects", "models", split_value.replace("/", " "), "scVI_model", "var_names.csv"), header=None)
    markers = markers[0].to_list()

    sub = adata[(adata.obs["subclass_scANVI"] == i), markers].copy()
    
    model = scvi.model.SCVI.load(os.path.join(pwd, "output", region + "_" + dataset, "objects", "models", split_value.replace("/", " "), "scVI_model"), sub)
    
    with parallel_backend('threading', n_jobs=32):
        sub.obsm["X_scVI"] = model.get_latent_representation()
        sc.pp.neighbors(sub, use_rep="X_scVI")
        sc.tl.umap(sub)

    np.save(os.path.join(pwd, "output", region + "_" + dataset, "objects", "models", split_value.replace("/", " "), "scVI_model", "X_scVI.npy"), sub.obsm["X_scVI"])
    np.save(os.path.join(pwd, "output", region + "_" + dataset, "objects", "models", split_value.replace("/", " "), "scVI_model", "X_umap.npy"), sub.obsm["X_umap"])
    pd.DataFrame(sub.obs_names).to_csv(os.path.join(pwd, "output", region + "_" + dataset, "objects", "models", split_value.replace("/", " "), "scVI_model", "obs_names.csv"), index=False)    

### Train a model on the MGE neighborhood

In [None]:
sub = adata[[i in ["Sst", "Pvalb"] for i in adata.obs["subclass_scANVI"]]].copy()

final_model_args = {
    "n_layers": 2,
    "n_latent": 20,
    "dispersion": "gene-label"
}
print("Setting up AnnData...")
scvi.model.SCVI.setup_anndata(
    sub,
    layer="UMIs",
    categorical_covariate_keys=["donor_name"],
    continuous_covariate_keys=["nFeature_RNA"],
    labels_key="supertype_scANVI_leiden"
)
if os.path.exists(os.path.join(pwd, "output", region + "_" + dataset, "neighborhood", "Sst_Pvalb", "final_model")) is False:
    print("Creating model...")
    final_model = scvi.model.SCVI(sub, **final_model_args)
    print("Training model...")
    final_model.train(max_epochs=200)
    final_model.save(os.path.join(pwd, "output", region + "_" + dataset, "neighborhood", "Sst_Pvalb", "final_model"))
else:
    print("Loading model...")
    final_model = scvi.model.SCVI.load(os.path.join(pwd, "output", region + "_" + dataset, "neighborhood", "Sst_Pvalb", "final_model"), sub)

print("Calculcating latent representation and UMAP...")
with parallel_backend('threading', n_jobs=32):
    sub.obsm["X_scVI"] = final_model.get_latent_representation()
    sc.pp.neighbors(sub, use_rep="X_scVI")
    sc.tl.umap(sub)

np.save(os.path.join(pwd, "output", region + "_" + dataset,"neighborhood", "Sst_Pvalb", "final_model", "X_scVI.npy"), sub.obsm["X_scVI"])
np.save(os.path.join(pwd, "output", region + "_" + dataset, "neighborhood", "Sst_Pvalb", "final_model", "X_umap.npy"), sub.obsm["X_umap"])
pd.DataFrame(sub.obs_names).to_csv(os.path.join(pwd, "output", region + "_" + dataset, "neighborhood", "Sst_Pvalb", "final_model", "obs_names.csv"), index=False)    