In [None]:
import os
import random
import numpy as np
import pandas as pd
import scanpy as sc
import rapids_singlecell as rsc
import scvi
import glob
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats as sp_stats
from datetime import datetime
import torch
import warnings

%matplotlib inline

warnings.filterwarnings("ignore")
sc.settings.n_jobs = 32

pwd = os.getcwd()

### Define highly informative features from MTG reference

In [None]:
## Load the reference MTG dataset
dataset = "reference"
technology = "singleomeCR6"
region = "MTG"
date = "2022-04-08"
adata_ref = sc.read_h5ad(filename=os.path.join(pwd, "input", region + "_" + dataset + "_" + technology + "." + date + ".h5ad"))

if os.path.exists(os.path.join(pwd, "output", "de_genes.csv")):
    cells = []

    for i in adata_ref.obs["cluster"].cat.categories:
        tmp = adata_ref[adata_ref.obs["cluster"] == i].obs_names.to_list()
        if len(tmp) > 100:
            cells = cells + random.sample(tmp, k=100)
        else:
            cells = cells + tmp  
    adata_ds = adata_ref[cells].copy()

    marker_genes_a = {}
    for i in adata_ref.obs["class"].cat.categories:
        try:
            adata_tmp = adata_ds[adata_ds.obs["class"] == i].copy()
            sc.tl.rank_genes_groups(adata_tmp,
                                    groupby="cluster",
                                    method="wilcoxon",
                                    tie_correct=True)

            result = adata_tmp.uns['rank_genes_groups']
            groups = result['names'].dtype.names
            tmp = {group: pd.DataFrame({key: result[key][group] for key in ['names', 'pvals_adj', 'logfoldchanges']}) for group in groups}

            marker_genes = {**marker_genes_a, **tmp}
        except ZeroDivisionError:
            pass

    marker_genes_b = {}
    for i in adata_ref.obs["subclass"].cat.categories:
        try:
            adata_tmp = adata_ds[adata_ds.obs["subclass"] == i].copy()
            sc.tl.rank_genes_groups(adata_tmp,
                                    groupby="cluster",
                                    method="wilcoxon",
                                    tie_correct=True)

            result = adata_tmp.uns['rank_genes_groups']
            groups = result['names'].dtype.names
            tmp = {group: pd.DataFrame({key: result[key][group] for key in ['names', 'pvals_adj', 'logfoldchanges']}) for group in groups}

            marker_genes_2 = {**marker_genes_b, **tmp}
        except ZeroDivisionError:
            pass

    de_genes_a = []
    for i,j in marker_genes_a.items():
        de_genes_a.extend(j.iloc[:10, :].loc[:, "names"].to_list())
    de_genes_a = np.unique(de_genes_a)

    de_genes_b = []
    for i,j in marker_genes_b.items():
        de_genes_b.extend(j.iloc[:5, :].loc[:, "names"].to_list())
    de_genes_b = np.unique(de_genes_b)

    de_genes = np.union1d(de_genes_a, de_genes_b)
    pd.DataFrame(index=de_genes).to_csv(os.path.join(pwd, "output", "de_genes.csv"), header=None)

### Build combined dataset on a subset of highly informative features

In [None]:
external_adata = []
external_datasets = glob.glob(os.path.join(pwd, "input", "*_external_singleomeCR6.2023-10-12.h5ad"))
external_datasets.extend([os.path.join(pwd, "input", "SEAAD_A9_RNAseq_final-nuclei.2023-07-19.h5ad")])

de_genes = pd.read_csv(os.path.join(pwd, "output", "de_genes.csv"), header=None)
de_genes = de_genes[0].to_list()

for k in external_datasets:
    print(os.path.basename(k))
    tmp = sc.read_h5ad(k)
    tmp = tmp[:, de_genes].copy()
    tmp.X = tmp.layers["UMIs"].copy()
    del tmp.layers["UMIs"]
    if "external" in k:
        tmp.obs["Source"] = os.path.basename(k).replace("PFC_", "").replace("_external_singleomeCR6.2023-10-12.h5ad", "")
    else:
        tmp.obs["Source"] = "Allen" 
    
    external_adata.append(tmp)

first = external_adata.pop()
external_adata = first.concatenate(external_adata, index_unique=None)

for i in external_adata.obs.columns[external_adata.obs.isna().sum(axis=0) > 0]:
    if any(external_adata.obs[i].notna()) == False:
        print("Dropping no-value column " + i)
        external_adata.obs.drop([i], axis=1, inplace=True)
            
    else:            
        replace_with = ""
        
        if isinstance(external_adata.obs.loc[external_adata.obs[i].notna(), i][0], np.float64) == True or isinstance(external_adata.obs.loc[external_adata.obs[i].notna(), i][0], np.float32) == True:
            replace_with = 0.0

        if isinstance(external_adata.obs[i].dtype, pd.core.dtypes.dtypes.CategoricalDtype) == True:
            external_adata.obs[i] = external_adata.obs[i].astype("object")
        
        print("Replacing NaNs with " + str(replace_with) + " for " + i + " with dtype " + str(type(external_adata.obs.loc[external_adata.obs[i].notna(), i][0])))
        external_adata.obs.loc[external_adata.obs[i].isna(), i] = replace_with
        
        if isinstance(external_adata.obs.loc[(external_adata.obs[i].notna()) & (external_adata.obs[i] != ""), i][0], bool) == True:
            external_adata.obs[i] = external_adata.obs[i].astype("str")
            
external_adata.obs["Donor ID"] = external_adata.obs["Donor ID"].astype("str")
external_adata.obs["Age at Death"] = external_adata.obs["Age at Death"].astype("str")
for i in external_adata.obs.columns[(external_adata.obs == "Reference").sum(axis=0) > 0]:
    external_adata.obs[i] = external_adata.obs[i].astype("str")

external_adata.write(os.path.join(pwd, "output", "PFC_external_SEAAD_singleomeCR6_UMIs_only_de_genes.2023-10-12.h5ad"))

### Load the dataset and map the original author study names

In [None]:
external_adata = sc.read_h5ad(os.path.join(pwd, "output", "PFC_external_SEAAD_singleomeCR6_UMIs_only_de_genes.2023-10-12.h5ad"))

external_adata.obs["Original Cell ID"] = external_adata.obs_names.copy()
external_adata.obs["Original Cell ID Alternative"] = external_adata.obs_names.copy()

### Mathys_2023
# ROSMAP_clinical.csv - https://www.synapse.org/#!Synapse:syn3191087
# projid_and_subject_to_cellid_ending.tsv - Personal communication, Tain Luquez
# Mathys_2023_FASTQ_batch_crosswalk.csv - Personal communication, Sudhagar Babu

# Original study name format: [Batch]_[Barcode]-[Batch Number]
# Original study name format (R): [Barcode]-[Batch Number]-[Dataset Number]
# Example: SM_Last16_AAACCCATCCGTAGTA-1
# Example (R): AAACCCATCCGTAGTA-1-1

ROSMAP_metadata = pd.read_csv(os.path.join(pwd, "external", "Mathys_2023", "ROSMAP_clinical.csv"), index_col=0)

tain_batch = pd.read_csv(os.path.join(pwd, "external", "Mathys_2023", "projid_and_subject_to_cellid_ending.tsv"), sep="\t", index_col=1)
tain_batch = tain_batch.merge(ROSMAP_metadata.loc[:, ["individualID"]], how="left", left_index=True, right_index=True)
tain_batch["num_1"] = [i.split("_")[0] for i in tain_batch["cellid_ending"]]
tain_batch["num_2"] = [i.split("_")[1] for i in tain_batch["cellid_ending"]]
tain_batch = tain_batch.loc[tain_batch["num_2"] != "2", :].copy()

author_batch = pd.read_csv(os.path.join(pwd, "external", "Mathys_2023", "Mathys_2023_FASTQ_batch_crosswalk.csv"), index_col=0)
author_batch["batch"] = "SM_" + author_batch["batch"]
author_batch["batch"] = author_batch["batch"].replace(
    {"SM_Last_16": "SM_Last16"},
)
author_batch["batch_annotation"] = author_batch["batch_annotation"].astype("str")
author_batch = author_batch.loc[:, ["individualID", "library_id", "library_prep", "batch", "batch_annotation"]].drop_duplicates().merge(tain_batch.loc[:, ["individualID", "subject", "num_1", "num_2"]], left_on=["individualID", "batch_annotation"], right_on=["individualID", "num_2"], how="left")
author_batch = author_batch.drop("batch_annotation", axis=1)
author_batch.index = author_batch["library_id"].copy()

external_adata.obs.loc[external_adata.obs["Source"] == "Mathys_2023", "Original Cell ID"] = [re.sub("^([ATGC]+)-(.*)$", "\\2_\\1-", i) for i in external_adata.obs.loc[external_adata.obs["Source"] == "Mathys_2023", "Original Cell ID"]]
for i,j in author_batch["batch"].to_dict().items():
    external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"] = external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"].str.replace(i, j)
for i,j in author_batch["num_1"].to_dict().items():
    external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"] = external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"] + j

external_adata.obs.loc[external_adata.obs["Source"] == "Mathys_2023", "Original Cell ID Alternative"] = [re.sub("^([ATGC]+)-(.*)$", "\\1", i) for i in external_adata.obs.loc[external_adata.obs["Source"] == "Mathys_2023", "Original Cell ID Alternative"]]
for i,j in author_batch["num_1"].to_dict().items():
    external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID Alternative"] = external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID Alternative"] + "-" + j
for i,j in author_batch["num_2"].to_dict().items():
    external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID Alternative"] = external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID Alternative"] + "-" + j
    
### Green_2023
# cell-annotation.csv - https://www.synapse.org/#!Synapse:syn51218314
# ROSMAP_snRNAseq_demultiplexed_ID_mapping.csv - https://www.synapse.org/#!Synapse:syn34572333
# Original study name format: [Batch ID]_[Barcode]-1
# Example: 190403-B4-A_AAACCCATCCGTAGTA-1

author_annotation = pd.read_csv(os.path.join(pwd, "external", "Green_2023", "cell-annotation.csv"), index_col=0)
author_annotation = author_annotation.loc[author_annotation.index.str.startswith("MAP"), :]
author_annotation["libraryBatch_fix"] = [re.sub("_[ATGC]+-1", "", i) for i in author_annotation.index]
author_annotation = author_annotation.loc[:, ["individualID", "libraryBatch_fix"]].drop_duplicates()
author_annotation["new_batch"] = "B6-" + author_annotation["individualID"] + "-alone_Merged"

author_batch = pd.read_csv(os.path.join(pwd, "external", "Green_2023", "ROSMAP_snRNAseq_demultiplexed_ID_mapping.csv"))
author_batch = author_batch.loc[:, ["libraryBatch", "individualID"]].drop_duplicates()
author_batch["new_batch"] = [re.sub("^[0-9]+-", "", i) + "_Merged" for i in author_batch["libraryBatch"]]
author_batch["new_batch"] = author_batch["new_batch"].replace(
    {
        "B6_Merged": "B6-A_Merged"
    }
)
author_batch = author_batch.reset_index(drop=True)

author_batch = author_batch.loc[:, ["libraryBatch", "new_batch"]].drop_duplicates().merge(author_annotation.loc[:, ["new_batch", "libraryBatch_fix"]], left_on="new_batch", right_on="new_batch", how="left")
author_batch.loc[~author_batch["libraryBatch_fix"].isna(), "libraryBatch"] = author_batch.loc[~author_batch["libraryBatch_fix"].isna(), "libraryBatch_fix"]
author_batch = author_batch.drop("libraryBatch_fix", axis=1)
author_batch.index = author_batch["new_batch"].copy()

external_adata.obs.loc[external_adata.obs["Source"] == "Green_2023", "Original Cell ID"] = [re.sub("^([ATGC]+)-(.*)$", "\\2_\\1-1", i) for i in external_adata.obs.loc[external_adata.obs["Source"] == "Green_2023", "Original Cell ID"]]
for i,j in author_batch["libraryBatch"].to_dict().items():
    external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"] = external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"].str.replace(i, j)

external_adata.obs.loc[external_adata.obs["Source"] == "Green_2023", "Original Cell ID Alternative"] = ""

### Yang_2022
# Original study name format: [Barcode]-[Donor ID]
# Example: AAACCCATCCGTAGTA-AD 4

author_batch = external_adata.obs.loc[external_adata.obs["Source"] == "Yang_2022", ["library_prep", "Donor ID"]].drop_duplicates().reset_index(drop=True)
author_batch.index = author_batch["library_prep"].copy()

for i,j in author_batch["Donor ID"].to_dict().items():
    external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"] = external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"].str.replace(i, j)

external_adata.obs.loc[external_adata.obs["Source"] == "Yang_2022", "Original Cell ID Alternative"] = ""

### Cain_2022
# Original study name format: [Batch ID]_[Sex]_[Barcode]
# Example: MFC_B2_16_Cdx4_pAD1_M_GAACATCCACAGACTT
author_batch = external_adata.obs.loc[external_adata.obs["Source"] == "Cain_2022", ["library_prep", "Sex"]].drop_duplicates().reset_index(drop=True)
author_batch["new_batch"] = author_batch["library_prep"].str.replace("Cog", "Cdx").str.replace("Path", "pAD").str.replace("-", "_")
author_batch["Sex"] = author_batch["Sex"].replace(
    {
        "Female": "F",
        "Male": "M"
    }
)
author_batch["new_batch"] = author_batch["new_batch"] + "_" + author_batch["Sex"].astype("str")
author_batch.index = author_batch["library_prep"].copy()

external_adata.obs.loc[external_adata.obs["Source"] == "Cain_2022", "Original Cell ID"] = [re.sub("^([ATGC]+)-(.*)$", "\\2_\\1", i) for i in external_adata.obs.loc[external_adata.obs["Source"] == "Cain_2022", "Original Cell ID"]]

for i,j in author_batch["new_batch"].to_dict().items():
    external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"] = external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"].str.replace(i, j)

external_adata.obs.loc[external_adata.obs["Source"] == "Cain_2022", "Original Cell ID Alternative"] = ""

### Morabito_2021
# donor_to_cellid_ending.tsv - Personal communication, Tain Luquez
# Original study name format: [Barcode]-[Batch Number]
# Example: AAACGCTAGGGTACGT-5

author_batch = pd.read_csv(os.path.join(pwd, "external", "Morabito_2021", "donor_to_cellid_ending.tsv"), sep="\t")
author_batch.index = author_batch["donor"].copy()

for i,j in author_batch["cellid_ending"].to_dict().items():
    external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"] = external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"].str.replace(i, str(j))

external_adata.obs.loc[external_adata.obs["Source"] == "Morabito_2021", "Original Cell ID Alternative"] = ""

### Leng_2021
# Original study name format: [Donor ID]_[Barcode]
# Example: SFG1_AAACGCTAGGGTACGT

author_batch = external_adata.obs.loc[external_adata.obs["Source"] == "Leng_2021", ["library_prep", "Donor ID"]].drop_duplicates().reset_index(drop=True)
author_batch["Donor ID"] = "SFG" + author_batch["Donor ID"].astype("str")
author_batch.index = author_batch["library_prep"].copy()

external_adata.obs.loc[external_adata.obs["Source"] == "Leng_2021", "Original Cell ID"] = [re.sub("^([ATGC]+)-(.*)$", "\\2_\\1", i) for i in external_adata.obs.loc[external_adata.obs["Source"] == "Leng_2021", "Original Cell ID"]]

for i,j in author_batch["Donor ID"].to_dict().items():
    external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"] = external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"].str.replace(i, j)

external_adata.obs.loc[external_adata.obs["Source"] == "Leng_2021", "Original Cell ID Alternative"] = ""

### Lau_2020
# Original study name format: [Barcode]-1_[Donor ID]
# Example: AAACGCTAGCGATGGT-1_AD1

external_adata.obs.loc[external_adata.obs["Source"] == "Lau_2020", "Original Cell ID Alternative"] = ""

### Zhou_2020
# Original study name format: [Barcode]-1_[Donor ID]
# Example: AAACCTGGTCCGTGAC-1_AD11

external_adata.obs.loc[external_adata.obs["Source"] == "Zhou_2020", "Original Cell ID"] = [re.sub("TWCC-([^-]+)-([^-]+)-lib1", "1_\\2", i) for i in external_adata.obs.loc[external_adata.obs["Source"] == "Zhou_2020", "Original Cell ID"]]
external_adata.obs.loc[external_adata.obs["Source"] == "Zhou_2020", "Original Cell ID Alternative"] = ""

### Olah_2020
# Original study name format: [Barcode]-[Donor ID]
# Example: AAACCTGAGCTAGTTC-Microglia_MO_AD8

external_adata.obs.loc[external_adata.obs["Source"] == "Olah_2020", "Original Cell ID Alternative"] = ""

### Mathys_2019
# projid_to_cellid_ending.tsv - Personal communication, Tain Luquez
# ROSMAP_clinical.csv - https://www.synapse.org/#!Synapse:syn3191087
# Original study name format: [Barcode].[Batch Number]
# Example: AAAGATGAGCCAGAAC.41

ROSMAP_metadata = pd.read_csv(os.path.join(pwd, "external", "Mathys_2019", "ROSMAP_clinical.csv"), index_col=0)

tain_batch = pd.read_csv(os.path.join(pwd, "external", "Mathys_2019", "projid_to_cellid_ending.tsv"), sep="\t", index_col=1)
tain_batch = tain_batch.merge(ROSMAP_metadata.loc[:, ["individualID"]], how="left", left_index=True, right_index=True)

author_batch = external_adata.obs.loc[external_adata.obs["Source"] == "Mathys_2019", ["library_prep", "Donor ID"]].drop_duplicates().reset_index(drop=True)
author_batch = author_batch.merge(tain_batch, left_on="Donor ID", right_on="individualID", how="left")
author_batch.index = author_batch["library_prep"].copy()
author_batch

external_adata.obs.loc[external_adata.obs["Source"] == "Mathys_2019", "Original Cell ID"] = [re.sub("^([ATGC]+)-(.*)$", "\\1.\\2", i) for i in external_adata.obs.loc[external_adata.obs["Source"] == "Mathys_2019", "Original Cell ID"]]

for i,j in author_batch["cellid_ending"].to_dict().items():
    external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"] = external_adata.obs.loc[external_adata.obs["library_prep"] == i, "Original Cell ID"].str.replace(i, str(j))

external_adata.obs.loc[external_adata.obs["Source"] == "Mathys_2019", "Original Cell ID Alternative"] = ""


### Train the model to integrate it

In [None]:
model_args = {
    "n_layers": 2,
    "n_latent": 20,
    "dispersion": "gene-batch",
}
scvi.model.SCVI.setup_anndata(
    external_adata,
    layer=None,
    batch_key="Source",
    categorical_covariate_keys=["library_prep"]
)
if os.path.exists(os.path.join(pwd, "output", "External_Data_Models", "Full")) == False:
    model = scvi.model.SCVI(external_adata, **model_args)
    model.train(max_epochs=500, early_stopping=True, use_gpu=True)
    model.save(os.path.join(pwd, "output", "External_Data_Models", "Full"))
else:
    model = scvi.model.SCVI.load(os.path.join(pwd, "output", "External_Data_Models", "Full"), external_adata)

### Import results from iterative scANVI

In [None]:
results = glob.glob(os.path.join(pwd, "external", "iterative_scANVI", "*", "iterative_scANVI_results.2023-11-06.csv"))
for i in results:
    result = pd.read_csv(i, index_col=0)
    result = result.loc[result["class"] == "Unknown", :].copy()
    external_adata.obs.loc[result.index, "Class"] = result.loc[:, "class_scANVI"].replace(
        {
            "exc": "Neuronal: Glutamatergic",
            "inh": "Neuronal: GABAergic",
            "glia": "Non-neuronal and Non-neural"
        }
    )
    external_adata.obs.loc[result.index, "Class confidence"] = result.loc[:, "class_conf_scANVI"]
    external_adata.obs.loc[result.index, "Subclass"] = result.loc[:, "subclass_scANVI"].replace(
        {
            "Lamp5_Lhx6": "Lamp5 Lhx6",
            "Astro": "Astrocyte",
            "Oligo": "Oligodendrocyte",
            "Endo": "Endothelial",
            "Micro-PVM": "Microglia-PVM"
        }
    )
    external_adata.obs.loc[result.index, "Subclass confidence"] = result.loc[:, "subclass_conf_scANVI"]
    try:
        external_adata.obs["Supertype (non-expanded)"] = external_adata.obs["Supertype (non-expanded)"].cat.add_categories(["L5 ET_1", "Endo_1", "Endo_3"])
    except:
        pass
    external_adata.obs.loc[result.index, "Supertype (non-expanded)"] = result.loc[:, "supertype_scANVI"]
    external_adata.obs.loc[result.index, "Supertype confidence"] = result.loc[:, "supertype_conf_scANVI"]

### Compute a low dimensional representation and cluster the cells

In [None]:
external_adata.obsm["X_scVI"] = model.get_latent_representation()
print(str(datetime.now()) + " -- Computed latent representation")
rsc.pp.neighbors(external_adata, use_rep="X_scVI")
print(str(datetime.now()) + " -- Computed neighbors")
rsc.tl.umap(external_adata, min_dist=0.3)
print(str(datetime.now()) + " -- Computed UMAP")
rsc.tl.leiden(external_adata, resolution=5)
print(str(datetime.now()) + " -- Computed clustering")
sc.pp.subsample(external_adata, fraction=1)

### Perform QC on the clusters 

In [None]:
neuronal_clusters = external_adata.obs.loc[:, ["leiden", "Class"]].groupby("leiden").value_counts(sort=False, normalize=True).reset_index()
neuronal_clusters = neuronal_clusters.loc[neuronal_clusters["Class"] == "Non-neuronal and Non-neural"]
neuronal_clusters.columns = ["leiden", "Class", "Fraction Non-neuronal"]
neuronal_clusters = neuronal_clusters.loc[neuronal_clusters["Fraction Non-neuronal"] < 0.3, "leiden"]
neuronal_clusters = neuronal_clusters.to_list()

external_adata.obs["Neuronal Cluster"] = external_adata.obs["leiden"].isin(neuronal_clusters)
external_adata.obs["Neuronal Cluster"] = external_adata.obs["Neuronal Cluster"].astype("category")

qc_glia_thresh = [0.20, 1000, 0.0325]
qc_neuron_thresh = [0.20, 3000, 0.0325]
qc_vars = ["Doublet score", "Genes detected", "Fraction mitochondrial UMIs"]
qc_dir = ["gt", "lt", "gt"]
for l,k in enumerate(qc_vars):
    plt.rcParams["figure.figsize"] = (8,8)
    tmp = external_adata.obs.loc[:, ["leiden", k]].groupby("leiden").mean().loc[:, k].to_dict()
    groups = []
    for i,j in tmp.items():
        if i in neuronal_clusters:
            if qc_dir[l] == "gt":
                if j > qc_neuron_thresh[l]:
                    groups.append(i)
            else:
                if j < qc_neuron_thresh[l]:
                    groups.append(i)  
        else:
            if qc_dir[l] == "gt":
                if j > qc_glia_thresh[l]:
                    groups.append(i)
            else:
                if j < qc_glia_thresh[l]:
                    groups.append(i)
    print(groups)
    external_adata.obs["cluster_" + k + "_flag"] = "False"
    external_adata.obs.loc[external_adata.obs["leiden"].isin(groups), "cluster_" + k + "_flag"] = "True"
    sc.pl.umap(
        external_adata,
        color=["leiden"],
        legend_loc="on data",
        frameon=False,
        groups=groups,
        na_in_legend=False,
        sort_order=False
    )
    sc.pl.umap(
        external_adata,
        color="cluster_" + k + "_flag",
        frameon=False,
        palette={"True": "red", "False": "lightgrey"},
        sort_order=False
    )
    sc.pl.umap(
        external_adata,
        color=k,
        frameon=False,
        cmap="YlGnBu",
        sort_order=False
    )
    

### Remove low quality cells and re-compute low dimensional representation

In [None]:
external_adata = external_adata[(external_adata.obs.loc[:, external_adata.obs.columns.str.endswith("_flag")] == "True").sum(axis=1) == 0, :].copy()

rsc.pp.neighbors(external_adata, use_rep="X_scVI")
print(str(datetime.now()) + " -- Computed neighbors")
rsc.tl.umap(external_adata, min_dist=0.3)
print(str(datetime.now()) + " -- Computed UMAP")
rsc.tl.leiden(external_adata, resolution=1.5)
print(str(datetime.now()) + " -- Computed clustering")

### Associate Subclass and Supertype colors with the AnnData

In [None]:
subclass_colors = pd.read_csv(os.path.join("input", "cluster_order_and_colors.csv"))
subclass_colors = subclass_colors.loc[:, ["subclass_label", "subclass_color"]]
subclass_colors.index = subclass_colors["subclass_label"]
subclass_colors = subclass_colors["subclass_color"].to_dict()

supertype_colors = pd.read_csv(os.path.join("input", "cluster_order_and_colors.csv"))
supertype_colors = supertype_colors.loc[:, ["cluster_label", "cluster_color"]]
supertype_colors.index = supertype_colors["cluster_label"]
supertype_colors = supertype_colors["cluster_color"].to_dict()

sc.pl.umap(external_adata, color = ["Subclass"], legend_loc="on data", palette=subclass_colors)
sc.pl.umap(external_adata, color = ["Supertype"], legend_loc="on data", palette=supertype_colors)

### Subset by neighborhood for additional QC

### Save final mapping results and QC for neuronal subclasses

In [None]:
for i in external_adata.obs["Source"].cat.categories:
    print(i)
    print(external_adata.obs.loc[(external_adata.obs["Class"] != "Non-neuronal and Non-neural") & (external_adata.obs["Source"] == i), :].shape)
    external_adata.obs.loc[(external_adata.obs["Class"] != "Non-neuronal and Non-neural") & (external_adata.obs["Source"] == i), :].to_csv(os.path.join(pwd, "output", "PFC_" + i + "_external_metadata_keepers_neurons.2023-11-16.csv"))

external_adata.obs.loc[(external_adata.obs["Class"] == "Non-neuronal and Non-neural"), :].to_csv(os.path.join(pwd, "output", "PFC_external_non-neurons_intermediate_results.2023-11-16.csv"))


### Rebuild non-neuronal AnnData files with all genes to identify novel populations

In [None]:
external_adata = []
external_datasets = glob.glob(os.path.join(pwd, "input", "*_external_singleomeCR6.2023-10-12.h5ad"))
external_datasets.extend([os.path.join(pwd, "input", "SEAAD_A9_RNAseq_final-nuclei.2023-07-19.h5ad")])

non_neuronal_cells = pd.read_csv(os.path.join(pwd, "output", "PFC_external_non-neurons_intermediate_results.2023-11-16.csv"), index_col=0)

for k in external_datasets:
    print(os.path.basename(k))
    tmp = sc.read_h5ad(k)
    tmp = tmp[np.intersect1d(tmp.obs_names, non_neuronal_cells.index), :].copy()
    tmp.X = tmp.layers["UMIs"].copy()
    del tmp.layers["UMIs"]
    if "external" in k:
        tmp.obs["Source"] = os.path.basename(k).replace("PFC_", "").replace("_external_singleomeCR6.2023-10-12.h5ad", "")
    else:
        tmp.obs["Source"] = "Allen" 
    
    external_adata.append(tmp)

first = external_adata.pop()
external_adata = first.concatenate(external_adata, index_unique=None)

for i in external_adata.obs.columns[external_adata.obs.isna().sum(axis=0) > 0]:
    if any(external_adata.obs[i].notna()) == False:
        print("Dropping no-value column " + i)
        external_adata.obs.drop([i], axis=1, inplace=True)
            
    else:            
        replace_with = ""
        
        if isinstance(external_adata.obs.loc[external_adata.obs[i].notna(), i][0], np.float64) == True or isinstance(external_adata.obs.loc[external_adata.obs[i].notna(), i][0], np.float32) == True:
            replace_with = 0.0

        if isinstance(external_adata.obs[i].dtype, pd.core.dtypes.dtypes.CategoricalDtype) == True:
            external_adata.obs[i] = external_adata.obs[i].astype("object")

        if external_adata.obs[i].dtype == "bool" or external_adata.obs[i].dtype == "boolean":
                external_adata.obs[i] = external_adata.obs[i].astype("object")
                external_adata.obs[i] = [str(l) for l in external_adata.obs[i]]
        
        print("Replacing NaNs with " + str(replace_with) + " for " + i + " with dtype " + str(type(external_adata.obs.loc[external_adata.obs[i].notna(), i][0])))
        external_adata.obs.loc[external_adata.obs[i].isna(), i] = replace_with
        
        if isinstance(external_adata.obs.loc[(external_adata.obs[i].notna()) & (external_adata.obs[i] != ""), i][0], bool) == True:
            external_adata.obs[i] = external_adata.obs[i].astype("str")
            
external_adata.obs["Donor ID"] = external_adata.obs["Donor ID"].astype("str")
external_adata.obs["Age at Death"] = external_adata.obs["Age at Death"].astype("str")
for i in external_adata.obs.columns[(external_adata.obs == "Reference").sum(axis=0) > 0]:
    external_adata.obs[i] = external_adata.obs[i].astype("str")

external_adata.write("output", "PFC_external_SEAAD_singleomeCR6_UMIs_only_non-neuronal.2023-11-20.h5ad")

### Load the non-neuronal data and mapping results

In [None]:
external_adata = sc.read_h5ad("output", "PFC_external_SEAAD_singleomeCR6_UMIs_only_non-neuronal.2023-11-20.h5ad")
external_adata.obs.loc[external_adata.obs["Class"] == "", ["Class", "Class confidence", "Subclass", "Subclass confidence", "Supertype (non-expanded)",  "Supertype confidence"]] = non_neuronal_cells.loc[external_adata.obs["Class"] == "", ["Class", "Class confidence", "Subclass", "Subclass confidence", "Supertype (non-expanded)",  "Supertype confidence"]].copy()
external_adata.obs["Subclass"] = external_adata.obs["Subclass"].cat.remove_unused_categories()

### Identify novel populations in non-neuronal neighborhoods

In [None]:
external_adata.obs["Used in analysis"] = True
to_update = external_adata.obs.loc[:, ["Used in analysis", "Subclass", "Supertype", "Supertype (non-expanded)"]].copy()
for j in ["Oligodendrocyte", "OPC", "Astrocyte", "Microglia-PVM", "VLMC", "Endothelial"]:
    print(j)
    # Subset the data, compute normalization
    sub = external_adata[external_adata.obs["Subclass"] == j, :].copy()
    sub.layers["UMIs"] = sub.X.copy()
    sc.pp.normalize_total(sub, 1e5)
    sc.pp.log1p(sub)

    if os.path.exists(os.path.join(pwd, "output", "External_Data_Models", j.replace("/", " "))) == False:
        adata_ref = sub[sub.obs["Source"] == "Allen", :].copy()
        groupby = "Supertype"
        n_top_genes = 2000
        n_downsample_ref = 1000
        n_ref_genes = 500

        # Define model genes to use
        markers = []
        try:
            sc.pp.highly_variable_genes(adata_ref, flavor="seurat_v3", n_top_genes=n_top_genes, layer="UMIs")
        except:
            sc.pp.highly_variable_genes(adata_ref, min_mean=1, min_disp=0.5)    
        markers = adata_ref.var[adata_ref.var.highly_variable == True].index.to_list()
        
        ref_counts = adata_ref.obs[groupby].value_counts()
        adata_ref = adata_ref[~(adata_ref.obs[groupby].isin(ref_counts[ref_counts < 15].index))]
        
        if np.setdiff1d(adata_ref.obs[groupby].cat.categories, "Unknown").shape[0] > 1:
            
            cells = []
            for i in adata_ref.obs[groupby].cat.categories:
                tmp_cells = adata_ref[adata_ref.obs[groupby] == i].obs_names.to_list()
                if len(tmp_cells) > n_downsample_ref:
                    cells = cells + random.sample(tmp_cells, k=n_downsample_ref)
                else:
                    cells.extend(tmp_cells)
            adata_ref = adata_ref[cells]
            
            sc.tl.rank_genes_groups(adata_ref, method="wilcoxon", tie_correct=True, groupby=groupby, pts=True)
    
            result = adata_ref.uns['rank_genes_groups']
            groups = result['names'].dtype.names
            marker_genes = {group: pd.DataFrame({key: result[key][group] for key in ['names', 'pvals_adj', 'logfoldchanges']}) for group in groups}
    
            for group in groups:
                marker_genes[group]['pts'] = result['pts'][group][result['names'][group]].to_list()
                marker_genes[group]['pts_rest'] = result['pts_rest'][group][result['names'][group]].to_list()
                marker_genes[group].index = marker_genes[group].names
                marker_genes[group].drop(columns=['names'], inplace=True)
                tmp_genes = marker_genes[group].copy()
                tmp_genes = tmp_genes[tmp_genes.pvals_adj < 0.05]
                tmp_genes.sort_values(by="logfoldchanges", axis=0, inplace=True, ascending=False)
                markers.extend(tmp_genes.head(n_ref_genes).index.to_list())
        else:
            warnings.warn(groupby + " contains only one label. Differentially expressed genes were NOT included in the model")
    
        markers = np.unique(markers)
        del adata_ref
        
        # Train the subclass models
        model_args = {
            "n_layers": 2,
            "n_latent": 20,
            "dispersion": "gene-batch",
        }
        sub_sub = sub[:, markers].copy()
        scvi.model.SCVI.setup_anndata(
            sub_sub,
            layer="UMIs",
            batch_key="Source",
            categorical_covariate_keys=["library_prep"]
        )
        subclass_model = scvi.model.SCVI(sub_sub, **model_args)
        subclass_model.train(max_epochs=500, early_stopping=True, use_gpu=True)
        subclass_model.save(os.path.join(pwd, "output", "External_Data_Models", j.replace("/", " ")))
    else:
        markers = torch.load(os.path.join(pwd, "output", "External_Data_Models", j.replace("/", " "), "model.pt"))
        markers = markers["var_names"]
        sub_sub = sub[:, markers].copy()
        scvi.model.SCVI.setup_anndata(
            sub_sub,
            layer="UMIs",
            batch_key="Source",
            categorical_covariate_keys=["library_prep"]
        )
        subclass_model = scvi.model.SCVI.load(os.path.join(pwd, "output", "External_Data_Models", j.replace("/", " ")), sub_sub)
        
    del sub_sub
    sub.obsm["X_scVI"] = subclass_model.get_latent_representation()
    del sub.layers["UMIs"]
    print(str(datetime.now()) + " -- Computed latent representation")
    rsc.pp.neighbors(sub, use_rep="X_scVI", n_neighbors=200)
    print(str(datetime.now()) + " -- Computed neighbors")
    rsc.tl.umap(sub, min_dist=0.3)
    print(str(datetime.now()) + " -- Computed low dimensional embedding")
    rsc.tl.leiden(sub, resolution=1)
    print(str(datetime.now()) + " -- Computed clustering")

    # Add low abundance and SEA-AD specific cell types
    if j == "Oligodendrocyte":
        # Fix calling of low abundance Oligo_3
        signature_score = sc.get.obs_df(
            sub,
            ["FRY", "MDGA2", "CNTN1", "ERBB4", "SGCZ"],
        )
        signature_score = ((signature_score - signature_score.mean(axis=0)) / signature_score.std(axis=0)).mean(axis=1)
        to_update.loc[sub.obs_names[signature_score > 1.5], "Supertype (non-expanded)"] = "Oligo_3"
        sub.obs.loc[signature_score > 1.5, "Supertype (non-expanded)"] = "Oligo_3"
        
    elif j == "OPC":
        # Fix calling of low abundance OPC_1
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["1", "13", "180", "212"])], "Supertype (non-expanded)"] = "OPC_1"
        sub.obs.loc[sub.obs["leiden"].isin(["1", "13", "180", "212"]), "Supertype (non-expanded)"] = "OPC_1"

        # Add SEA-AD specific populations
        signature_score = sc.get.obs_df(
            sub,
            ["ARHGEF3", "TPST1", "SLC24A2", "RAB3IP", "CAMK2D"],
        )
        signature_score = ((signature_score - signature_score.mean(axis=0)) / signature_score.std(axis=0)).mean(axis=1)
        to_update["Supertype (non-expanded)"] = to_update["Supertype (non-expanded)"].cat.add_categories(["OPC_2_2-SEAAD"])
        to_update.loc[sub.obs_names[signature_score > (2/3)], "Supertype (non-expanded)"] = "OPC_2_2-SEAAD"
        sub.obs["Supertype (non-expanded)"] = sub.obs["Supertype (non-expanded)"].cat.add_categories(["OPC_2_2-SEAAD"])
        sub.obs.loc[signature_score > (2/3), "Supertype (non-expanded)"] = "OPC_2_2-SEAAD"
        
    elif j == "Astrocyte":
        # Add SEA-AD specific populations
        signature_score = sc.get.obs_df(
            sub,
            ["HILPDA", "CD44", "ARHGEF3", "TPST1", "SERPINA3"],
        )
        signature_score = ((signature_score - signature_score.mean(axis=0)) / signature_score.std(axis=0)).mean(axis=1)
        to_update["Supertype (non-expanded)"] = to_update["Supertype (non-expanded)"].cat.add_categories(["Astro_6-SEAAD"])
        to_update.loc[sub.obs_names[signature_score > 1.5], "Supertype (non-expanded)"] = "Astro_6-SEAAD"
        sub.obs["Supertype (non-expanded)"] = sub.obs["Supertype (non-expanded)"].cat.add_categories(["Astro_6-SEAAD"])
        sub.obs.loc[signature_score > 1.5, "Supertype (non-expanded)"] = "Astro_6-SEAAD"

        # Fix calling of Astro_3
        signature_score = sc.get.obs_df(
            sub,
            ["LINC00609", "AC012405.1", "L3MBTL4", "ADAMTSL3", "DCLK1"],
        )
        signature_score = ((signature_score - signature_score.mean(axis=0)) / signature_score.std(axis=0)).mean(axis=1)
        selected_cells = (signature_score < 1.5) & (sub.obs["leiden"] != "15") & (sub.obs["Supertype (non-expanded)"] == "Astro_3")
        to_update.loc[sub.obs_names[selected_cells], "Supertype (non-expanded)"] = "Astro_2"
        sub.obs.loc[selected_cells, "Supertype (non-expanded)"] = "Astro_2"
        
    elif j == "Microglia-PVM":
        # Fix calling of low abundance Micro-PVM_1
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["11"])], "Supertype (non-expanded)"] = "Micro-PVM_1"
        sub.obs.loc[sub.obs["leiden"].isin(["11"]), "Supertype (non-expanded)"] = "Micro-PVM_1"
        
        # Add SEA-AD specific populations
        to_update["Supertype (non-expanded)"] = to_update["Supertype (non-expanded)"].cat.add_categories(["Monocyte", "Micro-PVM_2_1-SEAAD", "Micro-PVM_2_3-SEAAD", "Micro-PVM_3-SEAAD", "Micro-PVM_4-SEAAD"])
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["3"])], "Supertype (non-expanded)"] = "Monocyte"
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["14"])], "Supertype (non-expanded)"] = "Micro-PVM_2_1-SEAAD"
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["6"])], "Supertype (non-expanded)"] = "Micro-PVM_3-SEAAD"
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["10"])], "Supertype (non-expanded)"] = "Micro-PVM_4-SEAAD"
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["12", "17"])], "Supertype (non-expanded)"] = "Micro-PVM_2_3-SEAAD"


        sub.obs["Supertype (non-expanded)"] = sub.obs["Supertype (non-expanded)"].cat.add_categories(["Monocyte", "Micro-PVM_2_1-SEAAD", "Micro-PVM_2_3-SEAAD", "Micro-PVM_3-SEAAD", "Micro-PVM_4-SEAAD"])
        sub.obs.loc[sub.obs["leiden"].isin(["3"]), "Supertype (non-expanded)"] = "Monocyte"
        sub.obs.loc[sub.obs["leiden"].isin(["14"]), "Supertype (non-expanded)"] = "Micro-PVM_2_1-SEAAD"
        sub.obs.loc[sub.obs["leiden"].isin(["6"]), "Supertype (non-expanded)"] = "Micro-PVM_3-SEAAD"
        sub.obs.loc[sub.obs["leiden"].isin(["10"]), "Supertype (non-expanded)"] = "Micro-PVM_4-SEAAD"
        sub.obs.loc[sub.obs["leiden"].isin(["12", "17"]), "Supertype (non-expanded)"] = "Micro-PVM_2_3-SEAAD"

    elif j == "VLMC":        
        # Add SEA-AD specific populations
        to_update["Supertype (non-expanded)"] = to_update["Supertype (non-expanded)"].cat.add_categories(["SMC-SEAAD", "VLMC_2-SEAAD", "VLMC_3-SEAAD", "Pericyte_1", "Pericyte_2-SEAAD"])
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["12"])], "Supertype (non-expanded)"] = "SMC-SEAAD"
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["12"])], "Supertype (non-expanded)"] = "SMC-SEAAD"
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["7"])], "Supertype (non-expanded)"] = "VLMC_2-SEAAD"
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["4"])], "Supertype (non-expanded)"] = "VLMC_3-SEAAD"
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["5", "9"])], "Supertype (non-expanded)"] = "Pericyte_2-SEAAD"
        to_update.loc[sub.obs_names[sub.obs["Supertype (non-expanded)"] == "VLMC_2"], "Supertype (non-expanded)"] = "Pericyte_1"

        sub.obs["Supertype (non-expanded)"] = sub.obs["Supertype (non-expanded)"].cat.add_categories(["SMC-SEAAD", "VLMC_2-SEAAD", "VLMC_3-SEAAD", "Pericyte_1", "Pericyte_2-SEAAD"])
        sub.obs.loc[sub.obs["leiden"].isin(["12"]), "Supertype (non-expanded)"] = "SMC-SEAAD"
        sub.obs.loc[sub.obs["leiden"].isin(["7"]), "Supertype (non-expanded)"] = "VLMC_2-SEAAD"
        sub.obs.loc[sub.obs["leiden"].isin(["4"]), "Supertype (non-expanded)"] = "VLMC_3-SEAAD"
        sub.obs.loc[sub.obs["leiden"].isin(["5", "9"]), "Supertype (non-expanded)"] = "Pericyte_2-SEAAD"
        sub.obs.loc[sub.obs["Supertype (non-expanded)"] == "VLMC_2", "Supertype (non-expanded)"] = "Pericyte_1"

    elif j == "Endothelial":        
        # Add SEA-AD specific populations
        to_update.loc[sub.obs_names, "Supertype (non-expanded)"] = "Endo_2"
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["2"])], "Supertype (non-expanded)"] = "Endo_1"
        to_update.loc[sub.obs_names[sub.obs["leiden"].isin(["1", "5"])], "Supertype (non-expanded)"] = "Endo_3"

        sub.obs.loc[:, "Supertype (non-expanded)"] = "Endo_2"
        sub.obs.loc[sub.obs["leiden"].isin(["2"]), "Supertype (non-expanded)"] = "Endo_1"
        sub.obs.loc[sub.obs["leiden"].isin(["1", "5"]), "Supertype (non-expanded)"] = "Endo_3"

        
    plt.rcParams["figure.figsize"] = (8,8)
    sc.pl.umap(sub, color=["Source"], size=1)
    sc.pl.umap(sub, color=["Source"], groups=["Allen"], size=1)
    sc.pl.umap(sub, color=["Source"], groups=["Mathys_2023"], size=1)
    sc.pl.umap(sub, color=["Source"], groups=["Green_2023"], size=1)
    sc.pl.umap(sub, color=["Subclass", "Supertype (non-expanded)", "Supertype"], legend_loc="on data", ncols=1, size=1)
    qc_vars = ["Doublet score", "Genes detected", "Fraction mitochondrial UMIs"]
    for l,k in enumerate(qc_vars):
        sc.pl.umap(
            sub,
            color=k,
            frameon=False,
            cmap="YlGnBu",
            sort_order=False,
            size=1
        )

    sc.pl.umap(sub, color=["leiden"], legend_loc="on data", ncols=1)

external_adata.obs["Supertype (non-expanded)"] = to_update["Supertype (non-expanded)"].copy()
external_adata.obs["Supertype"] = to_update["Supertype (non-expanded)"].copy()
external_adata = external_adata[(to_update["Used in analysis"] == True)].copy()
external_adata.obs["Used in analysis"] = True

### Save final mapping results and QC for non-neuronal subclasses

In [None]:
for i in external_adata.obs["Source"].cat.categories:
    print(i)
    external_adata.obs.loc[(external_adata.obs["Source"] == i), :].to_csv(os.path.join(pwd, "output", "PFC_Model", "PFC_" + i + "_external_metadata_keepers_non-neurons.2023-12-11.csv"))

### Build combined dataset

In [None]:
external_adata = []
external_datasets = glob.glob(os.path.join(pwd, "input", "*_external_singleomeCR6.2023-10-12.h5ad"))
external_datasets.extend([os.path.join(pwd, "input", "SEAAD_A9_RNAseq_final-nuclei.2023-07-19.h5ad")])

de_genes = pd.read_csv(os.path.join(pwd, "output", "de_genes.csv"), header=None)
de_genes = de_genes[0].to_list()

for k in external_datasets:
    print(os.path.basename(k))
    tmp = sc.read_h5ad(k)
    tmp = tmp[:, de_genes].copy()
    tmp.X = tmp.layers["UMIs"].copy()
    del tmp.layers["UMIs"]
    if "external" in k:
        tmp.obs["Source"] = os.path.basename(k).replace("PFC_", "").replace("_external_singleomeCR6.2023-10-12.h5ad", "")
    else:
        tmp.obs["Source"] = "Allen"
        tmp.obs = tmp.obs.drop(["Class", "Class confidence", "Subclass", "Subclass confidence", "Supertype", "Supertype (non-expanded)", "Supertype confidence"], axis=1)
    
    print(tmp.shape[0])
    neuronal_annotations = pd.read_csv(os.path.join(pwd, "output", "PFC_" + tmp.obs["Source"].iloc[0] + "_external_metadata_keepers_neurons.2023-11-16.csv"), index_col=0)
    neuronal_annotations = neuronal_annotations.loc[:, ["Class", "Class confidence", "Subclass", "Subclass confidence", "Supertype (non-expanded)", "Supertype confidence"]].copy()
    glial_annotations = pd.read_csv(os.path.join(pwd, "output", "PFC_" + tmp.obs["Source"].iloc[0] + "_external_metadata_keepers_non-neurons.2023-12-11.csv"), index_col=0)
    glial_annotations = glial_annotations.loc[:, ["Class", "Class confidence", "Subclass", "Subclass confidence", "Supertype (non-expanded)", "Supertype confidence"]].copy()
    annotations = pd.concat([neuronal_annotations, glial_annotations], axis=0)
    
    tmp.obs = tmp.obs.merge(annotations, left_index=True, right_index=True, how="left")
    
    external_adata.append(tmp)

first = external_adata.pop()
external_adata = first.concatenate(external_adata, index_unique=None)

for i in external_adata.obs.columns[external_adata.obs.isna().sum(axis=0) > 0]:
    if any(external_adata.obs[i].notna()) == False:
        print("Dropping no-value column " + i)
        external_adata.obs.drop([i], axis=1, inplace=True)
            
    else:            
        replace_with = ""
        
        if isinstance(external_adata.obs.loc[external_adata.obs[i].notna(), i][0], np.float64) == True or isinstance(external_adata.obs.loc[external_adata.obs[i].notna(), i][0], np.float32) == True:
            replace_with = 0.0

        if isinstance(external_adata.obs[i].dtype, pd.core.dtypes.dtypes.CategoricalDtype) == True:
            external_adata.obs[i] = external_adata.obs[i].astype("object")

        if external_adata.obs[i].dtype == "bool" or external_adata.obs[i].dtype == "boolean":
                external_adata.obs[i] = external_adata.obs[i].astype("object")
                external_adata.obs[i] = [str(l) for l in external_adata.obs[i]]
        
        print("Replacing NaNs with " + str(replace_with) + " for " + i + " with dtype " + str(type(external_adata.obs.loc[external_adata.obs[i].notna(), i][0])))
        external_adata.obs.loc[external_adata.obs[i].isna(), i] = replace_with
        
        if isinstance(external_adata.obs.loc[(external_adata.obs[i].notna()) & (external_adata.obs[i] != ""), i][0], bool) == True:
            external_adata.obs[i] = external_adata.obs[i].astype("str")
            
external_adata.obs["Donor ID"] = external_adata.obs["Donor ID"].astype("str")
external_adata.obs["Age at Death"] = external_adata.obs["Age at Death"].astype("str")
for i in external_adata.obs.columns[(external_adata.obs == "Reference").sum(axis=0) > 0]:
    external_adata.obs[i] = external_adata.obs[i].astype("str")
    
external_adata.obs["Used in analysis"] = (external_adata.obs["Class"] != "").astype("str")

external_adata.write("output", "PFC_external_SEAAD_singleomeCR6_UMIs_only_de_genes_final.2023-12-12.h5ad")

### Obtain representation of the whole dataset

In [None]:
external_adata = sc.read_h5ad("output", "PFC_external_SEAAD_singleomeCR6_UMIs_expanded_de_genes_final.2023-12-12.h5ad")
selected_cells = (external_adata.obs["Cognitive Status"] != "Reference") & (external_adata.obs["Used in analysis"] == "True")

external_adata = external_adata[selected_cells, :].copy()
var_names = pd.read_csv(os.path.join(pwd, "output", "External_Data_Models", "Full", "var_names.csv"), header=None).iloc[:, 0].to_list()
sub =  external_adata[:, var_names].copy()
model = scvi.model.SCVI.load(os.path.join(pwd, "output", "External_Data_Models", "Full"), sub)
del sub

external_adata.obsm["X_scVI"] = model.get_latent_representation()
print(str(datetime.now()) + " -- Computed latent representation")
rsc.pp.neighbors(external_adata, use_rep="X_scVI", n_neighbors=50)
print(str(datetime.now()) + " -- Computed neighbors")
rsc.tl.umap(external_adata, init_pos="random")
print(str(datetime.now()) + " -- Computed UMAP")

sc.pp.subsample(external_adata, fraction=1)

subclass_colors = pd.read_csv(os.path.join(pwd, "input", "cluster_order_and_colors.csv"))
subclass_colors = subclass_colors.loc[:, ["subclass_label", "subclass_color"]]
subclass_colors.index = subclass_colors["subclass_label"]
subclass_colors = subclass_colors["subclass_color"].to_dict()

supertype_colors = pd.read_csv(os.path.join(pwd, "input", "cluster_order_and_colors.csv"))
supertype_colors = supertype_colors.loc[:, ["cluster_label", "cluster_color"]]
supertype_colors.index = supertype_colors["cluster_label"]
supertype_colors = supertype_colors["cluster_color"].to_dict()

### Obtain final neighborhood representations signature scores

In [None]:
neighborhoods = {
    "MGE": ["Sst Chodl", "Sst", "Pvalb", "Chandelier"], # 4
    "CGE": ["Lamp5", "Lamp5 Lhx6", "Pax6", "Sncg", "Vip"], # 5
    "IT": ["L2/3 IT", "L4 IT", "L5 IT", "L6 IT", "L6 IT Car3"], # 5
    "Deep Projecting": ["L5 ET", "L5/6 NP", "L6b", "L6 CT"], # 4
    "Glia Refined": ["Oligodendrocyte", "OPC", "Astrocyte"], # 3
    "Vascular and Immune Refined": ["Endothelial", "VLMC", "Microglia-PVM"] # 3
}

fingerprint_correlation = {}
for i,j in neighborhoods.items():
    print(i)
    neighborhood_model_args = {
        "n_layers": 2,
        "n_latent": 20,
        "dispersion": "gene-batch",
    }
    neighborhood_external_adata = external_adata[external_adata.obs["Subclass"].isin(j)].copy()
    scvi.model.SCVI.setup_anndata(
        neighborhood_external_adata,
        layer=None,
        batch_key="Source",
        categorical_covariate_keys=["library_prep"]
    )
    if os.path.exists(os.path.join(pwd, "output", "External_Data_Models", i)) == False:
        neighborhood_model = scvi.model.SCVI(neighborhood_external_adata, **neighborhood_model_args)
        print(neighborhood_model)
        print(neighborhood_model.view_anndata_setup())
        neighborhood_model.train(max_epochs=500, early_stopping=True, use_gpu=True)
        neighborhood_model.save(os.path.join(pwd, "output", "External_Data_Models", i))
    else:
        if os.path.exists(os.path.join(pwd, "output", "External_Data_Models", i, "var_names.csv")) == False:
            tmp = torch.load(os.path.join(pwd, "output", "External_Data_Models", "model.pt"))
            pd.DataFrame(tmp["var_names"]).to_csv(os.path.join(pwd, "output", "External_Data_Models", i, "var_names.csv"), index=False, header=False)
        var_names = pd.read_csv(os.path.join(pwd,"output", "External_Data_Models", i, "var_names.csv"), header=None).iloc[:, 0].to_list()
        sub = neighborhood_external_adata[:, var_names].copy()
        neighborhood_model = scvi.model.SCVI.load(os.path.join(pwd, "output", "External_Data_Models", i), sub)
        del sub

    neighborhood_external_adata.obsm["X_scVI"] = neighborhood_model.get_latent_representation()
    print(str(datetime.now()) + " -- Computed latent representation")
    rsc.pp.neighbors(neighborhood_external_adata, use_rep="X_scVI", n_neighbors=50)
    print(str(datetime.now()) + " -- Computed neighbors")
    rsc.tl.umap(neighborhood_external_adata)
    print(str(datetime.now()) + " -- Computed UMAP")

    sc.pp.normalize_total(neighborhood_external_adata, 1e5)
    sc.pp.log1p(neighborhood_external_adata)
    sc.tl.rank_genes_groups(neighborhood_external_adata, groupby="Supertype", groups=np.setdiff1d(neighborhood_external_adata.obs["Supertype"], "").tolist())
    sc.pp.scale(neighborhood_external_adata)
    signature_genes = pd.DataFrame(neighborhood_external_adata.uns['rank_genes_groups']['names']).head(10)
    
    fingerprint_correlation[i] = pd.DataFrame(
        np.zeros((len(neighborhood_external_adata.obs["Source"].cat.categories), len(np.setdiff1d(neighborhood_external_adata.obs["Supertype"], "").tolist()))),
        index=neighborhood_external_adata.obs["Source"].cat.categories,
        columns=np.setdiff1d(neighborhood_external_adata.obs["Supertype"], "").tolist(),
    )
    
    for z in np.setdiff1d(neighborhood_external_adata.obs["Supertype"], "").tolist():
        neighborhood_external_adata.obs["Signature Score"] = neighborhood_external_adata[:, signature_genes[z]].X.mean(axis=1)
        signature_scores = sc.get.obs_df(neighborhood_external_adata, ["Source", "Supertype", "Signature Score"])
        signature_scores = signature_scores.groupby(["Source", "Supertype"]).mean().reset_index().pivot(index="Source", columns="Supertype", values="Signature Score").fillna(0)
        for y in neighborhood_external_adata.obs["Source"].cat.categories:
            fingerprint_correlation[i].loc[y, z] = sp_stats.spearmanr(signature_scores.loc["Allen", :], signature_scores.loc[y, :])[0]
