### Load needed libraries

In [None]:
import os
import re
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import colors
%matplotlib inline
import scanpy as sc
import warnings
from datetime import datetime

sc.settings.n_jobs = 32
warnings.filterwarnings("ignore")
plt.ioff()

pwd = os.getcwd()

### Define AnnData preparation function

In [None]:
def prepare_anndata(adata, region, modality, tag, metadata_to_color, annotation_color=True):
    print(str(datetime.datetime.now()) + " -- Starting export")

    # Add uns elements required by the schema
    adata.uns["title"] = region + ": Seattle Alzheimer's Disease Atlas (SEA-AD)"
    adata.uns["X_normalization"] = "ln(UP10K+1)"
    adata.uns["batch_condition"] = ["Specimen ID"]
    adata.uns["default_embedding"] = "X_umap"
    try:
        del adata.uns["_scvi"]
        del adata.uns["subclass_scANVI_colors"]
        del adata.uns["supertype_scANVI_leiden_colors"]

        # Remove unwanted obsm values
        del adata.obsm["_scvi_extra_categoricals"]
        del adata.obsm["_scvi_extra_continuous"]
    except:
        pass
    
    # add obs elements required by the schema
    adata.obs_names = adata.obs["exp_component_name"].copy()
    
    # Metadata tweaks for public release
    adata.obs["ch_hispanicorlatino"] = adata.obs["ch_hispanicorlatino"].cat.rename_categories(
        {"": "Unknown"},
    )
    adata.obs["late_stage"] = adata.obs["late_stage"].cat.rename_categories(
        {"Staging Precluded by FTLD with TDP43 or ALS/MND or TDP-43 pathology is unclassifiable": "Unclassifiable"},
    )
    
    adata.obs["alignment"] = adata.obs["alignment"].astype("object")
    adata.obs.loc[adata.obs["alignment"] == "Cell Ranger 3.0", "alignment"] = "Cell Ranger 6.0"
    adata.obs["alignment"] = adata.obs["alignment"].astype("category")

    
    adata.obs.loc[adata.obs["age_at_death"] > 89, "age_at_death"] = "90+"
    adata.obs["age_at_death"] = [str(i).replace(".0", "") for i in adata.obs["age_at_death"]]
    
    adata.obs["ch_lastmocascore_2"] = adata.obs["ch_lastmocascore_2"].str.replace("/30", "")

    adata.obs["ch_apoe_four"] = adata.obs["ch_apoestatuses"].str.contains("4")
    adata.obs["ch_apoe_four"] = adata.obs["ch_apoe_four"].fillna("Reference")
    adata.obs["ch_apoe_four"] = adata.obs["ch_apoe_four"].astype("category")
    adata.obs["ch_apoe_four"] = adata.obs["ch_apoe_four"].cat.rename_categories({True: "Y", False: "N"})
    
    categories_to_fix = [
        "ch_race___1",
        "ch_race___2",
        "ch_race___3",
        "ch_race___4",
        "ch_race___5",
        "ch_race___6",
        "ch_race___97",
        "ch_raceother",
        "ch_hispanicorlatino",
        "ch_education",
        "adneurochange",
        "thal",
        "braak",
        "cscore",
        "caascore",
        "lewybodydisease",
        "ge_atherosclerosis_id",
        "micro_arteriolosclerosis_id",
        "late_stage",
        "ch_cognitivestatus_binary",
        "primary_studyname",
        "secondary_studyname"
    ]
    
    for i in categories_to_fix:
        adata.obs[i] = adata.obs[i].astype("category")
        adata.obs[i] = adata.obs[i].cat.add_categories("Reference")
        adata.obs[i] = adata.obs[i].fillna("Reference")


    numerics_to_fix = [
        "ch_education_years",
        "pmi_date",
        "ap_freshbrainweight",
        "ap_brainph",
        "micro_totalmicroinfarcts",
        "micro_microvascularbrain",
        "ch_lastcasiscore",
        "ch_casi_interval",
        "ch_lastmmsescore",
        "ch_mmse_interval",
        "ch_lastmocascore_2",
        "ch_moca_interval",
    ]

    for i in numerics_to_fix:
        adata.obs[i] = adata.obs[i].astype("object")
        adata.obs.loc[adata.obs["reference_cell"] == 1, i] = "Reference"
        adata.obs[i] = [str(i) for i in adata.obs[i]]

    
    adata.obs["reference_cell"] = adata.obs["reference_cell"].astype("category")
    adata.obs["reference_cell"] = adata.obs["reference_cell"].cat.rename_categories(
        {
            0: "False",
            1: "True",
        },
    )

    dend_order = pd.read_csv(os.path.join(pwd, "input", region + "_" + modality, "cluster_order_and_colors.csv"))
    
    if tag == "final":
        class_order = [j for j in dend_order["class_label"].drop_duplicates() if j in adata.obs["class_scANVI"].cat.categories]
        adata.obs["class_scANVI"] = adata.obs["class_scANVI"].cat.reorder_categories(class_order)
        
        class_order = [j for j in dend_order["subclass_label"].drop_duplicates() if j in adata.obs["subclass_scANVI"].cat.categories]
        adata.obs["subclass_scANVI"] = adata.obs["subclass_scANVI"].cat.reorder_categories(class_order)

        class_order = [j for j in dend_order["cluster_label"].drop_duplicates() if j in adata.obs["supertype_scANVI_leiden"].cat.categories]
        adata.obs["supertype_scANVI_leiden"] = adata.obs["supertype_scANVI_leiden"].cat.reorder_categories(class_order)
        

    sc.pp.subsample(adata, fraction=1)
    
    adata.obs["ch_cognitivestatus_binary"] = adata.obs["ch_cognitivestatus_binary"].cat.reorder_categories(["Reference", "No dementia", "Dementia"])
    adata.obs["adneurochange"] = adata.obs["adneurochange"].cat.reorder_categories(["Reference", "Not AD", "Low", "Intermediate", "High"])
    adata.obs["braak"] = adata.obs["braak"].cat.reorder_categories(["Reference", "Braak 0", "Braak II", "Braak III", "Braak IV", "Braak V", "Braak VI"])
    adata.obs["thal"] = adata.obs["thal"].cat.reorder_categories(["Reference", "Thal 0", "Thal 1", "Thal 2", "Thal 3", "Thal 4", "Thal 5"])
    adata.obs["cscore"] = adata.obs["cscore"].cat.reorder_categories(["Reference", 'Absent', 'Sparse', 'Moderate', 'Frequent'])
    adata.obs["lewybodydisease"] = adata.obs["lewybodydisease"].cat.reorder_categories(["Reference", 'Not Identified (olfactory bulb not assessed)', 'Not Identified (olfactory bulb assessed)', 'Olfactory bulb only', 'Amygdala-predominant', 'Brainstem-predominant', 'Limbic (Transitional)', 'Neocortical (Diffuse)'])
    adata.obs["late_stage"] = adata.obs["late_stage"].cat.reorder_categories(["Reference", 'Unclassifiable', 'Not Identified', 'LATE Stage 1', 'LATE Stage 2', 'LATE Stage 3'])
    adata.obs["ch_apoe_four"] = adata.obs["ch_apoe_four"].cat.reorder_categories(["Reference", "N", "Y"])

    # Keep only display metadata
    adata.obs = adata.obs.loc[
        :,
        [
            "sample_id",
            "reference_cell",
            "donor_name",
            "organism",
            "roi",
            "sex",
            "gender",
            "age_at_death",
            "ch_race___1",
            "ch_race___2",
            "ch_race___3",
            "ch_race___4",
            "ch_race___5",
            "ch_race___6",
            "ch_race___97",
            "ch_raceother",
            "ch_hispanicorlatino",
            "ch_education",
            "ch_education_years",
            "pmi_date",
            "ap_freshbrainweight",
            "ap_brainph",
            "adneurochange",
            "thal",
            "braak",
            "cscore",
            "caascore",
            "lewybodydisease",
            "micro_totalmicroinfarcts",
            "micro_microvascularbrain",
            "ge_atherosclerosis_id",
            "micro_arteriolosclerosis_id",
            "late_stage",
            "ch_cognitivestatus_binary",
            "ch_lastcasiscore",
            "ch_casi_interval",
            "ch_lastmmsescore",
            "ch_mmse_interval",
            "ch_lastmocascore_2",
            "ch_moca_interval",
            "ch_apoe_four", 
            "primary_studyname",
            "secondary_studyname",
            "cell_prep_type",
            "facs_population_plan",
            "rna_amplification",
            "sample_name",
            "sample_quantity_count",
            "expc_cell_capture",
            "method",
            "pcr_cycles",
            "percent_cdna_longer_than_400bp",
            "rna_amplification_pass_fail",
            "amplified_quantity_ng",           
            "load_name",
            "library_prep",
            "library_input_ng",
            "r1_index",
            "avg_size_bp",
            "quantification_fmol",
            "library_prep_pass_fail",
            "exp_component_vendor_name",
            "batch_vendor_name",
            "experiment_component_failed",
            "alignment",
            "Genome",
            "ar_id",
            "bc",
            "Estimated_number_of_cells",
            "number_of_reads",
            "sequencing_saturation",
            "GEX_Mean_raw_reads_per_cell",
            "GEX_Q30_bases_in_barcode",
            "GEX_Q30_bases_in_read_2",
            "GEX_Q30_bases_in_UMI",
            "GEX_Percent_duplicates",
            "GEX_Q30_bases_in_sample_index_i1",
            "GEX_Q30_bases_in_sample_index_i2",
            "GEX_Reads_with_TSO",
            "GEX_Sequenced_read_pairs",
            "GEX_Valid_UMIs",
            "GEX_Valid_barcodes",
            "GEX_Reads_mapped_to_genome",
            "GEX_Reads_mapped_confidently_to_genome",
            "GEX_Reads_mapped_confidently_to_intergenic_regions",
            "GEX_Reads_mapped_confidently_to_intronic_regions",
            "GEX_Reads_mapped_confidently_to_exonic_regions",
            "GEX_Reads_mapped_confidently_to_transcriptome",
            "GEX_Reads_mapped_antisense_to_gene",
            "GEX_Fraction_of_transcriptomic_reads_in_cells",
            "GEX_Total_genes_detected",
            "GEX_Median_UMI_counts_per_cell",
            "GEX_Median_genes_per_cell",
            "Feature_linkages_detected",
            "Linked_genes",
            "Linked_peaks",
            "ATAC_Confidently_mapped_read_pairs",
            "ATAC_Fraction_of_genome_in_peaks",
            "ATAC_Fraction_of_high.quality_fragments_in_cells",
            "ATAC_Fraction_of_high.quality_fragments_overlapping_TSS",
            "ATAC_Fraction_of_high.quality_fragments_overlapping_peaks",
            "ATAC_Fraction_of_transposition_events_in_peaks_in_cells",
            "ATAC_Mean_raw_read_pairs_per_cell",
            "ATAC_Median_high.quality_fragments_per_cell",
            "ATAC_Non.nuclear_read_pairs",
            "ATAC_Number_of_peaks",
            "ATAC_Percent_duplicates",
            "ATAC_Q30_bases_in_barcode",
            "ATAC_Q30_bases_in_read_1",
            "ATAC_Q30_bases_in_read_2",
            "ATAC_Q30_bases_in_sample_index_i1",
            "ATAC_Sequenced_read_pairs",
            "ATAC_TSS_enrichment_score",
            "ATAC_Unmapped_read_pairs",
            "ATAC_Valid_barcodes",
            "mapped_reads",
            "unmapped_reads",
            "nonconf_mapped_reads",
            "total.reads",
            "nCount_RNA",
            "nFeature_RNA",
            "doublet_score",
            "fraction_mito",
            "for_analysis",
            "class_conf_scANVI",
            "class_scANVI",
            "subclass_conf_scANVI",
            "subclass_scANVI",
            "supertype_conf_scANVI",
            "supertype_scANVI",
            "supertype_scANVI_leiden"
        ]
    ]
    adata.obs = adata.obs.rename(
        {
            "reference_cell": "Neurotypical reference",
            "donor_name": "Donor ID",
            "organism": "Organism",
            "roi": "Brain Region",
            "sex": "Sex",
            "gender": "Gender",
            "age_at_death": "Age at Death",
            "ch_race___1": "Race (choice=White)",
            "ch_race___2": "Race (choice=Black/ African American)",
            "ch_race___3": "Race (choice=Asian)",
            "ch_race___4": "Race (choice=American Indian/ Alaska Native)",
            "ch_race___5": "Race (choice=Native Hawaiian or Pacific Islander)",
            "ch_race___6": "Race (choice=Unknown or unreported)",
            "ch_race___97": "Race (choice=Other)",
            "ch_raceother": "specify other race",
            "ch_hispanicorlatino": "Hispanic/Latino",
            "ch_education": "Highest level of education",
            "ch_education_years": "Years of education",
            "pmi_date": "PMI",
            "ap_freshbrainweight": "Fresh Brain Weight",
            "ap_brainph": "Brain pH",
            "adneurochange": "Overall AD neuropathological Change",
            "thal": "Thal",
            "braak": "Braak",
            "cscore": "CERAD score",
            "caascore": "Overall CAA Score",
            "lewybodydisease": "Highest Lewy Body Disease",
            "micro_totalmicroinfarcts": "Total Microinfarcts (not observed grossly)",
            "micro_microvascularbrain": "Total microinfarcts in screening sections",
            "ge_atherosclerosis_id": "Atherosclerosis",
            "micro_arteriolosclerosis_id": "Arteriolosclerosis",
            "late_stage": "LATE",
            "ch_cognitivestatus_binary": "Cognitive Status",
            "ch_lastcasiscore": "Last CASI Score",
            "ch_casi_interval": "Interval from last CASI in months",
            "ch_lastmmsescore": "Last MMSE Score",
            "ch_mmse_interval": "Interval from last MMSE in months",
            "ch_lastmocascore_2": "Last MOCA Score",
            "ch_moca_interval": "Interval from last MOCA in months",
            "ch_apoe_four": "APOE4 Status",
            "primary_studyname": "Primary Study Name",
            "secondary_studyname": "Secondary Study Name",
            "Estimated_number_of_cells": "GEX_Estimated_number_of_cells",
            "number_of_reads": "GEX_number_of_reads",
            "sequencing_saturation": "GEX_sequencing_saturation",
            "Feature_linkages_detected": "Multiome_Feature_linkages_detected",
            "Linked_genes": "Multiome_Linked_genes",
            "Linked_peaks": "Multiome_Linked_peaks",
            "ATAC_Fraction_of_high.quality_fragments_in_cells": "ATAC_Fraction_of_high_quality_fragments_in_cells",
            "ATAC_Fraction_of_high.quality_fragments_overlapping_TSS": "ATAC_Fraction_of_high_quality_fragments_overlapping_TSS",
            "ATAC_Fraction_of_high.quality_fragments_overlapping_peaks": "ATAC_Fraction_of_high_quality_fragments_overlapping_peaks",
            "ATAC_Median_high.quality_fragments_per_cell": "ATAC_Median_high_quality_fragments_per_cell",
            "ATAC_Non.nuclear_read_pairs": "ATAC_Non-nuclear_read_pairs",
            "mapped_reads": "Number of mapped reads",
            "unmapped_reads": "Number of unmapped reads",
            "nonconf_mapped_reads": "Number of multimapped reads",
            "total.reads": "Number of reads",
            "nCount_RNA": "Number of UMIs",
            "nFeature_RNA": "Genes detected",
            "doublet_score": "Doublet score",
            "fraction_mito": "Fraction mitochondrial UMIs",
            "for_analysis": "Used in analysis",
            "class_conf_scANVI": "Class confidence",
            "class_scANVI": "Class",
            "subclass_conf_scANVI": "Subclass confidence",
            "subclass_scANVI": "Subclass",
            "supertype_conf_scANVI": "Supertype confidence",
            "supertype_scANVI": "Supertype (non-expanded)",
            "supertype_scANVI_leiden": "Supertype",
        },
        axis=1,
    )
    metadata_colors = {}
    for i in metadata_to_color:
        adata.obs[i] = adata.obs[i].astype("category")
        a = adata.obs[i].cat.categories.to_list()
        cmap = plt.get_cmap('viridis', len(a)+1)
        b = [colors.rgb2hex(cmap(j)) for j in range(cmap.N)]
        b.pop(0)
        if i == "Sex":
            b = ["pink", "dodgerblue"]
        for A, B in zip(a, b):
            try:
                metadata_colors[i][A] = B
            except:
                metadata_colors[i] = {}
                metadata_colors[i][A] = B
        try:
            del adata.uns[i + "_colors"]
        except:
            pass
        sc.pl.umap(adata,
            color=i,
            title="",
            palette=metadata_colors[i],
            size=10,
            legend_fontsize=12
        )
    
    if annotation_color == True:
        tmp = dend_order.loc[:, ["subclass_label", "subclass_color"]].drop_duplicates()
        tmp.index = tmp["subclass_label"].copy()
        tmp = tmp.drop("subclass_label", axis=1)
        tmp = tmp.to_dict()["subclass_color"]
        sc.pl.umap(adata,
            color="Subclass",
            title="",
            palette=tmp,
            size=10,
            legend_fontsize=12
        )

        tmp = dend_order.loc[:, ["cluster_label", "cluster_color"]].drop_duplicates()
        tmp.index = tmp["cluster_label"].copy()
        tmp = tmp.drop("cluster_label", axis=1)
        tmp = tmp.to_dict()["cluster_color"]
        sc.pl.umap(adata,
            color="Supertype",
            title="",
            palette=tmp,
            size=10,
            legend_fontsize=12
        )
    
    for i in adata.uns.keys():
        if i.endswith("_colors"):
            adata.uns[i] = [re.sub("ff$", "", j) for j in adata.uns[i]]
            
    adata.obs["Supertype"] = adata.obs["Supertype"].cat.rename_categories(
        {
            "VLMC_2": "Pericyte_1",
            "VLMC_2_1-SEAAD": "SMC-SEAAD",
            "VLMC_2_2-SEAAD": "Pericyte_2-SEAAD",
            "Micro-PVM_2_2-SEAAD": "Lymphocyte",
            "Micro-PVM_1_1-SEAAD": "Monocyte"
        },
    )
    
    adata.write(os.path.join(pwd, "output", region, "SEAAD_" + region + "_" + modality + "_" + tag + "-nuclei." + str(datetime.datetime.date(datetime.datetime.now())) + ".h5ad"), compression="gzip")
    

### Prepare and export objects for AWS

In [None]:
region = "DLPFC"
modality = "RNAseq"
date = "2023-07-12"
adata = sc.read_h5ad(os.path.join(pwd, "input", region, "final." + date + ".h5ad"))

In [None]:
metadata_to_color = [
    "Sex",
    "Cognitive Status",
    "Overall AD neuropathological Change",
    "Braak",
    "Thal",
    "CERAD score",
    "Highest Lewy Body Disease",
    "LATE",
    "APOE4 Status",
]

prepare_anndata(
    adata=adata,
    region=region,
    modality=modality,
    tag="final",
    metadata_to_color=metadata_to_color
)

In [None]:
region = "DLPFC"
modality = "RNAseq"
date = "2023-07-12"
adata = sc.read_h5ad(os.path.join(pwd, "input", region, "raw." + date + ".h5ad"))

In [None]:
prepare_anndata(
    adata=adata,
    region=region,
    modality=modality,
    tag="all",
    metadata_to_color=[],
    annotation_color=False
)