In [None]:
import os
import numpy as np
import re
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
%matplotlib inline
import scanpy as sc
import warnings
from datetime import datetime

sc.settings.n_jobs = 32
warnings.filterwarnings("ignore")
plt.ioff()

pwd = os.getcwd()

### Define AnnData preparation function

In [None]:
def prepare_anndata(adata, region, modality, tag, metadata_to_color, annotation_color=True):
    # Add uns elements required by the schema
    adata.uns["title"] = region + ": Seattle Alzheimer's Disease Atlas (SEA-AD)"
    adata.uns["X_normalization"] = "None"
    adata.uns["batch_condition"] = ["Specimen ID"]
    adata.uns["default_embedding"] = "X_umap"
    
    del adata.uns["_scvi"]
    del adata.uns["Doublet_or_LowQuality_colors"]
    del adata.uns["donor_name_colors"]
    del adata.uns["leiden_1.0_colors"]
    del adata.uns["modality_colors"]
    del adata.uns["sex_colors"]
    del adata.uns["subclass_scANVI_colors"]
    del adata.obsm["_scvi_extra_categoricals"]
    
    # Metadata tweaks for public release
    adata.obs["sex"] = adata.obs["gender"].copy()
    adata.obs.loc[adata.obs["uwa"] == "UWA 7186", "sex"] = "Male"
    adata.obs.loc[adata.obs["donor_name"] == "H20.33.038", "ap_freshbrainweight"] = "Unavailable"
    adata.obs["ch_hispanicorlatino"].cat.rename_categories(
        {"": "Unknown"},
        inplace=True
    )
    adata.obs["late_stage"].cat.rename_categories(
        {"Staging Precluded by FTLD with TDP43 or ALS/MND or TDP-43 pathology is unclassifiable": "Unclassifiable"},
        inplace=True
    )
    adata.obs.loc[adata.obs["donor_name"] == "H20.33.026", "primary_studyname"] = "ADRC Clinical Core"
    
    adata.obs.loc[adata.obs["age_at_death"] > 89, "age_at_death"] = "90+"
    adata.obs["age_at_death"] = [str(i).replace(".0", "") for i in adata.obs["age_at_death"]]
    
    adata.obs["ch_lastmocascore_2"] = adata.obs["ch_lastmocascore_2"].str.replace("/30", "")
    
    adata.obs["ch_apoe_four"] = adata.obs["ch_apoestatuses"].str.contains("4")
    adata.obs["ch_apoe_four"].fillna("Reference", inplace=True)
    adata.obs["ch_apoe_four"] = adata.obs["ch_apoe_four"].astype("category")
    adata.obs["ch_apoe_four"].cat.rename_categories({True: "Y", False: "N"}, inplace=True)
    
    categories_to_fix = [
        "ch_race___1",
        "ch_race___2",
        "ch_race___3",
        "ch_race___4",
        "ch_race___5",
        "ch_race___6",
        "ch_race___97",
        "ch_raceother",
        "ch_hispanicorlatino",
        "ch_education",
        "adneurochange",
        "thal",
        "braak",
        "cscore",
        "caascore",
        "lewybodydisease",
        "ge_atherosclerosis_id",
        "micro_arteriolosclerosis_id",
        "late_stage",
        "ch_cognitivestatus_binary",
        "primary_studyname",
        "secondary_studyname"
    ]
    
    for i in categories_to_fix:
        adata.obs[i] = adata.obs[i].astype("category")
        adata.obs[i].cat.add_categories("Reference", inplace=True)
        adata.obs[i].fillna("Reference", inplace=True)
    
    
    numerics_to_fix = [
        "ch_education_years",
        "pmi_date",
        "ap_freshbrainweight",
        "ap_brainph",
        "micro_totalmicroinfarcts",
        "micro_microvascularbrain",
        "ch_lastcasiscore",
        "ch_casi_interval",
        "ch_lastmmsescore",
        "ch_mmse_interval",
        "ch_lastmocascore_2",
        "ch_moca_interval",
    ]
    
    for i in numerics_to_fix:
        adata.obs[i] = adata.obs[i].astype("object")
        adata.obs.loc[adata.obs["reference_cell"] == 1, i] = "Reference"
        adata.obs[i] = [str(i) for i in adata.obs[i]]
    
    
    adata.obs["reference_cell"] = adata.obs["reference_cell"].astype("category")
    adata.obs["reference_cell"].cat.rename_categories(
        {
            0: "False",
            1: "True",
        },
        inplace=True
    )
    
    dend_order = pd.read_csv(os.path.join(pwd, "input", region + "_" + modality, "cluster_order_and_colors.csv"))
    
    adata.obs["class_scANVI"].cat.rename_categories(
        {
            "exc": "Neuronal: Glutamatergic",
            "inh": "Neuronal: GABAergic",
            "glia": "Non-neuronal and Non-neural"
        },
        inplace=True
    )
    
    
    adata.obs["subclass_scANVI"].cat.rename_categories(
        {
            "Lamp5_Lhx6": "Lamp5 Lhx6",
            "Astro": "Astrocyte",
            "Oligo": "Oligodendrocyte",
            "Endo": "Endothelial",
            "Micro-PVM": "Microglia-PVM"
        },
        inplace=True
    )
    
    adata.obs["supertype_scANVI_leiden"].cat.rename_categories(
        {
            "Astro_Unknown_25": "Astro_6-SEAAD",
            "Oligo_Unknown_15": "Oligo_2_1-SEAAD",
            "OPC_Unknown_25": "OPC_2_2-SEAAD",
            "OPC_Unknown_18": "OPC_2_1-SEAAD",
            "VLMC_Unknown_24": "VLMC_2_1-SEAAD",
            "VLMC_Unknown_32": "VLMC_2_2-SEAAD",
            "Micro-PVM_Unknown_0": "Micro-PVM_3-SEAAD",
            "Micro-PVM_Unknown_9": "Micro-PVM_2_3-SEAAD",
            "Micro-PVM_Unknown_10": "Micro-PVM_4-SEAAD",
            "Micro-PVM_Unknown_116": "Micro-PVM_2_1-SEAAD",
            "Micro-PVM_Unknown_135": "Micro-PVM_1_1-SEAAD",
            "Micro-PVM_Unknown_200": "Micro-PVM_2_2-SEAAD",
        },
        inplace=True
    )
        
    adata.obs["supertype_scANVI_leiden"].cat.rename_categories(
        {
            "VLMC_2": "Pericyte_1",
            "VLMC_2_1-SEAAD": "SMC-SEAAD",
            "VLMC_2_2-SEAAD": "Pericyte_2-SEAAD",
            "Micro-PVM_2_2-SEAAD": "Lymphocyte",
            "Micro-PVM_1_1-SEAAD": "Monocyte"
        },
        inplace=True
    )

    if tag == "final":
        class_order = [j for j in dend_order["class_label"].drop_duplicates() if j in adata.obs["class_scANVI"].cat.categories]
        adata.obs["class_scANVI"] = adata.obs["class_scANVI"].cat.reorder_categories(class_order)
        
        class_order = [j for j in dend_order["subclass_label"].drop_duplicates() if j in adata.obs["subclass_scANVI"].cat.categories]
        adata.obs["subclass_scANVI"] = adata.obs["subclass_scANVI"].cat.reorder_categories(class_order)

        class_order = [j for j in dend_order["cluster_label"].drop_duplicates() if j in adata.obs["supertype_scANVI_leiden"].cat.categories]
        adata.obs["supertype_scANVI_leiden"] = adata.obs["supertype_scANVI_leiden"].cat.reorder_categories(class_order)
    
    sc.pp.subsample(adata, fraction=1)
    
    adata.obs["ch_cognitivestatus_binary"].cat.reorder_categories(["Reference", "No dementia", "Dementia"], inplace=True)
    adata.obs["adneurochange"].cat.reorder_categories(["Reference", "Not AD", "Low", "Intermediate", "High"], inplace=True)
    adata.obs["braak"].cat.reorder_categories(["Reference", "Braak 0", "Braak II", "Braak III", "Braak IV", "Braak V", "Braak VI"], inplace=True)
    adata.obs["thal"].cat.reorder_categories(["Reference", "Thal 0", "Thal 1", "Thal 2", "Thal 3", "Thal 4", "Thal 5"], inplace=True)
    adata.obs["cscore"].cat.reorder_categories(["Reference", 'Absent', 'Sparse', 'Moderate', 'Frequent'], inplace=True)
    adata.obs["lewybodydisease"].cat.reorder_categories(["Reference", 'Not Identified (olfactory bulb not assessed)', 'Not Identified (olfactory bulb assessed)', 'Olfactory bulb only', 'Amygdala-predominant', 'Brainstem-predominant', 'Limbic (Transitional)', 'Neocortical (Diffuse)'], inplace=True)
    adata.obs["late_stage"].cat.reorder_categories(["Reference", 'Unclassifiable', 'Not Identified', 'LATE Stage 1', 'LATE Stage 2', 'LATE Stage 3'], inplace=True)
    adata.obs["ch_apoe_four"].cat.reorder_categories(["Reference", "N", "Y"], inplace=True)

    # Keep only display metadata
    adata.obs = adata.obs.loc[
        :,
        [
            "sample_id",
            "reference_cell",
            "donor_name",
            "organism",
            "roi",
            "sex",
            "gender",
            "age_at_death",
            "ch_race___1",
            "ch_race___2",
            "ch_race___3",
            "ch_race___4",
            "ch_race___5",
            "ch_race___6",
            "ch_race___97",
            "ch_raceother",
            "ch_hispanicorlatino",
            "ch_education",
            "ch_education_years",
            "pmi_date",
            "ap_freshbrainweight",
            "ap_brainph",
            "adneurochange",
            "thal",
            "braak",
            "cscore",
            "caascore",
            "lewybodydisease",
            "micro_totalmicroinfarcts",
            "micro_microvascularbrain",
            "ge_atherosclerosis_id",
            "micro_arteriolosclerosis_id",
            "late_stage",
            "ch_cognitivestatus_binary",
            "ch_lastcasiscore",
            "ch_casi_interval",
            "ch_lastmmsescore",
            "ch_mmse_interval",
            "ch_lastmocascore_2",
            "ch_moca_interval",
            "ch_apoe_four", 
            "primary_studyname",
            "secondary_studyname",
            "cell_prep_type",
            "facs_population_plan",
            "rna_amplification",
            "sample_name",
            "sample_quantity_count",
            "expc_cell_capture",
            "method",
            "pcr_cycles",
            "percent_cdna_longer_than_400bp",
            "rna_amplification_pass_fail",
            "load_name",
            "library_prep",
            "library_input_ng",
            "r1_index",
            "avg_size_bp",
            "quantification_fmol",
            "library_prep_pass_fail",
            "exp_component_vendor_name",
            "batch_vendor_name",
            "experiment_component_failed",
            "alignment",
            "Genome",
            "ar_id",
            "bc",
            "Estimated_number_of_cells",
            "number_of_reads",
            "GEX_Mean_raw_reads_per_cell",
            "GEX_Q30_bases_in_barcode",
            "GEX_Q30_bases_in_read_2",
            "GEX_Q30_bases_in_UMI",
            "GEX_Percent_duplicates",
            "GEX_Q30_bases_in_sample_index_i1",
            "GEX_Q30_bases_in_sample_index_i2",
            "GEX_Reads_with_TSO",
            "GEX_Sequenced_read_pairs",
            "GEX_Valid_UMIs",
            "GEX_Valid_barcodes",
            "GEX_Reads_mapped_to_genome",
            "GEX_Reads_mapped_confidently_to_genome",
            "GEX_Reads_mapped_confidently_to_intergenic_regions",
            "GEX_Reads_mapped_confidently_to_intronic_regions",
            "GEX_Reads_mapped_confidently_to_exonic_regions",
            "GEX_Reads_mapped_confidently_to_transcriptome",
            "GEX_Reads_mapped_antisense_to_gene",
            "GEX_Fraction_of_transcriptomic_reads_in_cells",
            "GEX_Total_genes_detected",
            "GEX_Median_UMI_counts_per_cell",
            "GEX_Median_genes_per_cell",
            "Feature_linkages_detected",
            "Linked_genes",
            "Linked_peaks",
            "ATAC_Confidently_mapped_read_pairs",
            "ATAC_Fraction_of_genome_in_peaks",
            "ATAC_Fraction_of_high.quality_fragments_in_cells",
            "ATAC_Fraction_of_high.quality_fragments_overlapping_TSS",
            "ATAC_Fraction_of_high.quality_fragments_overlapping_peaks",
            "ATAC_Fraction_of_transposition_events_in_peaks_in_cells",
            "ATAC_Mean_raw_read_pairs_per_cell",
            "ATAC_Median_high.quality_fragments_per_cell",
            "ATAC_Non.nuclear_read_pairs",
            "ATAC_Number_of_peaks",
            "ATAC_Percent_duplicates",
            "ATAC_Q30_bases_in_barcode",
            "ATAC_Q30_bases_in_read_1",
            "ATAC_Q30_bases_in_read_2",
            "ATAC_Q30_bases_in_sample_index_i1",
            "ATAC_Sequenced_read_pairs",
            "ATAC_TSS_enrichment_score",
            "ATAC_Unmapped_read_pairs",
            "ATAC_Valid_barcodes",
            "nCount_RNA",
            "nFeature_RNA",
            "doublet_score",
            "fraction_mito",
            "for_analysis",
            "class_conf_scANVI",
            "class_scANVI",
            "subclass_conf_scANVI",
            "subclass_scANVI",
            "supertype_conf_scANVI",
            "supertype_scANVI",
            "supertype_scANVI_leiden",
            "rna_neighbors_qc_ratio_new",
            "leiden_1.0",
        ]
    ]
    adata.obs.rename(
        {
            "reference_cell": "Neurotypical reference",
            "donor_name": "Donor ID",
            "organism": "Organism",
            "roi": "Brain Region",
            "sex": "Sex",
            "gender": "Gender",
            "age_at_death": "Age at Death",
            "ch_race___1": "Race (choice=White)",
            "ch_race___2": "Race (choice=Black/ African American)",
            "ch_race___3": "Race (choice=Asian)",
            "ch_race___4": "Race (choice=American Indian/ Alaska Native)",
            "ch_race___5": "Race (choice=Native Hawaiian or Pacific Islander)",
            "ch_race___6": "Race (choice=Unknown or unreported)",
            "ch_race___97": "Race (choice=Other)",
            "ch_raceother": "specify other race",
            "ch_hispanicorlatino": "Hispanic/Latino",
            "ch_education": "Highest level of education",
            "ch_education_years": "Years of education",
            "pmi_date": "PMI",
            "ap_freshbrainweight": "Fresh Brain Weight",
            "ap_brainph": "Brain pH",
            "adneurochange": "Overall AD neuropathological Change",
            "thal": "Thal",
            "braak": "Braak",
            "cscore": "CERAD score",
            "caascore": "Overall CAA Score",
            "lewybodydisease": "Highest Lewy Body Disease",
            "micro_totalmicroinfarcts": "Total Microinfarcts (not observed grossly)",
            "micro_microvascularbrain": "Total microinfarcts in screening sections",
            "ge_atherosclerosis_id": "Atherosclerosis",
            "micro_arteriolosclerosis_id": "Arteriolosclerosis",
            "late_stage": "LATE",
            "ch_cognitivestatus_binary": "Cognitive Status",
            "ch_lastcasiscore": "Last CASI Score",
            "ch_casi_interval": "Interval from last CASI in months",
            "ch_lastmmsescore": "Last MMSE Score",
            "ch_mmse_interval": "Interval from last MMSE in months",
            "ch_lastmocascore_2": "Last MOCA Score",
            "ch_moca_interval": "Interval from last MOCA in months",
            "ch_apoe_four": "APOE4 Status",
            "primary_studyname": "Primary Study Name",
            "secondary_studyname": "Secondary Study Name",
            "Estimated_number_of_cells": "GEX_Estimated_number_of_cells",
            "number_of_reads": "GEX_number_of_reads",
            "Feature_linkages_detected": "Multiome_Feature_linkages_detected",
            "Linked_genes": "Multiome_Linked_genes",
            "Linked_peaks": "Multiome_Linked_peaks",
            "ATAC_Fraction_of_high.quality_fragments_in_cells": "ATAC_Fraction_of_high_quality_fragments_in_cells",
            "ATAC_Fraction_of_high.quality_fragments_overlapping_TSS": "ATAC_Fraction_of_high_quality_fragments_overlapping_TSS",
            "ATAC_Fraction_of_high.quality_fragments_overlapping_peaks": "ATAC_Fraction_of_high_quality_fragments_overlapping_peaks",
            "ATAC_Median_high.quality_fragments_per_cell": "ATAC_Median_high_quality_fragments_per_cell",
            "ATAC_Non.nuclear_read_pairs": "ATAC_Non-nuclear_read_pairs",
            "nCount_RNA": "Number of UMIs",
            "nFeature_RNA": "Genes detected",
            "doublet_score": "Doublet score",
            "fraction_mito": "Fraction mitochondrial UMIs",
            "for_analysis": "Used in analysis",
            "class_conf_scANVI": "Class confidence",
            "class_scANVI": "Class",
            "subclass_conf_scANVI": "Subclass confidence",
            "subclass_scANVI": "Subclass",
            "supertype_conf_scANVI": "Supertype confidence",
            "supertype_scANVI": "Supertype (non-expanded)",
            "supertype_scANVI_leiden": "Supertype",
            "rna_neighbors_qc_ratio_new": "RNA Quality Control Score",
            "leiden_1.0": "Quality Control Clusters",
        },
        axis=1,
        inplace=True
    )

    metadata_colors = {}
    for i in metadata_to_color:
        adata.obs[i] = adata.obs[i].astype("category")
        a = adata.obs[i].cat.categories.to_list()
        cmap = plt.get_cmap('viridis', len(a)+1)
        b = [colors.rgb2hex(cmap(j)) for j in range(cmap.N)]
        b.pop(0)
        if i == "Sex":
            b = ["pink", "dodgerblue"]
        for A, B in zip(a, b):
            try:
                metadata_colors[i][A] = B
            except:
                metadata_colors[i] = {}
                metadata_colors[i][A] = B
        try:
            del adata.uns[i + "_colors"]
        except:
            pass
        sc.pl.umap(adata,
            color=i,
            title="",
            palette=metadata_colors[i],
            size=10,
            legend_fontsize=12
        )
    
    if annotation_color == True:
        tmp = dend_order.loc[:, ["subclass_label", "subclass_color"]].drop_duplicates()
        tmp.index = tmp["subclass_label"].copy()
        tmp.drop("subclass_label", axis=1, inplace=True)
        tmp = tmp.to_dict()["subclass_color"]
        sc.pl.umap(adata,
            color="Subclass",
            title="",
            palette=tmp,
            size=10,
            legend_fontsize=12
        )
    
    for i in adata.uns.keys():
        if i.endswith("_colors"):
            adata.uns[i] = [re.sub("ff$", "", j) for j in adata.uns[i]]


    CPS = pd.read_csv(os.path.join(pwd, "input", region, "donor_name_CPS.csv"), index_col=0)
    adata.obs = adata.obs.merge(CPS, left_on="Donor ID", right_index=True, how="left")

    adata.write(os.path.join(pwd, "output", region, "SEAAD_" + region + "_" + modality + "_" + tag + " -nuclei." + str(datetime.datetime.date(datetime.datetime.now())) + ".h5ad"), compression="gzip")



### Read in snATACseq data and match format with snRNAseq data

In [None]:
region = "MTG"
modalit = "ATACseq"
adata = sc.read_h5ad(os.path.join(pwd, "input", region + "_" + modality, "multivi_AD_3cohorts_RNA_ATAC_Multiome_v2.0_annotation_updated.h5ad"))

# Read in metadata variables that are also present in snRNAseq
del adata.obsm["X_scVI"]
del adata.layers["UMIs"]
adata = adata[adata.obs["method"] != "10Xv3.1"].copy()
adata = adata[:, adata.var["modality"] == "Peaks"].copy

ATAC_metadata = pd.read_csv(os.path.join(pwd, "input", region + "_" + modality, "ATAC_AD_Center_Grant_update.csv"))
ATAC_ref_metadata = pd.read_csv(os.path.join(pwd, "input", region + "_" + modality, "ATAC_Hu_Ref_Brain_ATX-576/ATAC_LIMS_Hu_Ref_Brain_update.csv"))
ATAC_ref_metadata = ATAC_ref_metadata.loc[[i in ['L8AT_210427_01_A05', 'L8AT_210427_01_B05'] for i in ATAC_ref_metadata["library_prep"]], ATAC_metadata.columns]
ATAC_metadata = pd.concat([ATAC_metadata, ATAC_ref_metadata], axis=0)
to_drop_singleome = [
    "Estimated_bulk_library_complexity",
    "Fragments_flanking_a_single_nucleosome",
    "Fragments_in_nucleosome-free_regions",
    "Pipeline_version",
    "Sample_ID",
    "lib_method",
    "percent_dinucleosome_analyzer",
    "percent_mononucleosome_analyzer",
    "percent_multinucleated_fragments_analyzer",
    "percent_nucleosome_free_analyzer",
    "Sequencing_saturation"
]
to_rename_singleome = {
    "Confidently_mapped_read_pairs": "ATAC_Confidently_mapped_read_pairs",
    "Fraction_of_genome_in_peaks": "ATAC_Fraction_of_genome_in_peaks",
    "Fraction_of_high-quality_fragments_in_cells": "ATAC_Fraction_of_high.quality_fragments_in_cells",
    "Fraction_of_high-quality_fragments_overlapping_TSS": "ATAC_Fraction_of_high.quality_fragments_overlapping_TSS",
    "Fraction_of_high-quality_fragments_overlapping_peaks": "ATAC_Fraction_of_high.quality_fragments_overlapping_peaks",
    "Fraction_of_transposition_events_in_peaks_in_cells": "ATAC_Fraction_of_transposition_events_in_peaks_in_cells",
    "Mean_raw_read_pairs_per_cell": "ATAC_Mean_raw_read_pairs_per_cell",
    "Median_high-quality_fragments_per_cell": "ATAC_Median_high.quality_fragments_per_cell",
    "Non-nuclear_read_pairs": "ATAC_Non.nuclear_read_pairs",
    "Number_of_peaks": "ATAC_Number_of_peaks",
    "Percent_duplicates": "ATAC_Percent_duplicates",
    "Q30_bases_in_barcode": "ATAC_Q30_bases_in_barcode",
    "Q30_bases_in_read_1": "ATAC_Q30_bases_in_read_1",
    "Q30_bases_in_read_2": "ATAC_Q30_bases_in_read_2",
    "Q30_bases_in_sample_index_i1": "ATAC_Q30_bases_in_sample_index_i1",
    "Sequenced_read_pairs": "ATAC_Sequenced_read_pairs",
    "TSS_enrichment_score": "ATAC_TSS_enrichment_score",
    "Unmapped_read_pairs": "ATAC_Unmapped_read_pairs",
    "Valid_barcodes": "ATAC_Valid_barcodes",
}

ATAC_metadata.drop(to_drop_singleome, axis=1, inplace=True)
ATAC_metadata.rename(columns=to_rename_singleome, inplace=True)

ATAC_metadata["alignment"] = "cellranger-arc-2.0.0"

tmp = adata.obs.merge(ATAC_metadata, left_on="library_prep", right_on="library_prep", how="left")
for i in tmp.columns[tmp.columns.str.endswith("_x")]:
    tmp[i] = tmp[i].astype("object")
    tmp[i].fillna(tmp[i.replace("_x", "_y")], inplace=True)
    tmp[i.replace("_x", "")] = tmp[i].copy()
    tmp.drop([i, i.replace("_x", "_y")], axis=1, inplace=True)
adata.obs = tmp.loc[:, adata.obs.columns]

clinical_metadata = pd.read_spss(os.path.join(pwd, "input", region + "_" + modality, "NeuropathMetadata_version103_20220415.KT.sav"))
tmp = adata.obs.merge(clinical_metadata, left_on="external_donor_name", right_on="uwa", how="left")
for i in tmp.columns[tmp.columns.str.endswith("_x")]:
    tmp[i] = tmp[i].astype("object")
    tmp[i].fillna(tmp[i.replace("_x", "_y")], inplace=True)
    tmp[i.replace("_x", "")] = tmp[i].copy()
    tmp.drop([i, i.replace("_x", "_y")], axis=1, inplace=True)
adata.obs = tmp.loc[:, adata.obs.columns]
adata.obs.index = adata.obs["sample_id"].copy()
adata.obs.index.name = "index"

# Pre-surgery to remove low quality clusters identified in snATACseq notebooks
bad_clusters = ['6', '10', '21', '22', '25', '29', '32', '33', '36', '39', '47', '37', '42', '43', '45', '46']
for_analysis = (adata.obs["rna_neighbors_qc_ratio_new"] <= 0.2) & ([i not in bad_clusters for i in adata.obs["leiden_1.0"]]) & ~(adata.obs["label_transfer"].isna())
for_analysis = np.array([str(i) for i in for_analysis])
adata.obs["for_analysis"] = for_analysis.copy()

adata.obs.loc[adata.obs["method"] == "10xATAC_v1.1", "subclass_scANVI"] = adata.obs.loc[adata.obs["method"] == "10xATAC_v1.1", "label_transfer"]
adata.obs.loc[adata.obs["method"] == "10xATAC_v1.1", "subclass_conf_scANVI"] = adata.obs.loc[adata.obs["method"] == "10xATAC_v1.1", "subclass_purity_ratio"]
adata.obs.loc[adata.obs["method"] == "10xATAC_v1.1", "class_scANVI"] = "exc"
adata.obs.loc[(adata.obs["method"] == "10xATAC_v1.1") & ([i in ["Lamp5_Lhx6", "Lamp5", "Sncg", "Pax6", "Vip", "Sst Chodl", "Sst", "Pvalb", "Chandelier"] for i in adata.obs["subclass_scANVI"]]), "class_scANVI"] = "inh"
adata.obs.loc[(adata.obs["method"] == "10xATAC_v1.1") & ([i in ["Astro", "Oligo", "OPC", "Endo", "VLMC", "Micro-PVM"] for i in adata.obs["subclass_scANVI"]]), "class_scANVI"] = "glia"
adata.obs.loc[adata.obs["method"] == "10xATAC_v1.1", "class_conf_scANVI"] = adata.obs.loc[adata.obs["method"] == "10xATAC_v1.1", "subclass_purity_ratio"]

adata.obs["reference_cell"].fillna(0, inplace=True)
adata.obs.loc[[i in ['L8AT_210427_01_A05', 'L8AT_210427_01_B05'] for i in adata.obs["library_prep"]], "reference_cell"] = 1

adata.obs["bc"] = [re.sub("^([ATGC]+)-(.*)$", "\\1", i) for i in adata.obs["sample_id"]]

adata.var.drop("modality", axis=1, inplace=True)

adata.obs["nCount_RNA"] = adata.X.sum(axis=1)

adata.obs["age"] = adata.obs["age"].astype("object")
tmp = [np.float32(i.replace(" yrs", "")) for i in adata.obs.loc[adata.obs["reference_cell"] == 1, "age"]]
adata.obs.loc[adata.obs["reference_cell"] == 1, "age_at_death"] = tmp.copy()

### Prepare and export objects for AWS

In [None]:
region = "MTG"
modality = "ATACseq"

prepare_anndata(
    adata=adata,
    region=region,
    modality=modality,
    tag="all",
    metadata_to_color=[],
    annotation_color=False
)

In [None]:
region = "MTG"
modality = "ATACseq"

metadata_to_color = [
    "Sex",
    "Cognitive Status",
    "Overall AD neuropathological Change",
    "Braak",
    "Thal",
    "CERAD score",
    "Highest Lewy Body Disease",
    "LATE",
    "APOE4 Status",
]

prepare_anndata(
    adata=adata,
    region=region,
    modality=modality,
    tag="final",
    metadata_to_color=metadata_to_color
)