### Load needed libraries

In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from matplotlib import pyplot as plt
from datetime import datetime
import warnings
from helper_functions import *

sc.settings.n_jobs = 32
warnings.filterwarnings("ignore")

pwd = os.getcwd()

### Define datasets

In [None]:
external_datasets = {
    "10.1038_s41586-019-1195-2": {
        "name": "Mathys_2019",
        "dir": "",
        "glob_pattern": "D17*",
        "merge_on": "",
    },
    "10.1038_s41467-020-19737-2": {
        "name": "Olah_2020",
        "dir": "",
        "glob_pattern": "Microglia_MO_*",
        "merge_on": "",
    },
    "10.1038_s41591-019-0695-9": {
        "name": "Zhou_2020",
        "dir": "",
        "glob_pattern": "TWCC-*",
        "merge_on": "",
    },
    "10.1073_pnas.2008762117": {
        "name": "Lau_2020",
        "dir": "",
        "glob_pattern": "[AD|NC]*",
        "merge_on": "",
    },
    "10.1038_s41593-020-00764-7": {
        "name": "Leng_2021",
        "dir": "",
        "glob_pattern": "SRR*",
        "merge_on": "",
    },
    "10.1038_s41588-021-00894-z": {
        "name": "Morabito_2021",
        "dir": "",
        "glob_pattern": "Sample-*",
        "merge_on": "",
    },
    "10.1101_2020.12.22.424084": {
        "name": "Cain_2022",
        "dir": "",
        "glob_pattern": "MFC-*",
        "merge_on": "",
    },
    "10.1038_s41586-021-04369-3": {
        "name": "Yang_2022",
        "dir": "",
        "glob_pattern": "SRR*",
        "merge_on": "",
    },
    "10.1101_2023.03.07.531493": {
        "name": "Green_2023",
        "dir": "CellRanger_Results",
        "glob_pattern": "B*",
        "merge_on": "",
    },
    "10.1016_j.cell.2023.08.039": {
        "name": "Mathys_2023",
        "dir": "CellRanger_Results",
        "glob_pattern": "D*",
        "merge_on": "",
    },
}
external_datasets = {
    "10.1038_s41586-021-04369-3": {
        "name": "Yang_2022",
        "dir": "",
        "glob_pattern": "SRR*",
        "merge_on": "",
    }
}

external_datasets = {
    "10.1038_s41586-019-1195-2": {
        "name": "Mathys_2019",
        "dir": "",
        "glob_pattern": "D17*",
        "merge_on": "",
    },
    "10.1038_s41467-020-19737-2": {
        "name": "Olah_2020",
        "dir": "",
        "glob_pattern": "Microglia_MO_*",
        "merge_on": "",
    },
    "10.1038_s41591-019-0695-9": {
        "name": "Zhou_2020",
        "dir": "",
        "glob_pattern": "TWCC-*",
        "merge_on": "",
    },
    "10.1073_pnas.2008762117": {
        "name": "Lau_2020",
        "dir": "",
        "glob_pattern": "[AD|NC]*",
        "merge_on": "",
    },
    "10.1038_s41593-020-00764-7": {
        "name": "Leng_2021",
        "dir": "",
        "glob_pattern": "SRR*",
        "merge_on": "",
    },
    "10.1038_s41588-021-00894-z": {
        "name": "Morabito_2021",
        "dir": "",
        "glob_pattern": "Sample-*",
        "merge_on": "",
    },
    "10.1101_2020.12.22.424084": {
        "name": "Cain_2022",
        "dir": "",
        "glob_pattern": "MFC-*",
        "merge_on": "",
    },
    "10.1038_s41586-021-04369-3": {
        "name": "Yang_2022",
        "dir": "",
        "glob_pattern": "SRR*",
        "merge_on": "",
    },
    "10.1101_2023.03.07.531493": {
        "name": "Green_2023",
        "dir": "CellRanger_Results",
        "glob_pattern": "B*",
        "merge_on": "",
    },
    "10.1016_j.cell.2023.08.039": {
        "name": "Mathys_2023",
        "dir": "CellRanger_Results",
        "glob_pattern": "D*",
        "merge_on": "",
    },
}
external_datasets = {
    "10.1038_s41588-021-00894-z": {
        "name": "Morabito_2021",
        "dir": "",
        "glob_pattern": "Sample-*",
        "merge_on": "",
    },
    "10.1073_pnas.2008762117": {
        "name": "Lau_2020",
        "dir": "",
        "glob_pattern": "[AD|NC]*",
        "merge_on": "",
    },
}

### Construct joint metadata

In [None]:
# File manifest, all accessed Oct 4th, 2023
#
# Mathys_2018
# https://www.synapse.org/#!Synapse:syn3191087 - ROSMAP_clinical.csv
# https://www.synapse.org/#!Synapse:syn18642936 - snRNAseqPFC_BA10_biospecimen_metadata.csv
# From RADC - dataset_1134_basic_11-29-2023.xlsx
#
# Olah_2020
# https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-020-19737-2/MediaObjects/41467_2020_19737_MOESM3_ESM.xls
#
# Zhou_2020
# https://static-content.springer.com/esm/art%3A10.1038%2Fs41591-019-0695-9/MediaObjects/41591_2019_695_MOESM2_ESM.xlsx
# From RADC - dataset_1134_basic_11-29-2023.xlsx
#
# Lau_2020
# https://www.pnas.org/doi/suppl/10.1073/pnas.2008762117/suppl_file/pnas.2008762117.sd01.xlsx
# https://www.ncbi.nlm.nih.gov/biosample?LinkName=bioproject_biosample_all&from_uid=662923 - biosample_result.txt
#
# Leng_2021
# https://www.nature.com/articles/s41593-020-00764-7/tables/1 - Table 1.txt
# https://www.ncbi.nlm.nih.gov/Traces/study/?acc=PRJNA615180&o=acc_s%3Aa - SraRunTable.txt
# https://www.ncbi.nlm.nih.gov/biosample?LinkName=bioproject_biosample_all&from_uid=615180 - biosample_result.txt
#
# Morabito_2021
# https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-021-00894-z/MediaObjects/41588_2021_894_MOESM4_ESM.xlsx
# https://www.ncbi.nlm.nih.gov/biosample?LinkName=bioproject_biosample_all&from_uid=729525 - biosample_result.txt
# 
# Cain_2022
# https://www.synapse.org/#!Synapse:syn3191087 - ROSMAP_clinical.csv
# https://www.synapse.org/#!Synapse:syn21073536 - ROSMAP_assay_scrnaSeq_metadata.csv
# https://www.synapse.org/#!Synapse:syn21323366 - ROSMAP_biospecimen_metadata.csv
# From RADC - dataset_1134_basic_11-29-2023.xlsx
#
# Yang_2022
# https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-021-04369-3/MediaObjects/41586_2021_4369_MOESM2_ESM.xlsx
# https://www.ncbi.nlm.nih.gov/Traces/study/?acc=PRJNA686798&o=acc_s%3Aa - SraRunTable.txt
# https://www.ncbi.nlm.nih.gov/biosample?LinkName=bioproject_biosample_all&from_uid=686798 - biosample_result.txt
#
# Green_2023
# https://www.synapse.org/#!Synapse:syn3191087 - ROSMAP_clinical.csv
# https://www.synapse.org/#!Synapse:syn34572333 - ROSMAP_snRNAseq_demultiplexed_ID_mapping.csv
# From RADC - dataset_1134_basic_11-29-2023.xlsx
# 
# Mathys_2023
# https://www.synapse.org/#!Synapse:syn3191087 - ROSMAP_clinical.csv
# Mathys, Personal Communication - ROSMAP_snRNAseq_PFC_manifest_fastq_files_v2.csv
# From RADC - dataset_1134_basic_11-29-2023.xlsx

In [None]:
# Common metadata format/specification for every library
# library_prep - (Required) str
# Donor ID - (Required) str
# Brain Region - (Reguired) literal 'MTG' or 'PFC'
# Method - (Required) literal "3' 10x v2", "3' 10x v3", "3' 10x v3.1", "3' 10x Multiome", or "5' 10x v1"
# RIN - float
# barcode - str

# Common metadata format/specification for every donor
# Donor ID - (Required) str
# Publication - (Required) str
# Primary Study Name - (Required) str
# Age at Death (Required) - int, float or literal '90+'
# Sex - (Required) literal "Male" or "Female"
# Race (choice=White) - literal "Checked" or "Unchecked"
# Race (choice=Black/ African American) - literal "Checked" or "Unchecked"
# Race (choice=Asian) - literal "Checked" or "Unchecked"
# Race (choice=American Indian/ Alaska Native) - literal "Checked" or "Unchecked"
# Race (choice=Native Hawaiian or Pacific Islander) - literal "Checked" or "Unchecked"
# Race (choice=Unknown or unreported) - literal "Checked" or "Unchecked"
# Race (choice=Other) - literal "Checked" or "Unchecked"
# Hispanic/Latino - literal "Yes" or "No"
# Years of education - int or float
# APOE4 Status - (Required) literal 'Y' or 'N'
# PMI - (Required) float
# Fresh Brain Weight - float
# Brain pH - float
# Overall AD neuropathological Change - literal "Not AD", "Low", "Intermediate", or "High"
# Thal - literal "Thal 0", "Thal 1", "Thal 2", "Thal 3", "Thal 4", or "Thal 5"
# Braak - (Required) literal "Braak 0", "Braak I", "Braak II", "Braak III", "Braak IV", "Braak V", or "Braak VI"
# CERAD score - literal 'Absent', 'Sparse', 'Moderate', or 'Frequent'
# Cognitive Status - (Required) literal 'No dementia' or 'Dementia'
# Highest Lewy Body Disease - literal 'Not Identified (olfactory bulb not assessed)', 'Not Identified (olfactory bulb assessed)', 'Olfactory bulb only', 'Amygdala-predominant', 'Brainstem-predominant', 'Limbic (Transitional)', or 'Neocortical (Diffuse)'
# LATE - literal 'Unclassifiable', 'Not Identified', 'LATE Stage 1', 'LATE Stage 2', 'LATE Stage 3'
# Overall CAA Score - literal 'Not identified', 'Mild', 'Moderate', 'Severe'
# Atherosclerosis - literal 'None', 'Mild', 'Moderate', 'Severe'
# Arteriolosclerosis - literal 'None', 'Mild', 'Moderate', 'Severe'

##### Mathys_2019

In [None]:
# Mathys_2019
libraries = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Mathys_2019", "snRNAseqPFC_BA10_biospecimen_metadata.csv"))
libraries = libraries.loc[:, ["specimenID", "individualID"]]
libraries.columns = ["library_prep", "Donor ID"]
libraries["library_prep"] = [i.replace("_S", "-S") for i in libraries["library_prep"]]
libraries["Brain Region"] = "PFC"
libraries["Method"] = "3' 10x v2"

display(libraries)
libraries.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Mathys_2019", "libraries.csv"))

donors = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Mathys_2019", "ROSMAP_clinical.csv"))
RADC = pd.read_excel(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Mathys_2019", "dataset_1134_basic_11-29-2023.xlsx"), sheet_name="Sheet0")
donors = donors.loc[:, ["projid", "individualID"]].merge(RADC, how="left", left_on="projid", right_on="projid")
donors.index = donors["individualID"].copy()
donors.index.name = ''

donors["msex"] = donors["msex"].astype("category")
donors["msex"] = donors["msex"].cat.rename_categories(
    {
        0: "Female",
        1: "Male"
    }
)
donors["Race (choice=White)"] = "Unchecked"    
donors["Race (choice=Black/ African American)"] = "Unchecked"
donors["Race (choice=Asian)"] = "Unchecked"
donors["Race (choice=American Indian/ Alaska Native)"] = "Unchecked"
donors["Race (choice=Native Hawaiian or Pacific Islander)"] = "Unchecked"
donors["Race (choice=Unknown or unreported)"] = "Unchecked"
donors["Race (choice=Other)"] = "Unchecked"
for i,j in donors["race"].items():
    if j == 1:
        donors.loc[i, "Race (choice=White)"] = "Checked"
    if j == 2:
        donors.loc[i, "Race (choice=Black/ African American)"] = "Checked"
    if j == 3:
        donors.loc[i, "Race (choice=American Indian/ Alaska Native)"] = "Checked"
    if j == 4:
        donors.loc[i, "Race (choice=Native Hawaiian or Pacific Islander)"] = "Checked"
    if j == 5:
        donors.loc[i, "Race (choice=Asian)"] = "Checked"
    if j == 6:
        donors.loc[i, "Race (choice=Other)"] = "Checked"
    if j == 7:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
    if j == np.nan:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
        
donors["spanish"] = donors["spanish"].astype("category")
donors["spanish"] = donors["spanish"].cat.rename_categories(
    {
        1: "No",
        2: "Yes"
    }
)

donors["apoe_genotype"] = donors["apoe_genotype"].astype("str")
donors["apoe_genotype"] = donors["apoe_genotype"].str.contains("4")
donors["apoe_genotype"] = donors["apoe_genotype"].astype("category")
donors["apoe_genotype"] = donors["apoe_genotype"].cat.rename_categories(
    {
        True: "Y",
        True: "N"
    }
)

donors["niareagansc"] = donors["niareagansc"].astype("category")
donors["niareagansc"] = donors["niareagansc"].cat.rename_categories(
    {
        1.0: "High",
        2.0: "Intermediate",
        3.0: "Low",
        4.0: "Not AD",
    }
)    

donors["braaksc"] = donors["braaksc"].astype("category")
donors["braaksc"] = donors["braaksc"].cat.rename_categories(
    {
        0: "Braak 0",
        1: "Braak I",
        2: "Braak II",
        3: "Braak III",
        4: "Braak IV",
        5: "Braak V",
        6: "Braak VI",
    }
)

donors["ceradsc"] = donors["ceradsc"].astype("category")
donors["ceradsc"] = donors["ceradsc"].cat.rename_categories(
    {
        1: "Frequent",
        2: "Moderate",
        3: "Sparse",
        4: "Absent",
    }
)

donors["cogdx"] = donors["cogdx"] > 3
donors["cogdx"] = donors["cogdx"].astype("category")
donors["cogdx"] = donors["cogdx"].cat.rename_categories(
    {
        True: "Dementia",
        False: "No dementia"
    }
)

donors["dlbdx"] = donors["dlbdx"].astype("category")
donors["dlbdx"] = donors["dlbdx"].cat.rename_categories(
    {
        0.0: "Not Identified (olfactory bulb not assessed)",
        1.0: "Brainstem-predominant",
        2.0: "Limbic (Transitional)",
        3.0: "Neocortical (Diffuse)",
    }
)

donors["tdp_st4"] = donors["tdp_st4"].astype("category")
donors["tdp_st4"] = donors["tdp_st4"].cat.rename_categories(
    {
        0.0: "Not Identified",
        1.0: "LATE Stage 1",
        2.0: "LATE Stage 2",
        3.0: "LATE Stage 3",
    }
)

donors["caa_4gp"] = donors["caa_4gp"].astype("category")
donors["caa_4gp"] = donors["caa_4gp"].cat.rename_categories(
    {
        0.0: "Not identified",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors["cvda_4gp2"] = donors["cvda_4gp2"].astype("category")
donors["cvda_4gp2"] = donors["cvda_4gp2"].cat.rename_categories(
    {
        0.0: "None",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors["arteriol_scler"] = donors["arteriol_scler"].astype("category")
donors["arteriol_scler"] = donors["arteriol_scler"].cat.rename_categories(
    {
        0.0: "None",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors = donors.loc[
    libraries["Donor ID"],
    [
        "Study",
        "age_death",
        "msex",
        "Race (choice=White)", 
        "Race (choice=Black/ African American)",
        "Race (choice=Asian)",
        "Race (choice=American Indian/ Alaska Native)",
        "Race (choice=Native Hawaiian or Pacific Islander)",
        "Race (choice=Unknown or unreported)",
        "Race (choice=Other)",
        "spanish",
        "educ",
        "apoe_genotype",
        "pmi",
        "niareagansc",
        "braaksc",
        "ceradsc",
        "cogdx",
        "dlbdx",
        "tdp_st4",
        "caa_4gp",
        "cvda_4gp2",
        "arteriol_scler",
    ]
]
donors = donors.rename(
    {
        "Study": "Primary Study Name",
        "age_death": "Age at Death",
        "msex": "Sex",
        "spanish": "Hispanic/Latino",
        "educ": "Years of education",
        "apoe_genotype": "APOE4 Status",
        "pmi": "PMI",
        "niareagansc": "Overall AD neuropathological Change",
        "braaksc": "Braak",
        "ceradsc": "CERAD score",
        "cogdx": "Cognitive Status",
        "dlbdx": "Highest Lewy Body Disease",
        "tdp_st4": "LATE",
        "caa_4gp": "Overall CAA Score",
        "cvda_4gp2": "Atheroslcerosis",
        "arteriol_scler": "Arteriolosclerosis",
    },
    axis=1
)
display(donors)
donors.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Mathys_2019", "donors.csv"))


##### Olah_2020

In [None]:
# Olah_2020
ROSMAP = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Olah_2020", "ROSMAP_clinical.csv"), index_col=0)

donors = pd.read_excel(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Olah_2020", "41467_2020_19737_MOESM3_ESM.xls"), sheet_name="RUSH", nrows=14)
donors = donors.merge(ROSMAP.loc[:, np.setdiff1d(ROSMAP.columns, donors.columns)], left_on="projid", right_index=True, how="left")

libraries = donors.loc[:, ["Sample ID", "individualID"]]
libraries.columns = ["library_prep", "Donor ID"]
libraries["library_prep"] = ["Microglia_MO_" + i for i in libraries["library_prep"]]
libraries["Brain Region"] = "PFC"
libraries["Method"] = "3' 10x v2"

display(libraries)
libraries.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Olah_2020", "libraries.csv"))

donors.index = donors["individualID"].copy()
donors.index.name = ''

donors["msex"] = donors["msex"].astype("category")
donors["msex"] = donors["msex"].cat.rename_categories(
    {
        0: "Female",
        1: "Male"
    }
)
donors["Race (choice=White)"] = "Unchecked"    
donors["Race (choice=Black/ African American)"] = "Unchecked"
donors["Race (choice=Asian)"] = "Unchecked"
donors["Race (choice=American Indian/ Alaska Native)"] = "Unchecked"
donors["Race (choice=Native Hawaiian or Pacific Islander)"] = "Unchecked"
donors["Race (choice=Unknown or unreported)"] = "Unchecked"
donors["Race (choice=Other)"] = "Unchecked"
for i,j in donors["race"].items():
    if j == 1:
        donors.loc[i, "Race (choice=White)"] = "Checked"
    if j == 2:
        donors.loc[i, "Race (choice=Black/ African American)"] = "Checked"
    if j == 3:
        donors.loc[i, "Race (choice=American Indian/ Alaska Native)"] = "Checked"
    if j == 4:
        donors.loc[i, "Race (choice=Native Hawaiian or Pacific Islander)"] = "Checked"
    if j == 5:
        donors.loc[i, "Race (choice=Asian)"] = "Checked"
    if j == 6:
        donors.loc[i, "Race (choice=Other)"] = "Checked"
    if j == 7:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
    if j == np.nan:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
        
donors["spanish"] = donors["spanish"].astype("category")
donors["spanish"] = donors["spanish"].cat.rename_categories(
    {
        1: "No",
        2: "Yes"
    }
)

donors["apoe_genotype"] = donors["apoe_genotype"].astype("str")
donors["apoe_genotype"] = donors["apoe_genotype"].str.contains("4")
donors["apoe_genotype"] = donors["apoe_genotype"].astype("category")
donors["apoe_genotype"] = donors["apoe_genotype"].cat.rename_categories(
    {
        True: "Y",
        True: "N"
    }
)

donors["NIA-AA score"] = donors["NIA-AA score"].astype("category")
donors["NIA-AA score"] = donors["NIA-AA score"].cat.rename_categories(
    {
        "intermediate": "Intermediate",
        "high": "High",
    }
)

donors["braaksc"] = donors["braaksc"].astype("category")
donors["braaksc"] = donors["braaksc"].cat.rename_categories(
    {
        0: "Braak 0",
        1: "Braak I",
        2: "Braak II",
        3: "Braak III",
        4: "Braak IV",
        5: "Braak V",
        6: "Braak VI",
    }
)

donors["ceradsc"] = donors["ceradsc"].astype("category")
donors["ceradsc"] = donors["ceradsc"].cat.rename_categories(
    {
        1: "Frequent",
        2: "Moderate",
        3: "Sparse",
        4: "Absent",
    }
)

donors["cogdx"] = donors["cogdx"] > 3
donors["cogdx"] = donors["cogdx"].astype("category")
donors["cogdx"] = donors["cogdx"].cat.rename_categories(
    {
        True: "Dementia",
        False: "No dementia"
    }
)

donors["DLB diagnosis"] = donors["DLB diagnosis"].astype("category")
donors["DLB diagnosis"] = donors["DLB diagnosis"].cat.rename_categories(
    {
        "none": "Not Identified (olfactory bulb assessed)",
        "limbic-type": "Limbic (Transitional)",
        "neocortical-type": "Neocortical (Diffuse)",
    }
)

donors["TDP-43 stage"] = donors["TDP-43 stage"].astype("category")
donors["TDP-43 stage"] = donors["TDP-43 stage"].cat.rename_categories(
    {
        "none": "Not Identified",
        "amygdala-only": "LATE Stage 1",
        "limbic": "LATE Stage 2",
        "neocortical": "Late Stage 3",
    }
)

donors["CAA severity"] = donors["CAA severity"].astype("category")
donors["CAA severity"] = donors["CAA severity"].cat.rename_categories(
    {
        "none": "Not identified",
        "mild": "Mild",
        "moderate": "Moderate",
        "severe": "Severe",
    }
)

donors[" Atheroslcerosis severity "] = donors[" Atheroslcerosis severity "].astype("category")
donors[" Atheroslcerosis severity "] = donors[" Atheroslcerosis severity "].cat.rename_categories(
    {
        "none": "None",
        "mild": "Mild",
        "moderate": "Moderate",
        "severe": "Severe",
    }
)

donors[" Arteriolosclerosis severity"] = donors[" Arteriolosclerosis severity"].astype("category")
donors[" Arteriolosclerosis severity"] = donors[" Arteriolosclerosis severity"].cat.rename_categories(
    {
        "none": "None",
        "mild": "Mild",
        "moderate": "Moderate",
        "severe": "Severe",
    }
)

donors = donors.loc[
    :,
    [
        "Study",
        "age_death",
        "msex",
        "Race (choice=White)", 
        "Race (choice=Black/ African American)",
        "Race (choice=Asian)",
        "Race (choice=American Indian/ Alaska Native)",
        "Race (choice=Native Hawaiian or Pacific Islander)",
        "Race (choice=Unknown or unreported)",
        "Race (choice=Other)",
        "spanish",
        "educ",
        "apoe_genotype",
        "pmi",
        "NIA-AA score",
        "braaksc",
        "ceradsc",
        "cogdx",
        "DLB diagnosis",
        "TDP-43 stage",
        "CAA severity",
        " Atheroslcerosis severity ",
        " Arteriolosclerosis severity",
    ]
]
donors = donors.rename(
    {
        "Study": "Primary Study Name",
        "age_death": "Age at Death",
        "msex": "Sex",
        "spanish": "Hispanic/Latino",
        "educ": "Years of education",
        "apoe_genotype": "APOE4 Status",
        "pmi": "PMI",
        "NIA-AA score": "Overall AD neuropathological Change",
        "braaksc": "Braak",
        "ceradsc": "CERAD score",
        "cogdx": "Cognitive Status",
        "DLB diagnosis": "Highest Lewy Body Disease",
        "TDP-43 stage": "LATE",
        "CAA severity": "Overall CAA Score",
        " Atheroslcerosis severity ": "Atheroslcerosis",
        " Arteriolosclerosis severity": "Arteriolosclerosis",
    },
    axis=1
)
with pd.option_context("display.max_columns", None):
    display(donors)
    
donors.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Olah_2020", "donors.csv"))



##### Zhou_2020

In [None]:
ROSMAP = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Zhou_2020", "ROSMAP_clinical.csv"), index_col=0)
RADC = pd.read_excel(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Zhou_2020", "dataset_1134_basic_11-29-2023.xlsx"), sheet_name="Sheet0")
ROSMAP = ROSMAP.loc[:, ["projid", "individualID"]].merge(RADC, how="left", left_on="projid", right_on="projid")

donors = pd.read_excel(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Zhou_2020", "41591_2019_695_MOESM2_ESM.xlsx"), sheet_name="Rush samples", skiprows=4)
donors = donors.loc[donors["Sample ID in snRNA-seq"] != "-", :]
donors = donors.merge(ROSMAP.loc[:, np.setdiff1d(ROSMAP.columns, donors.columns)], left_on="Sample Identifier", right_index=True, how="left")

libraries = donors.loc[:, ["Sample ID in snRNA-seq", "individualID"]]
libraries.columns = ["library_prep", "Donor ID"]
libraries["library_prep"] = ["TWCC-" + i + "-" + i + "-lib1" for i in libraries["library_prep"]]
libraries["Brain Region"] = "PFC"
libraries["Method"] = "5' 10x v1"
display(libraries)
libraries.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Zhou_2020", "libraries.csv"))

donors.index = donors["individualID"].copy()
donors.index.name = ''

donors["msex"] = donors["msex"].astype("category")
donors["msex"] = donors["msex"].cat.rename_categories(
    {
        0: "Female",
        1: "Male"
    }
)
donors["Race (choice=White)"] = "Unchecked"    
donors["Race (choice=Black/ African American)"] = "Unchecked"
donors["Race (choice=Asian)"] = "Unchecked"
donors["Race (choice=American Indian/ Alaska Native)"] = "Unchecked"
donors["Race (choice=Native Hawaiian or Pacific Islander)"] = "Unchecked"
donors["Race (choice=Unknown or unreported)"] = "Unchecked"
donors["Race (choice=Other)"] = "Unchecked"
for i,j in donors["race"].items():
    if j == 1:
        donors.loc[i, "Race (choice=White)"] = "Checked"
    if j == 2:
        donors.loc[i, "Race (choice=Black/ African American)"] = "Checked"
    if j == 3:
        donors.loc[i, "Race (choice=American Indian/ Alaska Native)"] = "Checked"
    if j == 4:
        donors.loc[i, "Race (choice=Native Hawaiian or Pacific Islander)"] = "Checked"
    if j == 5:
        donors.loc[i, "Race (choice=Asian)"] = "Checked"
    if j == 6:
        donors.loc[i, "Race (choice=Other)"] = "Checked"
    if j == 7:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
    if j == np.nan:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
        
donors["spanish"] = donors["spanish"].astype("category")
donors["spanish"] = donors["spanish"].cat.rename_categories(
    {
        1: "No",
        2: "Yes"
    }
)

donors["apoe_genotype"] = donors["apoe_genotype"].astype("str")
donors["apoe_genotype"] = donors["apoe_genotype"].str.contains("4")
donors["apoe_genotype"] = donors["apoe_genotype"].astype("category")
donors["apoe_genotype"] = donors["apoe_genotype"].cat.rename_categories(
    {
        True: "Y",
        True: "N"
    }
)

donors["niareagansc"] = donors["niareagansc"].astype("category")
donors["niareagansc"] = donors["niareagansc"].cat.rename_categories(
    {
        1.0: "High",
        2.0: "Intermediate",
        3.0: "Low",
        4.0: "Not AD",
    }
)    

donors["braaksc"] = donors["braaksc"].astype("category")
donors["braaksc"] = donors["braaksc"].cat.rename_categories(
    {
        0: "Braak 0",
        1: "Braak I",
        2: "Braak II",
        3: "Braak III",
        4: "Braak IV",
        5: "Braak V",
        6: "Braak VI",
    }
)

donors["ceradsc"] = donors["ceradsc"].astype("category")
donors["ceradsc"] = donors["ceradsc"].cat.rename_categories(
    {
        1: "Frequent",
        2: "Moderate",
        3: "Sparse",
        4: "Absent",
    }
)

donors["cogdx"] = donors["cogdx"] > 3
donors["cogdx"] = donors["cogdx"].astype("category")
donors["cogdx"] = donors["cogdx"].cat.rename_categories(
    {
        True: "Dementia",
        False: "No dementia"
    }
)

donors["dlbdx"] = donors["dlbdx"].astype("category")
donors["dlbdx"] = donors["dlbdx"].cat.rename_categories(
    {
        0.0: "Not Identified (olfactory bulb not assessed)",
        1.0: "Brainstem-predominant",
        2.0: "Limbic (Transitional)",
        3.0: "Neocortical (Diffuse)",
    }
)

donors["tdp_st4"] = donors["tdp_st4"].astype("category")
donors["tdp_st4"] = donors["tdp_st4"].cat.rename_categories(
    {
        0.0: "Not Identified",
        1.0: "LATE Stage 1",
        2.0: "LATE Stage 2",
        3.0: "LATE Stage 3",
    }
)

donors["caa_4gp"] = donors["caa_4gp"].astype("category")
donors["caa_4gp"] = donors["caa_4gp"].cat.rename_categories(
    {
        0.0: "Not identified",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors["cvda_4gp2"] = donors["cvda_4gp2"].astype("category")
donors["cvda_4gp2"] = donors["cvda_4gp2"].cat.rename_categories(
    {
        0.0: "None",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors["arteriol_scler"] = donors["arteriol_scler"].astype("category")
donors["arteriol_scler"] = donors["arteriol_scler"].cat.rename_categories(
    {
        0.0: "None",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)


donors = donors.loc[
    :,
    [
        "Study",
        "age_death",
        "msex",
        "Race (choice=White)", 
        "Race (choice=Black/ African American)",
        "Race (choice=Asian)",
        "Race (choice=American Indian/ Alaska Native)",
        "Race (choice=Native Hawaiian or Pacific Islander)",
        "Race (choice=Unknown or unreported)",
        "Race (choice=Other)",
        "spanish",
        "educ",
        "apoe_genotype",
        "pmi",
        "niareagansc",
        "braaksc",
        "ceradsc",
        "cogdx",
        "dlbdx",
        "tdp_st4",
        "caa_4gp",
        "cvda_4gp2",
        "arteriol_scler",
    ]
]
donors = donors.rename(
    {
        "Study": "Primary Study Name",
        "age_death": "Age at Death",
        "msex": "Sex",
        "spanish": "Hispanic/Latino",
        "educ": "Years of education",
        "apoe_genotype": "APOE4 Status",
        "pmi": "PMI",
        "niareagansc": "Overall AD neuropathological Change",
        "braaksc": "Braak",
        "ceradsc": "CERAD score",
        "cogdx": "Cognitive Status",
        "dlbdx": "Highest Lewy Body Disease",
        "tdp_st4": "LATE",
        "caa_4gp": "Overall CAA Score",
        "cvda_4gp2": "Atheroslcerosis",
        "arteriol_scler": "Arteriolosclerosis",
    },
    axis=1
)
with pd.option_context("display.max_columns", None):
    display(donors)
    
donors.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Zhou_2020", "donors.csv"))

##### Lau_2020

In [None]:
libraries = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Lau_2020", "biosample_result.txt"), sep="\t", header=None)
libraries.columns = ["Donor ID", "BioSample"]
libraries = libraries.loc[:, ["Donor ID"]]
libraries["library_prep"] = libraries["Donor ID"].copy()
libraries = libraries.loc[:, ["library_prep", "Donor ID"]].copy()
libraries["Brain Region"] = "PFC"
libraries["Method"] = "3' 10x v2"

display(libraries)
libraries.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Lau_2020", "libraries.csv"))

donors = pd.read_excel(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Lau_2020", "pnas.2008762117.sd01.xlsx"), sheet_name="Patient_info")
donors.index = donors["ID"].copy()
donors.index.name = ''

donors["Primary Study Name"] = "SWDBB"


donors["SEX"] = donors["SEX"].astype("category")
donors["SEX"] = donors["SEX"].cat.rename_categories(
    {
        "F": "Female",
        "M": "Male"
    }
)

donors["APOE"] = donors["APOE"].astype("str")
donors["APOE"] = donors["APOE"].str.contains("4")
donors["APOE"] = donors["APOE"].astype("category")
donors["APOE"] = donors["APOE"].cat.rename_categories(
    {
        True: "Y",
        True: "N"
    }
)

donors["Braak tangle stage"] = donors["Braak tangle stage"].astype("category")
donors["Braak tangle stage"] = donors["Braak tangle stage"].cat.rename_categories(
    {
        0: "Braak 0",
        1: "Braak I",
        2: "Braak II",
        3: "Braak III",
        4: "Braak IV",
        5: "Braak V",
        6: "Braak VI",
    }
)

donors["CONDITION"] = donors["CONDITION"].astype("category")
donors["CONDITION"] = donors["CONDITION"].cat.rename_categories(
    {
        "AD": "Dementia",
        "NC": "No dementia"
    }
)


donors = donors.loc[
    :,
    [
        "Primary Study Name",
        "AGE",
        "SEX",
        "APOE",
        "PMD",
        "Braak tangle stage",
        "CONDITION",
    ]
]
donors = donors.rename(
    {
        "AGE": "Age at Death",
        "SEX": "Sex",
        "APOE": "APOE4 Status",
        "PMD": "PMI",
        "Braak tangle stage": "Braak",
        "CONDITION": "Cognitive Status",
    },
    axis=1
)
with pd.option_context("display.max_columns", None):
    display(donors)
    
donors.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Lau_2020", "donors.csv"))


##### Leng_2021

In [None]:
libraries = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Leng_2021", "SraRunTable.txt"))
biosamples = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Leng_2021", "biosample_result.txt"), sep="\t", header=None)
biosamples.columns = ["Donor ID", "BioSample"]
libraries = libraries.merge(biosamples, left_on="BioSample", right_on="BioSample", how="left")
libraries = libraries.loc[:, ["Run", "Donor ID"]]
libraries.columns = ["library_prep", "Donor ID"]
libraries["Donor ID"] = [i.replace("SFG", "") for i in libraries["Donor ID"]]
libraries["Brain Region"] = "PFC"
libraries["Method"] = "3' 10x v2"

display(libraries)
libraries.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Leng_2021", "libraries.csv"))

donors = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Leng_2021", "Table 1.txt"), sep="\t")
donors["Participant no."] = [i.replace("*", "") for i in donors["Participant no."]]
donors.index = donors["Participant no."].copy()
donors.index.name = ''

donors["Sex"] = donors["Sex"].astype("category")
donors["Sex"] = donors["Sex"].cat.rename_categories(
    {
        "F": "Female",
        "M": "Male"
    }
)

donors["APOE genotype"] = donors["APOE genotype"].astype("str")
donors["APOE genotype"] = donors["APOE genotype"].str.contains("4")
donors["APOE genotype"] = donors["APOE genotype"].astype("category")
donors["APOE genotype"] = donors["APOE genotype"].cat.rename_categories(
    {
        True: "Y",
        True: "N"
    }
)

donors["Braak stage"] = donors["Braak stage"].astype("category")
donors["Braak stage"] = donors["Braak stage"].cat.rename_categories(
    {
        0: "Braak 0",
        1: "Braak I",
        2: "Braak II",
        3: "Braak III",
        4: "Braak IV",
        5: "Braak V",
        6: "Braak VI",
    }
)

donors["CDR before death"] = donors["CDR before death"] > 2
donors["CDR before death"] = donors["CDR before death"].astype("category")
donors["CDR before death"] = donors["CDR before death"].cat.rename_categories(
    {
        True: "Dementia",
        False: "No dementia"
    }
)


donors = donors.loc[
    :,
    [
        "Source",
        "Age at death (years)",
        "Sex",
        "APOE genotype",
        "Postmortem interval (h)",
        "Braak stage",
        "CDR before death",
    ]
]
donors = donors.rename(
    {
        "Source": "Primary Study Name",
        "Age at death (years)": "Age at Death",
        "APOE genotype": "APOE4 Status",
        "Postmortem interval (h)": "PMI",
        "Braak stage": "Braak",
        "CDR before death": "Cognitive Status",
    },
    axis=1
)
with pd.option_context("display.max_columns", None):
    display(donors)
    
donors.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Leng_2021", "donors.csv"))


##### Morabito_2021

In [None]:
libraries = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Morabito_2021", "biosample_result.txt"), sep="\t", header=None)
libraries.columns = ["Donor ID", "BioSample"]
libraries = libraries.loc[:, ["Donor ID"]]
libraries["library_prep"] = libraries["Donor ID"].copy()
libraries = libraries.loc[:, ["library_prep", "Donor ID"]].copy()
libraries["Brain Region"] = "PFC"
libraries["Method"] = "3' 10x v3"

donors = pd.read_excel(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Morabito_2021", "41588_2021_894_MOESM4_ESM.xlsx"), sheet_name="Supplementary Table 1", skiprows=1)
donors = donors.loc[donors["Assay"] == "both", :]
donors.index = donors["Sample.ID"].copy()
donors.index.name = ''

libraries = libraries.merge(donors.loc[:, ["RIN"]], left_on="Donor ID", right_index=True, how="left")

display(libraries)
libraries.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Morabito_2021", "libraries.csv"))

donors["Primary Study Name"] = "UCI MIND"

donors["Sex"] = donors["Sex"].astype("category")
donors["Sex"] = donors["Sex"].cat.rename_categories(
    {
        "F": "Female",
        "M": "Male"
    }
)

donors["APoE"] = donors["APoE"].astype("str")
donors["APoE"] = donors["APoE"].str.contains("4")
donors["APoE"] = donors["APoE"].astype("category")
donors["APoE"] = donors["APoE"].cat.rename_categories(
    {
        True: "Y",
        True: "N"
    }
)

donors["Tangle.Stage"] = donors["Tangle.Stage"].astype("category")
donors["Tangle.Stage"] = donors["Tangle.Stage"].cat.rename_categories(
    {
        "Stage 0": "Braak 0",
        "Stage 1": "Braak I",
        "Stage 2": "Braak II",
        "Stage 3": "Braak III",
        "Stage 4": "Braak IV",
        "Stage 5": "Braak V",
        "Stage 6": "Braak VI",
    }
)
donors["Tangle.Stage"] = donors["Tangle.Stage"].fillna("Braak 0")

donors["Diagnosis"] = donors["Diagnosis"].astype("category")
donors["Diagnosis"] = donors["Diagnosis"].cat.rename_categories(
    {
        "AD": "Dementia",
        "Control": "No dementia"
    }
)

donors = donors.loc[
    :,
    [
        "Primary Study Name",
        "Age",
        "Sex",
        "APoE",
        "PMI",
        "Tangle.Stage",
        "Diagnosis",
    ]
]
donors = donors.rename(
    {
        "Age": "Age at Death",
        "APoE": "APOE4 Status",
        "Tangle.Stage": "Braak",
        "Diagnosis": "Cognitive Status",
    },
    axis=1
)
with pd.option_context("display.max_columns", None):
    display(donors)
    
donors.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Morabito_2021", "donors.csv"))


##### Cain_2022

In [None]:
libraries = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Cain_2022", "ROSMAP_assay_scrnaSeq_metadata.csv"))
libraries = libraries.loc[libraries["specimenID"].str.startswith("MFC"), :]
libraries = libraries.loc[:, ["specimenID", "RIN"]]
libraries.columns = ["library_prep", "RIN"]
libraries["Brain Region"] = "PFC"
libraries["Method"] = "3' 10x v3"

donors = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Cain_2022", "ROSMAP_biospecimen_metadata.csv"))
donors = donors.loc[donors["specimenID"].isin(libraries["library_prep"]), ["specimenID", "individualID"]]
donors = donors.rename(
    {
        "individualID": "Donor ID"
    },
    axis=1
)

libraries = libraries.merge(donors, left_on="library_prep", right_on="specimenID", how="left")
libraries = libraries.loc[
    :,
    [
        "library_prep",
        "Donor ID",
        "Brain Region",
        "Method",
        "RIN"
    ]
]
with pd.option_context("display.max_columns", None):
    display(libraries)
    
libraries.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Cain_2022", "libraries.csv"))

ROSMAP = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Cain_2022", "ROSMAP_clinical.csv"), index_col=0)
RADC = pd.read_excel(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Cain_2022", "dataset_1134_basic_11-29-2023.xlsx"), sheet_name="Sheet0")
ROSMAP = ROSMAP.loc[:, ["projid", "individualID"]].merge(RADC, how="left", left_on="projid", right_on="projid")

ROSMAP.index = ROSMAP["individualID"].copy()
donors = donors.merge(ROSMAP.loc[:, np.setdiff1d(ROSMAP.columns, donors.columns)], left_on="Donor ID", right_index=True, how="left")
donors.index = donors["individualID"].copy()
donors.index.name = ''

donors["msex"] = donors["msex"].astype("category")
donors["msex"] = donors["msex"].cat.rename_categories(
    {
        0: "Female",
        1: "Male"
    }
)
donors["Race (choice=White)"] = "Unchecked"    
donors["Race (choice=Black/ African American)"] = "Unchecked"
donors["Race (choice=Asian)"] = "Unchecked"
donors["Race (choice=American Indian/ Alaska Native)"] = "Unchecked"
donors["Race (choice=Native Hawaiian or Pacific Islander)"] = "Unchecked"
donors["Race (choice=Unknown or unreported)"] = "Unchecked"
donors["Race (choice=Other)"] = "Unchecked"
for i,j in donors["race"].items():
    if j == 1:
        donors.loc[i, "Race (choice=White)"] = "Checked"
    if j == 2:
        donors.loc[i, "Race (choice=Black/ African American)"] = "Checked"
    if j == 3:
        donors.loc[i, "Race (choice=American Indian/ Alaska Native)"] = "Checked"
    if j == 4:
        donors.loc[i, "Race (choice=Native Hawaiian or Pacific Islander)"] = "Checked"
    if j == 5:
        donors.loc[i, "Race (choice=Asian)"] = "Checked"
    if j == 6:
        donors.loc[i, "Race (choice=Other)"] = "Checked"
    if j == 7:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
    if j == np.nan:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
        
donors["spanish"] = donors["spanish"].astype("category")
donors["spanish"] = donors["spanish"].cat.rename_categories(
    {
        1: "No",
        2: "Yes"
    }
)

donors["apoe_genotype"] = donors["apoe_genotype"].astype("str")
donors["apoe_genotype"] = donors["apoe_genotype"].str.contains("4")
donors["apoe_genotype"] = donors["apoe_genotype"].astype("category")
donors["apoe_genotype"] = donors["apoe_genotype"].cat.rename_categories(
    {
        True: "Y",
        True: "N"
    }
)

donors["niareagansc"] = donors["niareagansc"].astype("category")
donors["niareagansc"] = donors["niareagansc"].cat.rename_categories(
    {
        1.0: "High",
        2.0: "Intermediate",
        3.0: "Low",
        4.0: "Not AD",
    }
)    

donors["braaksc"] = donors["braaksc"].astype("category")
donors["braaksc"] = donors["braaksc"].cat.rename_categories(
    {
        0: "Braak 0",
        1: "Braak I",
        2: "Braak II",
        3: "Braak III",
        4: "Braak IV",
        5: "Braak V",
        6: "Braak VI",
    }
)

donors["ceradsc"] = donors["ceradsc"].astype("category")
donors["ceradsc"] = donors["ceradsc"].cat.rename_categories(
    {
        1: "Frequent",
        2: "Moderate",
        3: "Sparse",
        4: "Absent",
    }
)

donors["cogdx"] = donors["cogdx"] > 3
donors["cogdx"] = donors["cogdx"].astype("category")
donors["cogdx"] = donors["cogdx"].cat.rename_categories(
    {
        True: "Dementia",
        False: "No dementia"
    }
)

donors["dlbdx"] = donors["dlbdx"].astype("category")
donors["dlbdx"] = donors["dlbdx"].cat.rename_categories(
    {
        0.0: "Not Identified (olfactory bulb not assessed)",
        1.0: "Brainstem-predominant",
        2.0: "Limbic (Transitional)",
        3.0: "Neocortical (Diffuse)",
    }
)

donors["tdp_st4"] = donors["tdp_st4"].astype("category")
donors["tdp_st4"] = donors["tdp_st4"].cat.rename_categories(
    {
        0.0: "Not Identified",
        1.0: "LATE Stage 1",
        2.0: "LATE Stage 2",
        3.0: "LATE Stage 3",
    }
)

donors["caa_4gp"] = donors["caa_4gp"].astype("category")
donors["caa_4gp"] = donors["caa_4gp"].cat.rename_categories(
    {
        0.0: "Not identified",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors["cvda_4gp2"] = donors["cvda_4gp2"].astype("category")
donors["cvda_4gp2"] = donors["cvda_4gp2"].cat.rename_categories(
    {
        0.0: "None",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors["arteriol_scler"] = donors["arteriol_scler"].astype("category")
donors["arteriol_scler"] = donors["arteriol_scler"].cat.rename_categories(
    {
        0.0: "None",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors = donors.loc[
    :,
    [
        "Study",
        "age_death",
        "msex",
        "Race (choice=White)", 
        "Race (choice=Black/ African American)",
        "Race (choice=Asian)",
        "Race (choice=American Indian/ Alaska Native)",
        "Race (choice=Native Hawaiian or Pacific Islander)",
        "Race (choice=Unknown or unreported)",
        "Race (choice=Other)",
        "spanish",
        "educ",
        "apoe_genotype",
        "pmi",
        "niareagansc",
        "braaksc",
        "ceradsc",
        "cogdx",
        "dlbdx",
        "tdp_st4",
        "caa_4gp",
        "cvda_4gp2",
        "arteriol_scler",
    ]
]
donors = donors.rename(
    {
        "Study": "Primary Study Name",
        "age_death": "Age at Death",
        "msex": "Sex",
        "spanish": "Hispanic/Latino",
        "educ": "Years of education",
        "apoe_genotype": "APOE4 Status",
        "pmi": "PMI",
        "niareagansc": "Overall AD neuropathological Change",
        "braaksc": "Braak",
        "ceradsc": "CERAD score",
        "cogdx": "Cognitive Status",
        "dlbdx": "Highest Lewy Body Disease",
        "tdp_st4": "LATE",
        "caa_4gp": "Overall CAA Score",
        "cvda_4gp2": "Atheroslcerosis",
        "arteriol_scler": "Arteriolosclerosis",
    },
    axis=1
)
with pd.option_context("display.max_columns", None):
    display(donors)
    
donors.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Cain_2022", "donors.csv"))

##### Yang_2022

In [None]:
libraries = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Yang_2022", "SraRunTable.txt"))
biosamples = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Yang_2022", "biosample_result.txt"), sep="\t", header=None)
biosamples.columns = ["Donor ID", "BioSample"]
libraries = libraries.merge(biosamples, left_on="BioSample", right_on="BioSample", how="left")
libraries = libraries.loc[:, ["Run", "Donor ID"]]
libraries.columns = ["library_prep", "Donor ID"]
# Based on a personal communication with Yang et al
libraries["Donor ID"] = libraries["Donor ID"].replace(
    {
        "Control 1 Ctx": "NCI 1",
        "Control 2 Ctx": "NCI 6",
        "Control 3 Ctx": "NCI 7",
        "Control 4 Ctx": "NCI 8",
        "AD 1 Ctx": "AD 4",
        "AD 2 Ctx": "AD 5",
        "AD 3 Ctx": "AD 6",
        "AD 4 Ctx": "AD 7",
    }
)
libraries["Brain Region"] = "PFC"
libraries["Method"] = "3' 10x v3"

display(libraries)
libraries.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Yang_2022", "libraries.csv"))

donors = pd.read_excel(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Yang_2022", "41586_2021_4369_MOESM2_ESM.xlsx"), sheet_name="Sheet1")
donors = donors.loc[donors["Brain region profiled"] == "Superior frontal cortex", :]
donors.index = donors["Patient ID"].copy()
donors.index.name = ''

donors["Primary Study Name"] = "Stanford ADRC"

donors["Sex"] = donors["Sex"].astype("category")
donors["Sex"] = donors["Sex"].cat.rename_categories(
    {
        "F": "Female",
        "M": "Male"
    }
)

donors["ApoE genotype"] = donors["ApoE genotype"].astype("str")
donors["ApoE genotype"] = donors["ApoE genotype"].str.contains("4")
donors["ApoE genotype"] = donors["ApoE genotype"].astype("category")
donors["ApoE genotype"] = donors["ApoE genotype"].cat.rename_categories(
    {
        True: "Y",
        True: "N"
    }
)

donors["Braak"] = donors["Braak"].astype("category")
donors["Braak"] = donors["Braak"].cat.rename_categories(
    {
        "NR": "Braak 0",
        "VI": "Braak IV",
        "IV": "Braak VI",
    }
)

donors["CERAD"] = donors["CERAD"].astype("category")
donors["CERAD"] = donors["CERAD"].cat.rename_categories(
    {
        "NR": "Absent",
        "C": "Frequent",
    }
)

donors["Clinical AD or not"] = donors["Clinical AD or not"].astype("category")
donors["Clinical AD or not"] = donors["Clinical AD or not"].cat.rename_categories(
    {
        "AD": "Dementia",
        "No": "No dementia"
    }
)

donors = donors.loc[
    :,
    [
        "Primary Study Name",
        "Age",
        "Sex",
        "ApoE genotype",
        "PMI (hours)",
        "Overall brain weight",
        "Braak",
        "CERAD",
        "Clinical AD or not",
    ]
]
donors = donors.rename(
    {
        "Age": "Age at Death",
        "ApoE genotype": "APOE4 Status",
        "PMI (hours)": "PMI",
        "Overall brain weight": "Fresh Brain Weight",
        "Clinical AD or not": "Cognitive Status",
    },
    axis=1
)
with pd.option_context("display.max_columns", None):
    display(donors)
    
donors.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Yang_2022", "donors.csv"))

##### Green_2020

In [None]:
libraries = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Green_2023", "ROSMAP_snRNAseq_demultiplexed_ID_mapping.csv"))
libraries.columns = ["library_prep", "barcode", "Donor ID"]
libraries["library_prep"] = [re.sub("^[0-9]+-", "", i) + "_Merged" for i in libraries["library_prep"]]
libraries["library_prep"] = [i.replace("B6_", "B6-A_") for i in libraries["library_prep"]]
libraries["Brain Region"] = "PFC"
libraries["Method"] = "3' 10x v3"
libraries["barcode"] = [i.replace("-1", "") for i in libraries["barcode"]]
libraries = libraries.loc[:, ["library_prep", "Donor ID", "Brain Region", "Method", "barcode"]]
display(libraries.loc[libraries["library_prep"].str.startswith("B6"), :])

display(libraries)
libraries.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Green_2023", "libraries.csv"))

ROSMAP = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Green_2023", "ROSMAP_clinical.csv"), index_col=0)
RADC = pd.read_excel(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Green_2023", "dataset_1134_basic_11-29-2023.xlsx"), sheet_name="Sheet0")
ROSMAP = ROSMAP.loc[:, ["projid", "individualID"]].merge(RADC, how="left", left_on="projid", right_on="projid")

donors = libraries.loc[:, ["Donor ID"]].drop_duplicates()
donors = donors.merge(ROSMAP.loc[:, np.setdiff1d(ROSMAP.columns, donors.columns)], left_on="Donor ID", right_on="individualID", how="left")

donors.index = donors["individualID"].copy()
donors.index.name = ''

donors["msex"] = donors["msex"].astype("category")
donors["msex"] = donors["msex"].cat.rename_categories(
    {
        0: "Female",
        1: "Male"
    }
)
donors["Race (choice=White)"] = "Unchecked"    
donors["Race (choice=Black/ African American)"] = "Unchecked"
donors["Race (choice=Asian)"] = "Unchecked"
donors["Race (choice=American Indian/ Alaska Native)"] = "Unchecked"
donors["Race (choice=Native Hawaiian or Pacific Islander)"] = "Unchecked"
donors["Race (choice=Unknown or unreported)"] = "Unchecked"
donors["Race (choice=Other)"] = "Unchecked"
for i,j in donors["race"].items():
    if j == 1:
        donors.loc[i, "Race (choice=White)"] = "Checked"
    if j == 2:
        donors.loc[i, "Race (choice=Black/ African American)"] = "Checked"
    if j == 3:
        donors.loc[i, "Race (choice=American Indian/ Alaska Native)"] = "Checked"
    if j == 4:
        donors.loc[i, "Race (choice=Native Hawaiian or Pacific Islander)"] = "Checked"
    if j == 5:
        donors.loc[i, "Race (choice=Asian)"] = "Checked"
    if j == 6:
        donors.loc[i, "Race (choice=Other)"] = "Checked"
    if j == 7:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
    if j == np.nan:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
        
donors["spanish"] = donors["spanish"].astype("category")
donors["spanish"] = donors["spanish"].cat.rename_categories(
    {
        1: "No",
        2: "Yes"
    }
)

donors["apoe_genotype"] = donors["apoe_genotype"].astype("str")
donors["apoe_genotype"] = donors["apoe_genotype"].str.contains("4")
donors["apoe_genotype"] = donors["apoe_genotype"].astype("category")
donors["apoe_genotype"] = donors["apoe_genotype"].cat.rename_categories(
    {
        True: "Y",
        True: "N"
    }
)

donors["niareagansc"] = donors["niareagansc"].astype("category")
donors["niareagansc"] = donors["niareagansc"].cat.rename_categories(
    {
        1.0: "High",
        2.0: "Intermediate",
        3.0: "Low",
        4.0: "Not AD",
    }
)    

donors["braaksc"] = donors["braaksc"].astype("category")
donors["braaksc"] = donors["braaksc"].cat.rename_categories(
    {
        0: "Braak 0",
        1: "Braak I",
        2: "Braak II",
        3: "Braak III",
        4: "Braak IV",
        5: "Braak V",
        6: "Braak VI",
    }
)

donors["ceradsc"] = donors["ceradsc"].astype("category")
donors["ceradsc"] = donors["ceradsc"].cat.rename_categories(
    {
        1: "Frequent",
        2: "Moderate",
        3: "Sparse",
        4: "Absent",
    }
)

donors["cogdx"] = donors["cogdx"] > 3
donors["cogdx"] = donors["cogdx"].astype("category")
donors["cogdx"] = donors["cogdx"].cat.rename_categories(
    {
        True: "Dementia",
        False: "No dementia"
    }
)

donors["dlbdx"] = donors["dlbdx"].astype("category")
donors["dlbdx"] = donors["dlbdx"].cat.rename_categories(
    {
        0.0: "Not Identified (olfactory bulb not assessed)",
        1.0: "Brainstem-predominant",
        2.0: "Limbic (Transitional)",
        3.0: "Neocortical (Diffuse)",
    }
)

donors["tdp_st4"] = donors["tdp_st4"].astype("category")
donors["tdp_st4"] = donors["tdp_st4"].cat.rename_categories(
    {
        0.0: "Not Identified",
        1.0: "LATE Stage 1",
        2.0: "LATE Stage 2",
        3.0: "LATE Stage 3",
    }
)

donors["caa_4gp"] = donors["caa_4gp"].astype("category")
donors["caa_4gp"] = donors["caa_4gp"].cat.rename_categories(
    {
        0.0: "Not identified",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors["cvda_4gp2"] = donors["cvda_4gp2"].astype("category")
donors["cvda_4gp2"] = donors["cvda_4gp2"].cat.rename_categories(
    {
        0.0: "None",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors["arteriol_scler"] = donors["arteriol_scler"].astype("category")
donors["arteriol_scler"] = donors["arteriol_scler"].cat.rename_categories(
    {
        0.0: "None",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors = donors.loc[
    :,
    [
        "Study",
        "age_death",
        "msex",
        "Race (choice=White)", 
        "Race (choice=Black/ African American)",
        "Race (choice=Asian)",
        "Race (choice=American Indian/ Alaska Native)",
        "Race (choice=Native Hawaiian or Pacific Islander)",
        "Race (choice=Unknown or unreported)",
        "Race (choice=Other)",
        "spanish",
        "educ",
        "apoe_genotype",
        "pmi",
        "niareagansc",
        "braaksc",
        "ceradsc",
        "cogdx",
        "dlbdx",
        "tdp_st4",
        "caa_4gp",
        "cvda_4gp2",
        "arteriol_scler",
    ]
]
donors = donors.rename(
    {
        "Study": "Primary Study Name",
        "age_death": "Age at Death",
        "msex": "Sex",
        "spanish": "Hispanic/Latino",
        "educ": "Years of education",
        "apoe_genotype": "APOE4 Status",
        "pmi": "PMI",
        "niareagansc": "Overall AD neuropathological Change",
        "braaksc": "Braak",
        "ceradsc": "CERAD score",
        "cogdx": "Cognitive Status",
        "dlbdx": "Highest Lewy Body Disease",
        "tdp_st4": "LATE",
        "caa_4gp": "Overall CAA Score",
        "cvda_4gp2": "Atheroslcerosis",
        "arteriol_scler": "Arteriolosclerosis",
    },
    axis=1
)
with pd.option_context("display.max_columns", None):
    display(donors)
    
donors.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Green_2023", "donors.csv"))

##### Mathys_2023

In [None]:
libraries = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Mathys_2023", "ROSMAP_snRNAseq_PFC_manifest_fastq_files_v2.csv"))
libraries["library_prep"] = [re.sub('_S[0-9]+_L[0-9]+_[IR][12]_001.fastq.gz"', "", os.path.basename(i)) for i in libraries["path"]]
libraries = libraries.loc[:, ["library_prep", "individualID"]].drop_duplicates()
libraries.columns = ["library_prep", "Donor ID"]
libraries["Brain Region"] = "PFC"
libraries["Method"] = "3' 10x v3"

display(libraries)
libraries.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Mathys_2023", "libraries.csv"))

ROSMAP = pd.read_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Mathys_2023", "ROSMAP_clinical.csv"), index_col=0)
RADC = pd.read_excel(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Mathys_2023", "dataset_1134_basic_11-29-2023.xlsx"), sheet_name="Sheet0")
ROSMAP = ROSMAP.loc[:, ["projid", "individualID"]].merge(RADC, how="left", left_on="projid", right_on="projid")

donors = libraries.loc[:, ["Donor ID"]]
donors = donors.merge(ROSMAP.loc[:, np.setdiff1d(ROSMAP.columns, donors.columns)], left_on="Donor ID", right_on="individualID", how="left")

donors.index = donors["individualID"].copy()
donors.index.name = ''

donors["msex"] = donors["msex"].astype("category")
donors["msex"] = donors["msex"].cat.rename_categories(
    {
        0: "Female",
        1: "Male"
    }
)
donors["Race (choice=White)"] = "Unchecked"    
donors["Race (choice=Black/ African American)"] = "Unchecked"
donors["Race (choice=Asian)"] = "Unchecked"
donors["Race (choice=American Indian/ Alaska Native)"] = "Unchecked"
donors["Race (choice=Native Hawaiian or Pacific Islander)"] = "Unchecked"
donors["Race (choice=Unknown or unreported)"] = "Unchecked"
donors["Race (choice=Other)"] = "Unchecked"
for i,j in donors["race"].items():
    if j == 1:
        donors.loc[i, "Race (choice=White)"] = "Checked"
    if j == 2:
        donors.loc[i, "Race (choice=Black/ African American)"] = "Checked"
    if j == 3:
        donors.loc[i, "Race (choice=American Indian/ Alaska Native)"] = "Checked"
    if j == 4:
        donors.loc[i, "Race (choice=Native Hawaiian or Pacific Islander)"] = "Checked"
    if j == 5:
        donors.loc[i, "Race (choice=Asian)"] = "Checked"
    if j == 6:
        donors.loc[i, "Race (choice=Other)"] = "Checked"
    if j == 7:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
    if j == np.nan:
        donors.loc[i, "Race (choice=Unknown or unreported)"] = "Checked"
        
donors["spanish"] = donors["spanish"].astype("category")
donors["spanish"] = donors["spanish"].cat.rename_categories(
    {
        1: "No",
        2: "Yes"
    }
)

donors["apoe_genotype"] = donors["apoe_genotype"].astype("str")
donors["apoe_genotype"] = donors["apoe_genotype"].str.contains("4")
donors["apoe_genotype"] = donors["apoe_genotype"].astype("category")
donors["apoe_genotype"] = donors["apoe_genotype"].cat.rename_categories(
    {
        True: "Y",
        True: "N"
    }
)

donors["niareagansc"] = donors["niareagansc"].astype("category")
donors["niareagansc"] = donors["niareagansc"].cat.rename_categories(
    {
        1.0: "High",
        2.0: "Intermediate",
        3.0: "Low",
        4.0: "Not AD",
    }
)    

donors["braaksc"] = donors["braaksc"].astype("category")
donors["braaksc"] = donors["braaksc"].cat.rename_categories(
    {
        0: "Braak 0",
        1: "Braak I",
        2: "Braak II",
        3: "Braak III",
        4: "Braak IV",
        5: "Braak V",
        6: "Braak VI",
    }
)

donors["ceradsc"] = donors["ceradsc"].astype("category")
donors["ceradsc"] = donors["ceradsc"].cat.rename_categories(
    {
        1: "Frequent",
        2: "Moderate",
        3: "Sparse",
        4: "Absent",
    }
)

donors["cogdx"] = donors["cogdx"] > 3
donors["cogdx"] = donors["cogdx"].astype("category")
donors["cogdx"] = donors["cogdx"].cat.rename_categories(
    {
        True: "Dementia",
        False: "No dementia"
    }
)

donors["dlbdx"] = donors["dlbdx"].astype("category")
donors["dlbdx"] = donors["dlbdx"].cat.rename_categories(
    {
        0.0: "Not Identified (olfactory bulb not assessed)",
        1.0: "Brainstem-predominant",
        2.0: "Limbic (Transitional)",
        3.0: "Neocortical (Diffuse)",
    }
)

donors["tdp_st4"] = donors["tdp_st4"].astype("category")
donors["tdp_st4"] = donors["tdp_st4"].cat.rename_categories(
    {
        0.0: "Not Identified",
        1.0: "LATE Stage 1",
        2.0: "LATE Stage 2",
        3.0: "LATE Stage 3",
    }
)

donors["caa_4gp"] = donors["caa_4gp"].astype("category")
donors["caa_4gp"] = donors["caa_4gp"].cat.rename_categories(
    {
        0.0: "Not identified",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors["cvda_4gp2"] = donors["cvda_4gp2"].astype("category")
donors["cvda_4gp2"] = donors["cvda_4gp2"].cat.rename_categories(
    {
        0.0: "None",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors["arteriol_scler"] = donors["arteriol_scler"].astype("category")
donors["arteriol_scler"] = donors["arteriol_scler"].cat.rename_categories(
    {
        0.0: "None",
        1.0: "Mild",
        2.0: "Moderate",
        3.0: "Severe",
    }
)

donors = donors.loc[
    :,
    [
        "Study",
        "age_death",
        "msex",
        "Race (choice=White)", 
        "Race (choice=Black/ African American)",
        "Race (choice=Asian)",
        "Race (choice=American Indian/ Alaska Native)",
        "Race (choice=Native Hawaiian or Pacific Islander)",
        "Race (choice=Unknown or unreported)",
        "Race (choice=Other)",
        "spanish",
        "educ",
        "apoe_genotype",
        "pmi",
        "niareagansc",
        "braaksc",
        "ceradsc",
        "cogdx",
        "dlbdx",
        "tdp_st4",
        "caa_4gp",
        "cvda_4gp2",
        "arteriol_scler",
    ]
]
donors = donors.rename(
    {
        "Study": "Primary Study Name",
        "age_death": "Age at Death",
        "msex": "Sex",
        "spanish": "Hispanic/Latino",
        "educ": "Years of education",
        "apoe_genotype": "APOE4 Status",
        "pmi": "PMI",
        "niareagansc": "Overall AD neuropathological Change",
        "braaksc": "Braak",
        "ceradsc": "CERAD score",
        "cogdx": "Cognitive Status",
        "dlbdx": "Highest Lewy Body Disease",
        "tdp_st4": "LATE",
        "caa_4gp": "Overall CAA Score",
        "cvda_4gp2": "Atheroslcerosis",
        "arteriol_scler": "Arteriolosclerosis",
    },
    axis=1
)

donors = donors.loc[~donors.duplicated(), :].copy()

with pd.option_context("display.max_columns", None):
    display(donors)
    
donors.to_csv(os.path.join(pwd, "input", "External_AD_singleomeCR6", "metadata", "Mathys_2023", "donors.csv"))

### Build AnnData Files

In [None]:
for i in external_datasets.keys():
    build_external_anndata(
        doi=i,
        external_datasets=external_datasets
    )