### Load needed libraries

In [None]:
import os
import scanpy as sc
import pandas as pd
import numpy as np
import warnings
from datetime import datetime

sc.settings.n_jobs = 32
warnings.filterwarnings("ignore")

pwd = os.getcwd()

### Load the MTG dataset

In [None]:
region = "MTG"
dataset = "RNAseq"
date = "2024-02-13"
adata = sc.read_h5ad(os.path.join(pwd, "input", "SEAAD_" + region + "_" + dataset + "_final-nuclei." + date +".h5ad"))

# Subset on the SEA-AD cells not severely affected donors
adata = adata[(adata.obs["Neurotypical reference"] == "False") & (adata.obs["Severely Affected Donor"] == "N")].copy()

# Move UMIs back to X and delete layers to reduce memory demands
adata.X = adata.layers["UMIs"].copy()
del adata.layers["UMIs"]

# Format and scale metadata for the differential expression test
adata.obs["Continuous_Pseudo-progression_Score"] = adata.obs["Continuous Pseudo-progression Score"].copy()

adata.obs["Age at Death"] = adata.obs["Age at Death"].astype("object")
adata.obs["Age at Death"] = [np.float32(i) for i in adata.obs["Age at Death"]]
adata.obs["Age_at_Death_binned"] = pd.cut(adata.obs["Age at Death"], bins=5)
adata.obs["Age_at_Death_binned_codes"] = adata.obs["Age_at_Death_binned"].cat.codes
adata.obs["Age_at_Death_binned_codes"] = adata.obs["Age_at_Death_binned_codes"] /  adata.obs["Age_at_Death_binned_codes"].max()

adata.obs["Sex"] = adata.obs["Sex"].astype("category")
adata.obs["Sex"] = adata.obs["Sex"].cat.remove_unusued_categories()
adata.obs["Sex"] = adata.obs["Sex"].cat.reorder_categories(["F", "M"])

adata.obs["Race_choice_White"] = adata.obs["Race (choice=White)"].astype("category")
adata.obs["Race_choice_White"] = adata.obs["Race_choice_White"].cat.remove_unusued_categories()
adata.obs["Race_choice_White"] = adata.obs["Race_choice_White"].cat.reorder_categories(["Unchecked", "Checked"])

adata.obs["method"] = adata.obs["method"].cat.remove_unusued_categories()

adata.obs["Genes_detected"] = (adata.obs["Genes detected"] - adata.obs["Genes detected"].min()) /  (adata.obs["Genes detected"].max() - adata.obs["Genes detected"].min())

adata.obs["Donor_ID"] = adata.obs["Donor ID"].copy()

adata.obs["Number_of_UMIs"] = adata.obs["Number of UMIs"].copy()

adata.obs["PMI"] = (adata.obs["PMI"] - adata.obs["PMI"].min()) /  (adata.obs["PMI"].max() - adata.obs["PMI"].min())

adata.obs["APOE4_Status"] = adata.obs.obs["APOE Genotype"].str.contains("4")
adata.obs["APOE4_Status"] = adata.obs["APOE4 Status"].astype("category")
adata.obs["APOE4_Status"] = adata.obs["APOE4_Status"].cat.reorder_categories([False, True])
adata.obs["APOE4_Status"] = adata.obs["APOE4_Status"].cat.rename_categories(
    {
        False: "N",
        True: "Y",
    }
)

### Prepare dataset splits to distribute differential expression testing

In [None]:
# These objects were also symlinked to MTG_No_Genes_detected, MTG_PMI, and MTG_APOE4_Status
region = "MTG"  
for i in adata.obs["Subclass"].cat.categories:
    adata[
        (adata.obs["Subclass"] == i)
    ].write(os.path.join(pwd, "tmp", region, i.replace("/", " ") + ".h5ad"))
    
# Objects used for early tests
region = "MTG_early"      
for i in adata.obs["Subclass"].cat.categories:
    adata[
        (adata.obs["Subclass"] == i) &
        (adata.obs["Continuous_Pseudo-progression_Score"] <= 0.55)
        
    ].write(os.path.join(pwd, "tmp", region, i.replace("/", " ") + ".h5ad"))
    
# Objects used for late tests
region = "MTG_late"      
for i in adata.obs["Subclass"].cat.categories:
    adata[
        (adata.obs["Subclass"] == i) &
        (adata.obs["Continuous_Pseudo-progression_Score"] > 0.45)
    ].write(os.path.join(pwd, "tmp", region, i.replace("/", " ") + ".h5ad"))