### Load needed libraries

In [None]:
import os
import scanpy as sc
import pandas as pd
import numpy as np
import copy
import re
import anndata as ad
from joblib import parallel_backend
import warnings
from datetime import datetime
from helper_functions import *

sc.settings.n_jobs = 32
warnings.filterwarnings("ignore")

pwd = os.getcwd()

### Load needed datasets

In [None]:
# Load the AnnData file 
region = "MTG"
dataset = "RNAseq"
date = "2024-02-13"
adata = sc.read_h5ad(os.path.join(pwd, "input", "SEAAD_" + region + "_" + dataset + "_final-nuclei." + date +".h5ad"))

# Subset on the SEA-AD cells not severely affected donors
adata = adata[(adata.obs["Neurotypical reference"] == "False") & (adata.obs["Severely Affected Donor"] == "N")].copy()

In [None]:
blacklisted_genes = ["MTRNR2L12", "TTTY14", "USP9Y", "NLGN4Y", "UTY", "DDX3Y", "KDM5D", "TTTY10"]
blacklisted_genes.extend(adata.var_names[adata.var_names.str.startswith("MT-")].to_list())

effect_sizes, pvalues, std_errors  = build_effect_size_anndata(
    results_dir=os.path.join(pwd, "output", "MTG", "Continuous_Pseudo-progression_Score"),
    glob_pattern="*.csv",
    file_pattern="_across_Continuous_Pseudo-progression_Score_DE.csv",
    test="Continuous_Pseudo-progression_Score",
    adata=adata,
    subclass="Subclass",
    celltype="Supertype",
    blacklisted_genes=blacklisted_genes,
)

effect_sizes_early, pvalues_early, std_errors_early = build_effect_size_anndata(
    results_dir=os.path.join(pwd, "output", "MTG_early", "Continuous_Pseudo-progression_Score"),
    glob_pattern="*.csv",
    file_pattern="_across_Continuous_Pseudo-progression_Score_DE.csv",
    test="Continuous_Pseudo-progression_Score",
    adata=adata,
    subclass="Subclass",
    celltype="Supertype",
    blacklisted_genes=blacklisted_genes,
)

effect_sizes_late, pvalues_late, std_errors_late = build_effect_size_anndata(
    results_dir=os.path.join(pwd, "output", "MTG_late", "Continuous_Pseudo-progression_Score"),
    glob_pattern="*.csv",
    file_pattern="_across_Continuous_Pseudo-progression_Score_DE.csv",
    test="Continuous_Pseudo-progression_Score",
    adata=adata,
    subclass="Subclass",
    celltype="Supertype",
    blacklisted_genes=blacklisted_genes,
)

effect_sizes_vs_all, pvalues_vs_all, std_errors_late = build_effect_size_anndata(
    results_dir=os.path.join(pwd, "output", "MTG", "versus_all"),
    glob_pattern="*.csv",
    file_pattern="_versus_all_DE.csv",
    test="comparison1",
    adata=adata,
    subclass="Subclass",
    celltype="Supertype",
    blacklisted_genes=blacklisted_genes,
)

### Build mean expression and fraction expressed tables

In [None]:
prefix = "Supertype_"

for j in adata.obs["Supertype"].cat.categories:
    print(str(datetime.now()) + " -- " + str(j))
    adata.var[prefix + "mean_expression_" + str(j)] = adata[adata.obs["Supertype"] == j].X.mean(axis=0).T
    adata.var[prefix + "fraction_expressed_" + str(j)] = (np.sum(adata[adata.obs["Supertype"] == j].X > 0, axis=0) / adata[adata.obs["Supertype"] == j].shape[0]).T

In [None]:
mean_expression = ad.AnnData(X=adata.var.loc[:, adata.var.columns.str.startswith(prefix + "mean_expression_")])
mean_expression.var_names = [i.replace(prefix + "mean_expression_", "") for i in mean_expression.var_names]
mean_expression.var = effect_sizes.var.copy()
mean_expression = mean_expression[effect_sizes.obs_names, :].copy()

fraction_expressed = ad.AnnData(X=adata.var.loc[:, adata.var.columns.str.startswith(prefix + "fraction_expressed_")])
fraction_expressed.var_names = [i.replace(prefix + "fraction_expressed_", "") for i in fraction_expressed.var_names]
fraction_expressed.var = effect_sizes.var.copy()
fraction_expressed = fraction_expressed[effect_sizes.obs_names, :].copy()

### Build effect size table

In [None]:
supertypes = effect_sizes.var["Supertype"]
subclasses = effect_sizes.var["Subclass"]
classes = effect_sizes.var["Class"]
genes = effect_sizes.obs_names


effect_size_table = pd.DataFrame(index=range((len(supertypes) + len(np.unique(subclasses)) + len(np.unique(classes))) * len(genes)), columns=["Gene", "Taxonomy Level", "Population", "Effect size across all of pseudoprogression", "Effect size across early pseudoprogression", "Effect size across late pseudoprogression", "Mean expression (natural log UMIs per 10k plus 1)"])

for i,j in enumerate(supertypes):
    for k,l in enumerate(genes):
        z = (i * len(genes)) + k
        effect_size_table.loc[z, :] = [l, "Supertype", j, np.round(float(effect_sizes[l, j].X), 2), np.round(float(effect_sizes_early[l, j].X), 2), np.round(float(effect_sizes_late[l, j].X), 2), np.round(float(mean_expression[l, j].X),2)]
        if z % 1000000 == 0:
            print(datetime.now())
            print(effect_size_table.iloc[z, :])
          
for i,j in enumerate(np.unique(subclasses)):
    for k,l in enumerate(genes):
        z = (len(genes) * len(supertypes)) + (i * len(genes)) + k
        effect_size_table.loc[z, :] = [l, "Subclass", j, np.round(float(effect_sizes[l, [m == j for m in subclasses]].X.mean()), 2), np.round(float(effect_sizes_early[l, [m == j for m in subclasses]].X.mean()), 2), np.round(float(effect_sizes_late[l, [m == j for m in subclasses]].X.mean()), 2), np.round(float(mean_expression[l, [m == j for m in subclasses]].X.mean()),2)]
        if z % 100000 == 0:
            print(datetime.now())
            print(effect_size_table.iloc[z, :])
            
for i,j in enumerate(np.unique(classes)):
    for k,l in enumerate(genes):
        z = (len(genes) * len(np.unique(subclasses))) + (len(genes) * len(supertypes)) + (i * len(genes)) + k
        effect_size_table.loc[z, :] = [l, "Class", j, np.round(float(effect_sizes[l, [m == j for m in classes]].X.mean()), 2), np.round(float(effect_sizes_early[l, [m == j for m in classes]].X.mean()), 2), np.round(float(effect_sizes_late[l, [m == j for m in classes]].X.mean()), 2), np.round(float(mean_expression[l, [m == j for m in classes]].X.mean()),2)]
        if z % 10000 == 0:
            print(datetime.now())
            print(effect_size_table.iloc[z, :])



In [None]:
effect_size_table.to_csv(os.path.join("output", "effect_size_table.csv"))

In [None]:
effect_sizes.write(os.path.join(pwd, "output", "effect_sizes.h5ad"))
pvalues.write(os.path.join(pwd, "output", "pvalues.h5ad"))

effect_sizes_early.write(os.path.join(pwd, "output", "effect_sizes_early.h5ad"))
pvalues_early.write(os.path.join(pwd, "output", "pvalues_early.h5ad"))

effect_sizes_late.write(os.path.join(pwd, "output", "effect_sizes_late.h5ad"))
pvalues_late.write(os.path.join(pwd, "output", "pvalues_late.h5ad"))

effect_sizes_vs_all.write(os.path.join(pwd, "output", "effect_sizes_vs_all.h5ad"))
pvalues_vs_all.write(os.path.join(pwd, "output", "pvalues_vs_all.h5ad"))

mean_expression.write(os.path.join(pwd, "output", "mean_expression.h5ad"))
fraction_expressed.write(os.path.join(pwd, "output", "fraction_expressed.h5ad"))

### Construct Neighborhood and Subclass aggregate metrics

In [None]:
vectors = {
    1: (np.array([i not in ["Non-neuronal and non-neural"] for i in effect_sizes.var["Class"]])) & (np.array([i in ["Neuronal: GABAergic"] for i in effect_sizes.var["Class"]])), # Inhibitory versus excitatory
    2: (np.array([i not in ["Non-neuronal and non-neural"] for i in effect_sizes.var["Class"]])) & (np.array([i in ["Neuronal: Glutamatergic"] for i in effect_sizes.var["Class"]])), # Excitatory versus inhibitory
    3: (np.array([i not in ["Neuronal: GABAergic", "Neuronal: Glutamatergic"] for i in effect_sizes.var["Class"]])) & (np.array([i in ["Astrocyte", "Oligodendrocyte", "OPC"] for i in effect_sizes.var["Subclass"]])),
    4: (np.array([i not in ["Neuronal: GABAergic", "Neuronal: Glutamatergic"] for i in effect_sizes.var["Class"]])) & (np.array([i in ["Endothelial", "VLMC", "Microglia-PVM"] for i in effect_sizes.var["Subclass"]])),
}

for j in effect_sizes.var["Subclass"].cat.categories:
    next_index = np.max(list(vectors.keys())) + 1
    vectors[next_index] = (effect_sizes.var["Subclass"] == j).to_numpy()

### Construct the gene dynamic space

In [None]:
prefix = "Supertype_"

gene_dynamic_space = construct_gene_graph(
    mean_expression=mean_expression,
    fraction_expressed=fraction_expressed,
    effect_sizes_early=effect_sizes_early,
    effect_sizes_late=effect_sizes_late,
    prefix=prefix,
    aggregate_metrics=True,
    vectors=vectors
)

In [None]:
gene_dynamic_space.write(os.path.join(pwd, "output", "gene_dynamic_space.h5ad"))