# Jeffries Human Brain Aging paper - AUCell UPR scoring
*This script is to be run before the Rmd script, for computing the AUC values with pySCENIC* 

**Author:** Vincent Gardeux

**Date Created:** 2025-11-17
**Date Last Modified:** 2025-11-17

In [1]:
# import dependencies
import pandas as pd
import numpy as np
import anndata as ad
import time
from ctxcore.genesig import GeneSignature
from pyscenic.aucell import create_rankings, enrichment

In [2]:
GENE_SIGNATURE_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/UPR_genes_REACTOME_human.txt' # From https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/REACTOME_UNFOLDED_PROTEIN_RESPONSE_UPR.html
GENE_SIGNATURE_SNAME = '90 + 3 UPR genes' # 3 are missing from Jeffries' data
EXPRESSION_H5AD_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pfc.clean.h5ad' # Built from pfc.clean.rds. Downloaded from https://publications.wenglab.org/SomaMut/Jeffries_Yu_BrainAging_2025/

In [3]:
data_genes = pd.read_csv(filepath_or_buffer=GENE_SIGNATURE_FNAME, header=None)[0].tolist()
gs = GeneSignature('REACTOME - UNFOLDED PROTEIN RESPONSE (UPR)', data_genes)
gs

GeneSignature(name='REACTOME - UNFOLDED PROTEIN RESPONSE (UPR)', gene2weight=frozendict.frozendict({'ACADVL': 1.0, 'ADD1': 1.0, 'ARFGAP1': 1.0, 'ASNS': 1.0, 'ATF3': 1.0, 'ATF4': 1.0, 'ATF6': 1.0, 'ATF6B': 1.0, 'ATP6V0D1': 1.0, 'CALR': 1.0, 'CCL2': 1.0, 'CEBPB': 1.0, 'CEBPG': 1.0, 'CREB3': 1.0, 'CREB3L1': 1.0, 'CREB3L2': 1.0, 'CREB3L3': 1.0, 'CREB3L4': 1.0, 'CREBRF': 1.0, 'CTDSP2': 1.0, 'CUL7': 1.0, 'CXCL8': 1.0, 'CXXC1': 1.0, 'DCP2': 1.0, 'DCSTAMP': 1.0, 'DCTN1': 1.0, 'DDIT3': 1.0, 'DDX11': 1.0, 'DIS3': 1.0, 'DNAJB11': 1.0, 'DNAJB9': 1.0, 'DNAJC3': 1.0, 'EDEM1': 1.0, 'EIF2AK3': 1.0, 'EIF2S1': 1.0, 'EIF2S2': 1.0, 'EIF2S3': 1.0, 'ERN1': 1.0, 'EXOSC1': 1.0, 'EXOSC2': 1.0, 'EXOSC3': 1.0, 'EXOSC4': 1.0, 'EXOSC5': 1.0, 'EXOSC6': 1.0, 'EXOSC7': 1.0, 'EXOSC8': 1.0, 'EXOSC9': 1.0, 'EXTL1': 1.0, 'EXTL2': 1.0, 'EXTL3': 1.0, 'FKBP14': 1.0, 'GFPT1': 1.0, 'GOSR2': 1.0, 'GSK3A': 1.0, 'HDGF': 1.0, 'HERPUD1': 1.0, 'HSP90B1': 1.0, 'HSPA5': 1.0, 'HYOU1': 1.0, 'IGFBP1': 1.0, 'KDELR3': 1.0, 'KHSRP': 1.0, '

In [4]:
len(gs)

93

In [5]:
# [Input] Load expression matrix from H5ad file
f_h5ad = ad.read_h5ad(EXPRESSION_H5AD_FNAME)
f_gene_names = f_h5ad.var_names.tolist()  # Gene names
f_cell_names = f_h5ad.obs_names.tolist()   # Cell names
ex_matrix = pd.DataFrame.sparse.from_spmatrix(f_h5ad.X.T, index=f_gene_names, columns=f_cell_names) # 33538 genes x 2663736 cells
ex_matrix

Unnamed: 0,0950_240109_AAACCCAAGACATCCT,0950_240109_AAACCCACACCGTACG,0950_240109_AAACCCAGTAATCAGA,0950_240109_AAACCCATCACTCTTA,0950_240109_AAACGAAAGGTTACCT,0950_240109_AAACGAAAGTATGACA,0950_240109_AAACGAACAGCGATTT,0950_240109_AAACGAATCCATCGTC,0950_240109_AAACGAATCTCATGCC,0950_240109_AAACGCTCAATTGCGT,...,6052_200709_TTTGGTTCATGTACGT,6052_200709_TTTGGTTCATGTCGTA,6052_200709_TTTGGTTGTACCGTCG,6052_200709_TTTGGTTGTATTGACC,6052_200709_TTTGGTTGTCAGTCCG,6052_200709_TTTGGTTTCGACACTA,6052_200709_TTTGGTTTCGCAGTTA,6052_200709_TTTGTTGCAAGCGATG,6052_200709_TTTGTTGCAAGGTCAG,6052_200709_TTTGTTGCAGCCGTTG
MIR1302-2HG,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
AL627309.1,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
AL627309.3,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
AL627309.5,0.0,0.0,0.0,0.000000,0.0,0.0,0.711225,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
AP006222.2,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AL592183.1,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.448062,0.0,0.26238,0.243586,0.0
AC240274.1,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.243586,0.0
AC004556.3,0.0,0.0,0.0,0.301522,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
AC007325.4,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.248919,0.0,0.00000,0.000000,0.0


In [6]:
# Compute the intersection
intersection = list(set(f_gene_names).intersection(set(data_genes)))
print(intersection)

['TPP1', 'SRPRA', 'IGFBP1', 'HERPUD1', 'XBP1', 'EXOSC3', 'EIF2S1', 'LMNA', 'CEBPB', 'ARFGAP1', 'ERN1', 'CXCL8', 'DCP2', 'SRPRB', 'SSR1', 'DCSTAMP', 'EDEM1', 'KHSRP', 'SEC31A', 'ATP6V0D1', 'EIF2AK3', 'PARN', 'KLHDC3', 'EXOSC7', 'GSK3A', 'CREB3L1', 'GOSR2', 'EIF2S2', 'CREB3L4', 'EXOSC1', 'PREB', 'TLN1', 'EXTL1', 'EXOSC4', 'SULT1A3', 'ACADVL', 'PDIA5', 'DDIT3', 'CCL2', 'ATF3', 'DIS3', 'FKBP14', 'MYDGF', 'CREBRF', 'EXOSC6', 'CXXC1', 'PLA2G4B', 'SERP1', 'ASNS', 'NFYA', 'EXTL2', 'HSP90B1', 'CREB3', 'WIPI1', 'ATF4', 'DNAJB9', 'CUL7', 'DDX11', 'GFPT1', 'EXOSC5', 'EXOSC9', 'MBTPS1', 'SYVN1', 'CREB3L3', 'HYOU1', 'CREB3L2', 'WFS1', 'SHC1', 'HDGF', 'PPP2R5B', 'ATF6B', 'EXOSC8', 'EXOSC2', 'EXTL3', 'DNAJC3', 'ATF6', 'DCTN1', 'KDELR3', 'TATDN2', 'ADD1', 'CEBPG', 'NFYB', 'CTDSP2', 'PDIA6', 'YIF1A', 'NFYC', 'DNAJB11', 'HSPA5', 'CALR', 'ZBTB17']


In [7]:
len(intersection)

90

In [8]:
# Compute the genes that don't overlap in the other direction
difference_reverse = list(set(data_genes).difference(set(f_gene_names)))
print(difference_reverse)

['EIF2S3', 'MBTPS2', 'TSPYL2']


In [9]:
len(difference_reverse)

3

- TSPYL1 TSPYL4 TSPYL5 TSPYL6 are in the matrix. But TSPYL2 is not there.
- EIF2S1 EIF2S2 EIF2S3B are in the matrix. But EIF2S3 is not there.
- MBTPS1 is in the matrix. But MBTPS2 is not there.

So I assume their absence simply means that they are not expressed in the data. 3 genes are thus missing from the dataset (TSPYL2, EIF2S3, MBTPS2).

In [10]:
# Run AUCell
# 1. Rankings (~30mn)
start_time = time.time()

rnk_mtx = create_rankings(ex_matrix.transpose(), seed=42)

print(f"Ranking (AUCell) computing time: {(time.time() - start_time):.2f} seconds")

Ranking (AUCell) computing time: 1738.49 seconds


In [11]:
# 2. Enrichment (~5s)
aucs = enrichment(rnk_mtx, gs)
aucs

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC
Cell,Regulon,Unnamed: 2_level_1
0950_240109_AAACCCAAGACATCCT,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.040437
0950_240109_AAACCCACACCGTACG,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.009242
0950_240109_AAACCCAGTAATCAGA,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.013810
0950_240109_AAACCCATCACTCTTA,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.009175
0950_240109_AAACGAAAGGTTACCT,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.039547
...,...,...
6052_200709_TTTGGTTTCGACACTA,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.013766
6052_200709_TTTGGTTTCGCAGTTA,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.047697
6052_200709_TTTGTTGCAAGCGATG,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.005122
6052_200709_TTTGTTGCAAGGTCAG,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.019590


In [12]:
min(aucs["AUC"])

0.0

In [13]:
max(aucs["AUC"])

0.12424854194706146

In [14]:
# Save DataFrame to TSV file
aucs.to_csv("/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging_90_UPR_AUCell_auc.tsv", sep='\t', index=True)