# Kellis Alzheimer paper - AUCell UPR scoring
*This script is to be run before the Rmd script, for computing the AUC values with pySCENIC* 

**Author:** Vincent Gardeux

**Date Created:** 22/04/2025
**Date Last Modified:** 22/04/2025

In [1]:
# import dependencies
import pandas as pd
import polars as pl
import numpy as np
import anndata as ad
import time
from ctxcore.genesig import GeneSignature
from pyscenic.aucell import create_rankings, enrichment

In [2]:
GENE_SIGNATURE_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/UPR_genes_REACTOME_human.txt' # From https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/REACTOME_UNFOLDED_PROTEIN_RESPONSE_UPR.html
GENE_SIGNATURE_SNAME = '93 UPR genes' # All are present in Martirosyan's data
EXPRESSION_H5AD_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Gene Expression (snRNAseq - 10x) processed, multi-region/all_brain_regions_filt_preprocessed_scanpy_fullmatrix.h5ad' # From Synapse

In [3]:
data_genes = pd.read_csv(filepath_or_buffer=GENE_SIGNATURE_FNAME, header=None)[0].tolist()
gs = GeneSignature('REACTOME - UNFOLDED PROTEIN RESPONSE (UPR)', data_genes)
gs

GeneSignature(name='REACTOME - UNFOLDED PROTEIN RESPONSE (UPR)', gene2weight=frozendict.frozendict({'ACADVL': 1.0, 'ADD1': 1.0, 'ARFGAP1': 1.0, 'ASNS': 1.0, 'ATF3': 1.0, 'ATF4': 1.0, 'ATF6': 1.0, 'ATF6B': 1.0, 'ATP6V0D1': 1.0, 'CALR': 1.0, 'CCL2': 1.0, 'CEBPB': 1.0, 'CEBPG': 1.0, 'CREB3': 1.0, 'CREB3L1': 1.0, 'CREB3L2': 1.0, 'CREB3L3': 1.0, 'CREB3L4': 1.0, 'CREBRF': 1.0, 'CTDSP2': 1.0, 'CUL7': 1.0, 'CXCL8': 1.0, 'CXXC1': 1.0, 'DCP2': 1.0, 'DCSTAMP': 1.0, 'DCTN1': 1.0, 'DDIT3': 1.0, 'DDX11': 1.0, 'DIS3': 1.0, 'DNAJB11': 1.0, 'DNAJB9': 1.0, 'DNAJC3': 1.0, 'EDEM1': 1.0, 'EIF2AK3': 1.0, 'EIF2S1': 1.0, 'EIF2S2': 1.0, 'EIF2S3': 1.0, 'ERN1': 1.0, 'EXOSC1': 1.0, 'EXOSC2': 1.0, 'EXOSC3': 1.0, 'EXOSC4': 1.0, 'EXOSC5': 1.0, 'EXOSC6': 1.0, 'EXOSC7': 1.0, 'EXOSC8': 1.0, 'EXOSC9': 1.0, 'EXTL1': 1.0, 'EXTL2': 1.0, 'EXTL3': 1.0, 'FKBP14': 1.0, 'GFPT1': 1.0, 'GOSR2': 1.0, 'GSK3A': 1.0, 'HDGF': 1.0, 'HERPUD1': 1.0, 'HSP90B1': 1.0, 'HSPA5': 1.0, 'HYOU1': 1.0, 'IGFBP1': 1.0, 'KDELR3': 1.0, 'KHSRP': 1.0, '

In [4]:
len(gs)

93

In [5]:
# [Input] Load expression matrix from H5ad file
f_h5ad = ad.read_h5ad(EXPRESSION_H5AD_FNAME)
f_gene_names = f_h5ad.var_names.tolist()  # Gene names
f_cell_names = f_h5ad.obs_names.tolist()   # Cell names
ex_matrix = pd.DataFrame.sparse.from_spmatrix(f_h5ad.X.T, index=f_gene_names, columns=f_cell_names) # 33538 genes x 2663736 cells
ex_matrix

Unnamed: 0,AG_AAACCCACAGATAAAC-1,AG_AAACGAAAGGCCACCT-1,AG_AAACGAACACAAATAG-1,AG_AAACGAATCCACAGGC-1,AG_AAACGCTCAAACACGG-1,AG_AAACGCTCAGAATCGG-1,AG_AAACGCTTCTGTTCAT-1,AG_AAAGGGCAGCTAATGA-1,AG_AAAGGGCTCGCTTGAA-1,AG_AAAGGTACAGACCCGT-1,...,TH_TTTGACTGTGCCTAAT-47,TH_TTTGATCAGCAAATGT-47,TH_TTTGGAGAGCTAGATA-47,TH_TTTGGAGGTCTCCCTA-47,TH_TTTGGAGTCATTTCGT-47,TH_TTTGGTTAGTACAGCG-47,TH_TTTGGTTGTTACAGCT-47,TH_TTTGTTGCACCTCTGT-47,TH_TTTGTTGGTATGCTAC-47,TH_TTTGTTGGTCGGATTT-47
OR4F5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL645608.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND4L,3.0,3.0,9.0,12.0,1.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
MT-ND4,59.0,32.0,200.0,173.0,12.0,5.0,4.0,1.0,88.0,7.0,...,0.0,2.0,2.0,8.0,0.0,11.0,4.0,0.0,4.0,2.0
MT-ND5,6.0,3.0,15.0,21.0,1.0,0.0,0.0,1.0,17.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0
MT-ND6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Compute the intersection
intersection = list(set(f_gene_names).intersection(set(data_genes)))
print(intersection)

['HYOU1', 'ATF3', 'FKBP14', 'PPP2R5B', 'WIPI1', 'EIF2AK3', 'EXOSC6', 'DNAJB11', 'EXOSC9', 'KDELR3', 'CREB3L4', 'SERP1', 'SRPRA', 'CREB3L1', 'TSPYL2', 'DDX11', 'TPP1', 'EXOSC5', 'NFYB', 'CUL7', 'KLHDC3', 'DCP2', 'ADD1', 'ASNS', 'EXOSC2', 'GSK3A', 'IGFBP1', 'CXCL8', 'MBTPS1', 'SSR1', 'CREB3L3', 'XBP1', 'GFPT1', 'SHC1', 'YIF1A', 'HDGF', 'DCSTAMP', 'NFYA', 'NFYC', 'ATF6', 'PARN', 'ATF4', 'CREB3', 'PDIA5', 'DCTN1', 'PREB', 'HSPA5', 'ACADVL', 'PDIA6', 'HSP90B1', 'CREBRF', 'EXOSC3', 'ARFGAP1', 'SEC31A', 'EXOSC7', 'WFS1', 'EXTL3', 'EXTL1', 'MBTPS2', 'TLN1', 'SYVN1', 'KHSRP', 'DDIT3', 'EXOSC4', 'ATP6V0D1', 'CEBPB', 'EIF2S1', 'CCL2', 'CEBPG', 'EIF2S2', 'PLA2G4B', 'CREB3L2', 'DNAJC3', 'EXOSC1', 'TATDN2', 'CXXC1', 'EXOSC8', 'GOSR2', 'SRPRB', 'CTDSP2', 'SULT1A3', 'EXTL2', 'CALR', 'DNAJB9', 'EDEM1', 'LMNA', 'EIF2S3', 'DIS3', 'ATF6B', 'HERPUD1', 'ZBTB17', 'ERN1']


In [7]:
len(intersection)

92

In [8]:
# Compute the genes that don't overlap in the other direction
difference_reverse = list(set(data_genes).difference(set(f_gene_names)))
print(difference_reverse)

['MYDGF']


1 gene is missing from the dataset (MYDGF). It is present in the 33538 gene list from Seurat objects.

In [9]:
# Run AUCell
# 1. Rankings (~1h26)
start_time = time.time()

rnk_mtx = create_rankings(ex_matrix.transpose(), seed=42)

print(f"Ranking (AUCell) computing time: {(time.time() - start_time):.2f} seconds")

Ranking (AUCell) computing time: 5190.02 seconds


In [14]:
# 2. Enrichment (~5s)
aucs = enrichment(rnk_mtx, gs)
aucs

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC
Cell,Regulon,Unnamed: 2_level_1
AG_AAACCCACAGATAAAC-1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.003976
AG_AAACGAAAGGCCACCT-1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.006756
AG_AAACGAACACAAATAG-1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.000000
AG_AAACGAATCCACAGGC-1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.004251
AG_AAACGCTCAAACACGG-1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.023543
...,...,...
TH_TTTGGTTAGTACAGCG-47,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.017856
TH_TTTGGTTGTTACAGCT-47,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.024876
TH_TTTGTTGCACCTCTGT-47,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.011789
TH_TTTGTTGGTATGCTAC-47,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.009135


In [11]:
min(aucs["AUC"])

0.0

In [12]:
max(aucs["AUC"])

0.14719873150105708

In [13]:
# Save DataFrame to TSV file
aucs.to_csv("/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_92_UPR_AUCell_auc.tsv", sep='\t', index=True)