# GSE243639 - Martirosyan - AUCell UPR scoring
*This script is to be run before the Rmd script, for computing the AUC values with pySCENIC* 

**Author:** Vincent Gardeux

**Date Created:** 04/03/2025
**Date Last Modified:** 04/03/2025

In [1]:
# import dependencies
import pandas as pd
import polars as pl
import numpy as np
from ctxcore.genesig import GeneSignature
from pyscenic.aucell import create_rankings, enrichment

In [2]:
GENE_SIGNATURE_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/UPR_genes_REACTOME_human.txt' # From https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/REACTOME_UNFOLDED_PROTEIN_RESPONSE_UPR.html
GENE_SIGNATURE_SNAME = '93 UPR genes' # All are present in Martirosyan's data
EXPRESSION_MTX_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_2024/GSE243639_Filtered_count_table.csv.gz' # From https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE243639

In [3]:
data_genes = pd.read_csv(filepath_or_buffer=GENE_SIGNATURE_FNAME, header=None)[0].tolist()
gs = GeneSignature('REACTOME - UNFOLDED PROTEIN RESPONSE (UPR)', data_genes)
gs

GeneSignature(name='REACTOME - UNFOLDED PROTEIN RESPONSE (UPR)', gene2weight=frozendict.frozendict({'ACADVL': 1.0, 'ADD1': 1.0, 'ARFGAP1': 1.0, 'ASNS': 1.0, 'ATF3': 1.0, 'ATF4': 1.0, 'ATF6': 1.0, 'ATF6B': 1.0, 'ATP6V0D1': 1.0, 'CALR': 1.0, 'CCL2': 1.0, 'CEBPB': 1.0, 'CEBPG': 1.0, 'CREB3': 1.0, 'CREB3L1': 1.0, 'CREB3L2': 1.0, 'CREB3L3': 1.0, 'CREB3L4': 1.0, 'CREBRF': 1.0, 'CTDSP2': 1.0, 'CUL7': 1.0, 'CXCL8': 1.0, 'CXXC1': 1.0, 'DCP2': 1.0, 'DCSTAMP': 1.0, 'DCTN1': 1.0, 'DDIT3': 1.0, 'DDX11': 1.0, 'DIS3': 1.0, 'DNAJB11': 1.0, 'DNAJB9': 1.0, 'DNAJC3': 1.0, 'EDEM1': 1.0, 'EIF2AK3': 1.0, 'EIF2S1': 1.0, 'EIF2S2': 1.0, 'EIF2S3': 1.0, 'ERN1': 1.0, 'EXOSC1': 1.0, 'EXOSC2': 1.0, 'EXOSC3': 1.0, 'EXOSC4': 1.0, 'EXOSC5': 1.0, 'EXOSC6': 1.0, 'EXOSC7': 1.0, 'EXOSC8': 1.0, 'EXOSC9': 1.0, 'EXTL1': 1.0, 'EXTL2': 1.0, 'EXTL3': 1.0, 'FKBP14': 1.0, 'GFPT1': 1.0, 'GOSR2': 1.0, 'GSK3A': 1.0, 'HDGF': 1.0, 'HERPUD1': 1.0, 'HSP90B1': 1.0, 'HSPA5': 1.0, 'HYOU1': 1.0, 'IGFBP1': 1.0, 'KDELR3': 1.0, 'KHSRP': 1.0, '

In [4]:
len(gs)

93

In [5]:
# Open CSV count matrix
m = pl.read_csv(EXPRESSION_MTX_FNAME)

# Extract gene names (first column) and cell names (column headers)
gene_names = m[:, 0].to_list()  # First column as list of strings
cell_names = m.columns[1:]      # Column names (excluding first column)

m

Unnamed: 0_level_0,s.0096_AAACCCAAGTACGAGC.1,s.0096_AAACCCACACAGCGCT.1,s.0096_AAACCCACAGATAAAC.1,s.0096_AAACCCAGTCCGGATC.1,s.0096_AAACCCAGTCTCTCAC.1,s.0096_AAACCCAGTGAATTGA.1,s.0096_AAACCCAGTTTACGAC.1,s.0096_AAACCCATCATGGAGG.1,s.0096_AAACCCATCCGTGCGA.1,s.0096_AAACCCATCGCGAAGA.1,s.0096_AAACCCATCTCCTGTG.1,s.0096_AAACGAACAATCGCGC.1,s.0096_AAACGAACACGGTGTC.1,s.0096_AAACGAATCACATCAG.1,s.0096_AAACGAATCTACCCAC.1,s.0096_AAACGCTAGACGACTG.1,s.0096_AAACGCTCATCGGTTA.1,s.0096_AAACGCTTCACGAACT.1,s.0096_AAACGCTTCGGAGTAG.1,s.0096_AAAGAACAGAGTTGTA.1,s.0096_AAAGAACCATAATCGC.1,s.0096_AAAGGATAGACGATAT.1,s.0096_AAAGGATAGTAAAGCT.1,s.0096_AAAGGATCACGACAGA.1,s.0096_AAAGGATGTAACGGTG.1,s.0096_AAAGGATGTAGCTGTT.1,s.0096_AAAGGATTCACAAGAA.1,s.0096_AAAGGGCAGGCCTTGC.1,s.0096_AAAGGGCCAAGACGAC.1,s.0096_AAAGGGCGTCGTTCAA.1,s.0096_AAAGGTAAGACTTAAG.1,s.0096_AAAGGTAGTATCTTCT.1,s.0096_AAAGGTATCGCGATCG.1,s.0096_AAAGTCCAGAAATGGG.1,s.0096_AAAGTCCAGATACCAA.1,s.0096_AAAGTCCAGCATCTTG.1,…,s.0165_TTTACTGCATTGCTGA.1,s.0165_TTTACTGTCTGGTGGC.1,s.0165_TTTAGTCCAAGCGCAA.1,s.0165_TTTAGTCCAATAGTCC.1,s.0165_TTTAGTCTCCCAACTC.1,s.0165_TTTAGTCTCGGAAGGT.1,s.0165_TTTATGCTCCGTAGTA.1,s.0165_TTTCACAAGGGCCAAT.1,s.0165_TTTCACAAGTCACTGT.1,s.0165_TTTCACACAAAGGCTG.1,s.0165_TTTCAGTAGGATACCG.1,s.0165_TTTCAGTAGGTTTGAA.1,s.0165_TTTCAGTGTCTTCCGT.1,s.0165_TTTCATGAGAGATTCA.1,s.0165_TTTCATGAGCGACTGA.1,s.0165_TTTCATGAGGGACCAT.1,s.0165_TTTCATGCACCAGCCA.1,s.0165_TTTCATGTCGCCTCTA.1,s.0165_TTTCATGTCGGCATTA.1,s.0165_TTTCCTCTCGTTGTAG.1,s.0165_TTTCGATCATCAGCGC.1,s.0165_TTTCGATTCGGTCATA.1,s.0165_TTTGACTAGAGCATAT.1,s.0165_TTTGACTCAATGTCTG.1,s.0165_TTTGACTGTGATTCTG.1,s.0165_TTTGACTTCCATCTCG.1,s.0165_TTTGATCAGGTCCGAA.1,s.0165_TTTGATCCAGAGGTTG.1,s.0165_TTTGGTTAGCGCCTTG.1,s.0165_TTTGGTTCAAACTAAG.1,s.0165_TTTGGTTCATGCCGGT.1,s.0165_TTTGGTTGTAGAGATT.1,s.0165_TTTGGTTTCCGCCTAT.1,s.0165_TTTGGTTTCTCGTTTA.1,s.0165_TTTGTTGCAAGTTCGT.1,s.0165_TTTGTTGCATGTTCAG.1,s.0165_TTTGTTGCATTACTCT.1
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""MIR1302-2HG""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""FAM138A""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""OR4F5""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""AL627309.1""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""AL627309.3""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""AC233755.2""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""AC233755.1""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""AC240274.1""",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
"""AC213203.1""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
m.shape

(33537, 83485)

In [7]:
len(cell_names)

83484

In [8]:
len(gene_names)

33537

In [9]:
ex_matrix = pd.DataFrame(m[:, 1:], columns = cell_names, index = gene_names)
ex_matrix

Unnamed: 0,s.0096_AAACCCAAGTACGAGC.1,s.0096_AAACCCACACAGCGCT.1,s.0096_AAACCCACAGATAAAC.1,s.0096_AAACCCAGTCCGGATC.1,s.0096_AAACCCAGTCTCTCAC.1,s.0096_AAACCCAGTGAATTGA.1,s.0096_AAACCCAGTTTACGAC.1,s.0096_AAACCCATCATGGAGG.1,s.0096_AAACCCATCCGTGCGA.1,s.0096_AAACCCATCGCGAAGA.1,...,s.0165_TTTGATCCAGAGGTTG.1,s.0165_TTTGGTTAGCGCCTTG.1,s.0165_TTTGGTTCAAACTAAG.1,s.0165_TTTGGTTCATGCCGGT.1,s.0165_TTTGGTTGTAGAGATT.1,s.0165_TTTGGTTTCCGCCTAT.1,s.0165_TTTGGTTTCTCGTTTA.1,s.0165_TTTGTTGCAAGTTCGT.1,s.0165_TTTGTTGCATGTTCAG.1,s.0165_TTTGTTGCATTACTCT.1
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC233755.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC233755.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC240274.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC213203.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Compute the intersection
intersection = list(set(gene_names).intersection(set(data_genes)))
print(intersection)

['EXOSC5', 'FKBP14', 'CREB3L1', 'ERN1', 'ATF4', 'EXOSC7', 'TSPYL2', 'EIF2AK3', 'PDIA6', 'ADD1', 'CEBPB', 'HSP90B1', 'SERP1', 'EXOSC1', 'CREBRF', 'ACADVL', 'PREB', 'SRPRA', 'EXTL2', 'EXOSC8', 'TPP1', 'GOSR2', 'ATF6', 'WIPI1', 'SULT1A3', 'CCL2', 'SEC31A', 'HYOU1', 'DNAJB11', 'PPP2R5B', 'EXTL1', 'GSK3A', 'PARN', 'ZBTB17', 'KLHDC3', 'EXOSC4', 'EIF2S2', 'DCP2', 'ATF6B', 'DIS3', 'DNAJC3', 'SYVN1', 'NFYC', 'CTDSP2', 'PLA2G4B', 'CREB3L2', 'CXXC1', 'EDEM1', 'KHSRP', 'GFPT1', 'EXOSC6', 'DNAJB9', 'CREB3L4', 'HSPA5', 'SSR1', 'TATDN2', 'PDIA5', 'YIF1A', 'CALR', 'EXOSC3', 'XBP1', 'MYDGF', 'DDIT3', 'EXOSC2', 'CUL7', 'DCSTAMP', 'EIF2S1', 'EIF2S3', 'WFS1', 'SRPRB', 'EXOSC9', 'ARFGAP1', 'EXTL3', 'CREB3', 'DCTN1', 'CREB3L3', 'KDELR3', 'LMNA', 'CXCL8', 'IGFBP1', 'HDGF', 'CEBPG', 'DDX11', 'HERPUD1', 'SHC1', 'ASNS', 'ATF3', 'MBTPS2', 'MBTPS1', 'TLN1', 'NFYB', 'ATP6V0D1', 'NFYA']


In [11]:
len(intersection)

93

In [12]:
# Compute the genes that don't overlap in the other direction
difference_reverse = list(set(data_genes).difference(set(gene_names)))
print(difference_reverse)

[]


No gene is missing from the dataset.

In [13]:
# Run AUCell
# 1. Rankings (~2mn)
rnk_mtx = create_rankings(ex_matrix.transpose(), seed=42)

In [14]:
# 2. Enrichment (~5s)
aucs = enrichment(rnk_mtx, gs)
aucs

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC
Cell,Regulon,Unnamed: 2_level_1
s.0096_AAACCCAAGTACGAGC.1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.039318
s.0096_AAACCCACACAGCGCT.1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.029841
s.0096_AAACCCACAGATAAAC.1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.014863
s.0096_AAACCCAGTCCGGATC.1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.014369
s.0096_AAACCCAGTCTCTCAC.1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.012946
...,...,...
s.0165_TTTGGTTTCCGCCTAT.1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.029302
s.0165_TTTGGTTTCTCGTTTA.1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.032412
s.0165_TTTGTTGCAAGTTCGT.1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.030181
s.0165_TTTGTTGCATGTTCAG.1,REACTOME - UNFOLDED PROTEIN RESPONSE (UPR),0.019088


In [15]:
min(aucs["AUC"])

0.0

In [16]:
max(aucs["AUC"])

0.18932297176858318

In [17]:
# Save DataFrame to TSV file
aucs.to_csv("/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_93_UPR_AUCell_auc.tsv", sep='\t', index=True)