# GSE243639 - Martirosyan - AUCell Oxphos scoring
*This script is to be run before the Rmd script, for computing the AUC values with pySCENIC* 

**Author:** Vincent Gardeux

**Date Created:** 27/02/2025
**Date Last Modified:** 27/02/2025

In [1]:
# import dependencies
import pandas as pd
import polars as pl
import numpy as np
from ctxcore.genesig import GeneSignature
from pyscenic.aucell import create_rankings, enrichment

In [2]:
GENE_SIGNATURE_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/oxphos_genes_KEGG_hsa.txt' # From https://www.genome.jp/entry/hsa00190
GENE_SIGNATURE_SNAME = '135 (+1) OXPHOS genes' # 1 is not present in Martirosyan's data
EXPRESSION_MTX_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_2024/GSE243639_Filtered_count_table.csv.gz' # From https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE243639

In [3]:
data_genes = pd.read_csv(filepath_or_buffer=GENE_SIGNATURE_FNAME, header=None)[0].tolist()
gs = GeneSignature('KEGG hsa00190 - Oxidative phosphorylation', data_genes)
gs

GeneSignature(name='KEGG hsa00190 - Oxidative phosphorylation', gene2weight=frozendict.frozendict({'MT-ND1': 1.0, 'MT-ND2': 1.0, 'MT-ND3': 1.0, 'MT-ND4': 1.0, 'MT-ND4L': 1.0, 'MT-ND5': 1.0, 'MT-ND6': 1.0, 'NDUFS1': 1.0, 'NDUFS2': 1.0, 'NDUFS3': 1.0, 'NDUFS4': 1.0, 'NDUFS5': 1.0, 'NDUFS6': 1.0, 'NDUFS7': 1.0, 'NDUFS8': 1.0, 'NDUFV1': 1.0, 'NDUFV2': 1.0, 'NDUFV3': 1.0, 'NDUFA1': 1.0, 'NDUFA2': 1.0, 'NDUFA3': 1.0, 'NDUFA4': 1.0, 'NDUFA4L2': 1.0, 'NDUFA5': 1.0, 'NDUFA6': 1.0, 'NDUFA7': 1.0, 'NDUFA8': 1.0, 'NDUFA9': 1.0, 'NDUFA10': 1.0, 'NDUFAB1': 1.0, 'NDUFA11': 1.0, 'NDUFA12': 1.0, 'NDUFA13': 1.0, 'NDUFB1': 1.0, 'NDUFB2': 1.0, 'NDUFB3': 1.0, 'NDUFB4': 1.0, 'NDUFB5': 1.0, 'NDUFB6': 1.0, 'NDUFB7': 1.0, 'NDUFB8': 1.0, 'NDUFB9': 1.0, 'NDUFB10': 1.0, 'NDUFB11': 1.0, 'NDUFC1': 1.0, 'NDUFC2-KCTD14': 1.0, 'NDUFC2': 1.0, 'SDHA': 1.0, 'SDHB': 1.0, 'SDHC': 1.0, 'SDHD': 1.0, 'UQCRFS1': 1.0, 'MT-CYB': 1.0, 'CYC1': 1.0, 'UQCRC1': 1.0, 'UQCRC2': 1.0, 'UQCRHL': 1.0, 'UQCRH': 1.0, 'UQCRB': 1.0, 'UQCRQ': 1

In [4]:
len(gs)

136

In [5]:
# Open CSV count matrix
m = pl.read_csv(EXPRESSION_MTX_FNAME)

# Extract gene names (first column) and cell names (column headers)
gene_names = m[:, 0].to_list()  # First column as list of strings
cell_names = m.columns[1:]      # Column names (excluding first column)

m

Unnamed: 0_level_0,s.0096_AAACCCAAGTACGAGC.1,s.0096_AAACCCACACAGCGCT.1,s.0096_AAACCCACAGATAAAC.1,s.0096_AAACCCAGTCCGGATC.1,s.0096_AAACCCAGTCTCTCAC.1,s.0096_AAACCCAGTGAATTGA.1,s.0096_AAACCCAGTTTACGAC.1,s.0096_AAACCCATCATGGAGG.1,s.0096_AAACCCATCCGTGCGA.1,s.0096_AAACCCATCGCGAAGA.1,s.0096_AAACCCATCTCCTGTG.1,s.0096_AAACGAACAATCGCGC.1,s.0096_AAACGAACACGGTGTC.1,s.0096_AAACGAATCACATCAG.1,s.0096_AAACGAATCTACCCAC.1,s.0096_AAACGCTAGACGACTG.1,s.0096_AAACGCTCATCGGTTA.1,s.0096_AAACGCTTCACGAACT.1,s.0096_AAACGCTTCGGAGTAG.1,s.0096_AAAGAACAGAGTTGTA.1,s.0096_AAAGAACCATAATCGC.1,s.0096_AAAGGATAGACGATAT.1,s.0096_AAAGGATAGTAAAGCT.1,s.0096_AAAGGATCACGACAGA.1,s.0096_AAAGGATGTAACGGTG.1,s.0096_AAAGGATGTAGCTGTT.1,s.0096_AAAGGATTCACAAGAA.1,s.0096_AAAGGGCAGGCCTTGC.1,s.0096_AAAGGGCCAAGACGAC.1,s.0096_AAAGGGCGTCGTTCAA.1,s.0096_AAAGGTAAGACTTAAG.1,s.0096_AAAGGTAGTATCTTCT.1,s.0096_AAAGGTATCGCGATCG.1,s.0096_AAAGTCCAGAAATGGG.1,s.0096_AAAGTCCAGATACCAA.1,s.0096_AAAGTCCAGCATCTTG.1,…,s.0165_TTTACTGCATTGCTGA.1,s.0165_TTTACTGTCTGGTGGC.1,s.0165_TTTAGTCCAAGCGCAA.1,s.0165_TTTAGTCCAATAGTCC.1,s.0165_TTTAGTCTCCCAACTC.1,s.0165_TTTAGTCTCGGAAGGT.1,s.0165_TTTATGCTCCGTAGTA.1,s.0165_TTTCACAAGGGCCAAT.1,s.0165_TTTCACAAGTCACTGT.1,s.0165_TTTCACACAAAGGCTG.1,s.0165_TTTCAGTAGGATACCG.1,s.0165_TTTCAGTAGGTTTGAA.1,s.0165_TTTCAGTGTCTTCCGT.1,s.0165_TTTCATGAGAGATTCA.1,s.0165_TTTCATGAGCGACTGA.1,s.0165_TTTCATGAGGGACCAT.1,s.0165_TTTCATGCACCAGCCA.1,s.0165_TTTCATGTCGCCTCTA.1,s.0165_TTTCATGTCGGCATTA.1,s.0165_TTTCCTCTCGTTGTAG.1,s.0165_TTTCGATCATCAGCGC.1,s.0165_TTTCGATTCGGTCATA.1,s.0165_TTTGACTAGAGCATAT.1,s.0165_TTTGACTCAATGTCTG.1,s.0165_TTTGACTGTGATTCTG.1,s.0165_TTTGACTTCCATCTCG.1,s.0165_TTTGATCAGGTCCGAA.1,s.0165_TTTGATCCAGAGGTTG.1,s.0165_TTTGGTTAGCGCCTTG.1,s.0165_TTTGGTTCAAACTAAG.1,s.0165_TTTGGTTCATGCCGGT.1,s.0165_TTTGGTTGTAGAGATT.1,s.0165_TTTGGTTTCCGCCTAT.1,s.0165_TTTGGTTTCTCGTTTA.1,s.0165_TTTGTTGCAAGTTCGT.1,s.0165_TTTGTTGCATGTTCAG.1,s.0165_TTTGTTGCATTACTCT.1
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""MIR1302-2HG""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""FAM138A""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""OR4F5""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""AL627309.1""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""AL627309.3""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""AC233755.2""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""AC233755.1""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"""AC240274.1""",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
"""AC213203.1""",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
m.shape

(33537, 83485)

In [7]:
len(cell_names)

83484

In [8]:
len(gene_names)

33537

In [9]:
ex_matrix = pd.DataFrame(m[:, 1:], columns = cell_names, index = gene_names)
ex_matrix

Unnamed: 0,s.0096_AAACCCAAGTACGAGC.1,s.0096_AAACCCACACAGCGCT.1,s.0096_AAACCCACAGATAAAC.1,s.0096_AAACCCAGTCCGGATC.1,s.0096_AAACCCAGTCTCTCAC.1,s.0096_AAACCCAGTGAATTGA.1,s.0096_AAACCCAGTTTACGAC.1,s.0096_AAACCCATCATGGAGG.1,s.0096_AAACCCATCCGTGCGA.1,s.0096_AAACCCATCGCGAAGA.1,...,s.0165_TTTGATCCAGAGGTTG.1,s.0165_TTTGGTTAGCGCCTTG.1,s.0165_TTTGGTTCAAACTAAG.1,s.0165_TTTGGTTCATGCCGGT.1,s.0165_TTTGGTTGTAGAGATT.1,s.0165_TTTGGTTTCCGCCTAT.1,s.0165_TTTGGTTTCTCGTTTA.1,s.0165_TTTGTTGCAAGTTCGT.1,s.0165_TTTGTTGCATGTTCAG.1,s.0165_TTTGTTGCATTACTCT.1
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC233755.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC233755.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC240274.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC213203.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Compute the intersection
intersection = list(set(gene_names).intersection(set(data_genes)))
print(intersection)

['UQCRC2', 'ATP6V0E2', 'NDUFS7', 'ATP6V1G1', 'ATP6AP1', 'NDUFA8', 'NDUFA6', 'UQCRB', 'ATP5MC2', 'COX10', 'ATP5MF', 'ATP5F1C', 'NDUFB9', 'COX6C', 'ATP6V0C', 'COX5B', 'NDUFA4L2', 'ATP6V1C2', 'ATP5MG', 'COX7A2', 'NDUFV1', 'NDUFC2', 'NDUFB2', 'UQCRHL', 'COX4I1', 'ATP5F1B', 'MT-ND3', 'MT-ND6', 'COX4I2', 'ATP6V1G2', 'NDUFA9', 'ATP6V1B1', 'ATP5PB', 'COX7C', 'NDUFS1', 'NDUFS8', 'UQCRFS1', 'NDUFA10', 'ATP5PD', 'NDUFB7', 'NDUFA1', 'COX15', 'ATP5ME', 'COX6A2', 'MT-ATP6', 'NDUFB4', 'COX6B1', 'MT-ATP8', 'NDUFB3', 'ATP6V1H', 'ATP5MC3', 'CYCS', 'NDUFB1', 'MT-ND4', 'COX7A2L', 'NDUFV2', 'NDUFA12', 'MT-CYB', 'NDUFA3', 'ATP5PF', 'COX8C', 'COX7B', 'ATP6V0A1', 'NDUFS5', 'NDUFAB1', 'ATP5F1E', 'NDUFA11', 'PPA1', 'ATP5PO', 'ATP6V0A4', 'COX6A1', 'COX5A', 'UQCR11', 'ATP5F1D', 'COX7B2', 'ATP6V0D2', 'COX17', 'NDUFS2', 'ATP6V0E1', 'MT-ND4L', 'NDUFA5', 'UQCRH', 'COX11', 'NDUFS3', 'NDUFB11', 'ATP6V0B', 'MT-CO3', 'UQCRQ', 'ATP6V0A2', 'ATP4B', 'LHPP', 'ATP4A', 'MT-ND1', 'ATP6V1C1', 'MT-CO2', 'CYC1', 'PPA2', 'COX6B2', 

In [11]:
len(intersection)

135

In [12]:
# Compute the genes that don't overlap in the other direction
difference_reverse = list(set(data_genes).difference(set(gene_names)))
print(difference_reverse)

['ATP6V1FP2']


This 1 gene is missing from the dataset.

In [13]:
# Run AUCell
# 1. Rankings (~2mn)
rnk_mtx = create_rankings(ex_matrix.transpose(), seed=42)

In [14]:
# 2. Enrichment (~5s)
aucs = enrichment(rnk_mtx, gs)
aucs

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC
Cell,Regulon,Unnamed: 2_level_1
s.0096_AAACCCAAGTACGAGC.1,KEGG hsa00190 - Oxidative phosphorylation,0.075024
s.0096_AAACCCACACAGCGCT.1,KEGG hsa00190 - Oxidative phosphorylation,0.055434
s.0096_AAACCCACAGATAAAC.1,KEGG hsa00190 - Oxidative phosphorylation,0.042580
s.0096_AAACCCAGTCCGGATC.1,KEGG hsa00190 - Oxidative phosphorylation,0.055434
s.0096_AAACCCAGTCTCTCAC.1,KEGG hsa00190 - Oxidative phosphorylation,0.013353
...,...,...
s.0165_TTTGGTTTCCGCCTAT.1,KEGG hsa00190 - Oxidative phosphorylation,0.049551
s.0165_TTTGGTTTCTCGTTTA.1,KEGG hsa00190 - Oxidative phosphorylation,0.117591
s.0165_TTTGTTGCAAGTTCGT.1,KEGG hsa00190 - Oxidative phosphorylation,0.055562
s.0165_TTTGTTGCATGTTCAG.1,KEGG hsa00190 - Oxidative phosphorylation,0.058782


In [15]:
min(aucs["AUC"])

0.0

In [16]:
max(aucs["AUC"])

0.415671724198856

In [17]:
# Save DataFrame to TSV file
aucs.to_csv("/data/gardeux/Neuro_Droso_ND75KD/data/Martirosyan_135_Oxphos_AUCell_auc.tsv", sep='\t', index=True)