# Jeffries Human Brain Aging paper - AUCell Oxphos scoring
*This script is to be run before the Rmd script, for computing the AUC values with pySCENIC* 

**Author:** Vincent Gardeux

**Date Created:** 2025-11-17

**Date Last Modified:** 2025-11-17

In [1]:
# import dependencies
import pandas as pd
import numpy as np
import anndata as ad
import time
from ctxcore.genesig import GeneSignature
from pyscenic.aucell import create_rankings, enrichment

In [2]:
GENE_SIGNATURE_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/oxphos_genes_KEGG_hsa.txt' # From https://www.genome.jp/entry/hsa00190
GENE_SIGNATURE_SNAME = '135 (+1) OXPHOS genes' # 1 is not present in Jeffries' data
EXPRESSION_H5AD_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging/pfc.clean.h5ad' # Built from pfc.clean.rds. Downloaded from https://publications.wenglab.org/SomaMut/Jeffries_Yu_BrainAging_2025/

In [3]:
data_genes = pd.read_csv(filepath_or_buffer=GENE_SIGNATURE_FNAME, header=None)[0].tolist()
gs = GeneSignature('KEGG hsa00190 - Oxidative phosphorylation', data_genes)
gs

GeneSignature(name='KEGG hsa00190 - Oxidative phosphorylation', gene2weight=frozendict.frozendict({'MT-ND1': 1.0, 'MT-ND2': 1.0, 'MT-ND3': 1.0, 'MT-ND4': 1.0, 'MT-ND4L': 1.0, 'MT-ND5': 1.0, 'MT-ND6': 1.0, 'NDUFS1': 1.0, 'NDUFS2': 1.0, 'NDUFS3': 1.0, 'NDUFS4': 1.0, 'NDUFS5': 1.0, 'NDUFS6': 1.0, 'NDUFS7': 1.0, 'NDUFS8': 1.0, 'NDUFV1': 1.0, 'NDUFV2': 1.0, 'NDUFV3': 1.0, 'NDUFA1': 1.0, 'NDUFA2': 1.0, 'NDUFA3': 1.0, 'NDUFA4': 1.0, 'NDUFA4L2': 1.0, 'NDUFA5': 1.0, 'NDUFA6': 1.0, 'NDUFA7': 1.0, 'NDUFA8': 1.0, 'NDUFA9': 1.0, 'NDUFA10': 1.0, 'NDUFAB1': 1.0, 'NDUFA11': 1.0, 'NDUFA12': 1.0, 'NDUFA13': 1.0, 'NDUFB1': 1.0, 'NDUFB2': 1.0, 'NDUFB3': 1.0, 'NDUFB4': 1.0, 'NDUFB5': 1.0, 'NDUFB6': 1.0, 'NDUFB7': 1.0, 'NDUFB8': 1.0, 'NDUFB9': 1.0, 'NDUFB10': 1.0, 'NDUFB11': 1.0, 'NDUFC1': 1.0, 'NDUFC2-KCTD14': 1.0, 'NDUFC2': 1.0, 'SDHA': 1.0, 'SDHB': 1.0, 'SDHC': 1.0, 'SDHD': 1.0, 'UQCRFS1': 1.0, 'MT-CYB': 1.0, 'CYC1': 1.0, 'UQCRC1': 1.0, 'UQCRC2': 1.0, 'UQCRHL': 1.0, 'UQCRH': 1.0, 'UQCRB': 1.0, 'UQCRQ': 1

In [4]:
len(gs)

136

In [5]:
# [Input] Load expression matrix from H5ad file
f_h5ad = ad.read_h5ad(EXPRESSION_H5AD_FNAME)
f_gene_names = f_h5ad.var_names.tolist()  # Gene names
f_cell_names = f_h5ad.obs_names.tolist()   # Cell names
ex_matrix = pd.DataFrame.sparse.from_spmatrix(f_h5ad.X.T, index=f_gene_names, columns=f_cell_names) # 33538 genes x 2663736 cells
ex_matrix

Unnamed: 0,0950_240109_AAACCCAAGACATCCT,0950_240109_AAACCCACACCGTACG,0950_240109_AAACCCAGTAATCAGA,0950_240109_AAACCCATCACTCTTA,0950_240109_AAACGAAAGGTTACCT,0950_240109_AAACGAAAGTATGACA,0950_240109_AAACGAACAGCGATTT,0950_240109_AAACGAATCCATCGTC,0950_240109_AAACGAATCTCATGCC,0950_240109_AAACGCTCAATTGCGT,...,6052_200709_TTTGGTTCATGTACGT,6052_200709_TTTGGTTCATGTCGTA,6052_200709_TTTGGTTGTACCGTCG,6052_200709_TTTGGTTGTATTGACC,6052_200709_TTTGGTTGTCAGTCCG,6052_200709_TTTGGTTTCGACACTA,6052_200709_TTTGGTTTCGCAGTTA,6052_200709_TTTGTTGCAAGCGATG,6052_200709_TTTGTTGCAAGGTCAG,6052_200709_TTTGTTGCAGCCGTTG
MIR1302-2HG,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
AL627309.1,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
AL627309.3,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
AL627309.5,0.0,0.0,0.0,0.000000,0.0,0.0,0.711225,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
AP006222.2,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AL592183.1,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.448062,0.0,0.26238,0.243586,0.0
AC240274.1,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.243586,0.0
AC004556.3,0.0,0.0,0.0,0.301522,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
AC007325.4,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.248919,0.0,0.00000,0.000000,0.0


In [6]:
# Compute the intersection
intersection = list(set(f_gene_names).intersection(set(data_genes)))
print(intersection)

['NDUFA10', 'COX4I1', 'ATP6V1B2', 'PPA1', 'COX4I2', 'NDUFB2', 'ATP5PD', 'NDUFC1', 'NDUFA9', 'ATP4A', 'COX5B', 'ATP5F1C', 'UQCRH', 'NDUFV3', 'NDUFB8', 'NDUFV2', 'ATP12A', 'TCIRG1', 'COX7B2', 'SDHC', 'COX10', 'NDUFS2', 'ATP5F1D', 'NDUFA11', 'ATP5MC3', 'ATP6V0A1', 'NDUFA4', 'ATP6V0A2', 'ATP5PO', 'NDUFA2', 'NDUFB10', 'COX8A', 'NDUFB9', 'ATP6V0C', 'ATP6V1C1', 'ATP6V0D2', 'ATP5PF', 'NDUFA3', 'ATP5ME', 'SDHA', 'NDUFS3', 'ATP5MF', 'ATP6V0A4', 'ATP6V1D', 'NDUFB5', 'NDUFS6', 'UQCRC1', 'NDUFB1', 'COX6A1', 'ATP6V1H', 'NDUFS7', 'ATP6V1B1', 'NDUFB3', 'NDUFS5', 'COX7A2L', 'ATP6V1C2', 'UQCR10', 'ATP5MGL', 'COX17', 'NDUFV1', 'NDUFB6', 'NDUFA13', 'COX6B1', 'COX11', 'COX7A1', 'NDUFS1', 'COX7C', 'COX6B2', 'ATP6V0E2', 'NDUFA12', 'ATP5PB', 'UQCRFS1', 'NDUFA8', 'COX8C', 'ATP5F1A', 'ATP6V1F', 'LHPP', 'UQCRHL', 'UQCRB', 'ATP6V1E1', 'ATP5MC1', 'NDUFS4', 'NDUFB7', 'UQCR11', 'COX15', 'COX7A2', 'ATP5MC2', 'UQCRC2', 'SDHB', 'ATP5F1E', 'ATP6V0D1', 'NDUFA5', 'NDUFS8', 'NDUFA7', 'COX5A', 'ATP6V1A', 'COX6A2', 'ATP6V1G1

In [7]:
len(intersection)

116

In [8]:
# Compute the genes that don't overlap in the other direction
difference_reverse = list(set(data_genes).difference(set(f_gene_names)))
print(difference_reverse)

['NDUFB11', 'MT-ND2', 'MT-CO1', 'ATP6AP1', 'MT-ATP8', 'MT-ND1', 'MT-CO3', 'NDUFC2-KCTD14', 'MT-ND4L', 'MT-ND6', 'MT-CYB', 'MT-ND3', 'MT-CO2', 'MT-ND5', 'MT-ND4', 'NDUFA1', 'ATP6V1FP2', 'ATP6V1G3', 'COX7B', 'MT-ATP6']


In [9]:
len(difference_reverse)

20

- Many ATP6V1 genes are present but not ATP6V1FP2 or ATP6V1G3
- Same for ATP6AP1
- COX7B2 is present but not COX7B
- Many NDUF genes are present (including NDUFC2), but not NDUFB11 NDUFA1 NDUFC2-KCTD14
- All MT- mitochondrial genes are not in the dataset. Probably filtered out specifically?

In [10]:
# Run AUCell
# 1. Rankings (~30mn)
start_time = time.time()

rnk_mtx = create_rankings(ex_matrix.transpose(), seed=42)

print(f"Ranking (AUCell) computing time: {(time.time() - start_time):.2f} seconds")

Ranking (AUCell) computing time: 1812.72 seconds


In [11]:
# 2. Enrichment (~5s)
aucs = enrichment(rnk_mtx, gs)
aucs

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC
Cell,Regulon,Unnamed: 2_level_1
0950_240109_AAACCCAAGACATCCT,KEGG hsa00190 - Oxidative phosphorylation,0.016128
0950_240109_AAACCCACACCGTACG,KEGG hsa00190 - Oxidative phosphorylation,0.024394
0950_240109_AAACCCAGTAATCAGA,KEGG hsa00190 - Oxidative phosphorylation,0.010054
0950_240109_AAACCCATCACTCTTA,KEGG hsa00190 - Oxidative phosphorylation,0.014468
0950_240109_AAACGAAAGGTTACCT,KEGG hsa00190 - Oxidative phosphorylation,0.009764
...,...,...
6052_200709_TTTGGTTTCGACACTA,KEGG hsa00190 - Oxidative phosphorylation,0.029650
6052_200709_TTTGGTTTCGCAGTTA,KEGG hsa00190 - Oxidative phosphorylation,0.023315
6052_200709_TTTGTTGCAAGCGATG,KEGG hsa00190 - Oxidative phosphorylation,0.014004
6052_200709_TTTGTTGCAAGGTCAG,KEGG hsa00190 - Oxidative phosphorylation,0.042326


In [12]:
min(aucs["AUC"])

0.0

In [13]:
max(aucs["AUC"])

0.38616164663294195

In [14]:
# Save DataFrame to TSV file
aucs.to_csv("/data/gardeux/Neuro_Droso_ND75KD/data/Jeffries_2025_HumanBrainAging_116_Oxphos_AUCell_auc.tsv", sep='\t', index=True)