# Kellis Alzheimer paper - AUCell Oxphos scoring
*This script is to be run before the Rmd script, for computing the AUC values with pySCENIC* 

**Author:** Vincent Gardeux

**Date Created:** 22/04/2025
**Date Last Modified:** 22/04/2025

In [1]:
# import dependencies
import pandas as pd
import polars as pl
import numpy as np
import anndata as ad
from ctxcore.genesig import GeneSignature
from pyscenic.aucell import create_rankings, enrichment

In [2]:
GENE_SIGNATURE_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/oxphos_genes_KEGG_hsa.txt' # From https://www.genome.jp/entry/hsa00190
GENE_SIGNATURE_SNAME = '135 (+1) OXPHOS genes' # 1 is not present in Martirosyan's data
EXPRESSION_H5AD_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_2024/Gene Expression (snRNAseq - 10x) processed, multi-region/all_brain_regions_filt_preprocessed_scanpy_fullmatrix.h5ad' # From Synapse

In [3]:
data_genes = pd.read_csv(filepath_or_buffer=GENE_SIGNATURE_FNAME, header=None)[0].tolist()
gs = GeneSignature('KEGG hsa00190 - Oxidative phosphorylation', data_genes)
gs

GeneSignature(name='KEGG hsa00190 - Oxidative phosphorylation', gene2weight=frozendict.frozendict({'MT-ND1': 1.0, 'MT-ND2': 1.0, 'MT-ND3': 1.0, 'MT-ND4': 1.0, 'MT-ND4L': 1.0, 'MT-ND5': 1.0, 'MT-ND6': 1.0, 'NDUFS1': 1.0, 'NDUFS2': 1.0, 'NDUFS3': 1.0, 'NDUFS4': 1.0, 'NDUFS5': 1.0, 'NDUFS6': 1.0, 'NDUFS7': 1.0, 'NDUFS8': 1.0, 'NDUFV1': 1.0, 'NDUFV2': 1.0, 'NDUFV3': 1.0, 'NDUFA1': 1.0, 'NDUFA2': 1.0, 'NDUFA3': 1.0, 'NDUFA4': 1.0, 'NDUFA4L2': 1.0, 'NDUFA5': 1.0, 'NDUFA6': 1.0, 'NDUFA7': 1.0, 'NDUFA8': 1.0, 'NDUFA9': 1.0, 'NDUFA10': 1.0, 'NDUFAB1': 1.0, 'NDUFA11': 1.0, 'NDUFA12': 1.0, 'NDUFA13': 1.0, 'NDUFB1': 1.0, 'NDUFB2': 1.0, 'NDUFB3': 1.0, 'NDUFB4': 1.0, 'NDUFB5': 1.0, 'NDUFB6': 1.0, 'NDUFB7': 1.0, 'NDUFB8': 1.0, 'NDUFB9': 1.0, 'NDUFB10': 1.0, 'NDUFB11': 1.0, 'NDUFC1': 1.0, 'NDUFC2-KCTD14': 1.0, 'NDUFC2': 1.0, 'SDHA': 1.0, 'SDHB': 1.0, 'SDHC': 1.0, 'SDHD': 1.0, 'UQCRFS1': 1.0, 'MT-CYB': 1.0, 'CYC1': 1.0, 'UQCRC1': 1.0, 'UQCRC2': 1.0, 'UQCRHL': 1.0, 'UQCRH': 1.0, 'UQCRB': 1.0, 'UQCRQ': 1

In [4]:
len(gs)

136

In [5]:
# [Input] Load expression matrix from H5ad file
f_h5ad = ad.read_h5ad(EXPRESSION_H5AD_FNAME)
f_gene_names = f_h5ad.var_names.tolist()  # Gene names
f_cell_names = f_h5ad.obs_names.tolist()   # Cell names
ex_matrix = pd.DataFrame.sparse.from_spmatrix(f_h5ad.X.T, index=f_gene_names, columns=f_cell_names) # 33538 genes x 2663736 cells
ex_matrix

Unnamed: 0,AG_AAACCCACAGATAAAC-1,AG_AAACGAAAGGCCACCT-1,AG_AAACGAACACAAATAG-1,AG_AAACGAATCCACAGGC-1,AG_AAACGCTCAAACACGG-1,AG_AAACGCTCAGAATCGG-1,AG_AAACGCTTCTGTTCAT-1,AG_AAAGGGCAGCTAATGA-1,AG_AAAGGGCTCGCTTGAA-1,AG_AAAGGTACAGACCCGT-1,...,TH_TTTGACTGTGCCTAAT-47,TH_TTTGATCAGCAAATGT-47,TH_TTTGGAGAGCTAGATA-47,TH_TTTGGAGGTCTCCCTA-47,TH_TTTGGAGTCATTTCGT-47,TH_TTTGGTTAGTACAGCG-47,TH_TTTGGTTGTTACAGCT-47,TH_TTTGTTGCACCTCTGT-47,TH_TTTGTTGGTATGCTAC-47,TH_TTTGTTGGTCGGATTT-47
OR4F5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL645608.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND4L,3.0,3.0,9.0,12.0,1.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
MT-ND4,59.0,32.0,200.0,173.0,12.0,5.0,4.0,1.0,88.0,7.0,...,0.0,2.0,2.0,8.0,0.0,11.0,4.0,0.0,4.0,2.0
MT-ND5,6.0,3.0,15.0,21.0,1.0,0.0,0.0,1.0,17.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0
MT-ND6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Compute the intersection
intersection = list(set(f_gene_names).intersection(set(data_genes)))
print(intersection)

['COX6B2', 'ATP6V0D2', 'ATP6V1G2', 'NDUFS7', 'MT-CO3', 'COX7B', 'ATP6V1A', 'NDUFA6', 'MT-ATP8', 'UQCRQ', 'MT-ND1', 'COX17', 'COX7A1', 'COX5B', 'NDUFA11', 'ATP6V0B', 'CYC1', 'NDUFB8', 'NDUFA4', 'COX7C', 'NDUFC2-KCTD14', 'UQCR11', 'COX4I1', 'ATP6V1G1', 'ATP6V0A1', 'COX8C', 'ATP6V0C', 'NDUFV3', 'UQCRHL', 'NDUFA12', 'ATP6V1E1', 'UQCR10', 'NDUFC1', 'NDUFA1', 'NDUFB9', 'COX15', 'UQCRH', 'MT-CYB', 'NDUFA2', 'NDUFB7', 'NDUFA4L2', 'NDUFB2', 'CYCS', 'MT-ATP6', 'NDUFC2', 'MT-ND5', 'NDUFB3', 'SDHD', 'LHPP', 'MT-ND2', 'ATP6V1C1', 'ATP6V1E2', 'ATP6V1B2', 'UQCRC2', 'MT-CO1', 'ATP6V0A2', 'MT-ND3', 'COX7A2L', 'UQCRB', 'UQCRFS1', 'COX4I2', 'ATP6V1F', 'ATP6V0D1', 'NDUFV2', 'COX5A', 'COX6C', 'NDUFB1', 'SDHB', 'NDUFS4', 'NDUFB11', 'NDUFA7', 'ATP6V1H', 'NDUFS8', 'ATP6V0E2', 'NDUFA3', 'NDUFB4', 'NDUFB5', 'NDUFB10', 'ATP6V0A4', 'COX7B2', 'SDHA', 'UQCRC1', 'MT-CO2', 'NDUFS3', 'ATP6V1C2', 'SDHC', 'NDUFA5', 'MT-ND6', 'NDUFS2', 'ATP6V0E1', 'ATP4A', 'NDUFS6', 'COX8A', 'NDUFV1', 'ATP6V1D', 'COX7A2', 'PPA1', 'TCIRG1

In [7]:
len(intersection)

119

In [8]:
# Compute the genes that don't overlap in the other direction
difference_reverse = list(set(data_genes).difference(set(f_gene_names)))
print(difference_reverse)

['ATP5F1D', 'ATP5MGL', 'ATP5F1B', 'ATP5PF', 'ATP5MC2', 'ATP5MC1', 'ATP5F1E', 'ATP5PB', 'ATP5MC3', 'ATP5F1A', 'ATP5MG', 'ATP5PO', 'ATP5PD', 'ATP5ME', 'ATP5MF', 'ATP6V1FP2', 'ATP5F1C']


In [9]:
len(difference_reverse)

17

These 17 genes are missing from the dataset. 16/17 are in the non-filtered matrix (Seurat objects), while [ATP6V1FP2] is not in the object at all.

In [10]:
# Run AUCell
# 1. Rankings (long)
rnk_mtx = create_rankings(ex_matrix.transpose(), seed=42)

In [11]:
# 2. Enrichment (~5s)
aucs = enrichment(rnk_mtx, gs)
aucs

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC
Cell,Regulon,Unnamed: 2_level_1
AG_AAACCCACAGATAAAC-1,KEGG hsa00190 - Oxidative phosphorylation,0.080427
AG_AAACGAAAGGCCACCT-1,KEGG hsa00190 - Oxidative phosphorylation,0.082488
AG_AAACGAACACAAATAG-1,KEGG hsa00190 - Oxidative phosphorylation,0.096523
AG_AAACGAATCCACAGGC-1,KEGG hsa00190 - Oxidative phosphorylation,0.084442
AG_AAACGCTCAAACACGG-1,KEGG hsa00190 - Oxidative phosphorylation,0.071588
...,...,...
TH_TTTGGTTAGTACAGCG-47,KEGG hsa00190 - Oxidative phosphorylation,0.075222
TH_TTTGGTTGTTACAGCT-47,KEGG hsa00190 - Oxidative phosphorylation,0.070061
TH_TTTGTTGCACCTCTGT-47,KEGG hsa00190 - Oxidative phosphorylation,0.009709
TH_TTTGTTGGTATGCTAC-47,KEGG hsa00190 - Oxidative phosphorylation,0.030353


In [12]:
min(aucs["AUC"])

0.0

In [13]:
max(aucs["AUC"])

0.32308525947376837

In [14]:
# Save DataFrame to TSV file
aucs.to_csv("/data/gardeux/Neuro_Droso_ND75KD/data/Kellis_119_Oxphos_AUCell_auc.tsv", sep='\t', index=True)