# MouseBrain - Linnarsson - AUCell Oxphos scoring
*This script is to be run before the Rmd script, for computing the AUC values with pySCENIC* 

**Author:** Vincent Gardeux

**Date Created:** 19/02/2025
**Date Last Modified:** 19/02/2025

In [1]:
# import dependencies
import pandas as pd
import numpy as np
import h5py
from ctxcore.genesig import GeneSignature
from pyscenic.aucell import create_rankings, enrichment

In [5]:
GENE_SIGNATURE_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/oxphos_genes_KEGG_mmu.txt' # From https://www.genome.jp/entry/mmu00190
GENE_SIGNATURE_SNAME = '133 (+8) OXPHOS genes' # 8 are not present in Linnarsson's object
EXPRESSION_MTX_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/l5_all_rev1.loom' # From http://mousebrain.org/adolescent/downloads.html

In [18]:
data_genes = pd.read_csv(filepath_or_buffer=GENE_SIGNATURE_FNAME, header=None)[0].tolist()
gs = GeneSignature('KEGG mmu00190 - Oxidative phosphorylation', data_genes)
gs

GeneSignature(name='KEGG mmu00190 - Oxidative phosphorylation', gene2weight=frozendict.frozendict({'mt-Nd1': 1.0, 'mt-Nd2': 1.0, 'mt-Nd3': 1.0, 'mt-Nd4': 1.0, 'mt-Nd4l': 1.0, 'mt-Nd5': 1.0, 'mt-Nd6': 1.0, 'Ndufs1': 1.0, 'Ndufs2': 1.0, 'Ndufs3': 1.0, 'Ndufs4': 1.0, 'Ndufs5': 1.0, 'Ndufs6': 1.0, 'Ndufs6b': 1.0, 'Ndufs7': 1.0, 'Ndufs8': 1.0, 'Ndufv1': 1.0, 'Ndufv2': 1.0, 'Ndufv3': 1.0, 'Ndufa1': 1.0, 'Ndufa2': 1.0, 'Ndufa3': 1.0, 'Ndufa4': 1.0, 'Ndufa4l2': 1.0, 'Gm19340': 1.0, 'Ndufa5': 1.0, 'Ndufa6': 1.0, 'Ndufa7': 1.0, 'Ndufa8': 1.0, 'Ndufa9': 1.0, 'Ndufa10': 1.0, 'Ndufab1-ps': 1.0, 'Ndufab1': 1.0, 'Ndufa11': 1.0, 'Ndufa12': 1.0, 'Ndufa13': 1.0, 'Ndufb1': 1.0, 'Ndufb2': 1.0, 'Ndufb3': 1.0, 'Ndufb4c': 1.0, 'Ndufb4b': 1.0, 'Ndufb4': 1.0, 'Ndufb5': 1.0, 'Ndufb6': 1.0, 'Ndufb7': 1.0, 'Ndufb8': 1.0, 'Ndufb9': 1.0, 'Ndufb10': 1.0, 'Ndufb11': 1.0, 'Ndufc1': 1.0, 'Ndufc2': 1.0, 'Sdha': 1.0, 'Sdhb': 1.0, 'Sdhc': 1.0, 'Sdhd': 1.0, 'Uqcrfs1': 1.0, 'mt-Cytb': 1.0, 'Cyc1': 1.0, 'Uqcrc1': 1.0, 'Uqcrc

In [19]:
len(gs)

141

In [20]:
# Open Loom file in reading mode
f = h5py.File(EXPRESSION_MTX_FNAME, 'r')
m = f["/matrix"][:,:]
gene_names = f["/row_attrs/Gene"].asstr()[:]
cell_names = f["/col_attrs/CellID"].asstr()[:]
f.close()

m

array([[21., 28., 28., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       [14., 11.,  7., ...,  2.,  2.,  2.],
       ...,
       [ 0.,  0.,  2., ...,  2.,  3.,  3.],
       [ 0.,  1.,  0., ...,  0.,  0.,  1.],
       [ 1.,  0.,  2., ...,  8., 11.,  3.]], dtype=float32)

In [21]:
m.shape

(27998, 160796)

In [22]:
cell_names

array(['10X82_2:TCTCTCACCAGTTA', '10X82_2:TATTATCTACCAGA',
       '10X82_2:TATCCCAGATGGCA', ..., '10X43_2:CTGCAGCTTAGAGA',
       '10X53_7:XXGCGATGGGAGGT', '10X43_2:CCTTAATGGGGCAA'], dtype=object)

In [23]:
len(cell_names)

160796

In [24]:
gene_names

array(['Cbln2', 'Ptchd2', 'P2rx2', ..., 'Mmadhc', 'Med27', 'Psmc6'],
      dtype=object)

In [25]:
len(gene_names)

27998

In [26]:
ex_matrix = pd.DataFrame(m, columns = cell_names, index = gene_names)
ex_matrix

Unnamed: 0,10X82_2:TCTCTCACCAGTTA,10X82_2:TATTATCTACCAGA,10X82_2:TATCCCAGATGGCA,10X82_2:ATTACGTATGAATG,10X82_2:ATACGTCAATAAGG,10X82_2:TACAGTCTTCGGTC,10X81_2:CGTAACATTCGACA,10X81_3:TGATGAGATACACA,10X82_2:GCCAGGTAGGACAC,10X81_2:AGATCAGTCCGTAT,...,10X53_7:TATGTCTGAGAGGC,10X43_2:TTCAGTTGCTTGGA,10X43_2:CAAGTCGATCGTGA,10X43_2:ACGAAGCTTCGTAG,10X43_2:TACCATTGGGCAAG,10X43_2:GGTACAACAGTCGT,10X43_2:TAATGATGGGTTAC,10X43_2:CTGCAGCTTAGAGA,10X53_7:XXGCGATGGGAGGT,10X43_2:CCTTAATGGGGCAA
Cbln2,21.0,28.0,28.0,48.0,31.0,29.0,66.0,13.0,22.0,45.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ptchd2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P2rx2,14.0,11.0,7.0,13.0,11.0,7.0,16.0,7.0,14.0,9.0,...,0.0,3.0,2.0,6.0,8.0,3.0,1.0,2.0,2.0,2.0
Ptger4,2.0,1.0,2.0,2.0,2.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
Fam19a1,8.0,7.0,9.0,20.0,4.0,11.0,20.0,4.0,16.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BC051537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,2.0,2.0,8.0,0.0,2.0
Gm15518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Mmadhc,0.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,3.0,1.0,2.0,0.0,0.0,2.0,2.0,3.0,3.0
Med27,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [27]:
# Compute the intersection
intersection = list(set(gene_names).intersection(set(data_genes)))
print(intersection)

['Ndufa10', 'Ndufb3', 'Ndufa3', 'Uqcrh', 'Ppa2', 'Ndufa12', 'Uqcrfs1', 'Cycs', 'Cox7a1', 'Ndufa2', 'Ndufs6', 'Ndufb10', 'Ndufs5', 'mt-Nd4l', 'Uqcrc2', 'Atp6v1a', 'Atp6v1h', 'Ndufa4', 'Ndufs1', 'Atp5g3', 'Atp6v0d2', 'Uqcrb', 'Atp5a1', 'Atp6v0a2', 'Atp12a', 'Cox6b2', 'Cox7c', 'Ndufb5', 'Ndufa4l2', 'mt-Nd5', 'Ndufb7', 'Atp6v1f', 'Cox6a1', 'Atp5c1', 'Cox10', 'mt-Cytb', 'Ndufa11', 'Atp5h', 'Ndufa8', 'mt-Nd3', 'Ndufv2', 'mt-Co3', 'Atp6v0a1', 'Uqcrc1', 'Ndufa1', 'Atp5f1', 'mt-Co2', 'Atp6v1e1', 'Atp6v1c1', 'Cox4i2', 'Atp6v1c2', 'Ndufa9', 'Ndufa6', 'Ndufb2', 'Ndufb6', 'Cox5a', 'mt-Atp8', 'mt-Nd4', 'Sdha', 'Lhpp', 'Atp6v0b', 'Atp5g1', 'Atp4b', 'Cox7b2', 'Ndufa7', 'Atp6v1d', 'Atp6v0e', 'Atp6v0d1', 'Cox11', 'Cox6b1', 'Atp5l', 'Atp6v1b1', 'Cox17', 'Ndufb11', 'mt-Nd2', 'Cox5b', 'Cox7a2l', 'mt-Nd1', 'Atp5d', 'mt-Co1', 'Ndufs4', 'Atp5o', 'Atp6ap1', 'Cox15', 'Cox8a', 'Atp5g2', 'Sdhb', 'Cox7a2', 'Cox8c', 'Cox4i1', 'Uqcr11', 'Atp6v1g1', 'mt-Atp6', 'Uqcrq', 'Ndufa5', 'Ndufab1', 'Ndufs3', 'Ndufv1', 'Atp6v1

In [28]:
len(intersection)

133

In [30]:
# Compute the genes that don't overlap in the other direction
difference_reverse = list(set(data_genes).difference(set(gene_names)))
print(difference_reverse)

['Ndufab1-ps', 'Gm11633', 'Ndufb4b', 'Ndufb4c', 'Ndufs6b', 'Gm6293', 'Ndufb1', 'Gm19340']


These 8 genes are missing from the dataset.

In [31]:
# Run AUCell
# 1. Rankings (~2mn)
rnk_mtx = create_rankings(ex_matrix.transpose(), seed=42)

In [32]:
# 2. Enrichment (~5s)
aucs = enrichment(rnk_mtx, gs)
aucs

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC
Cell,Regulon,Unnamed: 2_level_1
10X82_2:TCTCTCACCAGTTA,KEGG mmu00190 - Oxidative phosphorylation,0.300892
10X82_2:TATTATCTACCAGA,KEGG mmu00190 - Oxidative phosphorylation,0.278340
10X82_2:TATCCCAGATGGCA,KEGG mmu00190 - Oxidative phosphorylation,0.305011
10X82_2:ATTACGTATGAATG,KEGG mmu00190 - Oxidative phosphorylation,0.341160
10X82_2:ATACGTCAATAAGG,KEGG mmu00190 - Oxidative phosphorylation,0.338276
...,...,...
10X43_2:GGTACAACAGTCGT,KEGG mmu00190 - Oxidative phosphorylation,0.424012
10X43_2:TAATGATGGGTTAC,KEGG mmu00190 - Oxidative phosphorylation,0.415752
10X43_2:CTGCAGCTTAGAGA,KEGG mmu00190 - Oxidative phosphorylation,0.378872
10X53_7:XXGCGATGGGAGGT,KEGG mmu00190 - Oxidative phosphorylation,0.348867


In [33]:
min(aucs["AUC"])

0.034591836734693876

In [34]:
max(aucs["AUC"])

0.5976906552094522

In [35]:
# Save DataFrame to TSV file
aucs.to_csv("/data/gardeux/Neuro_Droso_ND75KD/data/Linnarsson_133_Oxphos_AUCell_auc.tsv", sep='\t', index=True)