# Fly Cell Atlas - HEAD - AUCell Oxphos scoring
*This script is to be run before the Rmd script, for computing the AUC values with pySCENIC* 

**Author:** Vincent Gardeux

**Date Created:** 13/02/2025
**Date Last Modified:** 13/02/2025

In [5]:
# import dependencies
import pandas as pd
import h5py
from ctxcore.genesig import GeneSignature
from pyscenic.aucell import create_rankings, enrichment

In [8]:
GENE_SIGNATURE_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/Oxphos_genes.xlsx'
GENE_SIGNATURE_SNAME = '68 OXPHOS genes'
EXPRESSION_MTX_FNAME = '/data/gardeux/Neuro_Droso_ND75KD/data/s_fca_biohub_head_10x.loom' # From https://cloud.flycellatlas.org/index.php/s/jtH5fGwLqtiSRSD

In [9]:
data_excel = pd.read_excel(io = GENE_SIGNATURE_FNAME, sheet_name = GENE_SIGNATURE_SNAME)
data_excel

Unnamed: 0,Complex,Complex_Name,Complex_Subunit,Gene_Name,Gene_CG,Gene_Ensembl_ID,Gene_Symbol,Gene_Symbol_clean
0,I,NADH ubiquinone oxidoreductase,34,13 kDa A,CG8680,FBgn0031684,ND-13A,ND-13A
1,I,NADH ubiquinone oxidoreductase,34,13 kDa B,CG6463,FBgn0047038,ND-13B,ND-13B
2,I,NADH ubiquinone oxidoreductase,34,15 kDa,CG11455,FBgn0031228,ND-15,ND-15
3,I,NADH ubiquinone oxidoreductase,34,18 kDa,CG12203,FBgn0031021,ND-18,ND-18
4,I,NADH ubiquinone oxidoreductase,34,19 kDa,CG3683,FBgn0035046,ND-19,ND-19
...,...,...,...,...,...,...,...,...
63,V,F0/F1 ATP synthase,13,F,CG4692,FBgn0035032,ATPsynF,ATPsynF
64,V,F0/F1 ATP synthase,13,G,CG6105,FBgn0010612,ATPsynG,ATPsynG
65,V,F0/F1 ATP synthase,13,Coupling factor 6,CG4412,FBgn0016119,ATPsynCF6,ATPsynCF6
66,V,F0/F1 ATP synthase,13,Lipid-binding protein,CG1746,FBgn0039830,ATPsynC,ATPsynC


In [10]:
#gs = GeneSignature('Flybase - GO - Oxidative phosphorylation', data_excel["Gene_Ensembl_ID"].tolist())
gs = GeneSignature('Flybase - GO - Oxidative phosphorylation', data_excel["Gene_Ensembl_ID"].tolist())
gs

GeneSignature(name='Flybase - GO - Oxidative phosphorylation', gene2weight=frozendict.frozendict({'FBgn0031684': 1.0, 'FBgn0047038': 1.0, 'FBgn0031228': 1.0, 'FBgn0031021': 1.0, 'FBgn0035046': 1.0, 'FBgn0030718': 1.0, 'FBgn0017567': 1.0, 'FBgn0030853': 1.0, 'FBgn0266582': 1.0, 'FBgn0037001': 1.0, 'FBgn0019957': 1.0, 'FBgn0039909': 1.0, 'FBgn0031771': 1.0, 'FBgn0017566': 1.0, 'FBgn0040705': 1.0, 'FBgn0034645': 1.0, 'FBgn0033570': 1.0, 'FBgn0025839': 1.0, 'FBgn0031505': 1.0, 'FBgn0034576': 1.0, 'FBgn0033961': 1.0, 'FBgn0029868': 1.0, 'FBgn0001989': 1.0, 'FBgn0031436': 1.0, 'FBgn0030605': 1.0, 'FBgn0032511': 1.0, 'FBgn0011361': 1.0, 'FBgn0029888': 1.0, 'FBgn0052230': 1.0, 'FBgn0029971': 1.0, 'FBgn0021967': 1.0, 'FBgn0011455': 1.0, 'FBgn0058002': 1.0, 'FBgn0085468': 1.0, 'FBgn0261439': 1.0, 'FBgn0014028': 1.0, 'FBgn0037873': 1.0, 'FBgn0039112': 1.0, 'FBgn0034245': 1.0, 'FBgn0011227': 1.0, 'FBgn0260008': 1.0, 'FBgn0030733': 1.0, 'FBgn0021906': 1.0, 'FBgn0035600': 1.0, 'FBgn0038271': 1.0, 'F

In [12]:
# Open Loom file in reading mode
f = h5py.File(EXPRESSION_MTX_FNAME, 'r')
m = f["/matrix"][:,:]
gene_names = f["/row_attrs/Gene"].asstr()[:]
cell_names = f["/col_attrs/CellID"].asstr()[:]
f.close()

m

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 7.,  1.,  1., ...,  2.,  3.,  1.],
       [19.,  2.,  4., ...,  1.,  2.,  1.],
       ...,
       [ 0.,  0.,  0., ...,  1.,  0.,  2.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [13]:
m.shape

(13056, 100527)

In [14]:
cell_names

array(['AAACCCAGTTACGGAG-d541ae4e__FCA1_MaleFemale_Head',
       'AAACCCATCCGTACGG-d541ae4e__FCA1_MaleFemale_Head',
       'AAACGAAGTATGATCC-d541ae4e__FCA1_MaleFemale_Head', ...,
       'TTTGTTGTCATCGCCT-f7ed0992__FCA15_Female_head_adult',
       'TTTGTTGTCCATTCGC-f7ed0992__FCA15_Female_head_adult',
       'TTTGTTGTCGCTTGCT-f7ed0992__FCA15_Female_head_adult'], dtype=object)

In [15]:
len(cell_names)

100527

In [16]:
gene_names

array(['128up', '14-3-3epsilon', '14-3-3zeta', ..., 'zormin', 'zyd',
       'zye'], dtype=object)

In [17]:
len(gene_names)

13056

In [24]:
# [Input] Fixed gene annotation (for SCENIC feather file compatibility)
genome_data = pd.read_csv("/data/gardeux/Neuro_Droso_ND75KD/data/features.tsv", sep="\t", na_filter=False)

# Update gene names using fixing table
gene_to_ensembl_dict = genome_data.set_index('Name')['Ensembl'].to_dict()

# Check if all gene names are in the dictionary
missing_genes = [gene for gene in gene_names if gene not in gene_to_ensembl_dict]
if missing_genes:
    print(f"Error: The following genes are missing from the annotation table: {len(missing_genes)}")
else:
    # Fix gene names
    f_gene_names_updated = np.array([
        gene_to_ensembl_dict[gene] for gene in gene_names
    ])

    print(f_gene_names_updated)

KeyError: "None of ['Name'] are in the columns"

In [18]:
ex_matrix = pd.DataFrame(m, columns = cell_names, index = gene_names)
ex_matrix

Unnamed: 0,AAACCCAGTTACGGAG-d541ae4e__FCA1_MaleFemale_Head,AAACCCATCCGTACGG-d541ae4e__FCA1_MaleFemale_Head,AAACGAAGTATGATCC-d541ae4e__FCA1_MaleFemale_Head,AAACGAATCACCTACC-d541ae4e__FCA1_MaleFemale_Head,AAACGCTAGAAACCAT-d541ae4e__FCA1_MaleFemale_Head,AAACGCTAGCAATAGT-d541ae4e__FCA1_MaleFemale_Head,AAACGCTCAACTTCTT-d541ae4e__FCA1_MaleFemale_Head,AAACGCTTCAGACCTA-d541ae4e__FCA1_MaleFemale_Head,AAAGAACAGCAACTCT-d541ae4e__FCA1_MaleFemale_Head,AAAGAACCACTTGTGA-d541ae4e__FCA1_MaleFemale_Head,...,TTTGGTTAGTAATTGG-f7ed0992__FCA15_Female_head_adult,TTTGGTTAGTCCTACA-f7ed0992__FCA15_Female_head_adult,TTTGGTTCACATACTG-f7ed0992__FCA15_Female_head_adult,TTTGGTTCATCGGTTA-f7ed0992__FCA15_Female_head_adult,TTTGGTTCATTGACCA-f7ed0992__FCA15_Female_head_adult,TTTGGTTGTACGAAAT-f7ed0992__FCA15_Female_head_adult,TTTGTTGCAACGCATT-f7ed0992__FCA15_Female_head_adult,TTTGTTGTCATCGCCT-f7ed0992__FCA15_Female_head_adult,TTTGTTGTCCATTCGC-f7ed0992__FCA15_Female_head_adult,TTTGTTGTCGCTTGCT-f7ed0992__FCA15_Female_head_adult
128up,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14-3-3epsilon,7.0,1.0,1.0,0.0,1.0,0.0,5.0,2.0,1.0,4.0,...,2.0,1.0,0.0,2.0,0.0,0.0,0.0,2.0,3.0,1.0
14-3-3zeta,19.0,2.0,4.0,2.0,3.0,3.0,7.0,1.0,2.0,3.0,...,2.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,2.0,1.0
140up,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18SrRNA-Psi:CR41602,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zip,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
zld,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zormin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,2.0
zyd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0


In [15]:
# Compute the intersection
intersection = list(set(gene_names).intersection(set(data_excel["Gene_Ensembl_ID"].tolist())))
print(intersection)

['FBgn0035046', 'FBgn0020235', 'FBgn0016691', 'FBgn0029888', 'FBgn0052230', 'FBgn0040773', 'FBgn0010217', 'FBgn0034877', 'FBgn0040529', 'FBgn0031505', 'FBgn0032511', 'FBgn0017567', 'FBgn0250814', 'FBgn0030853', 'FBgn0037001', 'FBgn0036728', 'FBgn0030718', 'FBgn0033961', 'FBgn0058002', 'FBgn0037873', 'FBgn0021906', 'FBgn0011211', 'FBgn0035032', 'FBgn0031021', 'FBgn0031066', 'FBgn0034576', 'FBgn0038224', 'FBgn0034645', 'FBgn0034245', 'FBgn0039830', 'FBgn0021967', 'FBgn0017566', 'FBgn0040705', 'FBgn0015031', 'FBgn0047038', 'FBgn0025839', 'FBgn0028342', 'FBgn0260008', 'FBgn0031830', 'FBgn0014391', 'FBgn0039909', 'FBgn0029868', 'FBgn0038271', 'FBgn0010612', 'FBgn0035600', 'FBgn0031771', 'FBgn0019957', 'FBgn0039112', 'FBgn0032833', 'FBgn0029971', 'FBgn0085468', 'FBgn0016120', 'FBgn0031436', 'FBgn0033570', 'FBgn0031228', 'FBgn0016119', 'FBgn0019624', 'FBgn0031684', 'FBgn0014028', 'FBgn0011361', 'FBgn0266582', 'FBgn0030605', 'FBgn0011455', 'FBgn0011227', 'FBgn0030733', 'FBgn0261439', 'FBgn0019

In [16]:
len(intersection)

68

In [17]:
# Run AUCell
# 1. Rankings (~2mn)
rnk_mtx = create_rankings(ex_matrix.transpose(), seed=42)

In [18]:
# 2. Enrichment (~5s)
aucs = enrichment(rnk_mtx, gs)
aucs

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC
Cell,Regulon,Unnamed: 2_level_1
AAACCCAAGGTGATAT-1_ctrl,Flybase - GO - Oxidative phosphorylation,0.440722
AAACCCACAAATAGCA-1_ctrl,Flybase - GO - Oxidative phosphorylation,0.138840
AAACCCACAACAAAGT-1_ctrl,Flybase - GO - Oxidative phosphorylation,0.152084
AAACCCACACTCATAG-1_ctrl,Flybase - GO - Oxidative phosphorylation,0.223340
AAACCCACAGAGAGGG-1_ctrl,Flybase - GO - Oxidative phosphorylation,0.360865
...,...,...
TTTGTTGGTTTGGAGG-1_ndkd,Flybase - GO - Oxidative phosphorylation,0.261794
TTTGTTGTCCGTTTCG-1_ndkd,Flybase - GO - Oxidative phosphorylation,0.130940
TTTGTTGTCCTTCAGC-1_ndkd,Flybase - GO - Oxidative phosphorylation,0.112057
TTTGTTGTCGAACGCC-1_ndkd,Flybase - GO - Oxidative phosphorylation,0.108359


In [19]:
min(aucs["AUC"])

0.011020197552705292

In [20]:
max(aucs["AUC"])

0.7085974740773502

In [21]:
# Save DataFrame to TSV file
aucs.to_csv("./data/Pan_neuro_integrated_68_Oxphos_AUCell_auc.tsv", sep='\t', index=True)