In [62]:
import pandas as pd
import numpy as np
import scanpy as sc

from scipy.stats import zscore
from scipy.stats import spearmanr

import mygene

# Spearman coefficients

In [2]:
def spearman_subprocess(hallmark_df_path, adata, name_of_hallmark, species):
    hallmark_df = pd.read_csv(hallmark_df_path)
    sub_hallmark = hallmark_df['Subprocess'].unique()
    sub_hallmark = list(filter(pd.notna, sub_hallmark))
    
    spearman_df = pd.DataFrame(columns=['Hallmark', 'Subprocess', 'Spearman', 'p-value', 
                                            'Experiment', 'Species'])

    for proc in sub_hallmark:
#         subproc_genes = list(hallmark_df[hallmark_df['Subprocess'] == f'{proc}']['ENSEMBL'])
        subproc_genes = list(hallmark_df[hallmark_df['Subprocess'] == f'{proc}']['Gene'])
        
        if species == 'Mouse':
            subproc_genes = [gene.capitalize() for gene in subproc_genes]
            
#         print(subproc_genes)
            
#         adata_process_i = adata[:, adata.var_names.isin(subproc_genes)].copy()
        adata_process_i = adata[:, adata.var['gene_symbol'].isin(subproc_genes)].copy()
        raw_matrix = adata_process_i.X.A
        raw_matrix = zscore(raw_matrix, axis=0, nan_policy='omit')
        zscored_df = pd.DataFrame(
            raw_matrix,
            index=adata_process_i.obs['timepoint'],
            columns=adata_process_i.var_names
        )
    
        zscored_df_long = zscored_df.reset_index().melt(
            id_vars=zscored_df.index.name, 
            var_name='Gene', 
            value_name='Zscore'
        )

        zscored_df_long = zscored_df_long[(zscored_df_long['Zscore'] <=5) & (zscored_df_long['Zscore']>=-5)]
    
        zscored_df_long['timepoint_numeric'] = zscored_df_long['timepoint'].astype('category').cat.codes
        
        
        rho, pval = spearmanr(zscored_df_long['timepoint_numeric'], zscored_df_long['Zscore'])
        
        new_row = [[name_of_hallmark, proc, rho, pval, 'Embryo', species]]
        
        spearman_results_df = pd.DataFrame(new_row, 
                                           columns=['Hallmark', 'Subprocess', 'Spearman', 'p-value', 
                                            'Experiment', 'Species'])
        
        spearman_df = pd.concat([spearman_df, spearman_results_df], ignore_index=True)
        
    return spearman_df
        


In [14]:
def spearman_gene(hallmark_df_path, adata, name_of_hallmark, species, sp_boarder = 0.4):
    hallmark_df = pd.read_csv(hallmark_df_path)
    sub_hallmark = hallmark_df['Subprocess'].unique()
    sub_hallmark = list(filter(pd.notna, sub_hallmark))
    
    spearman_df = pd.DataFrame(columns=['Hallmark', 'Subprocess', 'Gene Symbol', 'Spearman', 'p-value', 
                                            'Experiment', 'Species'])

    for proc in sub_hallmark:
#         subproc_genes = list(hallmark_df[hallmark_df['Subprocess'] == f'{proc}']['ENSEMBL'])
        subproc_genes = list(hallmark_df[hallmark_df['Subprocess'] == f'{proc}']['Gene'])
    
        if species == 'Mouse':
            subproc_genes = [gene.capitalize() for gene in subproc_genes]
            
        
#         adata_process_i = adata[:, adata.var_names.isin(subproc_genes)].copy()
        adata_process_i = adata[:, adata.var['gene_symbol'].isin(subproc_genes)].copy()
        raw_matrix = adata_process_i.X.A
        raw_matrix = zscore(raw_matrix, axis=0, nan_policy='omit')
        zscored_df = pd.DataFrame(
            raw_matrix,
            index=adata_process_i.obs['timepoint'],
#             columns=adata_process_i.var_names
            columns=adata_process_i.var['gene_symbol']
        )
    
        zscored_df_long = zscored_df.reset_index().melt(
            id_vars=zscored_df.index.name, 
            var_name='Gene', 
            value_name='Zscore'
        )

        zscored_df_long = zscored_df_long[(zscored_df_long['Zscore'] <=5) & (zscored_df_long['Zscore']>=-5)]
        
        zscored_df_long['timepoint_numeric'] = zscored_df_long['timepoint'].astype('category').cat.codes
        
        spearman_results = []
        
        for gene, group in zscored_df_long.groupby('Gene'):
            rho, pval = spearmanr(group['timepoint_numeric'], group['Zscore'])
            spearman_results.append((name_of_hallmark, proc, gene, rho, pval, 'Embryo', species))
            
        spearman_results_df = pd.DataFrame(spearman_results, 
                                           columns=['Hallmark', 'Subprocess', 'Gene Symbol', 'Spearman', 
                                                    'p-value', 'Experiment', 'Species'])
        spearman_df = pd.concat([spearman_df, spearman_results_df], ignore_index=True)
        
        spearman_df_signif = spearman_df[abs(spearman_df['Spearman']) >= sp_boarder]
    
    print(f'Number of genes with abs(Spearman) >= {sp_boarder} = {len(spearman_df_signif)}')
    return spearman_df_signif

In [30]:
def summary_genes(df_genes, species):
    sub_hallmark = list(df_genes['Subprocess'].unique())
    hallmarks = list(df_genes['Hallmark'].unique())
    
    spearman_df = pd.DataFrame(columns=['Hallmark', 'Subprocess', 'n_genes>0.2', 'n_genes<-0.2', 
                                        'n_genes>0.4', 'n_genes<-0.4', 'Experiment', 'Species'])
    for hallmark in hallmarks:
        for proc in sub_hallmark:
            df_subset = df_genes[(df_genes['Hallmark'] == hallmark) & (df_genes['Subprocess'] == proc)]
        
            gene_up02 = len(df_subset[df_subset['Spearman'] >= 0.2]['Gene Symbol'])
            gene_down02 = len(df_subset[df_subset['Spearman'] <= -0.2]['Gene Symbol'])
            gene_up04 = len(df_subset[df_subset['Spearman'] >= 0.4]['Gene Symbol'])
            gene_down04 = len(df_subset[df_subset['Spearman'] <= -0.4]['Gene Symbol'])
        
            new_row = [[hallmark, proc, gene_up02, gene_down02, gene_up04, gene_down04, 'Embryo', species]]
            
            spearman_results_df = pd.DataFrame(new_row, 
                                           columns=['Hallmark', 'Subprocess', 'n_genes>0.2', 'n_genes<-0.2', 
                                                    'n_genes>0.4', 'n_genes<-0.4', 'Experiment', 'Species'])
            
            spearman_df = pd.concat([spearman_df, spearman_results_df], ignore_index=True)
            
    return spearman_df

# Human

In [59]:
adata_human = sc.read_h5ad('32_human_adata.h5ad')
adata_h_p = sc.read_h5ad('portal_human_v1.5.h5ad')
adata_human.obs['timepoint'] = adata_h_p.obs['timepoint']

desired_order = ['Oocyte', 'Pronucleus', 'Zygote', '2C', '4C', '8C', 'E3.0','Morula', 
                 'E4.0', 'E5.0', 'E6.0', 'E7.0', 'E8.0', 'E9.0', 'E10.0']

adata_human.obs['timepoint'] = adata_human.obs['timepoint'].cat.reorder_categories(desired_order, ordered=True)

In [65]:
adata_human.obs.index

Index(['ERX3015937_ERX3015937', 'ERX3015939_ERX3015939',
       'ERX3015940_ERX3015940', 'ERX3015941_ERX3015941',
       'ERX3015936_ERX3015936', 'ERX3015943_ERX3015943',
       'ERX3015935_ERX3015935', 'ERX3015945_ERX3015945',
       'ERX3015938_ERX3015938', 'ERX3015947_ERX3015947',
       ...
       'SRX300887_SRX300887', 'SRX300880_SRX300880', 'SRX300893_SRX300893',
       'SRX300878_SRX300878', 'SRX300888_SRX300888', 'SRX300884_SRX300884',
       'SRX300876_SRX300876', 'SRX300882_SRX300882', 'SRX300886_SRX300886',
       'SRX300877_SRX300877'],
      dtype='object', name='index', length=2323)

In [79]:
embryo_counts = pd.DataFrame(adata_human.X.A, 
             index=adata_human.obs.index,
             columns = adata_human.var.index).T

embryo_counts.to_csv("embryo_counts_human.csv")

embryo_meta = adata_human.obs
embryo_meta.to_csv("embryo_meta_human.csv")

In [5]:
mg = mygene.MyGeneInfo()

# Get your Ensembl IDs and ensure uniqueness
ensembl_ids = list(set(adata_human.var.index.tolist()))

# Query MyGene.info for gene symbols
gene_info = mg.querymany(ensembl_ids, scopes='ensembl.gene', fields='symbol', species='human')

# Convert results to a DataFrame for easier handling
gene_info_df = pd.DataFrame(gene_info)

# Drop duplicates to ensure a unique mapping
gene_info_df = gene_info_df.drop_duplicates(subset=['query'])

# Map Ensembl IDs to gene symbols
symbol_mapping = gene_info_df.set_index('query')['symbol']

# Map gene symbols to your original var index (with duplicates)
adata_human.var['gene_symbol'] = adata_human.var.index.to_series().map(symbol_mapping).fillna("Unknown")


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
36 input query terms found dup hits:	[('ENSG00000276241', 4), ('ENSG00000287326', 2), ('ENSG00000243620', 2), ('ENSG00000284116', 2), ('E
1223 input query terms found no hit:	['ENSG00000286016', 'ENSG00000272515', 'ENSG00000290903', 'ENSG00000232597', 'ENSG00000261039', 'ENS


## Human: subprocesses

In [6]:
ginst_human_subp = spearman_subprocess(hallmark_df_path='genomic_signatures_new.csv', 
                                             adata = adata_human,
                                            name_of_hallmark='Genomic Instability',
                                            species = 'Human')

sen_human_subp = spearman_subprocess(hallmark_df_path='senescence_data.csv', 
                                             adata = adata_human,
                                            name_of_hallmark='Senescence',
                                            species = 'Human')

autoph_human_subp = spearman_subprocess(hallmark_df_path='autophagy_signatures.csv', 
                                             adata = adata_human,
                                            name_of_hallmark='Autophagy',
                                               species = 'Human')

prot_human_subp = spearman_subprocess(hallmark_df_path='proteostasis_signatures.csv', 
                                             adata = adata_human,
                                            name_of_hallmark='Proteostasis',
                                             species = 'Human')




## Human: per gene

In [7]:
sen_human_genes = spearman_gene(hallmark_df_path = 'senescence_data.csv', 
                                   adata=adata_human, 
                                   name_of_hallmark='Senescence', sp_boarder = 0,
                                   species = 'Human')

autoph_human_genes = spearman_gene(hallmark_df_path='autophagy_signatures.csv', 
                                             adata = adata_human,
                                            name_of_hallmark='Autophagy', sp_boarder = 0,
                                          species = 'Human')

prot_human_genes = spearman_gene(hallmark_df_path='proteostasis_signatures.csv', 
                                             adata = adata_human,
                                            name_of_hallmark='Proteostasis', sp_boarder = 0,
                                        species = 'Human')

ginst_human_genes = spearman_gene(hallmark_df_path='genomic_signatures_new.csv', 
                                             adata = adata_human,
                                            name_of_hallmark='Genomic Instability', sp_boarder = 0,
                                         species = 'Human')




Number of genes with abs(Spearman) >= 0 = 423




Number of genes with abs(Spearman) >= 0 = 768




Number of genes with abs(Spearman) >= 0 = 3369




Number of genes with abs(Spearman) >= 0 = 152


# Mouse

In [75]:
adata_mouse = sc.read_h5ad('01_mouse_reprocessed.h5ad')
adata_m_p = sc.read_h5ad('portal_mouse_v1.5.h5ad')
adata_mouse.obs['timepoint'] = adata_m_p.obs['timepoint']

desired_order = ['Zygote', '2C', '4C', '8C', '16C', 
                 'E3.25', 'E3.5', 'E3.75', 'E4.5']

adata_mouse.obs['timepoint'] = adata_mouse.obs['timepoint'].cat.reorder_categories(desired_order, ordered=True)


In [78]:
embryo_counts_mouse = pd.DataFrame(adata_mouse.X.A, 
             index=adata_mouse.obs.index,
             columns = adata_mouse.var.index).T
embryo_counts_mouse.to_csv("embryo_counts_mouse.csv")

embryo_meta_mouse = adata_mouse.obs
embryo_meta_mouse.to_csv("embryo_meta_mouse.csv")

## Mouse: subprocesses

In [9]:
ginst_mouse_subp = spearman_subprocess(hallmark_df_path='genomic_signatures_new.csv', 
                                             adata = adata_mouse,
                                            name_of_hallmark='Genomic Instability',
                                            species = 'Mouse')

sen_mouse_subp = spearman_subprocess(hallmark_df_path='senescence_data.csv', 
                                             adata = adata_mouse,
                                            name_of_hallmark='Senescence',
                                            species = 'Mouse')

autoph_mouse_subp = spearman_subprocess(hallmark_df_path='autophagy_signatures.csv', 
                                             adata = adata_mouse,
                                            name_of_hallmark='Autophagy',
                                               species = 'Mouse')

prot_mouse_subp = spearman_subprocess(hallmark_df_path='proteostasis_signatures.csv', 
                                             adata = adata_mouse,
                                            name_of_hallmark='Proteostasis',
                                             species = 'Mouse')


## Mouse: per gene

In [11]:
ginst_mouse_genes = spearman_gene(hallmark_df_path='genomic_signatures_new.csv', 
                                             adata = adata_mouse,
                                            name_of_hallmark='Genomic Instability',
                                            species = 'Mouse', sp_boarder = 0)

sen_mouse_genes = spearman_gene(hallmark_df_path='senescence_data.csv', 
                                             adata = adata_mouse,
                                            name_of_hallmark='Senescence',
                                            species = 'Mouse', sp_boarder = 0)

autoph_mouse_genes = spearman_gene(hallmark_df_path='autophagy_signatures.csv', 
                                             adata = adata_mouse,
                                            name_of_hallmark='Autophagy',
                                               species = 'Mouse', sp_boarder = 0)

prot_mouse_genes = spearman_gene(hallmark_df_path='proteostasis_signatures.csv', 
                                             adata = adata_mouse,
                                            name_of_hallmark='Proteostasis',
                                             species = 'Mouse', sp_boarder = 0)


Number of genes with abs(Spearman) >= 0 = 193
Number of genes with abs(Spearman) >= 0 = 382
Number of genes with abs(Spearman) >= 0 = 725
Number of genes with abs(Spearman) >= 0 = 3049


# Summary tables

In [35]:
embryo_subprocess = pd.concat([sen_human_subp, autoph_human_subp, prot_human_subp, ginst_human_subp,
                               sen_mouse_subp, autoph_mouse_subp, prot_mouse_subp, ginst_mouse_subp], 
                               ignore_index=True)

In [36]:
embryo_subprocess

Unnamed: 0,Hallmark,Subprocess,Spearman,p-value,Experiment,Species
0,Senescence,DNA damage response,0.086816,7.065056e-44,Embryo,Human
1,Senescence,SASP,0.075348,0.000000e+00,Embryo,Human
2,Senescence,SASP_Secreted,0.003562,1.210633e-01,Embryo,Human
3,Senescence,Metabolic adaptations,0.090075,2.222326e-26,Embryo,Human
4,Senescence,Cell cycle arrest,0.089405,8.928841e-83,Embryo,Human
...,...,...,...,...,...,...
111,Genomic Instability,Homologous Recombination,-0.233704,3.425171e-295,Embryo,Mouse
112,Genomic Instability,Transcription-Coupled Repair,-0.001913,8.340411e-01,Embryo,Mouse
113,Genomic Instability,Direct Reversal Repair,-0.123502,7.369909e-22,Embryo,Mouse
114,Genomic Instability,Single-Strand Break Repair,-0.102632,7.094413e-25,Embryo,Mouse


In [37]:
embryo_subprocess.to_csv("embryo_subprocess.csv", index=False)

In [25]:
embryo_gene = pd.concat([sen_human_genes, autoph_human_genes, prot_human_genes, ginst_human_genes,
                         sen_mouse_genes, autoph_mouse_genes, prot_mouse_genes, ginst_mouse_genes], 
                         ignore_index=True)

In [26]:
embryo_gene

Unnamed: 0,Hallmark,Subprocess,Gene Symbol,Spearman,p-value,Experiment,Species
0,Senescence,DNA damage response,BAX,0.285208,1.045573e-44,Embryo,Human
1,Senescence,DNA damage response,CCAR2,0.089200,1.673098e-05,Embryo,Human
2,Senescence,DNA damage response,ERCC1,-0.346577,1.974057e-66,Embryo,Human
3,Senescence,DNA damage response,ERCC4,-0.022108,2.878714e-01,Embryo,Human
4,Senescence,DNA damage response,H2AX,0.072243,4.942496e-04,Embryo,Human
...,...,...,...,...,...,...,...
9056,Genomic Instability,Single-Strand Break Repair,Parp1,0.000252,9.910146e-01,Embryo,Mouse
9057,Genomic Instability,Single-Strand Break Repair,Parp2,-0.306012,1.046434e-44,Embryo,Mouse
9058,Genomic Instability,Single-Strand Break Repair,Pnkp,-0.040617,6.908370e-02,Embryo,Mouse
9059,Genomic Instability,Single-Strand Break Repair,Tdp1,0.002271,9.190791e-01,Embryo,Mouse


In [34]:
embryo_gene.to_csv("embryo_gene.csv", index=False)

In [56]:
embryo_summary_human_sen = summary_genes(sen_human_genes, species='Human')
embryo_summary_human_auto = summary_genes(autoph_human_genes, species='Human')
embryo_summary_human_prot = summary_genes(prot_human_genes, species='Human')
embryo_summary_human_ginst = summary_genes(ginst_human_genes, species='Human')

embryo_summary_mouse_sen = summary_genes(sen_mouse_genes, species='Mouse')
embryo_summary_mouse_auto = summary_genes(autoph_mouse_genes, species='Mouse')
embryo_summary_mouse_prot = summary_genes(prot_mouse_genes, species='Mouse')
embryo_summary_mouse_ginst = summary_genes(ginst_mouse_genes, species='Mouse')



embryo_summary = pd.concat([embryo_summary_human_sen, embryo_summary_human_auto,
                            embryo_summary_human_prot, embryo_summary_human_ginst,
                            embryo_summary_mouse_sen, embryo_summary_mouse_auto,
                            embryo_summary_mouse_prot, embryo_summary_mouse_ginst], ignore_index=True)


In [57]:
embryo_summary

Unnamed: 0,Hallmark,Subprocess,n_genes>0.2,n_genes<-0.2,n_genes>0.4,n_genes<-0.4,Experiment,Species
0,Senescence,DNA damage response,3,2,1,0,Embryo,Human
1,Senescence,SASP,26,11,11,1,Embryo,Human
2,Senescence,SASP_Secreted,8,11,2,1,Embryo,Human
3,Senescence,Metabolic adaptations,1,1,1,0,Embryo,Human
4,Senescence,Cell cycle arrest,5,1,1,1,Embryo,Human
...,...,...,...,...,...,...,...,...
109,Genomic Instability,Homologous Recombination,0,8,0,4,Embryo,Mouse
110,Genomic Instability,Transcription-Coupled Repair,1,1,0,0,Embryo,Mouse
111,Genomic Instability,Direct Reversal Repair,0,1,0,0,Embryo,Mouse
112,Genomic Instability,Single-Strand Break Repair,0,2,0,0,Embryo,Mouse


In [58]:
embryo_summary.to_csv("embryo_summary.csv", index=False)