<a href="https://colab.research.google.com/github/Claptar/spatialMPFC/blob/main/DE/mammals/ANOVA/functional_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies and files

## Install dependencies

In [1]:
!python --version

Python 3.10.11


## Imports

In [2]:
import warnings
import json
import requests
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm.notebook import tqdm

warnings.simplefilter(action='ignore', category=FutureWarning)

# Global variables

Connect to google drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
dir_path = 'drive/MyDrive/Spatial project/data/'

In [5]:
de_dir_path = 'drive/MyDrive/Spatial project/results/DE/mammals/ANOVA/'

In [6]:
os.listdir(de_dir_path)

['anova_L3.csv',
 'anova_L6.csv',
 'anova_L5.csv',
 'anova_L4.csv',
 'anova_L2.csv',
 'anova_L1.csv',
 'anova_WM.csv',
 'anova_mamm.csv',
 'ttest_mamm.csv',
 'anova_human_sig_genes.csv',
 'genes_clusters.csv',
 'cluster_spectr_anova_6.csv',
 'cluster_spectr_anova_7.csv',
 'cluster_spectr_anova_8.csv',
 'cluster_spectr_anova_9.csv',
 'cluster_spectr_hs_6.csv',
 'cluster_spectr_hs_7.csv',
 'cluster_spectr_hs_8.csv',
 'cluster_spectr_hs_9.csv',
 'ttest_sign_genes.csv',
 'ssGSEA_human_sig_genes.csv',
 'ttest_human_sig_genes.csv']

# GSA

In [7]:
n_clusters = 6
cluster_spectr = pd.read_csv(de_dir_path + f'cluster_spectr_hs_{n_clusters}.csv', index_col=0)
cluster_spectr.columns = ['label']
cluster_spectr.head()

Unnamed: 0,label
MAN1A1,0
COX6B1,0
COX7B,0
COX8A,0
CRHR1,0


In [8]:
class GeneCluster:
    def __init__(self, genes, descr, label, enrichr_id=None, **kwargs):
        self.label = label
        self.genes = genes
        self.descr = descr
        self.enrichr_id = self._get_erichr_id() if enrichr_id is None else enrichr_id
        self.enrichment_res = dict()

    
    def _get_erichr_id(self):
        ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList'
        genes_str = '\n'.join(self.genes)
        payload = {
            'list': (None, genes_str),
            'description': (None, self.descr)
        }

        response = requests.post(ENRICHR_URL, files=payload)
        if not response.ok:
            raise Exception('Error analyzing gene list')

        data = json.loads(response.text)
        return data['userListId']

    def enrich(self, gene_set_library):
        # get enrichment results
        ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/enrich'
        query_string = '?userListId=%s&backgroundType=%s'
        response = requests.get(
            ENRICHR_URL + query_string % (self.enrichr_id, gene_set_library)
        )
        if not response.ok:
            raise Exception('Error fetching enrichment results')

        data = json.loads(response.text)
        # convert results to df
        columns = ['Rank', 'Term', 'p-val', 'Z-score', 'Combined score', 'Overlapping genes', 'Adjusted P-value', 'Old p-value', 'Old adjusted p-value']
        results = pd.DataFrame(data[gene_set_library], columns=columns)
        # preprocess df
        results.Term = results.Term.astype(str)
        results[['Term', 'Term GO index']] = results.Term.str.rsplit(' ', n=1, expand=True)
        results['num_overlap_genes'] = results['Overlapping genes'].apply(lambda x: len(x))
        results['neg_log10(p_adj)'] = - np.log10(results['Adjusted P-value'])
        results['cluster_label'] = self.label
        # save to enrichment_res
        self.enrichment_res[gene_set_library] = results

    @staticmethod
    def enrich_geneclusters(geneclusters, gene_set_library):
        for gc in geneclusters:
            gc.enrich(gene_set_library)


In [11]:
gene_clust = list()

for label in tqdm(range(n_clusters)):
    genes = cluster_spectr[cluster_spectr.label == label].index.tolist()
    descr = f'Genes for spectral clustering ({n_clusters} clusters) of human-specific genes. Cluster label: {label}'
    gene_clust.append(GeneCluster(genes, descr, label))

  0%|          | 0/6 [00:00<?, ?it/s]

## GO_Biological_Process_2021

In [12]:
GeneCluster.enrich_geneclusters(gene_clust, 'GO_Biological_Process_2021')

In [13]:
go_process = pd.concat([gc.enrichment_res['GO_Biological_Process_2021'] for gc in gene_clust])
go_process = go_process.set_index(['Term'])
go_process.head()

Unnamed: 0_level_0,Rank,p-val,Z-score,Combined score,Overlapping genes,Adjusted P-value,Old p-value,Old adjusted p-value,Term GO index,num_overlap_genes,neg_log10(p_adj),cluster_label
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
aerobic electron transport chain,1,2.055532e-12,13.565641,365.057992,"[COX8A, NDUFB9, NDUFA8, COX7B, NDUFB8, NDUFB11...",3.087932e-09,0,0,(GO:0019646),16,8.510332,0
mitochondrial ATP synthesis coupled electron transport,2,2.600364e-12,13.31831,355.270838,"[COX8A, NDUFB9, NDUFA8, COX7B, NDUFB8, NDUFB11...",3.087932e-09,0,0,(GO:0042775),16,8.510332,0
mitochondrial ATP synthesis coupled proton transport,3,1.13356e-06,24.454128,334.780616,"[STOML2, ATP5PD, ATP5PB, ATP5F1D, ATP5F1E, ATP...",0.0006730515,0,0,(GO:0042776),6,3.171952,0
"mitochondrial electron transport, cytochrome c to oxygen",4,1.13356e-06,24.454128,334.780616,"[COX8A, COX7B, CYCS, COX5B, COX5A, COX6B1]",0.0006730515,0,0,(GO:0006123),6,3.171952,0
"mitochondrial electron transport, NADH to ubiquinone",5,1.801282e-06,11.611119,153.580416,"[NDUFB9, NDUFA8, NDUFB8, NDUFS8, NDUFB11, NDUF...",0.0008556088,0,0,(GO:0006120),8,3.067725,0


In [14]:
terms = go_process[go_process.Rank < 5].index.unique()
terms.size

22

In [15]:
import plotly.express as px
res_sign = go_process.loc[terms]

fig = px.scatter(res_sign, x="cluster_label", size='num_overlap_genes', color='neg_log10(p_adj)')
fig.update_layout(
    autosize=False,
    width=1300,
    height=700)
fig.show()

## Azimuth Cell Types 2021


In [16]:
GeneCluster.enrich_geneclusters(gene_clust, 'Azimuth_Cell_Types_2021')

In [17]:
go_process = pd.concat([gc.enrichment_res['Azimuth_Cell_Types_2021'] for gc in gene_clust])
go_process = go_process.set_index(['Term'])
go_process.head()

Unnamed: 0_level_0,Rank,p-val,Z-score,Combined score,Overlapping genes,Adjusted P-value,Old p-value,Old adjusted p-value,Term GO index,num_overlap_genes,neg_log10(p_adj),cluster_label
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
RORB+ LAMA4+ Layer 3-5 Glutamatergic Neuron,1,4.4e-05,29.759513,298.215538,"[PLCH1, RORA, IL1RAP, RORB]",0.004846,0,0,CL0000679,4,2.314572,0
Sst+ GABAergic Neuron 3,2,0.001146,19.08656,129.245557,"[NMU, GRIK1, ELFN1]",0.020818,0,0,CL0000617,3,1.681571,0
FEZF2+ RNF144A-AS1+ Layer 5 Glutamatergic Neuron,3,0.001146,19.08656,129.245557,"[GRIK1, CD36, LRRC4C]",0.020818,0,0,CL0000679,3,1.681571,0
Glutamatergic Neuron,4,0.001146,19.08656,129.245557,"[CHN1, PHACTR1, LDB2]",0.020818,0,0,CL0000679,3,1.681571,0
RORB+ CCDC68+ Layer 2-3 Glutamatergic Neuron,5,0.001146,19.08656,129.245557,"[NTNG1, RORB, LRRC4C]",0.020818,0,0,CL0000679,3,1.681571,0


In [18]:
terms = go_process[go_process.Rank < 5].index.unique()
terms.size

23

In [19]:
import plotly.express as px
res_sign = go_process.loc[terms]

fig = px.scatter(res_sign, x="cluster_label", size='num_overlap_genes', color='neg_log10(p_adj)')
fig.update_layout(
    autosize=False,
    width=1100,
    height=700)
fig.show()

## MGI_Mammalian_Phenotype_Level_4_2021

In [20]:
GeneCluster.enrich_geneclusters(gene_clust, 'MGI_Mammalian_Phenotype_Level_4_2021')

In [21]:
go_process = pd.concat([gc.enrichment_res['MGI_Mammalian_Phenotype_Level_4_2021'] for gc in gene_clust])
go_process = go_process.set_index(['Term'])
go_process.head()

Unnamed: 0_level_0,Rank,p-val,Z-score,Combined score,Overlapping genes,Adjusted P-value,Old p-value,Old adjusted p-value,Term GO index,num_overlap_genes,neg_log10(p_adj),cluster_label
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
abnormal nervous system electrophysiology,1,1e-06,6.548053,89.170651,"[RGS2, CALB1, GAD1, ARX, CHN1, EPB41L3, GRIK1,...",0.002325,0,0,MP:0002272,12,2.633595,0
abnormal spatial learning,2,4e-06,4.545199,56.400557,"[NRN1, RGS14, GABRA5, ARX, RASGRF1, CAMK2A, CH...",0.003895,0,0,MP:0001463,15,2.409534,0
hyperactivity,3,8e-06,2.42055,28.428373,"[CNTNAP2, NDUFB9, GSK3A, TMEM63B, DGKB, MAST3,...",0.005045,0,0,MP:0001399,35,2.297154,0
abnormal spatial working memory,4,2.3e-05,6.643433,70.994333,"[KCNH3, GRIN2A, BRINP1, DGKB, WWC1, LRRTM1, CA...",0.010901,0,0,MP:0008428,9,1.962523,0
abnormal synapse morphology,5,5e-05,6.914569,68.486452,"[NRN1, NBEA, DGKB, LRRTM1, CAMK2A, ZDHHC8, GAB...",0.017146,0,0,MP:0009538,8,1.76584,0


In [22]:
terms = go_process[go_process.Rank < 5].index.unique()
terms.size

20

In [23]:
import plotly.express as px
res_sign = go_process.loc[terms]

fig = px.scatter(res_sign, x="cluster_label", size='num_overlap_genes', color='neg_log10(p_adj)')
fig.update_layout(
    autosize=False,
    width=950,
    height=700)
fig.show()

## Reactome_2022

In [24]:
GeneCluster.enrich_geneclusters(gene_clust, 'Reactome_2022')

In [25]:
go_process = pd.concat([gc.enrichment_res['Reactome_2022'] for gc in gene_clust])
go_process = go_process.set_index(['Term'])
go_process.head()

Unnamed: 0_level_0,Rank,p-val,Z-score,Combined score,Overlapping genes,Adjusted P-value,Old p-value,Old adjusted p-value,Term GO index,num_overlap_genes,neg_log10(p_adj),cluster_label
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
"Respiratory Electron Transport, ATP Synthesis By Chemiosmotic Coupling, Heat Production By Uncoupling Proteins",1,2.840672e-17,12.703349,483.996414,"[COX8A, NDUFB9, NDUFA8, COX7B, NDUFB8, ATP5PD,...",2.701479e-14,0,0,R-HSA-163200,24,13.568398,0
Citric Acid (TCA) Cycle And Respiratory Electron Transport,2,2.874132e-16,9.291176,332.490424,"[NDUFB9, COX7B, NDUFB8, NDUFB11, NDUFB4, COX5B...",1.36665e-13,0,0,R-HSA-1428517,27,12.864343,0
Respiratory Electron Transport,3,9.857308e-13,11.489387,317.628614,"[COX8A, NDUFB9, NDUFA8, COX7B, NDUFB8, NDUFB11...",3.124767e-10,0,0,R-HSA-611105,18,9.505182,0
Neuronal System,4,1.591677e-10,4.234036,95.524355,"[KCNG1, RASGRF2, RASGRF1, CAMK2A, GRIK1, IL1RA...",3.784211e-08,0,0,R-HSA-112316,32,7.422025,0
Complex I Biogenesis,5,1.548858e-06,9.658199,129.207311,"[NDUFB9, NDUFA8, NDUFB8, NDUFS8, NDUFB11, NDUF...",0.0002945928,0,0,R-HSA-6799198,9,3.530778,0


In [26]:
terms = go_process[go_process.Rank < 5].index.unique()
terms.size

20

In [27]:
import plotly.express as px
res_sign = go_process.loc[terms]

fig = px.scatter(res_sign, x="cluster_label", size='num_overlap_genes', color='neg_log10(p_adj)')
fig.update_layout(
    autosize=False,
    width=1300,
    height=700)
fig.show()

## KEGG_2021_Human

In [28]:
GeneCluster.enrich_geneclusters(gene_clust, 'KEGG_2021_Human')

In [29]:
go_process = pd.concat([gc.enrichment_res['KEGG_2021_Human'] for gc in gene_clust])
go_process = go_process.set_index(['Term'])
go_process.head()

Unnamed: 0_level_0,Rank,p-val,Z-score,Combined score,Overlapping genes,Adjusted P-value,Old p-value,Old adjusted p-value,Term GO index,num_overlap_genes,neg_log10(p_adj),cluster_label
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Diabetic,1,1.310783e-15,7.822437,268.060472,"[NDUFB9, COX7B, NDUFB8, NDUFB11, NDUFB4, CAMK2...",3.342497e-13,0,0,cardiomyopathy,29,12.475929,0
Oxidative,2,1.827309e-14,9.705012,307.002008,"[COX8A, NDUFB9, NDUFA8, COX7B, NDUFB8, ATP5PD,...",2.329819e-12,0,0,phosphorylation,23,11.632678,0
Parkinson,3,4.488161e-14,6.430044,197.62578,"[NDUFB9, COX7B, NDUFB8, NDUFB11, NDUFB4, CAMK2...",3.814937e-12,0,0,disease,30,11.418513,0
Prion,4,2.985199e-12,5.558151,147.498627,"[NDUFB9, COX7B, NDUFB8, NDUFB11, NDUFB4, COX7A...",1.903064e-10,0,0,disease,29,9.720547,0
Huntington,5,9.727454e-12,5.087062,128.987896,"[NDUFB9, COX7B, NDUFB8, NDUFB11, NDUFB4, CLTA,...",4.961002e-10,0,0,disease,30,9.304431,0


In [30]:
terms = go_process[go_process.Rank < 5].index.unique()
terms.size

24

In [31]:
import plotly.express as px
res_sign = go_process.loc[terms]

fig = px.scatter(res_sign, x="cluster_label", size='num_overlap_genes', color='neg_log10(p_adj)')
fig.update_layout(
    autosize=False,
    width=800,
    height=700)
fig.show()