<a href="https://polly.elucidata.io/manage/workspaces?action=open_polly_notebook&amp;source=github&amp;path=ElucidataInc%2Fpolly-python%2Fblob%2Fmain%2FDiscover%2Fmeta_analysis_transcriptomics%2FDifferentially+expressed+pathways+%28GO+and+HPO%29+in+Neuroblastoma.ipynb&amp;kernel=elucidata%2FPython+3.10&amp;machine=medium" target="_parent"><img alt="Open in Polly" src="https://elucidatainc.github.io/PublicAssets/open_polly.svg"/></a>


# Objective:-
1. Identify datasets on GEO where Neuroblastoma samples are compared with Normal samples
2. Determine which pathways are differentially expressed in the Disease conditions compared to Normal condition as per Gene Ontology and Human Phenotype Ontology

## Install polly-python

In [2]:
!sudo pip3 install polly-python --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.22.74 requires PyYAML<5.5,>=3.10, but you have pyyaml 6.0 which is incompatible.[0m
You should consider upgrading via the '/usr/local/bin/python3.10 -m pip install --upgrade pip' command.[0m


## Import libraries and OA configuration

In [3]:
import os
from polly.auth import Polly
from polly.omixatlas import OmixAtlas

AUTH_TOKEN=(os.environ['POLLY_REFRESH_TOKEN']) # Obtain authentication tokens
Polly.auth(AUTH_TOKEN)

#Defining omixatlas object
omixatlas = OmixAtlas()

### Params

In [4]:
params = {
    "ATLAS_NAME" : "gsea_atlas",
    "COMPARISION_NAME" : "Neuroblastoma",  # Drug or Disease condition you are interested in
    "Pvalue_cutoff" : 0.05,  # p-value cutoff for significance
    }

## Query GSEA atlas

### 1. datasets table

In [5]:
q = """SELECT src_dataset_id, 
              src_description,
              condition_column,
              condition_control,
              condition_perturbation
       FROM gsea_atlas.datasets"""
df = omixatlas.query_metadata(q, query_api_version='v2')
print(df.shape)
df.head()

Query execution succeeded (time taken: 1.86 seconds, data scanned: 0.233 MB)
Fetched 4995 rows
(4995, 5)


Unnamed: 0,src_dataset_id,src_description,condition_column,condition_control,condition_perturbation
0,GSE57275_GPL6887,Genome-wide analysis of gene expression in lun...,kw_curated_disease,[Normal],[Tuberculosis]
1,GSE57297_GPL17077,Identification of significant gene regulations...,kw_curated_disease,[Normal],[Breast Neoplasms]
2,GSE57314_GPL6246,Nfatc1 enhances stem cell contribution to squa...,kw_curated_disease,[Normal],"[Carcinoma, Squamous Cell]"
3,GSE57380_GPL17400,Coexistent ARID1A-PIK3CA mutations promote ova...,kw_curated_disease,[Normal],"[Carcinoma, Ovarian Epithelial]"
4,GSE57382_GPL6246,Calcitriol deregulates AR signaling to inhibit...,kw_curated_drug,[none],[calcitriol]


In [18]:
df['src_dataset_id'].nunique()

4953

In [19]:
df['condition_column'].value_counts()

kw_curated_disease    2686
kw_curated_drug       2309
Name: condition_column, dtype: int64

### 2. features table

In [6]:
q = """SELECT *
       FROM gsea_atlas.features LIMIT 5"""
df = omixatlas.query_metadata(q, query_api_version='v2')
print(df.shape)
df.head()

Query execution succeeded (time taken: 4.40 seconds, data scanned: 172.106 MB)
Fetched 5 rows
(5, 20)


Unnamed: 0,src_repo,id_key,padj,src_uri,pval,pmid,src_dataset_id,version,es,timestamp_,size,systematicname,data_id,exactsource,name,pathway,nmoreextreme,id,is_current,nes
0,gsea_atlas,kw_row_id,0.985726,polly:data://gsea_atlas/data/GSE16176_GPL570-2...,0.948571,,GSE16176_GPL570-2022-07-28-07-25-30,0,0.241467,1659016129013,8,M26585,_9001,GO:0034235,9001,GOMF_GPI_ANCHOR_BINDING,497,,True,0.558746
1,gsea_atlas,kw_row_id,0.985796,polly:data://gsea_atlas/data/GSE16176_GPL570-2...,0.949275,,GSE16176_GPL570-2022-07-28-07-25-30,0,0.274733,1659016129013,5,M34439,_9002,GO:0046978,9002,GOMF_TAP1_BINDING,523,,True,0.558287
2,gsea_atlas,kw_row_id,0.991797,polly:data://gsea_atlas/data/GSE16176_GPL570-2...,0.965779,,GSE16176_GPL570-2022-07-28-07-25-30,0,0.233708,1659016129013,9,M24446,_9003,GO:0071225,9003,GOBP_CELLULAR_RESPONSE_TO_MURAMYL_DIPEPTIDE,507,,True,0.556963
3,gsea_atlas,kw_row_id,0.99698,polly:data://gsea_atlas/data/GSE16176_GPL570-2...,0.979206,,GSE16176_GPL570-2022-07-28-07-25-30,0,0.195206,1659016129013,16,M18314,_9004,GO:0005092,9004,GOMF_GDP_DISSOCIATION_INHIBITOR_ACTIVITY,517,,True,0.556912
4,gsea_atlas,kw_row_id,0.993506,polly:data://gsea_atlas/data/GSE16176_GPL570-2...,0.970205,,GSE16176_GPL570-2022-07-28-07-25-30,0,0.200938,1659016129013,14,M35479,_9005,HP:0001841,9005,HP_PREAXIAL_FOOT_POLYDACTYLY,520,,True,0.556112


## Get comparisons of interest

In [7]:
q = """SELECT 
              dataset_id,
              src_dataset_id, 
              src_description,
              condition_column,
              condition_control,
              condition_perturbation
       FROM 
              """ + params["ATLAS_NAME"] + """.datasets
       WHERE
              CONTAINS(condition_perturbation, '""" + params["COMPARISION_NAME"] + """')
       """
df = omixatlas.query_metadata(q, query_api_version='v2')
print(df.shape)
df.head()

Query execution succeeded (time taken: 1.93 seconds, data scanned: 0.238 MB)
Fetched 13 rows
(13, 6)


Unnamed: 0,dataset_id,src_dataset_id,src_description,condition_column,condition_control,condition_perturbation
0,GSE87782_GPL10787-2022-07-28-07-43-39,GSE87782_GPL10787,MYCN-transformed neuroblasts from TH-MYCN mice...,kw_curated_disease,[Normal],[Neuroblastoma]
1,GSE87783_GPL10787-2022-07-28-07-43-40,GSE87783_GPL10787,Transcriptomic changes in MYC and PRC2 targets...,kw_curated_disease,[Normal],[Neuroblastoma]
2,GSE90711_GPL11154-2022-07-28-07-44-11,GSE90711_GPL11154,Proteomics and transcriptomics of peripheral n...,kw_curated_disease,[Normal],[Neuroblastoma]
3,GSE50959_GPL570-2022-07-28-07-59-34,GSE50959_GPL570,Differential gene expression in neuroblastoma ...,kw_curated_disease,[Normal],[Neuroblastoma]
4,GSE54721_GPL13534-2022-07-28-07-36-21,GSE54721_GPL13534,DNA methylation changes at CpG and non-CpG sit...,kw_curated_disease,[Normal],[Neuroblastoma]


## Query feature level info

After having identified the differential comparisons of interest, we would like to understand what pathways are significantly enriched in a majority of those datasets. To do this, we count the number of enriched pathways found in, and along with that the average Enrichment score for that pathway.

### 1. Enriched Pathways from Gene Ontology Biological Process (GOBP)

In [8]:
pathway_db_name = "GOBP"
q = """SELECT pathway, COUNT(src_dataset_id) AS dataset_count, 
              SUM(NES)/COUNT(src_dataset_id) AS NES_mean 
       FROM """ + params["ATLAS_NAME"] + """.features 
       WHERE src_dataset_id IN 
           (SELECT dataset_id FROM """ + params["ATLAS_NAME"] + """.datasets WHERE CONTAINS(condition_perturbation, 
           '""" + params["COMPARISION_NAME"] + """')) 
       AND padj < """ + str(params["Pvalue_cutoff"]) + """ AND pathway LIKE '""" + pathway_db_name + """%'
       GROUP BY pathway
       ORDER BY dataset_count DESC"""

df = omixatlas.query_metadata(q, query_api_version='v2')
print(df.shape)
df.head(20)

Query execution succeeded (time taken: 4.95 seconds, data scanned: 522.356 MB)
Fetched 4459 rows
(4459, 3)


Unnamed: 0,pathway,dataset_count,NES_mean
0,GOBP_DEFENSE_RESPONSE_TO_OTHER_ORGANISM,7,-1.407988
1,GOBP_IMMUNE_RESPONSE,7,-1.030059
2,GOBP_PROTEIN_PROCESSING,7,-1.232059
3,GOBP_RESPONSE_TO_WOUNDING,7,-1.535205
4,GOBP_PROTEIN_MATURATION,7,-1.120485
5,GOBP_WOUND_HEALING,7,-1.623887
6,GOBP_REGULATION_OF_IMMUNE_SYSTEM_PROCESS,7,-0.986747
7,GOBP_PROTEOLYSIS,7,-1.084543
8,GOBP_POSITIVE_REGULATION_OF_IMMUNE_SYSTEM_PROCESS,7,-0.998948
9,GOBP_PROTEIN_DNA_COMPLEX_SUBUNIT_ORGANIZATION,6,1.348849


In [9]:
df['metric'] = df['dataset_count']*df['NES_mean']
df = df.sort_values(by='metric', ascending=False)
df.head(20)

Unnamed: 0,pathway,dataset_count,NES_mean,metric
503,GOBP_MEIOSIS_I_CELL_CYCLE_PROCESS,5,2.094224,10.471122
275,GOBP_CENTROMERE_COMPLEX_ASSEMBLY,5,2.078301,10.391504
404,GOBP_POSITIVE_REGULATION_OF_CELL_CYCLE_G2_M_PH...,5,2.070064,10.350321
529,GOBP_MEIOTIC_CHROMOSOME_SEGREGATION,5,2.059274,10.296372
285,GOBP_HOMOLOGOUS_RECOMBINATION,5,1.956347,9.781735
394,GOBP_DENDRITE_EXTENSION,5,1.937409,9.687047
551,GOBP_NEUROTRANSMITTER_SECRETION,5,1.929397,9.646983
591,GOBP_FEMALE_MEIOTIC_NUCLEAR_DIVISION,5,1.916576,9.58288
311,GOBP_CHROMOSOME_ORGANIZATION_INVOLVED_IN_MEIOT...,5,1.888549,9.442746
372,GOBP_REGULATION_OF_DENDRITE_EXTENSION,5,1.881249,9.406247


### 2. Enriched Pathways from Gene Ontology Cellular Component (GOCC)

In [10]:
pathway_db_name = "GOCC"
q = """SELECT pathway, COUNT(src_dataset_id) AS dataset_count, 
              SUM(NES)/COUNT(src_dataset_id) AS NES_mean 
       FROM """ + params["ATLAS_NAME"] + """.features 
       WHERE src_dataset_id IN 
           (SELECT dataset_id FROM """ + params["ATLAS_NAME"] + """.datasets WHERE CONTAINS(condition_perturbation, 
           '""" + params["COMPARISION_NAME"] + """')) 
       AND padj < """ + str(params["Pvalue_cutoff"]) + """ AND pathway LIKE '""" + pathway_db_name + """%'
       GROUP BY pathway
       ORDER BY dataset_count DESC"""

df = omixatlas.query_metadata(q, query_api_version='v2')
print(df.shape)
df.head()

Query execution succeeded (time taken: 4.73 seconds, data scanned: 522.356 MB)
Fetched 702 rows
(702, 3)


Unnamed: 0,pathway,dataset_count,NES_mean
0,GOCC_ENDOPLASMIC_RETICULUM_LUMEN,7,-1.569582
1,GOCC_ENDOPLASMIC_RETICULUM,7,-1.229097
2,GOCC_EARLY_ENDOSOME,6,-1.017206
3,GOCC_SPLICEOSOMAL_COMPLEX,6,1.297896
4,GOCC_SECRETORY_GRANULE,6,-1.307329


In [11]:
df['metric'] = df['dataset_count']*df['NES_mean']
df = df.sort_values(by='metric', ascending=False)
df.head(20)

Unnamed: 0,pathway,dataset_count,NES_mean,metric
77,GOCC_PRESYNAPTIC_MEMBRANE,5,1.952774,9.763871
94,GOCC_SYNAPTIC_MEMBRANE,5,1.86677,9.333848
62,GOCC_DISTAL_AXON,5,1.865197,9.325986
92,GOCC_PRESYNAPSE,5,1.860195,9.300976
31,GOCC_CHROMOSOME_CENTROMERIC_REGION,6,1.504451,9.026706
24,GOCC_CONDENSED_CHROMOSOME_CENTROMERIC_REGION,6,1.434843,8.609056
5,GOCC_NUCLEAR_CHROMOSOME,6,1.406465,8.438788
147,GOCC_KINESIN_COMPLEX,4,2.090061,8.360244
110,GOCC_NEURON_TO_NEURON_SYNAPSE,4,2.032852,8.131407
149,GOCC_SPLICEOSOMAL_SNRNP_COMPLEX,4,2.014187,8.056746


### 3. Enriched Pathways from Gene Ontology Molecular Function (GOMF)

In [12]:
pathway_db_name = "GOMF"
q = """SELECT pathway, COUNT(src_dataset_id) AS dataset_count, 
              SUM(NES)/COUNT(src_dataset_id) AS NES_mean 
       FROM """ + params["ATLAS_NAME"] + """.features 
       WHERE src_dataset_id IN 
           (SELECT dataset_id FROM """ + params["ATLAS_NAME"] + """.datasets WHERE CONTAINS(condition_perturbation, 
           '""" + params["COMPARISION_NAME"] + """')) 
       AND padj < """ + str(params["Pvalue_cutoff"]) + """ AND pathway LIKE '""" + pathway_db_name + """%'
       GROUP BY pathway
       ORDER BY dataset_count DESC"""

df = omixatlas.query_metadata(q, query_api_version='v2')
print(df.shape)
df.head()

Query execution succeeded (time taken: 5.74 seconds, data scanned: 522.356 MB)
Fetched 913 rows
(913, 3)


Unnamed: 0,pathway,dataset_count,NES_mean
0,GOMF_ENDOPEPTIDASE_ACTIVITY,7,-1.16123
1,GOMF_PEPTIDASE_ACTIVITY,7,-1.079669
2,GOMF_CELL_ADHESION_MOLECULE_BINDING,6,-1.235589
3,GOMF_ENZYME_REGULATOR_ACTIVITY,6,-0.846585
4,GOMF_PROTEASE_BINDING,6,-1.434183


In [13]:
df['metric'] = df['dataset_count']*df['NES_mean']
df = df.sort_values(by='metric', ascending=False)
df.head(20)

Unnamed: 0,pathway,dataset_count,NES_mean,metric
29,GOMF_MICROTUBULE_MOTOR_ACTIVITY,5,2.00973,10.048648
25,GOMF_TUBULIN_BINDING,5,1.895192,9.475959
49,GOMF_MICROTUBULE_BINDING,5,1.886296,9.43148
21,GOMF_ATP_DEPENDENT_ACTIVITY_ACTING_ON_DNA,6,1.473506,8.841033
7,GOMF_DNA_HELICASE_ACTIVITY,6,1.443724,8.662342
23,GOMF_CATALYTIC_ACTIVITY_ACTING_ON_DNA,6,1.41517,8.491019
55,GOMF_ACETYLCHOLINE_RECEPTOR_ACTIVITY,4,2.002299,8.009197
69,GOMF_DNA_SECONDARY_STRUCTURE_BINDING,4,1.920315,7.681258
27,GOMF_SINGLE_STRANDED_DNA_HELICASE_ACTIVITY,5,1.283016,6.41508
46,GOMF_SINGLE_STRANDED_DNA_BINDING,5,1.274606,6.373028


### 4. Enriched HP

In [14]:
pathway_db_name = "HP"
q = """SELECT pathway, COUNT(src_dataset_id) AS dataset_count, 
              SUM(NES)/COUNT(src_dataset_id) AS NES_mean 
       FROM """ + params["ATLAS_NAME"] + """.features 
       WHERE src_dataset_id IN 
           (SELECT dataset_id FROM """ + params["ATLAS_NAME"] + """.datasets WHERE CONTAINS(condition_perturbation, 
           '""" + params["COMPARISION_NAME"] + """')) 
       AND padj < """ + str(params["Pvalue_cutoff"]) + """ AND pathway LIKE '""" + pathway_db_name + """%'
       GROUP BY pathway
       ORDER BY dataset_count DESC"""

df = omixatlas.query_metadata(q, query_api_version='v2')
print(df.shape)
df.head()

Query execution succeeded (time taken: 5.34 seconds, data scanned: 522.356 MB)
Fetched 3169 rows
(3169, 3)


Unnamed: 0,pathway,dataset_count,NES_mean
0,HP_ABNORMAL_BLEEDING,7,-1.337116
1,HP_VASCULAR_SKIN_ABNORMALITY,7,-1.318931
2,HP_GENERALIZED_ABNORMALITY_OF_SKIN,7,-1.204305
3,HP_SUBCUTANEOUS_HEMORRHAGE,7,-1.423481
4,HP_ABNORMAL_VASCULAR_PHYSIOLOGY,6,-1.374245


In [15]:
df['metric'] = df['dataset_count']*df['NES_mean']
df = df.sort_values(by='metric', ascending=False)
df.head(20)

Unnamed: 0,pathway,dataset_count,NES_mean,metric
64,HP_ABNORMALITY_OF_URINE_CATECHOLAMINE_CONCENTR...,5,1.980936,9.904678
111,HP_PROMINENT_DIGIT_PAD,5,1.845298,9.226492
171,HP_OCULOGYRIC_CRISIS,5,1.838789,9.193945
24,HP_SIMPLIFIED_GYRAL_PATTERN,6,1.429893,8.579355
183,HP_BILATERAL_TONIC_CLONIC_SEIZURE,5,1.638029,8.190145
311,HP_EPILEPTIC_ENCEPHALOPATHY,4,1.970116,7.880465
6,HP_SLOPING_FOREHEAD,6,1.269897,7.61938
243,HP_TONIC_SEIZURE,4,1.863954,7.455817
321,HP_ABSENT_THUMB,4,1.858084,7.432337
227,HP_NON_MOTOR_SEIZURE,4,1.787961,7.151844
