<a href="https://polly.elucidata.io/manage/workspaces?action=open_polly_notebook&amp;source=github&amp;path=ElucidataInc%2Fpolly-python%2Fblob%2Fmain%2FDiscover%2Ffind_datasets_of_interest.ipynb&amp;kernel=elucidata%2FPython+3&amp;machine=small" target="_parent"><img alt="Open in Polly" src="https://elucidatainc.github.io/PublicAssets/open_polly.svg"/></a>


## Install polly-python

In [1]:
!sudo pip3 install polly-python 

Looking in indexes: https://pypi.org/simple, http://54.245.179.143:80/
Collecting polly-python
  Downloading https://files.pythonhosted.org/packages/9a/4b/fc1433a5f8f214d2a6ef54d593c0302db7b85b9c5fbc3f316418c5fb8669/polly_python-0.0.6-py3-none-any.whl
Collecting python-magic==0.4.24 (from polly-python)
  Downloading https://files.pythonhosted.org/packages/d3/99/c89223c6547df268596899334ee77b3051f606077317023617b1c43162fb/python_magic-0.4.24-py2.py3-none-any.whl
Collecting chardet==4.0.0 (from polly-python)
[?25l  Downloading https://files.pythonhosted.org/packages/19/c7/fa589626997dd07bd87d9269342ccb74b1720384a4d739a1872bd84fbe68/chardet-4.0.0-py2.py3-none-any.whl (178kB)
[K     |████████████████████████████████| 184kB 7.7MB/s eta 0:00:01
[?25hCollecting six==1.16.0 (from polly-python)
  Downloading https://files.pythonhosted.org/packages/d9/5a/e7c31adbe875f2abbb91bd84cf2dc52d792b5a01506781dbcf25c91daf11/six-1.16.0-py2.py3-none-any.whl
Collecting requests==2.25.1 (from polly-python)

## Import libraries

In [2]:
from polly.omixatlas import OmixAtlas
import pandas as pd
from multiprocessing import Pool

## Authentication with polly

In [3]:
omix_atlas = OmixAtlas(os.environ['POLLY_REFRESH_TOKEN'])

## Define helper functions

In [4]:
def get_matching_datasets_count(match_params):
    """
    Returns the count of matching datasets with given parameteres
    """
    keys = list(match_params)
    q = f"SELECT COUNT(*) FROM geo_files WHERE MATCH_QUERY({keys[0]}, '{params[keys[0]]}') "
    for key in keys:
        q = q + f"AND MATCH_QUERY({key},'{params[key]}') "
    counts = omix_atlas.query_metadata(q).get('aggregations').get('COUNT(*)').get('value')

    return counts

def construct_query(match_params, offset, limit):
    """
    Constructs a query string using the key-value pairs from the input dictionary
    """
    keys = list(match_params)
    q = f"SELECT * FROM geo_files WHERE MATCH_QUERY({keys[0]}, '{params[keys[0]]}') "
    for key in keys:
        q = q + f"AND MATCH_QUERY({key},'{params[key]}') "
    q = q + f" LIMIT {offset}, {limit}"
    return q

def query_small_batch(args):
    """
    Returns a data frame containing dataset level metadata for a small batch of datasets in GEO 
    based on the input search parameters
    """
    q  = construct_query(*args)
    df = omix_atlas.query_metadata(q)
    return df

def _construct_parameters_for_map(*args):
    """
    Returns a list to be used while passing arguments to pool map function
    """
    params = args[0]
    counts = args[1]
    limit  = args[2]
    total_queries = counts//limit + 1
    params_list = [params]*total_queries
    offset_list = list(range(0, counts, limit))
    limit_list  = [limit]*total_queries
    
    return list(zip(params_list, offset_list, limit_list))

def search_geo_datasets(match_params):
    """
    Returns a data frame containing dataset level metadata for all datasets in GEO based on the input search parameters
    """
    counts = get_matching_datasets_count(match_params)
    
    pool = Pool()
    limit = 2000
    query_results = pool.map(query_small_batch, _construct_parameters_for_map(match_params, counts, limit))
    df = pd.concat(query_results)
    
    return df

## Define parameters for search

In [8]:
params = {
    'disease':'neuroendocrine',
    'organism':'homo sapiens',
    'platform': 'RNASeq'
}
params

{'disease': 'neuroendocrine', 'organism': 'homo sapiens', 'platform': 'RNASeq'}

## Search datasets

In [9]:
df = search_geo_datasets(params)

{'query_used': "SELECT COUNT(*) FROM geo_files WHERE MATCH_QUERY(disease, 'neuroendocrine') AND MATCH_QUERY(disease,'neuroendocrine') AND MATCH_QUERY(organism,'homo sapiens') AND MATCH_QUERY(platform,'RNASeq')  LIMIT 100"}
{'query_used': "SELECT * FROM geo_files WHERE MATCH_QUERY(disease, 'neuroendocrine') AND MATCH_QUERY(disease,'neuroendocrine') AND MATCH_QUERY(organism,'homo sapiens') AND MATCH_QUERY(platform,'RNASeq')  LIMIT 0, 2000", 'message': 'Showing 1 - 9 of 9 matching results'}


In [10]:
print(df.shape)
df.head()

(9, 36)


Unnamed: 0,publication_name,tissue,dataset_source,description,organism,year,disease,operation,platform,dataset_id,...,kw_filetype,kw_region,kw_location,kw_timestamp,author,abstract,overall_design,summary,type,experimental_design
0,28864682,[None],GEO,mTOR kinase inhibition effectively decreases p...,[Homo sapiens],2018,"[Gastro-enteropancreatic neuroendocrine tumor,...","{'is_normalized': 'true', 'batch_corrected_var...",RNASeq,GSE102246_GPL15433,...,gct,us-west-2,https://discover-prod-datalake-v1.s3-us-west-2...,1642014810453,,,,,,
1,30287662,"[lung, prostate gland]",GEO,PARCB Project: Reprogramming normal human epit...,[Homo sapiens],Mar 27 2019,[Neuroendocrine Tumors],"{'is_normalized': 'true', 'batch_corrected_var...",RNASeq,GSE118207_GPL21290,...,gct,us-west-2,https://discover-prod-datalake-v1.s3-us-west-2...,1642015405173,"Owen,N,Witte",This SuperSeries is composed of the SubSeries ...,Refer to individual Series,This SuperSeries is composed of the SubSeries ...,Genome binding/occupancy profiling by high thr...,
2,29915428,"[small intestine, liver, lymph node, rectum, p...",GEO,Expression profile of Gastro-Entero-Pancreatic...,[Homo sapiens],2018,"[Gastro-enteropancreatic neuroendocrine tumor,...","{'is_normalized': 'true', 'batch_corrected_var...",RNASeq,GSE98894_GPL16791,...,gct,us-west-2,https://discover-prod-datalake-v1.s3-us-west-2...,1642017949425,,,,,,
3,30315258,"[lymph node, pancreatic islet, liver]",GEO,Transcriptome of human non-functional pancreat...,[Homo sapiens],2018,"[Gastro-enteropancreatic neuroendocrine tumor,...","{'is_normalized': 'true', 'batch_corrected_var...",RNASeq,GSE118014_GPL11154,...,gct,us-west-2,https://discover-prod-datalake-v1.s3-us-west-2...,1642015434539,,,,,,
4,26460041,[prostate gland],GEO,A Basal Stem Cell Signature Identifies Aggress...,[Homo sapiens],2018,"[Prostatic Neoplasms, Carcinoma, Neuroendocrine]","{'is_normalized': 'true', 'batch_corrected_var...",RNASeq,GSE82071_GPL11154,...,gct,us-west-2,https://discover-prod-datalake-v1.s3-us-west-2...,1640216370001,,,,,,{'categorical_variables': {'cell_population': ...


In [63]:
df[['dataset_id','description','disease','tissue','total_num_samples']]

Unnamed: 0,dataset_id,description,disease,tissue,total_num_samples
0,GSE100215_GPL16791,Functional co-operativity of long-noncoding RN...,"[Neuroblastoma, Nijmegen Breakage Syndrome]",[None],11
1,GSE107707_GPL18573,Transcriptional changes associated with resist...,[Neuroblastoma],[None],24
2,GSE139442_GPL18573,miRNA analysis in depolarized neuroblastoma cells,[Neuroblastoma],[None],8
3,GSE136135_GPL20795,Transcriptomic analysis of neuroblastoma cells...,[Neuroblastoma],[None],2
4,GSE36350_GPL9115,"DNMT3B7, an aberrant DNMT3B isoform, suppresse...",[Neuroblastoma],[None],3
...,...,...,...,...,...
78,GSE53695_GPL16791,nELAVL HITS-CLIP in Alzheimer's Disease patients,"[Neuroblastoma, Alzheimer Disease]",[prefrontal cortex],10
79,GSE22785_GPL9128,"Integrated genome, transcriptome and translato...",[Neuroblastoma],"[adrenal gland, bone marrow, brain]",14
80,GSE90683_GPL16791,Heterogeneity of neuroblastoma cell identity r...,[Neuroblastoma],[neural crest],36
81,GSE91377_GPL16791,RNA-Seq comparative analysis of human neurobla...,[Neuroblastoma],[None],4
