<a href="https://polly.elucidata.io/manage/workspaces?action=open_polly_notebook&amp;source=github&amp;path=ElucidataInc%2Fpolly-python%2Fblob%2Fmain%2Fsearch_across_all_omixatlas.ipynb&amp;kernel=elucidata%2FPython+3&amp;machine=medium" target="_parent"><img alt="Open in Polly" src="https://elucidatainc.github.io/PublicAssets/open_polly.svg"/></a>


# Prototype polly-python Notebook with examples showing how to query data across all OmixAtlas

For internal use only

Instructions:
1. Please run all code cells one by one.
2. User is required to enter an SQL query to search for the required datasets.

For any support or feedback, kindly reach out to either pawan.verma@elucidata.io or yogesh.lakhotia@elucidata.io


In [2]:
# please do not modify
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)

## Install polly-python and joblib

In [1]:
!sudo pip3 install polly-python joblib

Collecting polly-python
  Downloading https://files.pythonhosted.org/packages/4b/01/053ce8c72b2dc2f6a896a7911eaa516331219e6c16493621c561b6089918/polly_python-0.0.8-py3-none-any.whl
Collecting joblib
[?25l  Downloading https://files.pythonhosted.org/packages/3e/d5/0163eb0cfa0b673aa4fe1cd3ea9d8a81ea0f32e50807b0c295871e4aab2e/joblib-1.1.0-py2.py3-none-any.whl (306kB)
[K     |################################| 307kB 6.7MB/s eta 0:00:01
[?25hCollecting python-magic==0.4.24 (from polly-python)
  Downloading https://files.pythonhosted.org/packages/d3/99/c89223c6547df268596899334ee77b3051f606077317023617b1c43162fb/python_magic-0.4.24-py2.py3-none-any.whl
Collecting boto3>=1.17.73 (from polly-python)
[?25l  Downloading https://files.pythonhosted.org/packages/21/9a/37d4bf80e6492eee474b71dc70c284598ab60c1fe9cffc0e6d3b57e8efec/boto3-1.21.18-py3-none-any.whl (132kB)
[K     |################################| 133kB 99.6MB/s eta 0:00:01
[?25hCollecting certifi==2021.5.30 (from polly-python)
[?25

  Found existing installation: idna 2.8
    Uninstalling idna-2.8:
      Successfully uninstalled idna-2.8
  Found existing installation: requests 2.25.0
    Uninstalling requests-2.25.0:
      Successfully uninstalled requests-2.25.0
  Found existing installation: pytz 2019.3
    Uninstalling pytz-2019.3:
      Successfully uninstalled pytz-2019.3
Successfully installed Deprecated-1.2.13 boto3-1.21.18 botocore-1.24.18 cached-property-1.5.2 certifi-2021.5.30 chardet-4.0.0 cmapPy-4.0.1 elucidatacloudpathlib-0.6.6 h5py-3.1.0 idna-2.10 joblib-1.1.0 polly-python-0.0.8 postpy2-0.0.6 python-magic-0.4.24 pytz-2021.1 requests-2.25.1 retrying-1.3.3 s3transfer-0.5.2 six-1.16.0 urllib3-1.26.6 wrapt-1.14.0
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
restartkernel() #Pause for a few seconds before the kernel is refreshed

In [1]:
# please do not modify
from IPython.display import HTML
HTML('''<script type="text/javascript"> Jupyter.notebook.kernel.execute("url = '" + window.location + "'", {}, {}); </script>''')

In [4]:
from polly.omixatlas import OmixAtlas
from polly.auth import Polly
from joblib import Parallel, delayed
import pandas as pd
import time
import re

## Authenticate your Polly session using the token

In [5]:
AUTH_TOKEN=(os.environ['POLLY_REFRESH_TOKEN']) # Obtain authentication tokens
Polly.auth(AUTH_TOKEN)

In [19]:
omixatlas = OmixAtlas()

## Defining functions for searching across all OA

In [20]:
def query_split(query):
    
    """
    Returns N SQL queries equal to the number of input indexes
    
    Extended description of function.
    
    Parameters:
    query (str): input SQL query to search across any given omixatlases
    
    Returns:
    q_list (list): list of SQL queries where each query queries on a single index only.
    
    """
    q_list = []
    
    query = query.replace("\n", "") #remove new line from query
    
    if('union' in query.lower()):
        q_list = re.split('union', query, flags=re.IGNORECASE)
        q_list = [q.strip() for q in q_list]
    else:
        index_search = re.findall(r"from.*\.[a-z]*s", query, re.IGNORECASE)[0].split(',')
        index_list = [re.sub("from ", "", index) for index in index_search]
        index_list = [index.strip() for index in index_list]

        const_string1 = query.split(index_list[0]) #Extract constant SQL query
        const_string2 = query.split(index_list[-1]) #Extract constant SQL query
    
        for index in index_list:
            q = f"{const_string1[0]}{index}{const_string2[-1]}"
            q_list.append(q)
    
    return q_list

def query_oa(query, version):
    
    """
    Returns a table containing metadata for all results based on the input SQL query
    
    Extended description of function.
    
    Parameters: 
    query (string): list of input SQL queries where each query is used to query across a single omixatlas
    version (string): The API version of data infra on which to query
    
    Returns:
    all_df (dataframe): DataFrame containing metadata of the resulting data.
    
    """

    all_df = omixatlas.query_metadata(query, query_api_version=version)
    return all_df
    

def parallel_query(query_list, version):
    
    """
    Performs a parallel execution by dividing the task among multiple threads rather than multiple CPUs
    
    Extended description of function.
    
    Parameters: 
    query_list (list): list of input SQL queries where each query is used to search across a single omixatlas
    version (string): The API version of data infra on which to query
    
    Returns:
    df (dataframe): DataFrame containing dataset level metadata of the resulting datasets.
    
    """

    final = []
    result = Parallel(n_jobs=4, prefer="threads")(delayed(query_oa)(query, version) for query in query_list)

    for that_dict in result:
        if isinstance(that_dict, pd.DataFrame):
            final.append(that_dict)
    
    df = pd.concat(final)
    return df

def empty_df():

    """
    Creates an empty dataframe with a message when no data is returned
    
    Extended description of function.
    
    Returns:
    empty_df (dataframe): An empty dataframe with a message.
    
    """

    empty_df = pd.DataFrame()
    data = pd.DataFrame({"Message": "No Data to show!"}, index=[0])
    empty_df = empty_df.append(data)
    return empty_df

def query_all_oa(sql, api_ver):

    """
    Entry point for the cross omixatlas querying app.
    
    Extended description of function.
    
    Parameters: 
    sql (string): Input SQL query
    api_ver (string): The API version of data infra on which to query
    
    Returns:
    result_df (dataframe): DataFrame containing dataset level metadata of the resulting datasets.
    status (string): String containing messages for the user
    time_elapsed (string): Execution wall-time in seconds
    """
    
    start_time = time.time()
    result_df = pd.DataFrame()

    try:

        result_df = query_oa(sql, api_ver)
        if result_df.empty:
            status = 'Query successful, No datasets were returned'
        else:
            status = 'Query Successful'
    
    except Exception as e:

        if 'Different mappings' in repr(e) or 'SYNTAX_ERROR' in repr(e):
            query_list = query_split(sql)
            result_df = parallel_query(query_list, api_ver)
            if result_df.empty:
                status = 'Query successful, No datasets were returned'
                result_df = empty_df()
            else:
                status = 'Query Successful, but caught an Exception with message: One or more indexes in SQL query cannot be concatenated due to reasons unknown. The RCA is yet to be found'
        else:
            status = repr(e)
            result_df = empty_df()
    
    time_elapsed = str("Elapsed time = --- %s seconds ---" % (time.time() - start_time))
    return(result_df, status, time_elapsed)

## Example query

In [52]:
query = f"""SELECT * from geo.datasets, liveromix_atlas.datasets, pcd.datasets, metabolomics.datasets 
            WHERE disease = 'Carcinoma, Hepatocellular'"""
version = 'v1'

result, status, wall_time = query_all_oa(query, version)

print(wall_time)
print(status)
result

Query execution succeeded
Query execution succeeded
Query execution succeeded
Query execution succeeded
Fetched 16 rows
Fetched 100 rows
Fetched 100 rows
Fetched 100 rows
Elapsed time = --- 12.672008752822876 seconds ---
Query Successful, but caught an Exception with message: One or more indexes in SQL query cannot be concatenated due to reasons unknown. The RCA is yet to be found


Unnamed: 0,file_type,disease,tissue,kw_cell_type,kw_drug,dataset_id,organism,dataset_source,platform,description,...,kw_analysis_type,kw_sample_source,data_required,kw_curated_genetic_mod_type,kw_curated_modified_gene,author,abstract,type,kw_source,kw_study_id
0,gct,"[Breast Neoplasms, Carcinoma, Hepatocellular]",[None],[None],[None],GSE101685_GPL570,[Homo sapiens],GEO,Microarray,Gene expression profile of hepatocellular carc...,...,,,,,,,,,,
1,gct,"[Carcinoma, Hepatocellular]",[liver],[None],[None],GSE101728_GPL21047,[Homo sapiens],GEO,Microarray,Expression profiling of lncRNA and mRNA in Hep...,...,,,,,,,,,,
2,,"[Carcinoma, Hepatocellular, End Stage Liver Di...",[liver],"[T cell, hepatocyte, bone marrow cell, splenoc...",[None],GSE103205_GPL2872,[Mus musculus],GEO,Microarray,Gene expression profiling of hepatocarcinogene...,...,,,,,,,,,,
3,,"[Carcinoma, Hepatocellular]",[liver],[None],[None],GSE102759_GPL20115,[Homo sapiens],GEO,Microarray,Hepatocellular carcinoma: recurrence after liv...,...,,,,,,,,,,
4,gct,"[Carcinoma, Hepatocellular]",[None],[None],[None],GSE105067_GPL10558,[Homo sapiens],GEO,Microarray,Integrative Epigenetic Analysis Reveals Therap...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11,,"[Neoplasms, Carcinoma, Hepatocellular, Cachexi...",[blood plasma],[none],[none],MTBLS105_m_mtbls105_GC_Q_mass_spectrometry,[Homo sapiens],Metabolights,,This study evaluates changes in metabolite lev...,...,mass spectrometry,blood plasma,,,,,,,,
12,,"[Fibrosis, Inflammation, Neoplasms, Carcinoma,...","[liver, urine, serum]",[none],[none],MTBLS225_m_prj129_f_tcdd_28rddr_liv-ser_metabo...,[Mus musculus],Metabolights,,TCDD is an environmental contaminant that elic...,...,mass spectrometry,liver,,,,,,,,
13,,"[Fibrosis, Inflammation, Neoplasms, Carcinoma,...","[liver, urine, serum]",[none],[none],MTBLS225_m_prj129_f_tcdd_28rddr_liv-ser_metabo...,[Mus musculus],Metabolights,,TCDD is an environmental contaminant that elic...,...,mass spectrometry,liver,,,,,,,,
14,,"[Fibrosis, Inflammation, Neoplasms, Carcinoma,...","[liver, urine, serum]",[none],[none],MTBLS225_m_mtbls225_serum_mass_spectrometry,[Mus musculus],Metabolights,,TCDD is an environmental contaminant that elic...,...,mass spectrometry,liver,,,,,,,,
