# Prototype notebook : Querying data across all Omix Atlas
_For internal use only_

### Instructions:

1. Please run all code cells one by one.
2. User is required to enter an SQL query to search for the required datasets.

## Install polly-python

In [None]:
!pip3 install polly-python --user

In [1]:
from polly.omixatlas import OmixAtlas
import pandas as pd
import sys
import time
import re
import traceback

## Authentication with Polly

In [2]:
repo_client = OmixAtlas(os.environ['POLLY_REFRESH_TOKEN'])

## Get all OmixAtlas

In [3]:
all_omix_atlas = pd.DataFrame.from_dict(repo_client.get_all_omixatlas()['data'])

In [4]:
all_omix_atlas

Unnamed: 0,repo_name,repo_id,indexes,diseases,organisms,sources,datatypes,dataset_count,disease_count,tissue_count,organism_count,cell_line_count,cell_type_count,drug_count,data_type_count,data_source_count,sample_count,normal_sample_count
0,gdc,1623221686703,"{'gct_metadata': 'gdc_gct_metadata', 'h5ad_met...","[leukemia, myeloid, acute, precursor t-cell ly...",[homo sapiens],[gdc],"[mirna expression, transcriptomics, copy numbe...",9869,75,41,1,1,1,1,3,1,5353,0
1,geo,9,"{'gct_metadata': 'geo_gct_metadata', 'h5ad_met...","[normal, neoplasms, breast neoplasms, obesity,...","[homo sapiens, mus musculus, rattus norvegicus...",[geo],[transcriptomics],10000,3028,980,68,4300,780,1379,1,1,1279490,839985
2,enterprise_atlas,1638441282192,{'gct_metadata': 'enterprise_atlas_gct_metadat...,"[breast neoplasms, endometrial neoplasms, endo...","[homo sapiens, mus musculus]","[geo, proprietary]","[transcriptomics, single cell, mutation, mirna]",176,35,25,2,96,40,21,4,2,8952,3927
3,liveromix_atlas,1615965444377,{'gct_metadata': 'liveromix_atlas_gct_metadata...,"[normal, carcinoma, hepatocellular, obesity, n...","[homo sapiens, mus musculus, rattus norvegicus...","[geo, lincs, tcga, metabolomics workbench, met...","[transcriptomics, mutation, metabolomics, sing...",6761,752,60,22,347,310,4704,12,10,143535,110202
4,cbioportal,1623986995264,"{'gct_metadata': 'cbioportal_gct_metadata', 'h...","[prostatic neoplasms, carcinoma, ductal, breas...",[homo sapiens],[cbioportal],"[mutation, copy number variation, fusion, tran...",10000,330,262,1,1,1,1,5,1,58994,0
5,pcd,1622113130397,"{'gct_metadata': 'pcd_gct_metadata', 'h5ad_met...","[none, adenocarcinoma, carcinoma, squamous cel...",[homo sapiens],[pharmacodb],[drug response],10000,133,34,1,1399,1,749,1,1,653500,0
6,lincs,32,"{'gct_metadata': 'lincs_gct_metadata', 'h5ad_m...","[prostatic neoplasms, normal, colorectal neopl...",[homo sapiens],"[geo, geo, lincs]",[transcriptomics],10000,34,21,1,84,1,20347,1,2,1371945,0
7,metabolomics,23,"{'gct_metadata': 'metabolomics_gct_metadata', ...","[normal, neoplasms, diabetes mellitus, obesity...","[homo sapiens, mus musculus, rattus norvegicus...","[metabolomics workbench, metabolights, publica...","[metabolomics, lipidomics, single cell]",1777,214,134,184,24,23,67,3,4,90677,0
8,sc_data_lake,17,"{'gct_metadata': 'sc_data_lake_gct_metadata', ...","[normal, neoplasms, breast neoplasms, melanoma...","[mus musculus, homo sapiens, macaca fascicular...","[geo, humancellatlas, gene expression omnibus ...",[single cell],2915,520,414,5,69,406,80,1,6,207888,0


## Get all dataset level indexes

In [5]:
all_indexes = list(all_omix_atlas['indexes'])
dataset_index = [ ind['files'] for ind in all_indexes ]

## Searching across all OA

In [7]:
def query_split(query):
    
    """
    Returns N SQL queries equal to the number of input indexes
    
    Extended description of function.
    
    Parameters:
    query (str): input SQL query to search across any given omixatlases
    
    Returns:
    q_list (list): list of SQL queries where each query queries on a single index only.
    
    """
    
    index_search = re.search("from(.*)where", query, re.IGNORECASE)
    index_list = index_search.group(1).split(',')
    index_list = [index.strip() for index in index_list]
    
    const_string1 = query.split(index_list[0]) #Extract constant SQL query
    const_string2 = query.split(index_list[-1]) #Extract constant SQL query
    
    q_list = []
    
    for index in index_list:
        q = f"{const_string1[0]}{index}{const_string2[-1]}"
        q_list.append(q)
    
    return q_list
    

def for_loop(query_list):
    
    """
    Returns a table containing dataset level metadata for all datasets based on the input SQL query
    
    Extended description of function.
    
    Parameters:
    query_list (list): list of input SQL queries where each query is used to search across a single omixatlas
    
    Returns:
    df_result (dataframe): DataFrame containing dataset level metadata of the resulting datasets.
    
    """

    df_list = []
    
    for query in query_list:
        all_df = repo_client.query_metadata(query)
        if all_df is not None:
            df_list.append(all_df)
    
    df_result = pd.concat(df_list) #Merge all results into a single table
    
    return df_result

def multi_index(query):

    all_df = repo_client.query_metadata(query)
    return all_df

## Input SQL query

#### List of available dataset indexes

1. gdc_files
2. geo_files
3. enterprise_atlas_files
4. liveromix_atlas_files
5. cbioportal_files
6. pcd_files
7. lincs_files
8. metabolomics_files
9. sc_data_lake_files

In [8]:
if __name__ == "__main__":
    
    sql = "SELECT * from pcd_files, geo_files WHERE disease = 'Carcinoma, Hepatocellular' LIMIT 2000"
    
    start_time = time.time()
    try:
        result_df = multi_index(sql)
    except Exception as e:
        print('Caught this Exception: ' + repr(e))
        query_list = query_split(sql)
        result_df = for_loop(query_list)
    
    print("--- %s seconds ---" % (time.time() - start_time))

Caught this error: VerificationException()
{'query_used': "SELECT * from pcd_files WHERE disease = 'Carcinoma, Hepatocellular' LIMIT 2000", 'message': 'Showing 1 - 603 of 603 matching results'}
{'query_used': "SELECT * from geo_files WHERE disease = 'Carcinoma, Hepatocellular' LIMIT 2000", 'message': 'Showing 1 - 942 of 942 matching results'}
--- 45.4629647731781 seconds ---


In [25]:
result_df

Unnamed: 0,dataset_source,dataset_id,description,kw_cell_line,organism,year,operation,kw_source,is_public,data_repository,...,kw_strain,file_type,source_process,processing,drug,author,abstract,overall_design,summary,type
0,PharmacoDB,17-AAG_haematopoietic_and_lymphoid_tissue_GDSC...,dose response data for 17-AAG drug and cell li...,"[BC-1, Jurkat, GR-ST, AMO1, Ku812, K-562, WSU-...",Homo sapiens,2018,"{'is_normalized': 'false', 'normalized_type': ...",GDSC1000,true,PharmacoDB,...,,,,,,,,,,
1,PharmacoDB,16-beta-bromoandrosterone_haematopoietic_and_l...,dose response data for 16-beta-bromoandrostero...,"[Jurkat, MHH-CALL-4, JK-1, HPB-ALL, Ku812, KMS...",Homo sapiens,2018,"{'is_normalized': 'false', 'normalized_type': ...",CTRPv2,true,PharmacoDB,...,,,,,,,,,,
2,PharmacoDB,5-FU_haematopoietic_and_lymphoid_tissue_CTRPv2...,dose response data for 5-FU drug and cell line...,"[Jurkat, MHH-CALL-4, JK-1, HPB-ALL, Ku812, KMS...",Homo sapiens,2018,"{'is_normalized': 'false', 'normalized_type': ...",CTRPv2,true,PharmacoDB,...,,,,,,,,,,
3,PharmacoDB,ABT-199_haematopoietic_and_lymphoid_tissue_CTR...,dose response data for ABT-199 drug and cell l...,"[MHH-CALL-4, JK-1, HPB-ALL, AMO1, ME1, RPMI-82...",Homo sapiens,2018,"{'is_normalized': 'false', 'normalized_type': ...",CTRPv2,true,PharmacoDB,...,,,,,,,,,,
4,PharmacoDB,AP-24534_haematopoietic_and_lymphoid_tissue_GD...,dose response data for AP-24534 drug and cell ...,"[SCC-3, BC-1, GR-ST, AMO1, Ku812, K-562, WSU-N...",Homo sapiens,2018,"{'is_normalized': 'false', 'normalized_type': ...",GDSC1000,true,PharmacoDB,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,GEO,GSE82177_GPL11154,Human liver RNA-seq data corresponding to unin...,[None],[Homo sapiens],2018,"{'is_normalized': 'true', 'batch_corrected_var...",,true,geo,...,[None],,,,,,,,,
938,GEO,GSE84186_GPL11154,Single-nucleotide-resolution mapping of HBV pr...,"[Hep-G2/2.2.15, HepAD38]",[Homo sapiens],2018,"{'is_normalized': 'true', 'batch_corrected_var...",,true,geo,...,[None],,,,,,,,,
939,GEO,GSE87240_GPL11154,S-adenosyl-methionine treatment selectively bl...,[None],[Homo sapiens],2018,"{'is_normalized': 'true', 'batch_corrected_var...",,true,geo,...,[None],,,,,,,,,
940,GEO,GSE85427_GPL13112,Insulin-like Growth Factor Binding Protein-7 (...,[None],[Mus musculus],Oct 01 2019,"{'is_normalized': 'true', 'batch_corrected_var...",,true,geo,...,[C57BL/6],,,,,"Yidong,,Chen",Insulin like growth factor binding protein-7 (...,Three replicates of RNAs derived from WT and I...,Insulin like growth factor binding protein-7 (...,Expression profiling by high throughput sequen...
