# bio.tools querying

## Overview

## Code

In [25]:
import sys
import json
import requests
import argparse
import yaml
import biotools_parse as bp
import pandas as pd

# The call to get all bio.tools entries through the api is: https://bio.tools/api/tool/?format=json.
# Unfortunately, the response is limited to n tools. To get the whole set, the next pages must be retrieve.
# use "next" in the response to get succesive entries.
 
base_call = "https://bio.tools/api/tool/?format=json"


def make_request(URL):
    try:
        response = requests.get(URL)
    except:
        print('Could not make the request')
        return
    else:
        response = json.loads(response.text)
        return(response)

def build_url(next_page, filters):
    call_template = "https://bio.tools/api/tool/?{attributes}{next_page}&format=json"
    if next_page:
        next_page = "&%s"%next_page
    else:
        next_page = ""
    attributes = '&'.join(filters)    
    url = call_template.format(next_page=next_page, attributes=attributes)
    return(url)


def get_all_pages(filters):
    res = []
    next_page = "page=1"
    print('Starting making the requests...')
    while next_page:
        response = make_request(build_url(next_page, filters))
        print("Requesting: " + build_url(next_page, filters), end='\r')
        res = res + response["list"]
        next_page = response["next"]
        if next_page:
            next_page = next_page[1:]
    print('\nRequests finished.')
    return(res)

def save_result(out_path, result):
    with open(out_path, 'w') as out:
        json.dump(result, out)
    print('Result saved as ' + out_path )
    

colnames_general = ['name', 'description', 'type', 'topic', 'input', 'output']
colnames_detailed = [ 'name', 'description', 'version', 'type', 'topic', 'links', 'publication', 'download', 'inst_instr', 'test', 'src', 'os', 'input', 'output', 'dependencies', 'documentation', 'license', 'termsUse', 'contribPolicy', 'authors', 'repository']
def results_to_table(result, colnames):
    tools = bp.biotoolsToolsGenerator(result).instances
    # Load features into table
    colnames_features = colnames
    df_dict = dict()
    for name in colnames_features:
        df_dict[name] = []

    for tool in tools:
        for field in colnames_features:
            df_dict[field].append(tool.__dict__.get(field))

    df_features = pd.DataFrame.from_dict(df_dict)
    return(df_features)


def results_to_table_detailed(result):
    tools = bp.biotoolsToolsGenerator(result).instances
    # Load features into table
    colnames_features = [ 'name', 'description', 'version', 'type', 'topic', 'links', 'publication', 'download', 'inst_instr', 'test', 'src', 'os', 'input', 'output', 'dependencies', 'documentation', 'license', 'termsUse', 'contribPolicy', 'authors', 'repository']
    df_dict = dict()
    for name in colnames_features:
        df_dict[name] = []

    for tool in tools:
        for field in colnames_features:
            df_dict[field].append(tool.__dict__.get(field))

    df_features = pd.DataFrame.from_dict(df_dict)
    return(df_features)


def parse_zooma_results(input_file):
    with open(input_file, 'r') as inp:
        zooma_terms_df = pd.read_csv(input_file)
        terms = zooma_terms_df['iri']
        terms = [term.split('http://edamontology.org/')[1] for term in terms if type(term) == str]
    return(terms)

def build_filter(term):
    filters_template =  {'topic': 'topicID="%s"', 'format':'dataFormatID="%s"', 'operation':'operationID="%s"', 'data':'dataTypeID="%s"'}
    if 'topic' in term:
        filters = filters_template['topic']%(term)
    elif 'data' in term:
        filters = filters_template['data']%(term)
    elif 'operation' in term:
        filters = filters_template['operation']%(term)
    elif 'format' in term:
        filters = filters_template['format']%(term)
    return(filters)     

def query_for_EDAM_terms(terms):
    '''
    Takes a list of EDAM terms and does a query for each
    Return a dictionary of results of the form: {<term>: <dataframe of results>}
    '''
    results = dict()
    results_detailed = dict()
    # Iteratre through terms
    for term in terms:
        # Avoid duplicated queries
        if term not in results.keys():
            # Building the filters using the EDAM terms
            filter_ = [build_filter(term)]
            # Do the query
            result = get_all_pages(filter_)
            # Put result in table
            result_df = results_to_table(result, colnames_general)
            result_detailed_df = results_to_table(result, colnames_detailed)
            # Put results table in dictionary with the results for the other terms
            results[term] = result_df
            results_detailed[term] = result_detailed_df
            return(results, results_detailed)

The attributes chosen to filter the query must be passed to the `get_all_pages` function in a list. Each attribute must be a string of the form `name=signalp`. This string will be put as a parameter in the query URL as it is. Please, follow the [bio.tools API reference](https://biotools.readthedocs.io/en/latest/api_reference.html) to construct this parameters successfully.

### Simple Query Example 1:

The following is an example of a query of tools annotted with the EDAM topic "Sequence composition, complexity and repeats" and with the word "game"  in their description. 

In [3]:
sample_filters = ['topic="Sequence composition, complexity and repeats"', 'description="game"']
filters = sample_filters
result = get_all_pages(filters)

Starting making the requests...
Requesting: https://bio.tools/api/tool/?topic="Sequence composition, complexity and repeats"&description="game"&page=1&format=json
Requests finished.


In [4]:
print(result)

[{'elixirCommunity': [], 'collectionID': ['EMBOSS'], 'accessibility': None, 'community': None, 'topic': [{'term': 'Sequence composition, complexity and repeats', 'uri': 'http://edamontology.org/topic_0157'}], 'owner': 'EMBOSS', 'cost': 'Free of charge', 'relation': [], 'download': [{'url': 'http://emboss.open-bio.org/html/adm/ch01s01.html', 'note': None, 'version': None, 'type': 'Source code'}, {'url': 'http://emboss.open-bio.org/html/adm/ch01s01.html', 'note': None, 'version': None, 'type': 'Binaries'}], 'validated': 1, 'publication': [{'doi': '10.1016/S0168-9525(00)02024-2', 'note': None, 'version': None, 'pmid': None, 'type': ['Primary'], 'pmcid': None, 'metadata': {'title': 'EMBOSS: The European Molecular Biology Open Software Suite', 'abstract': '', 'citationCount': 4828, 'authors': [{'name': 'Rice P.'}, {'name': 'Longden L.'}, {'name': 'Bleasby A.'}], 'date': '2000-06-01T00:00:00Z', 'journal': 'Trends in Genetics'}}, {'doi': '10.1017/CBO9781139151399', 'note': None, 'version': No

To save the rusult, use the `save_result(out_path, result)` funtion. `out_path` is the path where the `result` will be saved.

In [8]:
save_result('run_3.json', result)

Result saved as run_3.json


In [9]:
df_feaures = results_to_table(result)

###  Simple Query Example 2:

In [10]:
sample_filters = ['operation="Formatting"']
filters = sample_filters
result = get_all_pages(filters)

print("Number of tools found: %d"%len(result))

Starting making the requests...
Requesting: https://bio.tools/api/tool/?operation="Formatting"&page=39&format=json
Requests finished.
Number of tools found: 387


## Discovering tools with EDAM terms mapped with ZOOMA.

### Step 1: parsing zooma results

ZOOMA reslts must be in a csv format.The colunm names, in this order, must be: `keyword`,`iri`,`label` and `confidence`. An example of a valid: 

|keyword            |iri                                   |label                      |confidence|
|-------------------|--------------------------------------|---------------------------|----------|
|Ontology annotation|http://edamontology.org/operation_0226|Annotation                 |MEDIUM    |
|Ontology annotation|                                      |                           |          |
|semantic annotation|                                      |                           |          |
|semantic annotation|http://edamontology.org/operation_3778|Text annotation            |MEDIUM    |
|Text mining        |http://edamontology.org/topic_0218    |Natural language processing|GOOD      |

Raw: 

```
keyword,iri,label,confidence
Ontology annotation,http://edamontology.org/operation_0226,Annotation,MEDIUM
Ontology annotation,,,
semantic annotation,,,
semantic annotation,http://edamontology.org/operation_3778,Text annotation,MEDIUM
Text mining,http://edamontology.org/topic_0218,Natural language processing,GOOD
```

The function `parse_zooma_results` parses this csv and returns a list of terms suitable for the querying.

#### Example:
For a mapping of keyword related with ETL against EDAM, we got the ZOOMA results in the path `ontology_annotation_EDAM_curated_csv.csv`.

In [23]:
terms_file='ontology_annotation_EDAM_curated_csv.csv'
ETL_edam_terms = parse_zooma_results(terms_file)
print(ETL_edam_terms)

['operation_0226', 'operation_3778', 'topic_0218', 'topic_3474', 'topic_0218']


### Step 2: doing the query. 

The function `query_for_EDAM_terms` does the quering for the previously obtained list of EDAM terms. It resturns two kind of outputs: general and detailed. Each is a dictionary of the form: `{<term>: <dataframe of results>}`

#### Example
For the previous terms:

In [20]:
ETL_results_general, ETL_results_detailed  = query_for_EDAM_terms(ETL_edam_terms)

Starting making the requests...
Requesting: https://bio.tools/api/tool/?operationID="operation_0226"&page=14&format=json
Requests finished.


In [21]:
ETL_results_general

{'operation_0226':                       name                                        description  \
 0                   treeio  Base classes and functions for parsing and exp...   
 1    workflow4metabolomics  First fully open-source and collaborative onli...   
 2              globalfungi  User interface to data from high-throughput se...   
 3                    amtdb  This is the place where you can find an update...   
 4            causalbuilder  The causalBuilder is a prototype web applicati...   
 ..                     ...                                                ...   
 140             metdisease  App for Cytoscape, the bioinformatics network ...   
 141                   magi  Publicly available web application to explore ...   
 142                  npact  Computational and graphical representation too...   
 143                   cogs  Collections of Clusters of Orthologous Genes p...   
 144                  marky  It is able to annotate biomedical texts using ...  