Skip to content

Commit

Permalink
Merge pull request #41 from DLHub-Argonne/start-helpers
Browse files Browse the repository at this point in the history
Add search functionality to Client
  • Loading branch information
WardLT committed Feb 21, 2019
2 parents 22d7fe8 + 5d31c6b commit 6c30c8c
Show file tree
Hide file tree
Showing 11 changed files with 610 additions and 83 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ env:
- KERAS_VERSION=2.1.3
- KERAS_VERSION=2.2.2
before_install:
- openssl aes-256-cbc -K $encrypted_2be200cd136c_key -iv $encrypted_2be200cd136c_iv
- openssl aes-256-cbc -K $encrypted_ac5a6dd41c09_key -iv $encrypted_ac5a6dd41c09_iv
-in test-files/DLHub_Client_tokens.json.enc -out test-files/DLHub_Client_tokens.json
-d
- mkdir -p ~/.dlhub/credentials
Expand Down
256 changes: 191 additions & 65 deletions dlhub_sdk/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,19 @@
import os
from tempfile import mkstemp

from globus_sdk.base import BaseClient, slash_join
import jsonpickle
import mdf_forge
import mdf_toolbox
import pandas as pd
import requests
from globus_sdk.base import BaseClient, slash_join
from mdf_toolbox import login, logout
from mdf_toolbox.search_helper import SEARCH_LIMIT

from dlhub_sdk.config import DLHUB_SERVICE_ADDRESS, CLIENT_ID, SEARCH_INDEX
from dlhub_sdk.config import DLHUB_SERVICE_ADDRESS, CLIENT_ID
from dlhub_sdk.utils.schemas import validate_against_dlhub_schema
from dlhub_sdk.utils.search import DLHubSearchHelper, get_method_details, filter_latest


# Directory for authenticaation tokens
_token_dir = os.path.expanduser("~/.dlhub/credentials")


class DLHubClient(BaseClient):
Expand All @@ -24,7 +28,7 @@ class DLHubClient(BaseClient):
and, if not, get new credentials and save them for later use.
For cases where disk access is unacceptable, you can create the client by creating an authorizer
following the
`tutorial for the Globus SDK <https://globus-sdk-python.readthedocs.io/en/stable/tutorial/>_
`tutorial for the Globus SDK <https://globus-sdk-python.readthedocs.io/en/stable/tutorial/>`_
and providing that authorizer to the initializer (e.g., ``DLHubClient(auth)``)"""

def __init__(self, dlh_authorizer=None, search_client=None, http_timeout=None,
Expand All @@ -46,112 +50,130 @@ def __init__(self, dlh_authorizer=None, search_client=None, http_timeout=None,
Keyword arguments are the same as for BaseClient.
"""
if force_login or not dlh_authorizer or not search_client:
auth_res = mdf_toolbox.login(services=["search", "dlhub"], app_name="DLHub_Client",
client_id=CLIENT_ID, clear_old_tokens=force_login,
token_dir=os.path.expanduser("~/.dlhub/credentials"))

auth_res = login(services=["search", "dlhub"], app_name="DLHub_Client",
client_id=CLIENT_ID, clear_old_tokens=force_login,
token_dir=_token_dir)
dlh_authorizer = auth_res["dlhub"]
search_client = auth_res["search"]
# Unused variable, will be used in future
self.__forge_client = mdf_forge.Forge(index=SEARCH_INDEX, services=[], # noqa: F841
clients={"search": search_client})
self._search_client = auth_res["search"]

super(DLHubClient, self).__init__("DLHub", environment='dlhub', authorizer=dlh_authorizer,
http_timeout=http_timeout, base_url=DLHUB_SERVICE_ADDRESS,
**kwargs)

@classmethod
def login(cls, force=False, **kwargs):
"""Create a DLHubClient with credentials
def logout(self):
"""Remove credentials from your local system"""
logout()

Either uses the credentials already saved on the system or, if no credentials are present
or ``force=True``, runs a login procedure to get new credentials
@property
def query(self):
"""Access a query of the DLHub Search repository"""
return DLHubSearchHelper(search_client=self._search_client)

Keyword arguments are passed to the DLHubClient constructor
def get_username(self):
"""Get the username associated with the current credentials"""

Args:
force (bool): Whether to force a login to get new credentials.
Returns:
(DLHubClient) A client complete with proper credentials.
"""
auth_res = mdf_toolbox.login(services=["search", "dlhub"], app_name="DLHub_Client",
token_dir=os.path.expanduser("~/.dlhub/credentials"),
clear_old_tokens=force)
return DLHubClient(dlh_authorizer=auth_res["dlhub"], search_client=auth_res["search"],
**kwargs)
res = self.get('/namespaces')
return res.data['namespace']

def _get_servables(self):
def get_servables(self, only_latest_version=True):
"""Get all of the servables available in the service
Args:
only_latest_version (bool): Whether to only return the latest version of each servable
Returns:
(pd.DataFrame) Summary of all the models available in the service
"""

r = self.get("servables")
return pd.DataFrame(r.data)

def get_servables(self):
"""Get all of the servables available in the service
This is for backwards compatibility. Previous demos relied on this function
prior to it being made an internal function.
Returns:
(pd.DataFrame) Summary of all the models available in the service
([list]) Complete metadata for all servables found in DLHub
"""

return self._get_servables()
# Get all of the servables
results, info = self.query.match_field('dlhub.type', 'servable')\
.add_sort('dlhub.owner', ascending=True).add_sort('dlhub.name', ascending=False)\
.add_sort('dlhub.publication_date', ascending=False).search(info=True)
if info['total_query_matches'] > SEARCH_LIMIT:
raise RuntimeError('DLHub contains more servables than we can return in one entry. '
'DLHub SDK needs to be updated.')

if only_latest_version:
# Sort out only the most recent versions (they come first in the sorted list
names = set()
output = []
for r in results:
name = r['dlhub']['shorthand_name']
if name not in names:
names.add(name)
output.append(r)
results = output

return results

def list_servables(self):
"""Get a list of the servables available in the service
Returns:
(pd.DataFrame) Summary of all the models available in the service
[string]: List of all servable names in username/servable_name format
"""
df_tmp = self._get_servables()
return df_tmp[['dlhub_name']]

servables = self.get_servables(only_latest_version=True)
return [x['dlhub']['shorthand_name'] for x in servables]

def get_task_status(self, task_id):
"""Get the status of a DLHub task.
Args:
task_id (string): UUID of the task
Returns:
(dict) status block containing "status" key.
dict: status block containing "status" key.
"""

r = self.get("{task_id}/status".format(task_id=task_id))
return r.json()

def describe_servable(self, author, name):
"""
Get the description for a certain servable
def describe_servable(self, owner, name):
"""Get the description for a certain servable
Args:
author (string): Username of the owner of the servable
owner (string): Username of the owner of the servable
name (string): Name of the servable
Returns:
(pd.DataFrame) Summary of the servable
dict: Summary of the servable
"""

df_tmp = self._get_servables()
# Create a query for a single servable
query = self.query.match_servable(name)\
.match_owner(owner).add_sort("dlhub.publication_date", False)\
.search(limit=1)

# Downselect to more useful information
df_tmp = df_tmp[['name', 'description', 'input', 'output', 'author', 'status']]
# Raise error if servable is not found
if len(query) == 0:
raise AttributeError('No such servable: {}/{}'.format(owner, name))
return query[0]

# Get the desired servable
serv = df_tmp.query('name={name} AND author={author}'.format(name=name, author=author))
return serv.iloc[0]
def describe_methods(self, owner, name, method=None):
"""Get the description for the method(s) of a certain servable
Args:
owner (string): Username of the owner of the servable
name (string): Name of the servable
method (string): Optional: Name of the method
Returns:
dict: Description of a certain method if ``method`` provided, all methods
if the method name was not provided.
"""

metadata = self.describe_servable(owner, name)
return get_method_details(metadata, method)

def run(self, name, inputs, input_type='python'):
"""Invoke a DLHub servable
Args:
name (string): DLHub name of the model of the form <user>/<model_name>
name (string): DLHub name of the servable of the form <user>/<servable_name>
inputs: Data to be used as input to the function. Can be a string of file paths or URLs
input_type (string): How to send the data to DLHub. Can be "python" (which pickles
the data), "json" (which uses JSON to serialize the data), or "files" (which
sends the data as files).
Returns:
Reply from the service
Results of running the servable
"""
servable_path = 'servables/{name}/run'.format(name=name)

Expand Down Expand Up @@ -180,12 +202,12 @@ def publish_servable(self, model):
If this servable has not been published before, it will be assigned a unique identifier.
If it has been published before (DLHub detects if it has an identifier), then DLHub
will update the model to the new version.
will update the servable to the new version.
Args:
model (BaseMetadataModel): Servable to be submitted
Returns:
(string) Task ID of this submission, used for checking for success
(string): Task ID of this submission, used for checking for success
"""

# Get the metadata
Expand Down Expand Up @@ -232,7 +254,7 @@ def publish_repository(self, repository):
Args:
repository (string): Repository to publish
Returns:
(string) Task ID of this submission, used for checking for success
(string): Task ID of this submission, used for checking for success
"""

# Publish to DLHub
Expand All @@ -241,3 +263,107 @@ def publish_repository(self, repository):

task_id = response.data['task_id']
return task_id

def search(self, query, advanced=False, limit=None, only_latest=True):
"""Query the DLHub servable library
By default, the query is used as a simple plaintext search of all model metadata.
Optionally, you can provided an advanced query on any of the indexed fields in
the DLHub model metadata by setting :code:`advanced=True` and following the guide for
constructing advanced queries found in the
`Globus Search documentation <https://docs.globus.org/api/search/search/#query_syntax>`_.
Args:
query (string): Query to be performed
advanced (bool): Whether to perform an advanced query
limit (int): Maximum number of entries to return
only_latest (bool): Whether to return only the latest version of the model
Returns:
([dict]): All records matching the search query
"""

results = self.query.search(query, advanced=advanced, limit=limit)
return filter_latest(results) if only_latest else results

def search_by_servable(self, servable_name=None, owner=None, version=None,
only_latest=True, limit=None, get_info=False):
"""Search by the ownership, name, or version of a servable
Args:
servable_name (str): The name of the servable. **Default**: None, to match
all servable names.
owner (str): The name of the owner of the servable. **Default**: ``None``,
to match all owners.
version (int): Model version, which corresponds to the date when the
servable was published. **Default**: ``None``, to match all versions.
only_latest (bool): When ``True``, will only return the latest version
of each servable. When ``False``, will return all matching versions.
**Default**: ``True``.
limit (int): The maximum number of results to return.
**Default:** ``None``, for no limit.
get_info (bool): If ``False``, search will return a list of the results.
If ``True``, search will return a tuple containing the results list
and other information about the query.
**Default:** ``False``.
Returns:
If ``info`` is ``False``, *list*: The search results.
If ``info`` is ``True``, *tuple*: The search results,
and a dictionary of query information.
"""
if not servable_name and not owner and not version:
raise ValueError("One of 'servable_name', 'owner', or 'publication_date' is required.")

# Perform the query
results, info = (self.query.match_servable(servable_name=servable_name, owner=owner,
publication_date=version)
.search(limit=limit, info=True))

# Filter out the latest models
if only_latest:
results = filter_latest(results)

if get_info:
return results, info
return results

def search_by_authors(self, authors, match_all=True, limit=None, only_latest=True):
"""Execute a search for servables from certain authors.
Authors in DLHub may be different than the owners of the servable and generally are
the people who developed functionality of a certain servable (e.g., the creators
of the machine learning model used in a servable).
If you want to search by ownership, see :meth:`search_by_servable`
Args:
authors (str or list of str): The authors to match. Names must be in
"Family Name, Given Name" format
match_all (bool): If ``True``, will require all authors be on any results.
If ``False``, will only require one author to be in results.
**Default**: ``True``.
limit (int): The maximum number of results to return.
**Default:** ``None``, for no limit.
only_latest (bool): When ``True``, will only return the latest version
of each servable. When ``False``, will return all matching versions.
**Default**: ``True``.
Returns:
[dict]: List of servables from the desired authors
"""
results = self.query.match_authors(authors, match_all=match_all).search(limit=limit)
return filter_latest(results) if only_latest else results

def search_by_related_doi(self, doi, limit=None, only_latest=True):
"""Get all of the servables associated with a certain publication
Return:
doi (string): DOI of related paper
limit (int): Maximum number of results to return
only_latest (bool): Whether to return only the most recent version of the model
Returns:
[dict]: List of servables from the requested paper
"""

results = self.query.match_doi(doi).search(limit=limit)
return filter_latest(results) if only_latest else results

0 comments on commit 6c30c8c

Please sign in to comment.