Skip to content

Commit

Permalink
adds pagination to the search api and to the cli
Browse files Browse the repository at this point in the history
Fix #58
  • Loading branch information
Adrien Oyono committed Feb 12, 2019
1 parent 1751bf2 commit f441878
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 46 deletions.
100 changes: 80 additions & 20 deletions eodag/api/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@
logger = logging.getLogger('eodag.core')


# pagination defaults
DEFAULT_PAGE = 1
DEFAULT_ITEMS_PER_PAGE = 10


class EODataAccessGateway(object):
"""An API for downloading a wide variety of geospatial products originating from different types of systems.
Expand Down Expand Up @@ -135,7 +140,8 @@ def available_providers(self):
"""Gives the list of the available providers"""
return tuple(self.providers_config.keys())

def search(self, product_type, **kwargs):
def search(self, product_type, page=DEFAULT_PAGE, max_results=0, items_per_page=DEFAULT_ITEMS_PER_PAGE,
start=0, stop=1, return_all=False, **kwargs):
"""Look for products matching criteria in known providers.
The default behaviour is to look for products in the provider with the highest priority. If the search gives
Expand All @@ -151,26 +157,50 @@ def search(self, product_type, **kwargs):
:param product_type: The product type to search
:type product_type: str or unicode
:param page: The page number to return (default: 1)
:type page: int
:param start: On which page to start on the providers (default: 1)
:type start: int
:param stop: On which page to stop on the providers (default: 1)
:type stop: int
:param max_results: The maximum number of results to return. If this number is reached for the preferred
provider, the search stops there. Otherwise, it continues on the other providers
supporting the requested product type, even if ``partial_support`` is de-activated for
the preferred provider and the subsequent providers (default: 0, meaning this constraint
is relaxed => search continuation is driven by the ``partial_support`` mechanism)
:type max_results: int
:param items_per_page: The number of results that must appear in one single page (default: 10)
:type items_per_page: int
:param dict kwargs: some other criteria that will be used to do the search
:returns: A collection of EO products matching the criteria
:rtype: :class:`~eodag.api.search_result.SearchResult`
:returns: A collection of EO products matching the criteria, the current returned page and the total number of
results found
:rtype: tuple[class:`~eodag.api.search_result.SearchResult`, int, int]
.. note::
The search interfaces, which are implemented as plugins, are required to return a list as a result of their
processing. This requirement is enforced here.
"""
results = SearchResult([])
# How many results are left to retrieve
rest = max_results
if items_per_page is None:
items_per_page = DEFAULT_ITEMS_PER_PAGE
if page is None:
page = DEFAULT_PAGE
for plugin in self._plugins_manager.get_search_plugins(product_type):
logger.info("Searching product type '%s' on provider: %s", product_type, plugin.provider)
logger.debug('Using plugin class for search: %s', plugin.__class__.__name__)
auth = self._plugins_manager.get_auth_plugin(product_type, plugin.provider)
try:
res = plugin.query(product_type, auth=auth, **kwargs)
# Configure the number of items per page
getattr(plugin.config, 'pagination', {})['items_per_page'] = items_per_page
res = plugin.query(product_type, auth=auth, max_results=rest, start=start, stop=stop, **kwargs)
logger.info("Found %s result(s) on provider '%s'", len(res), plugin.provider)
if not isinstance(res, list):
raise PluginImplementationError(
'The query function of a Search plugin must return a list of results, got {} '
'instead'.format(type(res)))

# Filter and attach to each eoproduct in the result the plugin capable of downloading it (this
# is done to enable the eo_product to download itself doing: eo_product.download())
# The filtering is done by keeping only those eo_products that intersects the search extent (if
Expand All @@ -182,29 +212,59 @@ def search(self, product_type, **kwargs):
if eo_product.search_intersection is not None:
download_plugin = self._plugins_manager.get_download_plugin(eo_product)
eo_product.register_downloader(download_plugin, auth)
results.append(eo_product)
# Decide if we should go on with the search using the other search plugins. This happens in
# 2 cases:
# 1. The currently used plugin is the preferred one (the first), and it returned no result
# 2. The currently used plugin supports the product_type partially
if not plugin.config.products[product_type].get('partial_support', False):
if plugin.provider == self.get_preferred_provider()[0] and len(res) == 0:
logger.info(
"No result from preferred provider: '%s'. Search continues on other providers "
"supporting the product type: '%s'", plugin.provider, product_type)
continue
break
logger.info(
"Detected partial support for product type '%s' on provider '%s'. Search continues on "
"other providers supporting it.", product_type, plugin.provider)

# Decide if we should go on with the search using the other search plugins.
# There are two ways of driving the continuation of the search:
# 1. Through the specification of a maximum number of results to be reached. In this case, the
# partial_support mechanism is de-activated and we crawl the providers until we reach the maximum
# or we go out of providers
# 2. Through the partial_support mechanism, when max == 0 (the default). In that case, the following
# rules makes the search to continue on other providers:
# 1. The currently used plugin is the preferred one (the first), and it returned no result
# 2. The currently used plugin supports the product_type partially
if max_results > 0:
current_results_count = len(res)
if current_results_count < rest:
logger.info("The requested results number is not yet reached: %s over %s => search continues",
len(results) + current_results_count, max_results)
new_rest = rest - current_results_count
if new_rest <= 0:
# Take only what remained to complete the request and stop searching
results.extend(res[:rest])
break
# modify the maximum number of results we will request from the other providers
rest = new_rest
# Take what we got and continue to the other providers
results.extend(res)
else:
if not plugin.config.products[product_type].get('partial_support', False):
if plugin.provider == self.get_preferred_provider()[0] and len(res) == 0:
logger.info(
"No result from preferred provider: '%s'. Search continues on other providers "
"supporting the product type: '%s'", plugin.provider, product_type)
continue
# Take the results and stop searching
results.extend(res)
break
logger.info(
"Detected partial support for product type '%s' on provider '%s'. Search continues on "
"other providers supporting it.", product_type, plugin.provider)
# Take the current results and keep searching
results.extend(res)
except Exception:
import traceback as tb
logger.info("No result from provider '%s' due to an error during search. Raise verbosity of log "
"messages for details", plugin.provider)
logger.info('Search continues on other providers supporting the product type')
logger.debug('Error while searching on interface %s:\n %s.', plugin, tb.format_exc())
logger.debug('Ignoring it')
return results
# Paginate the result
start = (page - 1) * items_per_page
stop = start + items_per_page
if return_all:
start, stop = 0, -2
items_per_page = len(results)
return SearchResult(results[start:stop]), page, len(results), items_per_page

def crunch(self, results, **kwargs):
"""Apply the filters given through the keyword arguments to the results
Expand Down
6 changes: 4 additions & 2 deletions eodag/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def search_crunch(ctx, **kwargs):
'startTimeFromAscendingNode': None,
'completionTimeFromAscendingNode': None,
'cloudCover': kwargs.pop('cloudcover'),
'return_all': True,
}
if start_date:
criteria['startTimeFromAscendingNode'] = start_date.isoformat()
Expand All @@ -119,8 +120,9 @@ def search_crunch(ctx, **kwargs):
gateway = EODataAccessGateway(user_conf_file_path=conf_file)

# Search
results = gateway.search(producttype, **criteria)
click.echo("Found {} products with product type '{}': {}".format(len(results), producttype, results))
results, page, total, page_size = gateway.search(producttype, **criteria)
click.echo("Found {} overall products with product type '{}'".format(total, producttype))
click.echo("Returned page {} of {} products: {}".format(page, page_size, results))

# Crunch !
crunch_args = {
Expand Down
30 changes: 18 additions & 12 deletions eodag/plugins/search/qssearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ class QueryStringSearch(Search):
- *count_endpoint*: (optional) The endpoint for counting the number of items satisfying a request
- *items_per_page*: (optional) The maximum number of items a page can have (default: 10)
- **free_text_search_operations**: (optional) A tree structure of the form::
<search-param>: # e.g: $search
Expand Down Expand Up @@ -98,8 +100,8 @@ class QueryStringSearch(Search):
*"acquisition.endViewingDate:lte:1543922280.0"* if the search was done with the value of
``completionTimeFromAscendingNode`` being ``2018-12-04T12:18:00``. What happened is that
``{completionTimeFromAscendingNode#timestamp}`` was replaced with the timestamp of the value of
``completionTimeFromAscendingNode``. This example shows all there is to know about the semantics of the query string
formatting introduced by this plugin: any eodag search parameter can be referenced in the query string
``completionTimeFromAscendingNode``. This example shows all there is to know about the semantics of the query
string formatting introduced by this plugin: any eodag search parameter can be referenced in the query string
with an additional optional conversion function that is separated from it by a ``#`` (see
:func:`~eodag.utils.format_metadata` for further details on the available converters). Note that for the values
in the ``free_text_search_operations`` configuration parameter follow the same rule.
Expand All @@ -120,21 +122,20 @@ def __init__(self, provider, config):
self.config.pagination.setdefault('items_per_page', self.DEFAULT_ITEMS_PER_PAGE)
if self.config.pagination['items_per_page'] == 0:
self.config['items_per_page'] = self.DEFAULT_ITEMS_PER_PAGE
self.config.pagination.setdefault('count_url_base', '')
self.search_urls = []
self.query_params = dict()
self.query_string = ''

def query(self, product_type, cached=False, *args, **kwargs):
def query(self, product_type, cached=False, start=0, stop=1, max_results=0, *args, **kwargs):
if not cached:
self.rollback()
provider_product_type = self.map_product_type(product_type, *args, **kwargs)
keywords = {k: v for k, v in kwargs.items() if k != 'auth'}
qp, qs = self.build_query_string(product_type, productType=provider_product_type, *args, **keywords)
self.query_params = qp
self.query_string = qs
self.search_urls = self.collect_search_urls(product_type, *args, **kwargs)
provider_results = self.do_search(*args, **kwargs)
self.search_urls = self.collect_search_urls(max_results=max_results, productType=product_type, *args, **kwargs)
provider_results = self.do_search(start=start, stop=stop, *args, **kwargs)
eo_products = self.normalize_results(provider_results, product_type, provider_product_type, *args, **kwargs)
return eo_products

Expand Down Expand Up @@ -206,19 +207,21 @@ def get_queryables(self):
if len(val) == 2
}

def collect_search_urls(self, *args, **kwargs):
def collect_search_urls(self, max_results=0, *args, **kwargs):
urls = []
for collection in self.get_collections(*args, **kwargs):
search_endpoint = self.config.api_endpoint.rstrip('/').format(collection=collection)
count_endpoint = self.config.pagination.get('count_endpoint', '').format(collection=collection)
if count_endpoint:
count_url = '{}?{}'.format(count_endpoint, self.query_string)
max_page, items_per_page = self.count_hits(count_url)
max_page, items_per_page = self.count_hits(count_url, max_results=max_results,
result_type=self.config.result_type)
else: # First do one request querying only one element (lightweight request to schedule the pagination)
next_url_tpl = self.config.pagination['next_page_url_tpl']
count_url = next_url_tpl.format(url=search_endpoint, search=self.query_string,
items_per_page=1, page=1)
max_page, items_per_page = self.count_hits(count_url, result_type=self.config.result_type)
items_per_page=1, page=1, skip=0)
max_page, items_per_page = self.count_hits(count_url, max_results=max_results,
result_type=self.config.result_type)
for page in range(1, max_page + 1):
next_url = self.config.pagination['next_page_url_tpl'].format(
url=search_endpoint,
Expand Down Expand Up @@ -270,10 +273,10 @@ def normalize_results(self, results, *args, **kwargs):
for result in results
]

def count_hits(self, count_url, result_type='json'):
def count_hits(self, count_url, max_results=0, result_type='json'):
# Handle a very annoying special case :'(
url = count_url.replace('$format=json&', '')
response = self._request(
# Handle a very annoying special case :'(
url,
info_message='Sending count request: {}'.format(url),
exception_message='Skipping error while counting results for {} {} instance:'.format(
Expand All @@ -293,6 +296,9 @@ def count_hits(self, count_url, result_type='json'):
total_results = path_parsed.find(count_results)[0].value
else: # interpret the result as a raw int
total_results = int(count_results)
# Limit the number of results if the user requested a specific number of total results
if 0 < max_results < total_results:
total_results = max_results
items_per_page = self.config.pagination['items_per_page']
max_page, rest = divmod(total_results, items_per_page)
if rest != 0:
Expand Down
23 changes: 11 additions & 12 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,39 +115,38 @@ def test_eodag_search_bbox_valid(self, SatImagesAPI):
api_obj = SatImagesAPI.return_value
api_obj.search.assert_called_once_with(
product_type, startTimeFromAscendingNode=None, completionTimeFromAscendingNode=None,
cloudCover=None, geometry={'lonmin': 1, 'latmin': 43, 'lonmax': 2, 'latmax': 44})
cloudCover=None, geometry={'lonmin': 1, 'latmin': 43, 'lonmax': 2, 'latmax': 44}, return_all=True)

@mock.patch('eodag.cli.EODataAccessGateway', autospec=True)
def test_eodag_search_storage_arg(self, SatImagesAPI):
"""Calling eodag search with specified result filename without .geojson extension"""
with self.user_conf() as conf_file:
self.runner.invoke(eodag, ['search', '--conf', conf_file, '-p', 'whatever', '--storage', 'results'])
api_obj = SatImagesAPI.return_value
api_obj.serialize.assert_called_with(api_obj.search.return_value, filename='results.geojson')
api_obj.search.return_value = (mock.MagicMock(),) * 4
self.runner.invoke(eodag, ['search', '--conf', conf_file, '-p', 'whatever', '--storage', 'results'])
api_obj.serialize.assert_called_with(api_obj.search.return_value[0], filename='results.geojson')

@mock.patch('eodag.cli.EODataAccessGateway', autospec=True)
def test_eodag_search_with_cruncher(self, SatImagesAPI):
"""Calling eodag search with --cruncher arg should call crunch method of search result"""
with self.user_conf() as conf_file:
api_obj = SatImagesAPI.return_value
api_obj.search.return_value = (mock.MagicMock(),) * 4

product_type = 'whatever'
cruncher = 'FilterLatestIntersect'
criteria = dict(startTimeFromAscendingNode=None, completionTimeFromAscendingNode=None,
geometry=None, cloudCover=None)
result = self.runner.invoke(eodag, ['search', '-f', conf_file, '-p', product_type, '--cruncher', cruncher])
geometry=None, cloudCover=None, return_all=True)
self.runner.invoke(eodag, ['search', '-f', conf_file, '-p', product_type, '--cruncher', cruncher])

api_obj = SatImagesAPI.return_value
search_results = api_obj.search.return_value
search_results = api_obj.search.return_value[0]
crunch_results = api_obj.crunch.return_value

# Assertions
SatImagesAPI.assert_called_once_with(user_conf_file_path=conf_file)
api_obj.search.assert_called_once_with(product_type, **criteria)
api_obj.crunch.assert_called_once_with(search_results, search_criteria=criteria, **{})
api_obj.crunch.assert_called_once_with(search_results, search_criteria=criteria)
api_obj.serialize.assert_called_with(crunch_results, filename='search_results.geojson')
self.assertEqual(
result.output,
'\n'.join(("Found 0 products with product type '{}': {}".format(product_type, search_results),
"Results stored at '{!r}'\n".format(api_obj.serialize.return_value))))

# Call with a cruncher taking arguments
cruncher = 'FilterOverlap'
Expand Down

0 comments on commit f441878

Please sign in to comment.