Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/azul/lib/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ def json_element_strings(vs: AnyJSON) -> Iterable[str]:
return map(json_str, json_sequence(vs))


def json_element_primitives(vs: AnyJSON) -> Iterable[PrimitiveJSON]:
return map(json_primitive, json_sequence(vs))


def json_sequence_of_mappings(vs: AnyJSON) -> JSONs:
vs = json_sequence(vs)
assert json_elements_are_mappings(vs)
Expand Down Expand Up @@ -132,6 +136,11 @@ def json_items_are_sequences_of_mappings(vs: AnyJSON) -> TypeGuard[Mapping[str,
return True


def json_primitive(v: AnyJSON) -> PrimitiveJSON:
assert v is None or isinstance(v, (str, int, float, bool)), type(v)
return v


def json_dict(v: AnyMutableJSON) -> MutableJSON:
assert isinstance(v, dict), type(v)
return v
Expand Down
104 changes: 55 additions & 49 deletions src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from collections.abc import (
Iterable,
Mapping,
Sequence,
)
import csv
from datetime import (
Expand Down Expand Up @@ -449,10 +450,6 @@
file_name: str | None


def tuple_or_none(v):
return v if v is None else tuple(v)


@attrs.frozen(kw_only=True)
class ManifestPartition(SerializableAttrs):
"""
Expand Down Expand Up @@ -1275,6 +1272,53 @@
"""
page_size = 500

def _paginate_hits(self,
request_factory: Callable[[SortKey | None], Search]
) -> Iterable[Hit]:
"""
Yield all hits in every page of OpenSearch hits in responses to
requests that use client-side paging.

:param request_factory: A callable that returns a prepared OpenSearch
request for the given search-after key, with the
appropriate filters and sorting applied. The
returned request should yield one page worth of
hits, starting at the first page (if the argument
is None), or the hit right after the hit with
given search-after key
"""
search_after = None
while True:
request = request_factory(search_after)
response = request.execute()
if response.hits:
hit = None
for hit in response.hits:
yield hit
assert hit is not None
search_after = self._search_after(hit)
else:
break

def _paginate_hits_sorted(self,
request: Search,
sort: Sequence[str]
) -> Iterable[Hit]:
"""
Wrapper around :meth:`_paginate_hits` for simple cases where the request
does not require any additional setup between pages
"""
request = request.extra(size=self.page_size)
request = request.sort(*sort)

def request_factory(search_after: SortKey | None) -> Search:
if search_after is None:
return request
else:
return request.extra(search_after=search_after)

return self._paginate_hits(request_factory)

def _create_paged_request(self, search_after: SortKey | None) -> Search:
pagination = Pagination(sort='entryId',
order='asc',
Expand All @@ -1298,8 +1342,7 @@
return request

def _search_after(self, hit: Hit) -> SortKey:
a, b = hit.meta.sort
return a, b
return sort_key_from_json(list(hit.meta.sort))


class PagedManifestGenerator(ClientSidePagingManifestGenerator):
Expand Down Expand Up @@ -1777,7 +1820,8 @@
Bundles = dict[FQID, Bundle]


class PFBManifestGenerator(FileBasedManifestGenerator):
class PFBManifestGenerator(FileBasedManifestGenerator,
ClientSidePagingManifestGenerator):

@classmethod
def format(cls) -> ManifestFormat:
Expand Down Expand Up @@ -1805,10 +1849,8 @@

def _all_docs_sorted(self) -> Iterable[JSON]:
request = self._create_request(self.entity_type)
request = request.params(preserve_order=True).sort('entity_id.keyword')
for hit in request.scan():
doc = self._hit_to_doc(hit)
yield doc
hits = self._paginate_hits_sorted(request, sort=['entity_id.keyword'])
return map(self._hit_to_doc, hits)

def create_file(self) -> tuple[str, str | None]:
transformers = self.service.transformer_types(self.catalog)
Expand Down Expand Up @@ -1902,34 +1944,6 @@
hub_id: str
replica_ids: list[str]

def _paginate_hits(self,
request_factory: Callable[[SortKey | None], Search]
) -> Iterable[Hit]:
"""
Yield all hits in every page of OpenSearch hits in responses to
requests that use client-side paging.

:param request_factory: A callable that returns a prepared OpenSearch
request for the given search-after key, with the
appropriate filters and sorting applied. The
returned request should yield one page worth of
hits, starting at the first page (if the argument
is None), or the hit right after the hit with
given search-after key
"""
search_after = None
while True:
request = request_factory(search_after)
response = request.execute()
if response.hits:
hit = None
for hit in response.hits:
yield hit
assert hit is not None
search_after = self._search_after(hit)
else:
break

def _list_replica_keys(self) -> Iterable[ReplicaKeys]:
for hit in self._paginate_hits(self._create_paged_request):
document_ids = [
Expand Down Expand Up @@ -1975,7 +1989,6 @@
{'terms': {'hub_ids.keyword': list(hub_ids)}},
{'terms': {'entity_id.keyword': list(replica_ids)}}
]))
request = request.extra(size=self.page_size)

# `_id` is currently the only index field that is unique to each replica
# document (and thus results in an unambiguous total ordering). However,
Expand All @@ -1987,15 +2000,8 @@
# FIXME: ES DeprecationWarning for using _id as sort key
# https://github.com/DataBiosphere/azul/issues/7290
#
request = request.sort('entity_id.keyword', '_id')

def request_factory(search_after: SortKey | None) -> Search:
if search_after is None:
return request
else:
return request.extra(search_after=search_after)

return self._paginate_hits(request_factory)
sort = ['entity_id.keyword', '_id']
return self._paginate_hits_sorted(request, sort)


class JSONLVerbatimManifestGenerator(PagedManifestGenerator,
Expand Down
8 changes: 3 additions & 5 deletions src/azul/service/query_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@
JSONs,
MutableJSON,
PrimitiveJSON,
json_list,
json_str,
json_element_primitives,
)
from azul.opensearch import (
OpenSearchClientFactory,
Expand Down Expand Up @@ -461,12 +460,11 @@ def process_response(self, response: Response) -> MutableJSON:
return response.to_dict()


type SortKey = tuple[PrimitiveJSON, str]
type SortKey = tuple[PrimitiveJSON, ...]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we want to remove the size constraint.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed at PL to try using a type union to accommodate both the 1-element case and the 2-element while keeping the type as narrow as possible.



def sort_key_from_json(s: AnyJSON) -> SortKey:
a, b = json_list(s)
return a, json_str(b)
return tuple(json_element_primitives(s))


def sort_key_to_json(s: SortKey) -> AnyJSON:
Expand Down
Loading