From 2a170567c7738b18ad2946d67107a546deb6090f Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 25 Mar 2025 10:07:05 -0400 Subject: [PATCH 01/43] wip: BaseTroveView --- .../index_strategy/trove_indexcard_flats.py | 2 +- trove/render/__init__.py | 2 +- trove/render/html_browse.py | 6 +- trove/static/css/browse.css | 4 + trove/trovebrowse_gathering.py | 52 +++++ trove/trovesearch/search_params.py | 205 +++--------------- trove/util/propertypath.py | 63 ++++++ trove/util/queryparams.py | 148 ++++++++++++- trove/views/_base.py | 81 +++++++ trove/views/browse.py | 42 ++-- trove/views/search.py | 74 +------ 11 files changed, 411 insertions(+), 268 deletions(-) create mode 100644 trove/trovebrowse_gathering.py create mode 100644 trove/util/propertypath.py create mode 100644 trove/views/_base.py diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index 49874d189..edfc89fe1 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -24,13 +24,13 @@ PageCursor, ReproduciblyRandomSampleCursor, ) +from trove.util.propertypath import GLOB_PATHSTEP from trove.trovesearch.search_params import ( CardsearchParams, ValuesearchParams, SearchFilter, Textsegment, SortParam, - GLOB_PATHSTEP, ) from trove.trovesearch.search_handle import ( CardsearchHandle, diff --git a/trove/render/__init__.py b/trove/render/__init__.py index 351ac791f..2e1350ac4 100644 --- a/trove/render/__init__.py +++ b/trove/render/__init__.py @@ -11,7 +11,7 @@ from .simple_tsv import TrovesearchSimpleTsvRenderer -__all__ = ('get_renderer_type',) +__all__ = ('get_renderer_type', 'BaseRenderer') RENDERERS: tuple[type[BaseRenderer], ...] = ( RdfHtmlBrowseRenderer, diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index c7dceaf0e..a1acf7453 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -133,11 +133,11 @@ def __render_subj(self, subj_iri: str, start_collapsed=False): def __twoples(self, twopledict: primitive_rdf.RdfTwopleDictionary): with self.__nest('ul', {'class': 'Browse__twopleset'}): for _pred, _obj_set in shuffled(twopledict.items()): - with self.__nest('li', {'class': 'Browse__twople'}, visible=True): + with self.__nest('li', {'class': 'Browse__twople'}): self.__leaf_link(_pred) with self.__nest('ul', {'class': 'Browse__objectset'}): for _obj in shuffled(_obj_set): - with self.__nest('li', {'class': 'Browse__object'}, visible=True): + with self.__nest('li', {'class': 'Browse__object'}): self.__obj(_obj) def __obj(self, obj: primitive_rdf.RdfObject): @@ -186,7 +186,7 @@ def __sequence(self, sequence_twoples: frozenset): self.__leaf('summary', text=str(len(_obj_in_order))) with self.__nest('ol'): # TODO: style? for _seq_obj in _obj_in_order: - with self.__nest('li', visible=True): + with self.__nest('li'): # , visible=True): self.__obj(_seq_obj) def __quoted_graph(self, quoted_graph: primitive_rdf.QuotedGraph): diff --git a/trove/static/css/browse.css b/trove/static/css/browse.css index 163364611..0182e1be3 100644 --- a/trove/static/css/browse.css +++ b/trove/static/css/browse.css @@ -26,7 +26,9 @@ flex-direction: column; /*max-width: 31rem;*/ + /* border: solid 0.382rem rgba(0,0,0,0.191); + */ } details.Browse__card > summary::before { @@ -74,11 +76,13 @@ details.Browse__card[open] > summary::before { align-items: flex-start; gap: 0.382rem; margin: 0; + /* border: solid 1px rgba(0,0,0,0.382); } .Browse__twople:not(:first-child) { border-top: 0; + */ } .Browse__objectset { diff --git a/trove/trovebrowse_gathering.py b/trove/trovebrowse_gathering.py new file mode 100644 index 000000000..8c8458a86 --- /dev/null +++ b/trove/trovebrowse_gathering.py @@ -0,0 +1,52 @@ +from primitive_metadata import gather +from primitive_metadata import primitive_rdf as rdf + +from trove import models as trove_db +from trove.util.iris import get_sufficiently_unique_iri +from trove.vocab import namespaces as ns +from trove.vocab import static_vocab +from trove.vocab.trove import ( + TROVE_API_THESAURUS, +) + + +TROVEBROWSE_NORMS = gather.GatheringNorms.new( + namestory=( + rdf.literal('trovebrowse', language='en'), + rdf.literal('browse a trove of IRI-linked metadata', language='en'), + ), + focustype_iris={ns.RDFS.Resource}, + thesaurus=TROVE_API_THESAURUS, +) + + +trovebrowse = gather.GatheringOrganizer( + namestory=( + rdf.literal('trovebrowse organizer', language='en'), + ), + norms=TROVEBROWSE_NORMS, + gatherer_params={}, +) + + +@trovebrowse.gatherer(focustype_iris={ns.RDFS.Resource}) +def gather_thesaurus_entry(focus): + _thesaurus = static_vocab.combined_thesaurus__suffuniq() + for _iri in focus.iris: + _suffuniq_iri = get_sufficiently_unique_iri(_iri) + _thesaurus_entry = _thesaurus.get(_suffuniq_iri, None) + if _thesaurus_entry: + yield from rdf.iter_twoples(_thesaurus_entry) + + +@trovebrowse.gatherer(ns.DCTERMS.isReferencedBy) +def gather_cards_referencing(focus): + ... # TODO via elasticsearch aggregation + + +@trovebrowse.gatherer(ns.FOAF.primaryTopicOf) +def gather_cards_focused_on(focus): + _identifier_qs = trove_db.ResourceIdentifier.objects.queryset_for_iris(focus.iris) + _indexcard_qs = trove_db.Indexcard.objects.filter(focus_identifier_set__in=_identifier_qs) + for _indexcard in _indexcard_qs: + yield (ns.FOAF.primaryTopicOf, _indexcard.get_iri()) diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index d64eaa9d5..66edbd31a 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -7,10 +7,8 @@ import logging import types import typing -import urllib from django.http import QueryDict -from primitive_metadata import primitive_rdf from trove import exceptions as trove_exceptions from trove.trovesearch.page_cursor import ( @@ -18,12 +16,21 @@ PageCursor, ) from trove.util.frozen import freeze +from trove.util.propertypath import ( + ONE_GLOB_PROPERTYPATH, + PropertypathSet, + Propertypath, + PropertypathParser, + is_globpath, +) from trove.util.queryparams import ( + BaseTroveParams, QueryparamDict, QueryparamName, split_queryparam_value, join_queryparam_value, - queryparams_from_querystring, + get_single_value, + parse_propertypaths, ) from trove.vocab.osfmap import ( osfmap_shorthand, @@ -32,16 +39,11 @@ OSFMAP_THESAURUS, ) from trove.vocab.trove import trove_shorthand -from trove.vocab.namespaces import RDF, TROVE, OWL, NAMESPACES_SHORTHAND, FOAF, DCTERMS +from trove.vocab.namespaces import RDF, TROVE, OWL, FOAF, DCTERMS logger = logging.getLogger(__name__) -### -# type aliases -Propertypath = tuple[str, ...] -PropertypathSet = frozenset[Propertypath] - ### # constants for use in query param parsing @@ -53,12 +55,6 @@ # optional prefix for "sort" values DESCENDING_SORT_PREFIX = '-' -# between each step in a property path "foo.bar.baz" -PROPERTYPATH_DELIMITER = '.' - -# special path-step that matches any property -GLOB_PATHSTEP = '*' -ONE_GLOB_PROPERTYPATH: Propertypath = (GLOB_PATHSTEP,) DEFAULT_PROPERTYPATH_SET: PropertypathSet = frozenset([ONE_GLOB_PROPERTYPATH]) DEFAULT_INCLUDES_BY_TYPE: collections.abc.Mapping[str, frozenset[Propertypath]] = freeze({ @@ -123,90 +119,20 @@ def to_shortname(self) -> str: @dataclasses.dataclass(frozen=True) -class BaseTroveParams: +class BaseTrovesearchParams(BaseTroveParams): static_focus_type: typing.ClassVar[str] # expected on subclasses - iri_shorthand: primitive_rdf.IriShorthand = dataclasses.field(repr=False) - accept_mediatype: str | None - included_relations: PropertypathSet = dataclasses.field(repr=False, compare=False) - attrpaths_by_type: collections.abc.Mapping[str, PropertypathSet] = dataclasses.field(repr=False, compare=False) - - @classmethod - def from_querystring(cls, querystring: str) -> typing.Self: - return cls.from_queryparams(queryparams_from_querystring(querystring)) - - @classmethod - def from_queryparams(cls, queryparams: QueryparamDict) -> typing.Self: - return cls(**cls.parse_queryparams(queryparams)) - - @classmethod - def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: - # subclasses should override and add their fields to super().parse_queryparams(queryparams) - return { - 'iri_shorthand': cls._gather_shorthand(queryparams), - 'included_relations': cls._gather_include(queryparams), - 'attrpaths_by_type': cls._gather_attrpaths(queryparams), - 'accept_mediatype': _get_single_value(queryparams, QueryparamName('acceptMediatype')), - } - - def to_querystring(self) -> str: - return self.to_querydict().urlencode() - - def to_querydict(self) -> QueryDict: - # subclasses should override and add their fields to super().to_querydict() - _querydict = QueryDict(mutable=True) - if self.accept_mediatype: - _querydict['acceptMediatype'] = self.accept_mediatype - # TODO: iriShorthand, include, fields[...] - return _querydict - @classmethod - def _gather_shorthand(cls, queryparams: QueryparamDict): - _prefixmap = {} - for _qp_name, _iri in queryparams.get('iriShorthand', []): - try: - (_shortname,) = _qp_name.bracketed_names - except ValueError: - raise trove_exceptions.InvalidQueryParamName(_qp_name) - else: - _prefixmap[_shortname] = _iri - return NAMESPACES_SHORTHAND.with_update(_prefixmap) + def _default_shorthand(cls): + return osfmap_shorthand() @classmethod - def _gather_include(cls, queryparams: QueryparamDict) -> PropertypathSet: - _include_params = queryparams.get('include', []) - if _include_params: - return frozenset(itertools.chain.from_iterable( - _parse_propertypath_set(_include_value) - for _, _include_value in _include_params - )) + def _default_include(cls): return DEFAULT_INCLUDES_BY_TYPE.get(cls.static_focus_type, frozenset()) @classmethod - def _gather_attrpaths(cls, queryparams: QueryparamDict) -> collections.abc.Mapping[ - str, - tuple[Propertypath, ...], - ]: - _attrpaths: collections.ChainMap[str, tuple[Propertypath, ...]] = collections.ChainMap( - DEFAULT_FIELDS_BY_TYPE, # type: ignore[arg-type] - ) - _fields_params = queryparams.get('fields', []) - if _fields_params: - _requested: dict[str, list[Propertypath]] = collections.defaultdict(list) - for _param_name, _param_value in _fields_params: - try: - (_typenames,) = filter(bool, _param_name.bracketed_names) - except (IndexError, ValueError): - raise trove_exceptions.InvalidQueryParamName( - f'expected "fields[TYPE]" (with exactly one non-empty bracketed segment)' - f' (got "{_param_name}")' - ) - else: - for _type in split_queryparam_value(_typenames): - _type_iri = osfmap_shorthand().expand_iri(_type) - _requested[_type_iri].extend(_parse_propertypaths(_param_value)) - _attrpaths = _attrpaths.new_child(freeze(_requested)) - return _attrpaths + def _default_attrpaths(cls) -> collections.abc.Mapping[str, tuple[Propertypath, ...]]: + return DEFAULT_FIELDS_BY_TYPE @dataclasses.dataclass(frozen=True) @@ -236,7 +162,7 @@ def iter_from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_fam @classmethod def iter_from_searchtext_param(cls, param_name: QueryparamName, param_value: str): _propertypath_set = ( - _parse_propertypath_set(param_name.bracketed_names[0]) + frozenset(parse_propertypaths(param_name.bracketed_names[0], osfmap_shorthand())) if param_name.bracketed_names else None ) @@ -341,7 +267,7 @@ def queryparams_from_textsegments(self, queryparam_family: str, textsegments): for _propertypath_set, _combinable_segments in _by_propertypath_set.items(): _qp_name = QueryparamName( queryparam_family, - (propertypath_set_key(_propertypath_set),), + (osfmap_propertypath_set_key(_propertypath_set),), ) _qp_value = ' '.join( _textsegment.as_searchtext() @@ -421,7 +347,7 @@ def from_filter_param(cls, param_name: QueryparamName, param_value: str): str(param_name), f'unknown filter operator "{_operator_value}"', ) - _propertypath_set = _parse_propertypath_set(_serialized_path_set) + _propertypath_set = frozenset(parse_propertypaths(_serialized_path_set, osfmap_shorthand())) _is_date_filter = all( is_date_property(_path[-1]) for _path in _propertypath_set @@ -468,7 +394,7 @@ def is_type_filter(self) -> bool: def as_queryparam(self, queryparam_family: str): _qp_name = QueryparamName(queryparam_family, ( - propertypath_set_key(self.propertypath_set), + osfmap_propertypath_set_key(self.propertypath_set), self.operator.to_shortname(), )) _qp_value = join_queryparam_value( @@ -516,7 +442,7 @@ def _from_sort_queryparam( )) _descending = param_value.startswith(DESCENDING_SORT_PREFIX) _rawpath = param_value.lstrip(DESCENDING_SORT_PREFIX) - _path = _parse_propertypath(_rawpath, allow_globs=False) + _path = PropertypathParser(osfmap_shorthand(), allow_globs=False).parse_propertypath(_rawpath) return cls( value_type=_value_type, propertypath=_path, @@ -539,7 +465,7 @@ def as_queryparam(self) -> tuple[str, str]: if (self.value_type == ValueType.DATE) else f'sort[{self.value_type.to_shortname()}]' ) - _pathkey = propertypath_key(self.propertypath) + _pathkey = osfmap_propertypath_key(self.propertypath) _value = (f'-{_pathkey}' if self.descending else _pathkey) return (_name, _value) @@ -566,7 +492,7 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: **super().parse_queryparams(queryparams), 'cardsearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'cardSearchText'), 'cardsearch_filter_set': _filter_set, - 'index_strategy_name': _get_single_value(queryparams, QueryparamName('indexStrategy')), + 'index_strategy_name': get_single_value(queryparams, QueryparamName('indexStrategy')), 'sort_list': SortParam.from_sort_queryparams(queryparams), 'page_cursor': _get_page_cursor(queryparams), } @@ -631,12 +557,12 @@ class ValuesearchParams(CardsearchParams): # override CardsearchParams @classmethod def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: - _raw_propertypath = _get_single_value(queryparams, QueryparamName('valueSearchPropertyPath')) + _raw_propertypath = get_single_value(queryparams, QueryparamName('valueSearchPropertyPath')) if not _raw_propertypath: raise trove_exceptions.MissingRequiredQueryParam('valueSearchPropertyPath') return { **super().parse_queryparams(queryparams), - 'valuesearch_propertypath': _parse_propertypath(_raw_propertypath, allow_globs=False), + 'valuesearch_propertypath': PropertypathParser(osfmap_shorthand(), allow_globs=False).parse_propertypath(_raw_propertypath), 'valuesearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'valueSearchText'), 'valuesearch_filter_set': SearchFilter.from_queryparam_family(queryparams, 'valueSearchFilter'), } @@ -655,7 +581,7 @@ def __post_init__(self): def to_querydict(self): _querydict = super().to_querydict() - _querydict['valueSearchPropertyPath'] = propertypath_key(self.valuesearch_propertypath) + _querydict['valueSearchPropertyPath'] = osfmap_propertypath_key(self.valuesearch_propertypath) for _qp_name, _qp_value in Textsegment.queryparams_from_textsegments('valueSearchText', self.valuesearch_textsegment_set): _querydict[_qp_name] = _qp_value for _filter in self.valuesearch_filter_set: @@ -677,35 +603,18 @@ def valuesearch_type_iris(self): ### # helper functions -def is_globpath(path: Propertypath) -> bool: - return all(_pathstep == GLOB_PATHSTEP for _pathstep in path) - - -def make_globpath(length: int) -> Propertypath: - return ONE_GLOB_PROPERTYPATH * length - - def is_date_path(path: Propertypath) -> bool: return bool(path) and is_date_property(path[-1]) -def propertypathstep_key(pathstep: str) -> str: - if pathstep == GLOB_PATHSTEP: - return pathstep - # assume iri - return urllib.parse.quote(osfmap_shorthand().compact_iri(pathstep)) - +def osfmap_propertypath_key(propertypath: Propertypath) -> str: + return PropertypathParser(osfmap_shorthand()).propertypath_key(propertypath) -def propertypath_key(property_path: Propertypath) -> str: - return PROPERTYPATH_DELIMITER.join( - propertypathstep_key(_pathstep) - for _pathstep in property_path - ) - -def propertypath_set_key(propertypath_set: PropertypathSet) -> str: +def osfmap_propertypath_set_key(propertypath_set: PropertypathSet) -> str: + _parser = PropertypathParser(osfmap_shorthand()) return join_queryparam_value( - propertypath_key(_propertypath) + _parser.propertypath_key(_propertypath) for _propertypath in propertypath_set ) @@ -720,52 +629,6 @@ def _get_text_queryparam(queryparams: QueryparamDict, queryparam_family: str) -> ) -def _get_single_value( - queryparams: QueryparamDict, - queryparam_name: QueryparamName, -): - _family_params = queryparams.get(queryparam_name.family, ()) - _paramvalues = [ - _paramvalue - for _paramname, _paramvalue in _family_params - if _paramname.bracketed_names == queryparam_name.bracketed_names - ] - if not _paramvalues: - return None - try: - (_singlevalue,) = _paramvalues - except ValueError: - raise trove_exceptions.InvalidRepeatedQueryParam(str(queryparam_name)) - else: - return _singlevalue - - -def _parse_propertypath_set(serialized_path_set: str, *, allow_globs=True) -> PropertypathSet: - # comma-delimited set of dot-delimited paths - return frozenset(_parse_propertypaths(serialized_path_set, allow_globs=allow_globs)) - - -def _parse_propertypaths(serialized_path_set: str, *, allow_globs=True) -> typing.Iterator[Propertypath]: - for _path in split_queryparam_value(serialized_path_set): - yield _parse_propertypath(_path, allow_globs=allow_globs) - - -def _parse_propertypath(serialized_path: str, *, allow_globs=True) -> Propertypath: - _path = tuple( - osfmap_shorthand().expand_iri(_pathstep) - for _pathstep in serialized_path.split(PROPERTYPATH_DELIMITER) - ) - if GLOB_PATHSTEP in _path: - if not allow_globs: - raise trove_exceptions.InvalidPropertyPath(serialized_path, 'no * allowed') - if any(_pathstep != GLOB_PATHSTEP for _pathstep in _path): - raise trove_exceptions.InvalidPropertyPath( - serialized_path, - f'path must be all * or no * (got {serialized_path})', - ) - return _path - - def _get_related_property_paths(filter_set) -> tuple[Propertypath, ...]: # hard-coded for osf.io search pages, static list per type # TODO: replace with some dynamism, maybe a 'significant_terms' aggregation @@ -785,10 +648,10 @@ def _get_unnamed_iri_values(filter_set) -> typing.Iterable[str]: def _get_page_cursor(queryparams: QueryparamDict) -> PageCursor: - _cursor_value = _get_single_value(queryparams, QueryparamName('page', ('cursor',))) + _cursor_value = get_single_value(queryparams, QueryparamName('page', ('cursor',))) if _cursor_value: return PageCursor.from_queryparam_value(_cursor_value) - _size_value = _get_single_value(queryparams, QueryparamName('page', ('size',))) + _size_value = get_single_value(queryparams, QueryparamName('page', ('size',))) if _size_value is None: return PageCursor() try: diff --git a/trove/util/propertypath.py b/trove/util/propertypath.py new file mode 100644 index 000000000..d9c702251 --- /dev/null +++ b/trove/util/propertypath.py @@ -0,0 +1,63 @@ +import dataclasses +import urllib + +from primitive_metadata import primitive_rdf as rdf + +from trove import exceptions as trove_exceptions + + +### +# type aliases +Propertypath = tuple[str, ...] +PropertypathSet = frozenset[Propertypath] + +### +# constants + +# between each step in a property path "foo.bar.baz" +PROPERTYPATH_DELIMITER = '.' + +# special path-step that matches any property +GLOB_PATHSTEP = '*' +ONE_GLOB_PROPERTYPATH: Propertypath = (GLOB_PATHSTEP,) + + +def is_globpath(path: Propertypath) -> bool: + return all(_pathstep == GLOB_PATHSTEP for _pathstep in path) + + +def make_globpath(length: int) -> Propertypath: + return ONE_GLOB_PROPERTYPATH * length + + +@dataclasses.dataclass +class PropertypathParser: + shorthand: rdf.IriShorthand + allow_globs: bool = True + + def parse_propertypath(self, serialized_path: str) -> Propertypath: + _path = tuple( + self.shorthand.expand_iri(_pathstep) + for _pathstep in serialized_path.split(PROPERTYPATH_DELIMITER) + ) + if GLOB_PATHSTEP in _path: + if not self.allow_globs: + raise trove_exceptions.InvalidPropertyPath(serialized_path, 'no * allowed') + if any(_pathstep != GLOB_PATHSTEP for _pathstep in _path): + raise trove_exceptions.InvalidPropertyPath( + serialized_path, + f'path must be all * or no * (got {serialized_path})', + ) + return _path + + def propertypathstep_key(self, pathstep: str) -> str: + if pathstep == GLOB_PATHSTEP: + return pathstep + # assume iri + return urllib.parse.quote(self.shorthand.compact_iri(pathstep)) + + def propertypath_key(self, property_path: Propertypath) -> str: + return PROPERTYPATH_DELIMITER.join( + self.propertypathstep_key(_pathstep) + for _pathstep in property_path + ) diff --git a/trove/util/queryparams.py b/trove/util/queryparams.py index 3cff5b681..fd9dc8021 100644 --- a/trove/util/queryparams.py +++ b/trove/util/queryparams.py @@ -1,11 +1,22 @@ +from __future__ import annotations +import collections import dataclasses +import itertools import re -from typing import Iterable +import typing # TODO: remove django dependency (tho it is convenient) from django.http import QueryDict +from primitive_metadata import primitive_rdf as rdf from trove import exceptions as trove_exceptions +from trove.util.frozen import freeze +from trove.util.propertypath import ( + PropertypathSet, + Propertypath, + PropertypathParser, +) +from trove.vocab.namespaces import NAMESPACES_SHORTHAND ### @@ -87,5 +98,138 @@ def split_queryparam_value(value: str): return value.split(QUERYPARAM_VALUES_DELIM) -def join_queryparam_value(values: Iterable[str]): +def join_queryparam_value(values: typing.Iterable[str]): return QUERYPARAM_VALUES_DELIM.join(values) + + +def get_single_value( + queryparams: QueryparamDict, + queryparam_name: QueryparamName, +): + _family_params = queryparams.get(queryparam_name.family, ()) + _paramvalues = [ + _paramvalue + for _paramname, _paramvalue in _family_params + if _paramname.bracketed_names == queryparam_name.bracketed_names + ] + if not _paramvalues: + return None + try: + (_singlevalue,) = _paramvalues + except ValueError: + raise trove_exceptions.InvalidRepeatedQueryParam(str(queryparam_name)) + else: + return _singlevalue + + +@dataclasses.dataclass(frozen=True) +class BaseTroveParams: + iri_shorthand: rdf.IriShorthand = dataclasses.field(repr=False) + accept_mediatype: str | None + included_relations: PropertypathSet = dataclasses.field(repr=False, compare=False) + attrpaths_by_type: collections.abc.Mapping[str, PropertypathSet] = dataclasses.field(repr=False, compare=False) + + ### + # class methods + + @classmethod + def from_querystring(cls, querystring: str) -> typing.Self: + return cls.from_queryparams(queryparams_from_querystring(querystring)) + + @classmethod + def from_queryparams(cls, queryparams: QueryparamDict) -> typing.Self: + return cls(**cls.parse_queryparams(queryparams)) + + @classmethod + def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: + # subclasses should override and add their fields to super().parse_queryparams(queryparams) + _shorthand = cls._gather_shorthand(queryparams) + return { + 'iri_shorthand': _shorthand, + 'included_relations': cls._gather_included_relations(queryparams, _shorthand), + 'attrpaths_by_type': cls._gather_attrpaths(queryparams, _shorthand), + 'accept_mediatype': get_single_value(queryparams, QueryparamName('acceptMediatype')), + } + + @classmethod + def _default_shorthand(cls) -> rdf.IriShorthand: + return NAMESPACES_SHORTHAND + + @classmethod + def _default_include(cls) -> PropertypathSet: + return frozenset() + + @classmethod + def _default_attrpaths(cls) -> dict[str, tuple[Propertypath, ...]]: + return {} + + @classmethod + def _gather_shorthand(cls, queryparams: QueryparamDict): + _prefixmap = {} + for _qp_name, _iri in queryparams.get('iriShorthand', []): + try: + (_shortname,) = _qp_name.bracketed_names + except ValueError: + raise trove_exceptions.InvalidQueryParamName(_qp_name) + else: + _prefixmap[_shortname] = _iri + _shorthand = cls._default_shorthand() + if _prefixmap: + _shorthand = _shorthand.with_update(_prefixmap) + return _shorthand + + @classmethod + def _gather_included_relations(cls, queryparams: QueryparamDict, shorthand: rdf.IriShorthand) -> PropertypathSet: + _include_params = queryparams.get('include', []) + if _include_params: + return frozenset(itertools.chain.from_iterable( + parse_propertypaths(_include_value, shorthand) + for _, _include_value in _include_params + )) + return cls._default_include() + + @classmethod + def _gather_attrpaths(cls, queryparams: QueryparamDict, shorthand: rdf.IriShorthand) -> collections.abc.Mapping[ + str, + tuple[Propertypath, ...], + ]: + _attrpaths: collections.ChainMap[str, tuple[Propertypath, ...]] = collections.ChainMap( + cls._default_attrpaths(), + ) + _fields_params = queryparams.get('fields', []) + if _fields_params: + _requested: dict[str, list[Propertypath]] = collections.defaultdict(list) + for _param_name, _param_value in _fields_params: + try: + (_typenames,) = filter(bool, _param_name.bracketed_names) + except (IndexError, ValueError): + raise trove_exceptions.InvalidQueryParamName( + f'expected "fields[TYPE]" (with exactly one non-empty bracketed segment)' + f' (got "{_param_name}")' + ) + else: + for _type in split_queryparam_value(_typenames): + _type_iri = shorthand.expand_iri(_type) + _requested[_type_iri].extend(parse_propertypaths(_param_value, shorthand)) + _attrpaths = _attrpaths.new_child(freeze(_requested)) + return _attrpaths + + ### + # instance methods + + def to_querystring(self) -> str: + return self.to_querydict().urlencode() + + def to_querydict(self) -> QueryDict: + # subclasses should override and add their fields to super().to_querydict() + _querydict = QueryDict(mutable=True) + if self.accept_mediatype: + _querydict['acceptMediatype'] = self.accept_mediatype + # TODO: iriShorthand, include, fields[...] + return _querydict + + +def parse_propertypaths(serialized_path_set: str, shorthand: rdf.IriShorthand) -> typing.Iterator[Propertypath]: + _parser = PropertypathParser(shorthand) + for _path in split_queryparam_value(serialized_path_set): + yield _parser.parse_propertypath(_path) diff --git a/trove/views/_base.py b/trove/views/_base.py new file mode 100644 index 000000000..b6a49555f --- /dev/null +++ b/trove/views/_base.py @@ -0,0 +1,81 @@ +__all__ = ('BaseTroveView',) + +import abc +from typing import ClassVar + +from django import http as djhttp +from django.views import View +from primitive_metadata import gather + +from trove import exceptions as trove_exceptions +from trove.vocab.namespaces import RDFS, TROVE +from trove.util.queryparams import BaseTroveParams +from trove.render import ( + BaseRenderer, + DEFAULT_RENDERER_TYPE, + get_renderer_type, +) +from ._gather_ask import ask_gathering_from_params +from ._responder import ( + make_http_error_response, + make_http_response, +) + + +class BaseTroveView(View, abc.ABC): + # ClassVars expected on inheritors: + organizer: ClassVar[gather.GatheringOrganizer] + params_type: ClassVar[type[BaseTroveParams]] = BaseTroveParams + focus_type: ClassVar[type[gather.Focus]] = gather.Focus + + def get(self, request): + try: + _renderer_type = get_renderer_type(request) + except trove_exceptions.CannotRenderMediatype as _error: + return make_http_error_response( + error=_error, + renderer_type=DEFAULT_RENDERER_TYPE, + ) + try: + _url = request.build_absolute_uri() + _params = self._parse_params(request) + _renderer = self._gather_to_renderer(_url, _params, renderer_type=_renderer_type) + return make_http_response( + content_rendering=_renderer.render_document(), + http_request=request, + ) + except trove_exceptions.TroveError as _error: + return make_http_error_response( + error=_error, + renderer_type=_renderer_type, + ) + + def _parse_params(self, request: djhttp.HttpRequest): + return self.params_type.from_querystring(request.META['QUERY_STRING']) + + def _gather_to_renderer(self, url, params, renderer_type: type[BaseRenderer]) -> BaseRenderer: + _focus = self._build_focus(url, params) + _gathering = self.organizer.new_gathering( + **self._get_gatherer_kwargs(params, renderer_type), + ) + if renderer_type.PASSIVE_RENDER: + ask_gathering_from_params(_gathering, params, _focus) + return renderer_type(_focus, _gathering) + + def _build_focus(self, url, params): + return self.focus_type.new(url, RDFS.Resource) + + def _get_gatherer_kwargs(self, params, renderer_type): + _kwargs = {} + _deriver_kw = _get_param_keyword(TROVE.deriverIRI, self.organizer) + if _deriver_kw: + _kwargs[_deriver_kw] = renderer_type.INDEXCARD_DERIVER_IRI + return _kwargs + + +def _get_param_keyword(param_iri: str, organizer: gather.GatheringOrganizer) -> str | None: + if param_iri in organizer.norms.param_iris: + for (_k, _v) in organizer.gatherer_params.items(): + if _v == param_iri: + return _k + return None diff --git a/trove/views/browse.py b/trove/views/browse.py index ab1e488fe..e66189501 100644 --- a/trove/views/browse.py +++ b/trove/views/browse.py @@ -1,3 +1,5 @@ +import dataclasses + from django import http from django.shortcuts import redirect from django.views import View @@ -7,11 +9,21 @@ from trove.render import get_renderer_type from trove.util.iris import unquote_iri, get_sufficiently_unique_iri from trove.vocab import namespaces as ns -from trove.vocab import static_vocab +from trove.trovebrowse_gathering import trovebrowse +from trove.trovesearch.search_params import BaseTroveParams +from ._base import BaseTroveView from ._responder import make_http_response -class BrowseIriView(View): +@dataclasses.dataclass(frozen=True) +class BrowseParams(BaseTroveParams): + iri: str + + +class BrowseIriView(BaseTroveView): + organizer = trovebrowse + params_type = BrowseParams + def get(self, request, **kwargs): _iri_param = kwargs.get('iri') or request.GET.get('iri') if not _iri_param: @@ -37,32 +49,6 @@ def get(self, request, **kwargs): ) -def _get_latest_cardf(iri: str): - _combined_rdf = primitive_rdf.RdfGraph({}) - try: - _identifier = trove_db.ResourceIdentifier.objects.get_for_iri(iri) - except trove_db.ResourceIdentifier.DoesNotExist: - return iri, _combined_rdf - else: - _rdf_qs = ( - trove_db.LatestIndexcardRdf.objects - .filter(indexcard__focus_identifier_set=_identifier) - .select_related('indexcard') - ) - _focus_iri = None - for _indexcard_rdf in _rdf_qs: - if _focus_iri is None: - _focus_iri = _indexcard_rdf.focus_iri - _combined_rdf.add((_focus_iri, ns.FOAF.primaryTopicOf, _indexcard_rdf.indexcard.get_iri())) - for (_subj, _pred, _obj) in primitive_rdf.iter_tripleset(_indexcard_rdf.as_rdf_tripledict()): - _combined_rdf.add( - (_focus_iri, _pred, _obj) - if _subj == _indexcard_rdf.focus_iri - else (_subj, _pred, _obj) - ) - return (_focus_iri or iri), _combined_rdf - - def _recognize_trove_term(suffuniq_iri: str): _suffuniq_trove = get_sufficiently_unique_iri(str(ns.TROVE)) if suffuniq_iri.startswith(_suffuniq_trove): diff --git a/trove/views/search.py b/trove/views/search.py index d164b36e4..288738782 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -2,12 +2,7 @@ import logging from typing import Callable -from django import http -from django.views import View -from primitive_metadata import gather - from share.search import index_strategy -from trove import exceptions as trove_exceptions from trove.trovesearch.search_handle import BasicSearchHandle from trove.trovesearch.search_params import ( BaseTroveParams, @@ -19,15 +14,7 @@ CardsearchFocus, ValuesearchFocus, ) -from trove.render import ( - DEFAULT_RENDERER_TYPE, - get_renderer_type, -) -from ._gather_ask import ask_gathering_from_params -from ._responder import ( - make_http_error_response, - make_http_response, -) +from ._base import BaseTroveView logger = logging.getLogger(__name__) @@ -36,53 +23,16 @@ _TrovesearchHandler = Callable[[BaseTroveParams], BasicSearchHandle] -class _BaseTrovesearchView(View, abc.ABC): - # expected on inheritors - focus_type: type[gather.Focus] - params_dataclass: type[CardsearchParams] - - def get(self, request): - try: - _renderer_type = get_renderer_type(request) - except trove_exceptions.CannotRenderMediatype as _error: - return make_http_error_response( - error=_error, - renderer_type=DEFAULT_RENDERER_TYPE, - ) - try: - _url = request.build_absolute_uri() - _search_gathering = self._start_gathering(renderer_type=_renderer_type) - _search_params = self._parse_search_params(request) - _strategy = index_strategy.get_strategy_for_trovesearch(_search_params) - _focus = self.focus_type.new( - iris=_url, - search_params=_search_params, - search_handle=self.get_search_handle(_strategy, _search_params), - ) - if _renderer_type.PASSIVE_RENDER: - ask_gathering_from_params(_search_gathering, _search_params, _focus) - # take gathered data into a response - _renderer = _renderer_type(_focus, _search_gathering) - return make_http_response( - content_rendering=_renderer.render_document(), - http_request=request, - ) - except trove_exceptions.TroveError as _error: - return make_http_error_response( - error=_error, - renderer_type=_renderer_type, - ) - - def _parse_search_params(self, request: http.HttpRequest) -> CardsearchParams: - return self.params_dataclass.from_querystring( - request.META['QUERY_STRING'], - ) +class _BaseTrovesearchView(BaseTroveView, abc.ABC): + organizer = trovesearch_by_indexstrategy - def _start_gathering(self, renderer_type) -> gather.Gathering: - # TODO: 404 for unknown strategy - return trovesearch_by_indexstrategy.new_gathering({ - 'deriver_iri': renderer_type.INDEXCARD_DERIVER_IRI, - }) + def _build_focus(self, url, params): # override BaseTroveView + _strategy = index_strategy.get_strategy_for_trovesearch(params) + return self.focus_type.new( + iris=url, + search_params=params, + search_handle=self.get_search_handle(_strategy, params), + ) def get_search_handle(self, strategy, search_params) -> BasicSearchHandle: return self._get_wrapped_handler(strategy)(search_params) @@ -105,7 +55,7 @@ def _wrapped_handler(search_params): class CardsearchView(_BaseTrovesearchView): focus_type = CardsearchFocus - params_dataclass = CardsearchParams + params_type = CardsearchParams def get_search_handler(self, strategy): return strategy.pls_handle_cardsearch @@ -113,7 +63,7 @@ def get_search_handler(self, strategy): class ValuesearchView(_BaseTrovesearchView): focus_type = ValuesearchFocus - params_dataclass = ValuesearchParams + params_type = ValuesearchParams def get_search_handler(self, strategy): return strategy.pls_handle_valuesearch From 82aaf4f793aa18f2a9dd96ae6066543c0f9f78b9 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 26 Mar 2025 08:03:25 -0400 Subject: [PATCH 02/43] wip --- tests/trove/test_doctest.py | 29 +++++ trove/trovesearch/search_params.py | 36 ++---- trove/trovesearch/trovesearch_gathering.py | 6 +- trove/util/base_trove_params.py | 127 +++++++++++++++++++++ trove/util/chainmap.py | 56 +++++++++ trove/util/iris.py | 54 +++++++++ trove/util/propertypath.py | 66 ++++++----- trove/util/queryparams.py | 123 -------------------- trove/views/_base.py | 24 ++-- trove/vocab/osfmap.py | 65 +++++++++-- 10 files changed, 382 insertions(+), 204 deletions(-) create mode 100644 tests/trove/test_doctest.py create mode 100644 trove/util/base_trove_params.py create mode 100644 trove/util/chainmap.py diff --git a/tests/trove/test_doctest.py b/tests/trove/test_doctest.py new file mode 100644 index 000000000..be74792f6 --- /dev/null +++ b/tests/trove/test_doctest.py @@ -0,0 +1,29 @@ +import doctest + +import trove.util.chainmap +import trove.util.iris + + +_MODULES_WITH_DOCTESTS = ( + trove.util.iris, + trove.util.chainmap, +) + + +def _make_test_fn(testcase): + def _test(): + _result = testcase.run() + for _error_testcase, _traceback in _result.errors: + print(f'ERROR({_error_testcase}):\n{_traceback}') + for _error_testcase, _traceback in _result.failures: + print(f'FAILURE({_error_testcase}):\n{_traceback}') + assert not _result.failures and not _result.errors + return _test + + +for _module in _MODULES_WITH_DOCTESTS: + # HACK: allow running with pytest + globals().update({ + f'test_doctest_{_module.__name__}_{_i}': _make_test_fn(_test_case) + for _i, _test_case in enumerate(doctest.DocTestSuite(_module)) + }) diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index 66edbd31a..f94f62740 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -20,7 +20,6 @@ ONE_GLOB_PROPERTYPATH, PropertypathSet, Propertypath, - PropertypathParser, is_globpath, ) from trove.util.queryparams import ( @@ -32,12 +31,7 @@ get_single_value, parse_propertypaths, ) -from trove.vocab.osfmap import ( - osfmap_shorthand, - is_date_property, - suggested_property_paths, - OSFMAP_THESAURUS, -) +from trove.vocab import osfmap from trove.vocab.trove import trove_shorthand from trove.vocab.namespaces import RDF, TROVE, OWL, FOAF, DCTERMS @@ -124,7 +118,7 @@ class BaseTrovesearchParams(BaseTroveParams): @classmethod def _default_shorthand(cls): - return osfmap_shorthand() + return osfmap.osfmap_shorthand() @classmethod def _default_include(cls): @@ -162,7 +156,7 @@ def iter_from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_fam @classmethod def iter_from_searchtext_param(cls, param_name: QueryparamName, param_value: str): _propertypath_set = ( - frozenset(parse_propertypaths(param_name.bracketed_names[0], osfmap_shorthand())) + frozenset(parse_propertypaths(param_name.bracketed_names[0], osfmap.osfmap_shorthand())) if param_name.bracketed_names else None ) @@ -442,7 +436,7 @@ def _from_sort_queryparam( )) _descending = param_value.startswith(DESCENDING_SORT_PREFIX) _rawpath = param_value.lstrip(DESCENDING_SORT_PREFIX) - _path = PropertypathParser(osfmap_shorthand(), allow_globs=False).parse_propertypath(_rawpath) + _path = osfmap.parse_osfmap_propertypath(_rawpath) return cls( value_type=_value_type, propertypath=_path, @@ -562,13 +556,13 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: raise trove_exceptions.MissingRequiredQueryParam('valueSearchPropertyPath') return { **super().parse_queryparams(queryparams), - 'valuesearch_propertypath': PropertypathParser(osfmap_shorthand(), allow_globs=False).parse_propertypath(_raw_propertypath), + 'valuesearch_propertypath': osfmap.parse_osfmap_propertypath(_raw_propertypath), 'valuesearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'valueSearchText'), 'valuesearch_filter_set': SearchFilter.from_queryparam_family(queryparams, 'valueSearchFilter'), } def __post_init__(self): - if is_date_property(self.valuesearch_propertypath[-1]): + if osfmap.is_date_property(self.valuesearch_propertypath[-1]): # date-value limitations if self.valuesearch_textsegment_set: raise trove_exceptions.InvalidQueryParams( @@ -604,19 +598,7 @@ def valuesearch_type_iris(self): # helper functions def is_date_path(path: Propertypath) -> bool: - return bool(path) and is_date_property(path[-1]) - - -def osfmap_propertypath_key(propertypath: Propertypath) -> str: - return PropertypathParser(osfmap_shorthand()).propertypath_key(propertypath) - - -def osfmap_propertypath_set_key(propertypath_set: PropertypathSet) -> str: - _parser = PropertypathParser(osfmap_shorthand()) - return join_queryparam_value( - _parser.propertypath_key(_propertypath) - for _propertypath in propertypath_set - ) + return bool(path) and osfmap.is_date_property(path[-1]) def _get_text_queryparam(queryparams: QueryparamDict, queryparam_family: str) -> str: @@ -636,14 +618,14 @@ def _get_related_property_paths(filter_set) -> tuple[Propertypath, ...]: for _filter in filter_set: if _filter.is_type_filter(): _type_iris.update(_filter.value_set) - return suggested_property_paths(_type_iris) + return osfmap.suggested_property_paths(_type_iris) def _get_unnamed_iri_values(filter_set) -> typing.Iterable[str]: for _filter in filter_set: if _filter.operator.is_iri_operator(): for _iri in _filter.value_set: - if _iri not in OSFMAP_THESAURUS: + if _iri not in osfmap.OSFMAP_THESAURUS: yield _iri diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 7e027623c..7b3e130a3 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -19,8 +19,6 @@ from trove.trovesearch.search_params import ( CardsearchParams, ValuesearchParams, - propertypath_key, - propertypath_set_key, ) from trove.trovesearch.search_handle import ( CardsearchHandle, @@ -28,6 +26,10 @@ ValuesearchResult, ) from trove.util.iris import get_sufficiently_unique_iri +from trove.util.propertypath import ( + propertypath_key, + propertypath_set_key, +) from trove.vocab.namespaces import RDF, FOAF, DCTERMS, RDFS, DCAT, TROVE from trove.vocab.jsonapi import ( JSONAPI_LINK_OBJECT, diff --git a/trove/util/base_trove_params.py b/trove/util/base_trove_params.py new file mode 100644 index 000000000..855abb39b --- /dev/null +++ b/trove/util/base_trove_params.py @@ -0,0 +1,127 @@ +from __future__ import annotations +from collections import defaultdict +import dataclasses +import itertools +import typing +if typing.TYPE_CHECKING: + from collections.abc import Mapping + +# TODO: remove django dependency (tho it is convenient) +from django.http import QueryDict +from primitive_metadata import primitive_rdf as rdf + +from trove import exceptions as trove_exceptions +from trove.util.chainmap import SimpleChainMap +from trove.util.frozen import freeze +from trove.util.propertypath import ( + PropertypathSet, + Propertypath, +) +from trove.util import queryparams as _qp +from trove.vocab.namespaces import NAMESPACES_SHORTHAND +from trove.vocab import osfmap + + +@dataclasses.dataclass(frozen=True) +class BaseTroveParams: + iri_shorthand: rdf.IriShorthand = dataclasses.field(repr=False) + accept_mediatype: str | None + included_relations: PropertypathSet = dataclasses.field(repr=False, compare=False) + attrpaths_by_type: Mapping[str, PropertypathSet] = dataclasses.field(repr=False, compare=False) + + ### + # class methods + + @classmethod + def from_querystring(cls, querystring: str) -> typing.Self: + return cls.from_queryparams(_qp.queryparams_from_querystring(querystring)) + + @classmethod + def from_queryparams(cls, queryparams: _qp.QueryparamDict) -> typing.Self: + return cls(**cls.parse_queryparams(queryparams)) + + @classmethod + def parse_queryparams(cls, queryparams: _qp.QueryparamDict) -> dict: + # subclasses should override and add their fields to super().parse_queryparams(queryparams) + _shorthand = cls._gather_shorthand(queryparams) + return { + 'iri_shorthand': _shorthand, + 'included_relations': cls._gather_included_relations(queryparams, _shorthand), + 'attrpaths_by_type': cls._gather_attrpaths(queryparams, _shorthand), + 'accept_mediatype': _qp.get_single_value(queryparams, _qp.QueryparamName('acceptMediatype')), + } + + @classmethod + def _default_shorthand(cls) -> rdf.IriShorthand: + return NAMESPACES_SHORTHAND + + @classmethod + def _default_include(cls) -> PropertypathSet: + return frozenset() + + @classmethod + def _default_attrpaths(cls) -> Mapping[str, tuple[Propertypath, ...]]: + return {} + + @classmethod + def _gather_shorthand(cls, queryparams: _qp.QueryparamDict): + _prefixmap = {} + for _qp_name, _iri in queryparams.get('iriShorthand', []): + try: + (_shortname,) = _qp_name.bracketed_names + except ValueError: + raise trove_exceptions.InvalidQueryParamName(_qp_name) + else: + _prefixmap[_shortname] = _iri + _shorthand = cls._default_shorthand() + if _prefixmap: + _shorthand = _shorthand.with_update(_prefixmap) + return _shorthand + + @classmethod + def _gather_included_relations(cls, queryparams: _qp.QueryparamDict, shorthand: rdf.IriShorthand) -> PropertypathSet: + _include_params = queryparams.get('include', []) + if _include_params: + return frozenset(itertools.chain.from_iterable( + parse_propertypaths(_include_value, shorthand) + for _, _include_value in _include_params + )) + return cls._default_include() + + @classmethod + def _gather_attrpaths(cls, queryparams: _qp.QueryparamDict, shorthand: rdf.IriShorthand) -> Mapping[ + str, + tuple[Propertypath, ...], + ]: + _attrpaths = SimpleChainMap([cls._default_attrpaths()]) + _fields_params = queryparams.get('fields', []) + if _fields_params: + _requested: dict[str, list[Propertypath]] = defaultdict(list) + for _param_name, _param_value in _fields_params: + try: + (_typenames,) = filter(bool, _param_name.bracketed_names) + except (IndexError, ValueError): + raise trove_exceptions.InvalidQueryParamName( + f'expected "fields[TYPE]" (with exactly one non-empty bracketed segment)' + f' (got "{_param_name}")' + ) + else: + for _type in _qp.split_queryparam_value(_typenames): + _type_iri = shorthand.expand_iri(_type) + _requested[_type_iri].extend(parse_propertypaths(_param_value, shorthand)) + _attrpaths = _attrpaths.with_new(freeze(_requested)) + return _attrpaths + + ### + # instance methods + + def to_querystring(self) -> str: + return self.to_querydict().urlencode() + + def to_querydict(self) -> QueryDict: + # subclasses should override and add their fields to super().to_querydict() + _querydict = QueryDict(mutable=True) + if self.accept_mediatype: + _querydict['acceptMediatype'] = self.accept_mediatype + # TODO: iriShorthand, include, fields[...] + return _querydict diff --git a/trove/util/chainmap.py b/trove/util/chainmap.py new file mode 100644 index 000000000..1622899ba --- /dev/null +++ b/trove/util/chainmap.py @@ -0,0 +1,56 @@ +from collections.abc import Sequence, Mapping +import dataclasses + + +@dataclasses.dataclass +class SimpleChainMap(Mapping): + """Combine multiple mappings for sequential lookup. + + For example, to emulate Python's normal lookup sequence: + + import __builtin__ + pylookup = SimpleChainMap([locals(), globals(), vars(__builtin__)]) + + >>> _map = SimpleChainMap([{'a':1, 'b':2}, {'a':3, 'd':4}]) + >>> _map['a'] + 1 + >>> _map['d'] + 4 + >>> _map['f'] + KeyError + >>> 'b' in _map + True + >>> 'c' in _map + False + >>> 'd' in _map + True + >>> _map.get('a', 10) + 1 + >>> _map.get('b', 20) + 2 + >>> _map.get('d', 30) + 4 + >>> _map.get('f', 40) + 40 + """ + maps: Sequence[Mapping] + + def __getitem__(self, key): + for _mapping in self.maps: + try: + return _mapping[key] + except KeyError: + pass + raise KeyError(key) + + def __iter__(self): + _seen: set = set() + for _mapping in self.maps: + yield from set(_mapping.keys()).difference(_seen) + _seen.update(_mapping.keys()) + + def __len__(self): + return len(self.keys()) + + def with_new(self, new_map): + return dataclasses.replace(self, maps=[new_map, *self.maps]) diff --git a/trove/util/iris.py b/trove/util/iris.py index 2b266e6c8..740ef659f 100644 --- a/trove/util/iris.py +++ b/trove/util/iris.py @@ -29,21 +29,63 @@ def get_sufficiently_unique_iri(iri: str) -> str: + ''' + >>> get_sufficiently_unique_iri('flipl://iri.example/blarg/?#') + '://iri.example/blarg' + >>> get_sufficiently_unique_iri('namly:urn.example:blerg') + 'namly:urn.example:blerg' + ''' (_suffuniq_iri, _) = get_sufficiently_unique_iri_and_scheme(iri) return _suffuniq_iri def get_iri_scheme(iri: str) -> str: + ''' + >>> get_iri_scheme('flipl://iri.example/blarg/?#') + 'flipl' + >>> get_iri_scheme('namly:urn.example:blerg') + 'namly' + ''' (_, _iri_scheme) = get_sufficiently_unique_iri_and_scheme(iri) return _iri_scheme def iris_sufficiently_equal(*iris) -> bool: + ''' + >>> iris_sufficiently_equal( + ... 'flipl://iri.example/blarg/blerg/?#', + ... 'http://iri.example/blarg/blerg', + ... 'https://iri.example/blarg/blerg', + ... 'git://iri.example/blarg/blerg', + ... ) + True + >>> iris_sufficiently_equal( + ... 'flipl://iri.example/blarg/blerg', + ... 'namly:iri.example/blarg/blerg', + ... ) + False + >>> iris_sufficiently_equal( + ... 'namly:urn.example:blerg', + ... 'namly:urn.example:blerg', + ... ) + True + >>> iris_sufficiently_equal( + ... 'namly:urn.example:blerg', + ... 'nimly:urn.example:blerg', + ... ) + False + ''' _suffuniq_iris = set(map(get_sufficiently_unique_iri, iris)) return len(_suffuniq_iris) == 1 def get_sufficiently_unique_iri_and_scheme(iri: str) -> tuple[str, str]: + ''' + >>> get_sufficiently_unique_iri_and_scheme('flipl://iri.example/blarg/?#') + ('://iri.example/blarg', 'flipl') + >>> get_sufficiently_unique_iri_and_scheme('namly:urn.example:blerg') + ('namly:urn.example:blerg', 'namly') + ''' _scheme_match = IRI_SCHEME_REGEX_IGNORECASE.match(iri) if _scheme_match: _scheme = _scheme_match.group().lower() @@ -69,6 +111,14 @@ def get_sufficiently_unique_iri_and_scheme(iri: str) -> tuple[str, str]: def is_worthwhile_iri(iri: str): + ''' + >>> is_worthwhile_iri('flipl://iri.example/blarg/?#') + True + >>> is_worthwhile_iri('namly:urn.example:blerg') + True + >>> is_worthwhile_iri('_:1234') + False + ''' return ( isinstance(iri, str) and not iri.startswith('_') # skip artefacts of sharev2 shenanigans @@ -90,6 +140,10 @@ def iri_path_as_keyword(iris: list[str] | tuple[str, ...], *, suffuniq=False) -> def unquote_iri(iri: str) -> str: + ''' + >>> unquote_iri('flipl://iri.example/blarg/?#') + >>> unquote_iri('namly:urn.example:blerg') + ''' _unquoted_iri = iri while QUOTED_IRI_REGEX.match(_unquoted_iri): _unquoted_iri = unquote(_unquoted_iri) diff --git a/trove/util/propertypath.py b/trove/util/propertypath.py index d9c702251..22e8163e6 100644 --- a/trove/util/propertypath.py +++ b/trove/util/propertypath.py @@ -30,34 +30,38 @@ def make_globpath(length: int) -> Propertypath: return ONE_GLOB_PROPERTYPATH * length -@dataclasses.dataclass -class PropertypathParser: - shorthand: rdf.IriShorthand - allow_globs: bool = True - - def parse_propertypath(self, serialized_path: str) -> Propertypath: - _path = tuple( - self.shorthand.expand_iri(_pathstep) - for _pathstep in serialized_path.split(PROPERTYPATH_DELIMITER) - ) - if GLOB_PATHSTEP in _path: - if not self.allow_globs: - raise trove_exceptions.InvalidPropertyPath(serialized_path, 'no * allowed') - if any(_pathstep != GLOB_PATHSTEP for _pathstep in _path): - raise trove_exceptions.InvalidPropertyPath( - serialized_path, - f'path must be all * or no * (got {serialized_path})', - ) - return _path - - def propertypathstep_key(self, pathstep: str) -> str: - if pathstep == GLOB_PATHSTEP: - return pathstep - # assume iri - return urllib.parse.quote(self.shorthand.compact_iri(pathstep)) - - def propertypath_key(self, property_path: Propertypath) -> str: - return PROPERTYPATH_DELIMITER.join( - self.propertypathstep_key(_pathstep) - for _pathstep in property_path - ) +def parse_propertypath( + serialized_path: str, + shorthand: rdf.IriShorthand, + allow_globs: bool = False, +) -> Propertypath: + _path = tuple( + shorthand.expand_iri(_pathstep) + for _pathstep in serialized_path.split(PROPERTYPATH_DELIMITER) + ) + if GLOB_PATHSTEP in _path: + if not allow_globs: + raise trove_exceptions.InvalidPropertyPath(serialized_path, 'no * allowed') + if any(_pathstep != GLOB_PATHSTEP for _pathstep in _path): + raise trove_exceptions.InvalidPropertyPath( + serialized_path, + f'path must be all * or no * (got {serialized_path})', + ) + return _path + + +def propertypathstep_key( + pathstep: str, + shorthand: rdf.IriShorthand, +) -> str: + if pathstep == GLOB_PATHSTEP: + return pathstep + # assume iri + return urllib.parse.quote(shorthand.compact_iri(pathstep)) + + +def propertypath_key(self, property_path: Propertypath) -> str: + return PROPERTYPATH_DELIMITER.join( + self.propertypathstep_key(_pathstep) + for _pathstep in property_path + ) diff --git a/trove/util/queryparams.py b/trove/util/queryparams.py index fd9dc8021..c4128d481 100644 --- a/trove/util/queryparams.py +++ b/trove/util/queryparams.py @@ -1,22 +1,12 @@ from __future__ import annotations -import collections import dataclasses -import itertools import re import typing # TODO: remove django dependency (tho it is convenient) from django.http import QueryDict -from primitive_metadata import primitive_rdf as rdf from trove import exceptions as trove_exceptions -from trove.util.frozen import freeze -from trove.util.propertypath import ( - PropertypathSet, - Propertypath, - PropertypathParser, -) -from trove.vocab.namespaces import NAMESPACES_SHORTHAND ### @@ -120,116 +110,3 @@ def get_single_value( raise trove_exceptions.InvalidRepeatedQueryParam(str(queryparam_name)) else: return _singlevalue - - -@dataclasses.dataclass(frozen=True) -class BaseTroveParams: - iri_shorthand: rdf.IriShorthand = dataclasses.field(repr=False) - accept_mediatype: str | None - included_relations: PropertypathSet = dataclasses.field(repr=False, compare=False) - attrpaths_by_type: collections.abc.Mapping[str, PropertypathSet] = dataclasses.field(repr=False, compare=False) - - ### - # class methods - - @classmethod - def from_querystring(cls, querystring: str) -> typing.Self: - return cls.from_queryparams(queryparams_from_querystring(querystring)) - - @classmethod - def from_queryparams(cls, queryparams: QueryparamDict) -> typing.Self: - return cls(**cls.parse_queryparams(queryparams)) - - @classmethod - def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: - # subclasses should override and add their fields to super().parse_queryparams(queryparams) - _shorthand = cls._gather_shorthand(queryparams) - return { - 'iri_shorthand': _shorthand, - 'included_relations': cls._gather_included_relations(queryparams, _shorthand), - 'attrpaths_by_type': cls._gather_attrpaths(queryparams, _shorthand), - 'accept_mediatype': get_single_value(queryparams, QueryparamName('acceptMediatype')), - } - - @classmethod - def _default_shorthand(cls) -> rdf.IriShorthand: - return NAMESPACES_SHORTHAND - - @classmethod - def _default_include(cls) -> PropertypathSet: - return frozenset() - - @classmethod - def _default_attrpaths(cls) -> dict[str, tuple[Propertypath, ...]]: - return {} - - @classmethod - def _gather_shorthand(cls, queryparams: QueryparamDict): - _prefixmap = {} - for _qp_name, _iri in queryparams.get('iriShorthand', []): - try: - (_shortname,) = _qp_name.bracketed_names - except ValueError: - raise trove_exceptions.InvalidQueryParamName(_qp_name) - else: - _prefixmap[_shortname] = _iri - _shorthand = cls._default_shorthand() - if _prefixmap: - _shorthand = _shorthand.with_update(_prefixmap) - return _shorthand - - @classmethod - def _gather_included_relations(cls, queryparams: QueryparamDict, shorthand: rdf.IriShorthand) -> PropertypathSet: - _include_params = queryparams.get('include', []) - if _include_params: - return frozenset(itertools.chain.from_iterable( - parse_propertypaths(_include_value, shorthand) - for _, _include_value in _include_params - )) - return cls._default_include() - - @classmethod - def _gather_attrpaths(cls, queryparams: QueryparamDict, shorthand: rdf.IriShorthand) -> collections.abc.Mapping[ - str, - tuple[Propertypath, ...], - ]: - _attrpaths: collections.ChainMap[str, tuple[Propertypath, ...]] = collections.ChainMap( - cls._default_attrpaths(), - ) - _fields_params = queryparams.get('fields', []) - if _fields_params: - _requested: dict[str, list[Propertypath]] = collections.defaultdict(list) - for _param_name, _param_value in _fields_params: - try: - (_typenames,) = filter(bool, _param_name.bracketed_names) - except (IndexError, ValueError): - raise trove_exceptions.InvalidQueryParamName( - f'expected "fields[TYPE]" (with exactly one non-empty bracketed segment)' - f' (got "{_param_name}")' - ) - else: - for _type in split_queryparam_value(_typenames): - _type_iri = shorthand.expand_iri(_type) - _requested[_type_iri].extend(parse_propertypaths(_param_value, shorthand)) - _attrpaths = _attrpaths.new_child(freeze(_requested)) - return _attrpaths - - ### - # instance methods - - def to_querystring(self) -> str: - return self.to_querydict().urlencode() - - def to_querydict(self) -> QueryDict: - # subclasses should override and add their fields to super().to_querydict() - _querydict = QueryDict(mutable=True) - if self.accept_mediatype: - _querydict['acceptMediatype'] = self.accept_mediatype - # TODO: iriShorthand, include, fields[...] - return _querydict - - -def parse_propertypaths(serialized_path_set: str, shorthand: rdf.IriShorthand) -> typing.Iterator[Propertypath]: - _parser = PropertypathParser(shorthand) - for _path in split_queryparam_value(serialized_path_set): - yield _parser.parse_propertypath(_path) diff --git a/trove/views/_base.py b/trove/views/_base.py index b6a49555f..8730c0b41 100644 --- a/trove/views/_base.py +++ b/trove/views/_base.py @@ -1,6 +1,7 @@ __all__ = ('BaseTroveView',) import abc +from collections.abc import Container from typing import ClassVar from django import http as djhttp @@ -24,9 +25,9 @@ class BaseTroveView(View, abc.ABC): # ClassVars expected on inheritors: - organizer: ClassVar[gather.GatheringOrganizer] + gathering_organizer: ClassVar[gather.GatheringOrganizer] params_type: ClassVar[type[BaseTroveParams]] = BaseTroveParams - focus_type: ClassVar[type[gather.Focus]] = gather.Focus + focus_type_iris: ClassVar[Container[str]] = (RDFS.Resource,) def get(self, request): try: @@ -39,7 +40,7 @@ def get(self, request): try: _url = request.build_absolute_uri() _params = self._parse_params(request) - _renderer = self._gather_to_renderer(_url, _params, renderer_type=_renderer_type) + _renderer = self._gather_to_renderer(_url, _params, _renderer_type) return make_http_response( content_rendering=_renderer.render_document(), http_request=request, @@ -50,20 +51,23 @@ def get(self, request): renderer_type=_renderer_type, ) - def _parse_params(self, request: djhttp.HttpRequest): - return self.params_type.from_querystring(request.META['QUERY_STRING']) - def _gather_to_renderer(self, url, params, renderer_type: type[BaseRenderer]) -> BaseRenderer: _focus = self._build_focus(url, params) - _gathering = self.organizer.new_gathering( - **self._get_gatherer_kwargs(params, renderer_type), - ) + _gathering = self._build_gathering(params, renderer_type) if renderer_type.PASSIVE_RENDER: ask_gathering_from_params(_gathering, params, _focus) return renderer_type(_focus, _gathering) + def _parse_params(self, request: djhttp.HttpRequest): + return self.params_type.from_querystring(request.META['QUERY_STRING']) + def _build_focus(self, url, params): - return self.focus_type.new(url, RDFS.Resource) + return gather.Focus(url, self.focus_type_iri) + + def _build_gathering(self, params, renderer_type: type[BaseRenderer]) -> gather.Gathering: + return self.gathering_organizer.new_gathering( + self._get_gatherer_kwargs(params, renderer_type), + ) def _get_gatherer_kwargs(self, params, renderer_type): _kwargs = {} diff --git a/trove/vocab/osfmap.py b/trove/vocab/osfmap.py index 4f9112127..d3d3b5881 100644 --- a/trove/vocab/osfmap.py +++ b/trove/vocab/osfmap.py @@ -1,4 +1,7 @@ +import typing import functools +if typing.TYPE_CHECKING: + from collections.abc import Iterator from primitive_metadata.primitive_rdf import ( literal, @@ -8,6 +11,14 @@ from primitive_metadata import gather from share.models.feature_flag import FeatureFlag +from trove.util.propertypath import ( + Propertypath, + PropertypathSet, +) +from trove.util.queryparams import ( + join_queryparam_value, + split_queryparam_value, +) from trove.util.shorthand import build_shorthand_from_thesaurus from trove.vocab.jsonapi import JSONAPI_MEMBERNAME from trove.vocab.namespaces import ( @@ -814,17 +825,6 @@ ) -@functools.cache -def osfmap_shorthand() -> IriShorthand: - '''build iri shorthand that includes unprefixed osfmap terms - ''' - return build_shorthand_from_thesaurus( - thesaurus=OSFMAP_THESAURUS, - label_predicate=JSONAPI_MEMBERNAME, - base_shorthand=NAMESPACES_SHORTHAND, - ) - - ALL_SUGGESTED_PROPERTY_PATHS = ( (DCTERMS.created,), (OSFMAP.funder,), @@ -941,6 +941,49 @@ def osfmap_shorthand() -> IriShorthand: (DCTERMS.rights,), ) +# end constants +### + + +### +# functions + +@functools.cache # built once +def osfmap_shorthand() -> IriShorthand: + '''build iri shorthand that includes unprefixed osfmap terms + ''' + return build_shorthand_from_thesaurus( + thesaurus=OSFMAP_THESAURUS, + label_predicate=JSONAPI_MEMBERNAME, + base_shorthand=NAMESPACES_SHORTHAND, + ) + + +def parse_osfmap_propertypath(serialized_path: str, *, allow_globs=False) -> Propertypath: + return propertypaths.parse_propertypath(serialized_path, osfmap_shorthand(), allow_globs=allow_globs) + + +def parse_osfmap_propertypath_set(serialized_path_set: str) -> Iterator[Propertypath]: + _parser = PropertypathParser(osfmap_shorthand()) + for _path in split_queryparam_value(serialized_path_set): + yield _parser.parse_propertypath(_path) + return propertypaths.parse_propertypath(serialized_path, osfmap_shorthand(), allow_globs=allow_globs) + + +def osfmap_propertypath_key(propertypath: Propertypath) -> str: + return ( + PropertypathParser(osfmap_shorthand()) + .propertypath_key(propertypath) + ) + + +def osfmap_propertypath_set_key(propertypath_set: PropertypathSet) -> str: + _parser = PropertypathParser(osfmap_shorthand()) + return join_queryparam_value( + _parser.propertypath_key(_propertypath) + for _propertypath in propertypath_set + ) + def suggested_property_paths(type_iris: set[str]) -> tuple[tuple[str, ...], ...]: _suggested: tuple[tuple[str, ...], ...] From f69bb8787c44e26543ab20bcb72d708728fef2ff Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 28 Mar 2025 13:20:07 -0400 Subject: [PATCH 03/43] wip --- project/urls.py | 4 +- trove/render/html_browse.py | 2 +- trove/trovebrowse_gathering.py | 10 ++--- trove/trovesearch/search_params.py | 21 +++++----- trove/urls.py | 3 +- trove/util/base_trove_params.py | 15 ++++--- trove/util/propertypath.py | 28 +++++++++++-- trove/views/_base.py | 29 ++++++++------ trove/views/browse.py | 63 +++++++++++------------------- trove/views/vocab.py | 39 +++++------------- trove/vocab/osfmap.py | 19 ++++----- trove/vocab/trove.py | 2 +- 12 files changed, 110 insertions(+), 125 deletions(-) diff --git a/project/urls.py b/project/urls.py index 3a4a48aa0..da8ad1f28 100644 --- a/project/urls.py +++ b/project/urls.py @@ -17,8 +17,8 @@ urlpatterns = [ url(r'^admin/', admin_site.urls), # url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')), - path('api/v3/', include('trove.urls', namespace='trove')), # same as 'trove/' but more subtle - path('trove/', include('trove.urls', namespace='trovetrove')), + path('api/v3/', include('trove.urls', namespace='apiv3')), # same as 'trove/' but more subtle + path('trove/', include('trove.urls', namespace='trove')), path('vocab/2023/trove/', view=TroveVocabView.as_view(), name='trove-vocab'), url(r'^api/v2/', include('api.urls', namespace='api')), url(r'^api/(?P(?!v\d+).*)', APIVersionRedirectView.as_view()), diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index a1acf7453..bdd067a06 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -102,7 +102,7 @@ def __mediatype_link(self, mediatype: str): if mediatype in STABLE_MEDIATYPES: with self.__nest('aside') as _aside: _aside.text = '(stable for ' - with self.__nest('a', attrs={'href': reverse('trovetrove:docs')}) as _link: + with self.__nest('a', attrs={'href': reverse('trove:docs')}) as _link: _link.text = 'documented use' _link.tail = ')' diff --git a/trove/trovebrowse_gathering.py b/trove/trovebrowse_gathering.py index 8c8458a86..5c687379b 100644 --- a/trove/trovebrowse_gathering.py +++ b/trove/trovebrowse_gathering.py @@ -39,14 +39,14 @@ def gather_thesaurus_entry(focus): yield from rdf.iter_twoples(_thesaurus_entry) -@trovebrowse.gatherer(ns.DCTERMS.isReferencedBy) -def gather_cards_referencing(focus): - ... # TODO via elasticsearch aggregation - - @trovebrowse.gatherer(ns.FOAF.primaryTopicOf) def gather_cards_focused_on(focus): _identifier_qs = trove_db.ResourceIdentifier.objects.queryset_for_iris(focus.iris) _indexcard_qs = trove_db.Indexcard.objects.filter(focus_identifier_set__in=_identifier_qs) for _indexcard in _indexcard_qs: yield (ns.FOAF.primaryTopicOf, _indexcard.get_iri()) + + +@trovebrowse.gatherer(ns.TROVE.usedAtPath) +def gather_paths_used_at(focus): + ... # TODO via elasticsearch aggregation diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index f94f62740..0525e5115 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -22,14 +22,13 @@ Propertypath, is_globpath, ) +from trove.util.base_trove_params import BaseTroveParams from trove.util.queryparams import ( - BaseTroveParams, QueryparamDict, QueryparamName, split_queryparam_value, join_queryparam_value, get_single_value, - parse_propertypaths, ) from trove.vocab import osfmap from trove.vocab.trove import trove_shorthand @@ -156,7 +155,7 @@ def iter_from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_fam @classmethod def iter_from_searchtext_param(cls, param_name: QueryparamName, param_value: str): _propertypath_set = ( - frozenset(parse_propertypaths(param_name.bracketed_names[0], osfmap.osfmap_shorthand())) + frozenset(osfmap.parse_osfmap_propertypath_set(param_name.bracketed_names[0])) if param_name.bracketed_names else None ) @@ -261,7 +260,7 @@ def queryparams_from_textsegments(self, queryparam_family: str, textsegments): for _propertypath_set, _combinable_segments in _by_propertypath_set.items(): _qp_name = QueryparamName( queryparam_family, - (osfmap_propertypath_set_key(_propertypath_set),), + (osfmap.osfmap_propertypath_set_key(_propertypath_set),), ) _qp_value = ' '.join( _textsegment.as_searchtext() @@ -341,9 +340,9 @@ def from_filter_param(cls, param_name: QueryparamName, param_value: str): str(param_name), f'unknown filter operator "{_operator_value}"', ) - _propertypath_set = frozenset(parse_propertypaths(_serialized_path_set, osfmap_shorthand())) + _propertypath_set = frozenset(osfmap.parse_osfmap_propertypath_set(_serialized_path_set)) _is_date_filter = all( - is_date_property(_path[-1]) + osfmap.is_date_property(_path[-1]) for _path in _propertypath_set ) if _operator is None: # default operator @@ -363,7 +362,7 @@ def from_filter_param(cls, param_name: QueryparamName, param_value: str): if _is_date_filter: _value_list.append(_value) # TODO: vali-date else: - _value_list.append(osfmap_shorthand().expand_iri(_value)) + _value_list.append(osfmap.osfmap_shorthand().expand_iri(_value)) return cls( value_set=frozenset(_value_list), operator=_operator, @@ -388,11 +387,11 @@ def is_type_filter(self) -> bool: def as_queryparam(self, queryparam_family: str): _qp_name = QueryparamName(queryparam_family, ( - osfmap_propertypath_set_key(self.propertypath_set), + osfmap.osfmap_propertypath_set_key(self.propertypath_set), self.operator.to_shortname(), )) _qp_value = join_queryparam_value( - osfmap_shorthand().compact_iri(_value) + osfmap.osfmap_shorthand().compact_iri(_value) for _value in self.value_set ) return str(_qp_name), _qp_value @@ -459,7 +458,7 @@ def as_queryparam(self) -> tuple[str, str]: if (self.value_type == ValueType.DATE) else f'sort[{self.value_type.to_shortname()}]' ) - _pathkey = osfmap_propertypath_key(self.propertypath) + _pathkey = osfmap.osfmap_propertypath_key(self.propertypath) _value = (f'-{_pathkey}' if self.descending else _pathkey) return (_name, _value) @@ -575,7 +574,7 @@ def __post_init__(self): def to_querydict(self): _querydict = super().to_querydict() - _querydict['valueSearchPropertyPath'] = osfmap_propertypath_key(self.valuesearch_propertypath) + _querydict['valueSearchPropertyPath'] = osfmap.osfmap_propertypath_key(self.valuesearch_propertypath) for _qp_name, _qp_value in Textsegment.queryparams_from_textsegments('valueSearchText', self.valuesearch_textsegment_set): _querydict[_qp_name] = _qp_value for _filter in self.valuesearch_filter_set: diff --git a/trove/urls.py b/trove/urls.py index b58c7127f..8b183a48e 100644 --- a/trove/urls.py +++ b/trove/urls.py @@ -20,10 +20,9 @@ path('index-card/', view=IndexcardView.as_view(), name='index-card'), path('index-card-search', view=CardsearchView.as_view(), name='index-card-search'), path('index-value-search', view=ValuesearchView.as_view(), name='index-value-search'), - path('browse///', view=BrowseIriView.as_view(), name='browse-iri'), path('browse', view=BrowseIriView.as_view(), name='browse-iri'), path('ingest', view=RdfIngestView.as_view(), name='ingest-rdf'), path('docs/openapi.json', view=OpenapiJsonView.as_view(), name='docs.openapi-json'), path('docs/openapi.html', view=OpenapiHtmlView.as_view(), name='docs.openapi-html'), - re_path(r'docs/?', view=RedirectView.as_view(pattern_name='trovetrove:docs.openapi-html'), name='docs'), + re_path(r'docs/?', view=RedirectView.as_view(pattern_name='trove:docs.openapi-html'), name='docs'), ] diff --git a/trove/util/base_trove_params.py b/trove/util/base_trove_params.py index 855abb39b..742cc85f0 100644 --- a/trove/util/base_trove_params.py +++ b/trove/util/base_trove_params.py @@ -1,7 +1,6 @@ from __future__ import annotations from collections import defaultdict import dataclasses -import itertools import typing if typing.TYPE_CHECKING: from collections.abc import Mapping @@ -16,9 +15,9 @@ from trove.util.propertypath import ( PropertypathSet, Propertypath, + parse_propertypath, ) from trove.util import queryparams as _qp -from trove.vocab.namespaces import NAMESPACES_SHORTHAND from trove.vocab import osfmap @@ -53,7 +52,7 @@ def parse_queryparams(cls, queryparams: _qp.QueryparamDict) -> dict: @classmethod def _default_shorthand(cls) -> rdf.IriShorthand: - return NAMESPACES_SHORTHAND + return osfmap.osfmap_shorthand() # NOTE: osfmap entanglement @classmethod def _default_include(cls) -> PropertypathSet: @@ -82,9 +81,10 @@ def _gather_shorthand(cls, queryparams: _qp.QueryparamDict): def _gather_included_relations(cls, queryparams: _qp.QueryparamDict, shorthand: rdf.IriShorthand) -> PropertypathSet: _include_params = queryparams.get('include', []) if _include_params: - return frozenset(itertools.chain.from_iterable( - parse_propertypaths(_include_value, shorthand) + return frozenset(( + parse_propertypath(_path_value, shorthand) for _, _include_value in _include_params + for _path_value in _qp.split_queryparam_value(_include_value) )) return cls._default_include() @@ -108,7 +108,10 @@ def _gather_attrpaths(cls, queryparams: _qp.QueryparamDict, shorthand: rdf.IriSh else: for _type in _qp.split_queryparam_value(_typenames): _type_iri = shorthand.expand_iri(_type) - _requested[_type_iri].extend(parse_propertypaths(_param_value, shorthand)) + _requested[_type_iri].extend( + parse_propertypath(_path_value, shorthand) + for _path_value in _qp.split_queryparam_value(_param_value) + ) _attrpaths = _attrpaths.with_new(freeze(_requested)) return _attrpaths diff --git a/trove/util/propertypath.py b/trove/util/propertypath.py index 22e8163e6..c33355fad 100644 --- a/trove/util/propertypath.py +++ b/trove/util/propertypath.py @@ -1,4 +1,3 @@ -import dataclasses import urllib from primitive_metadata import primitive_rdf as rdf @@ -23,10 +22,28 @@ def is_globpath(path: Propertypath) -> bool: + ''' + >>> is_globpath(('*',)) + True + >>> is_globpath(('*', '*')) + True + >>> is_globpath(('*', 'url:url')) + False + >>> is_globpath(()) + False + ''' return all(_pathstep == GLOB_PATHSTEP for _pathstep in path) def make_globpath(length: int) -> Propertypath: + ''' + >>> make_globpath(1) + ('*',) + >>> make_globpath(2) + ('*', '*') + >>> make_globpath(5) + ('*', '*', '*', '*', '*') + ''' return ONE_GLOB_PROPERTYPATH * length @@ -60,8 +77,11 @@ def propertypathstep_key( return urllib.parse.quote(shorthand.compact_iri(pathstep)) -def propertypath_key(self, property_path: Propertypath) -> str: +def propertypath_key( + path: Propertypath, + shorthand: rdf.IriShorthand, +) -> str: return PROPERTYPATH_DELIMITER.join( - self.propertypathstep_key(_pathstep) - for _pathstep in property_path + propertypathstep_key(_pathstep, shorthand) + for _pathstep in path ) diff --git a/trove/views/_base.py b/trove/views/_base.py index 8730c0b41..e62b10806 100644 --- a/trove/views/_base.py +++ b/trove/views/_base.py @@ -38,31 +38,36 @@ def get(self, request): renderer_type=DEFAULT_RENDERER_TYPE, ) try: - _url = request.build_absolute_uri() _params = self._parse_params(request) - _renderer = self._gather_to_renderer(_url, _params, _renderer_type) - return make_http_response( - content_rendering=_renderer.render_document(), - http_request=request, - ) + return self._make_response(request, _params, _renderer_type) except trove_exceptions.TroveError as _error: return make_http_error_response( error=_error, renderer_type=_renderer_type, ) - def _gather_to_renderer(self, url, params, renderer_type: type[BaseRenderer]) -> BaseRenderer: - _focus = self._build_focus(url, params) + def _respond(self, request, params, renderer_type: type[BaseRenderer]): + _focus = self._build_focus(request, params) + _renderer = self._gather_to_renderer(_focus, params, renderer_type) + return make_http_response( + content_rendering=_renderer.render_document(), + http_request=request, + ) + + def _gather_to_renderer(self, focus, params, renderer_type: type[BaseRenderer]) -> BaseRenderer: _gathering = self._build_gathering(params, renderer_type) if renderer_type.PASSIVE_RENDER: - ask_gathering_from_params(_gathering, params, _focus) - return renderer_type(_focus, _gathering) + ask_gathering_from_params(_gathering, params, focus) + return renderer_type(focus, _gathering) def _parse_params(self, request: djhttp.HttpRequest): return self.params_type.from_querystring(request.META['QUERY_STRING']) - def _build_focus(self, url, params): - return gather.Focus(url, self.focus_type_iri) + def _get_focus_iri(self, request, params): + return request.build_absolute_uri() + + def _build_focus(self, request, params): + return gather.Focus(self._get_focus_iri(request, params), self.focus_type_iri) def _build_gathering(self, params, renderer_type: type[BaseRenderer]) -> gather.Gathering: return self.gathering_organizer.new_gathering( diff --git a/trove/views/browse.py b/trove/views/browse.py index e66189501..fa20c6e1d 100644 --- a/trove/views/browse.py +++ b/trove/views/browse.py @@ -1,56 +1,37 @@ import dataclasses -from django import http -from django.shortcuts import redirect -from django.views import View -from primitive_metadata import primitive_rdf - -from trove import models as trove_db -from trove.render import get_renderer_type -from trove.util.iris import unquote_iri, get_sufficiently_unique_iri +from trove import exceptions as trove_exceptions +from trove.util.iris import unquote_iri from trove.vocab import namespaces as ns from trove.trovebrowse_gathering import trovebrowse -from trove.trovesearch.search_params import BaseTroveParams +from trove.util.base_trove_params import BaseTroveParams +from trove.util.queryparams import ( + QueryparamDict, + QueryparamName, + get_single_value, +) from ._base import BaseTroveView -from ._responder import make_http_response @dataclasses.dataclass(frozen=True) class BrowseParams(BaseTroveParams): iri: str + @classmethod + def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: + _iri_value = get_single_value(queryparams, QueryparamName('iri')) + if not _iri_value: + raise trove_exceptions.MissingRequiredQueryParam('iri') + _iri = ns.NAMESPACES_SHORTHAND.expand_iri(unquote_iri(_iri_value)) + return { + **super().parse_queryparams(queryparams), + 'iri': _iri, + } + class BrowseIriView(BaseTroveView): - organizer = trovebrowse + gathering_organizer = trovebrowse params_type = BrowseParams - def get(self, request, **kwargs): - _iri_param = kwargs.get('iri') or request.GET.get('iri') - if not _iri_param: - raise http.Http404 # TODO: docs? random browse? - _iri = ns.NAMESPACES_SHORTHAND.expand_iri(unquote_iri(_iri_param)) - _suffuniq_iri = get_sufficiently_unique_iri(_iri) - _trove_term = _recognize_trove_term(_suffuniq_iri) - if _trove_term is not None: - return redirect('trove-vocab', vocab_term=_trove_term) - _card_focus_iri, _combined_rdf = _get_latest_cardf(_iri) - _thesaurus_entry = static_vocab.combined_thesaurus__suffuniq().get(_suffuniq_iri, {}) - if _thesaurus_entry: - _combined_rdf.add_twopledict(_card_focus_iri, _thesaurus_entry) - _renderer_type = get_renderer_type(request) - _renderer = _renderer_type( - _card_focus_iri, - _combined_rdf.tripledict, - ) - return make_http_response( - content_rendering=_renderer.render_document(), - http_headers=[('Content-Disposition', 'inline')], - http_request=request, - ) - - -def _recognize_trove_term(suffuniq_iri: str): - _suffuniq_trove = get_sufficiently_unique_iri(str(ns.TROVE)) - if suffuniq_iri.startswith(_suffuniq_trove): - return primitive_rdf.iri_minus_namespace(suffuniq_iri, _suffuniq_trove).strip('/') - return None + def _get_focus_iri(self, request, params: BrowseParams): + return params.iri diff --git a/trove/views/vocab.py b/trove/views/vocab.py index 62982f34e..b2c5026b0 100644 --- a/trove/views/vocab.py +++ b/trove/views/vocab.py @@ -1,40 +1,21 @@ +from urllib.parse import urlencode + from django import http +from django.shortcuts import redirect +from django.urls import reverse from django.views import View -from trove import exceptions as trove_exceptions -from trove.render import ( - DEFAULT_RENDERER_TYPE, - get_renderer_type, -) from trove.vocab.namespaces import TROVE from trove.vocab.trove import TROVE_API_THESAURUS -from ._responder import ( - make_http_error_response, - make_http_response, -) class TroveVocabView(View): def get(self, request, vocab_term): _iri = TROVE[vocab_term] - try: - _data = {_iri: TROVE_API_THESAURUS[_iri]} - except KeyError: + if _iri not in TROVE_API_THESAURUS: raise http.Http404 - try: - _renderer_type = get_renderer_type(request) - _renderer = _renderer_type(_iri, _data) - return make_http_response( - content_rendering=_renderer.render_document(), - http_request=request, - ) - except trove_exceptions.CannotRenderMediatype as _error: - return make_http_error_response( - error=_error, - renderer_type=DEFAULT_RENDERER_TYPE, - ) - except trove_exceptions.TroveError as _error: - return make_http_error_response( - error=_error, - renderer_type=_renderer_type, - ) + _browse_url = '?'.join(( + reverse('trove-browse'), + urlencode({'iri': _iri}), + )) + return redirect(_browse_url) diff --git a/trove/vocab/osfmap.py b/trove/vocab/osfmap.py index d3d3b5881..91deb1bdb 100644 --- a/trove/vocab/osfmap.py +++ b/trove/vocab/osfmap.py @@ -1,3 +1,4 @@ +from __future__ import annotations import typing import functools if typing.TYPE_CHECKING: @@ -14,6 +15,8 @@ from trove.util.propertypath import ( Propertypath, PropertypathSet, + parse_propertypath, + propertypath_key, ) from trove.util.queryparams import ( join_queryparam_value, @@ -960,27 +963,21 @@ def osfmap_shorthand() -> IriShorthand: def parse_osfmap_propertypath(serialized_path: str, *, allow_globs=False) -> Propertypath: - return propertypaths.parse_propertypath(serialized_path, osfmap_shorthand(), allow_globs=allow_globs) + return parse_propertypath(serialized_path, osfmap_shorthand(), allow_globs=allow_globs) -def parse_osfmap_propertypath_set(serialized_path_set: str) -> Iterator[Propertypath]: - _parser = PropertypathParser(osfmap_shorthand()) +def parse_osfmap_propertypath_set(serialized_path_set: str, *, allow_globs=False) -> Iterator[Propertypath]: for _path in split_queryparam_value(serialized_path_set): - yield _parser.parse_propertypath(_path) - return propertypaths.parse_propertypath(serialized_path, osfmap_shorthand(), allow_globs=allow_globs) + yield parse_osfmap_propertypath(_path, allow_globs=allow_globs) def osfmap_propertypath_key(propertypath: Propertypath) -> str: - return ( - PropertypathParser(osfmap_shorthand()) - .propertypath_key(propertypath) - ) + return propertypath_key(propertypath, osfmap_shorthand()) def osfmap_propertypath_set_key(propertypath_set: PropertypathSet) -> str: - _parser = PropertypathParser(osfmap_shorthand()) return join_queryparam_value( - _parser.propertypath_key(_propertypath) + osfmap_propertypath_key(_propertypath) for _propertypath in propertypath_set ) diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index c8c2f377f..a126fbca2 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -48,7 +48,7 @@ def _literal_markdown(text: str, *, language: str): def trove_browse_link(iri: str): return urllib.parse.urljoin( - reverse('trovetrove:browse-iri'), + reverse('trove:browse-iri'), f'?iri={urllib.parse.quote(iri)}', ) From d0b142c9b94e0b6a93f92540f4f9ffca99ba0170 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 28 Mar 2025 15:33:56 -0400 Subject: [PATCH 04/43] wip --- share/util/checksum_iri.py | 4 ---- .../_common_trovesearch_tests.py | 4 ++-- trove/trovesearch/search_params.py | 3 ++- trove/trovesearch/trovesearch_gathering.py | 22 ++++++------------- trove/util/base_trove_params.py | 5 +++-- trove/views/_base.py | 2 +- trove/vocab/trove.py | 19 +++++++++------- 7 files changed, 26 insertions(+), 33 deletions(-) diff --git a/share/util/checksum_iri.py b/share/util/checksum_iri.py index e204b1126..012fdbab2 100644 --- a/share/util/checksum_iri.py +++ b/share/util/checksum_iri.py @@ -72,7 +72,3 @@ def from_iri(cls, checksum_iri: str): salt=salt, hexdigest=hexdigest, ) - - @classmethod - def from_dataclass_instance(cls, dataclass_instance): - return cls.digest_json(dataclasses.asdict(dataclass_instance)) diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index 6d6eab52b..f8d49485b 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -145,14 +145,14 @@ def test_cardsearch_pagination(self): def test_cardsearch_related_properties(self): self._fill_test_data_for_querying() with mock.patch( - 'trove.trovesearch.search_params.suggested_property_paths', + 'trove.vocab.osfmap.suggested_property_paths', return_value=( (DCTERMS.creator,), (DCTERMS.references,), (BLARG.nada,), ), ): - _cardsearch_params = CardsearchParams.from_querystring('') + _cardsearch_params = CardsearchParams.from_querystring('include=relatedProperties') _cardsearch_handle = self.index_strategy.pls_handle_cardsearch(_cardsearch_params) self.assertEqual(_cardsearch_handle.related_propertypath_results, [ PropertypathUsage((DCTERMS.creator,), 3), diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index 0525e5115..26ec31656 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -155,7 +155,7 @@ def iter_from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_fam @classmethod def iter_from_searchtext_param(cls, param_name: QueryparamName, param_value: str): _propertypath_set = ( - frozenset(osfmap.parse_osfmap_propertypath_set(param_name.bracketed_names[0])) + frozenset(osfmap.parse_osfmap_propertypath_set(param_name.bracketed_names[0], allow_globs=True)) if param_name.bracketed_names else None ) @@ -492,6 +492,7 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: @functools.cached_property def related_property_paths(self) -> tuple[Propertypath, ...]: + breakpoint() return ( _get_related_property_paths(self.cardsearch_filter_set) if (TROVE.relatedPropertyList,) in self.included_relations diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 7b3e130a3..9d97765da 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -26,20 +26,12 @@ ValuesearchResult, ) from trove.util.iris import get_sufficiently_unique_iri -from trove.util.propertypath import ( - propertypath_key, - propertypath_set_key, -) from trove.vocab.namespaces import RDF, FOAF, DCTERMS, RDFS, DCAT, TROVE from trove.vocab.jsonapi import ( JSONAPI_LINK_OBJECT, JSONAPI_MEMBERNAME, ) -from trove.vocab.osfmap import ( - osfmap_shorthand, - OSFMAP_THESAURUS, - suggested_filter_operator, -) +from trove.vocab import osfmap from trove.vocab.trove import ( TROVE_API_THESAURUS, trove_indexcard_namespace, @@ -441,7 +433,7 @@ def _filter_as_blanknode(search_filter) -> frozenset: def _osfmap_or_unknown_iri_as_json(iri: str): try: - _twopledict = OSFMAP_THESAURUS[iri] + _twopledict = osfmap.OSFMAP_THESAURUS[iri] except KeyError: return rdf.literal_json({'@id': iri}) else: @@ -497,19 +489,19 @@ def _osfmap_twople_json(twopledict): def _osfmap_path(property_path): return rdf.literal_json([ - osfmap_shorthand().compact_iri(_iri) + osfmap.osfmap_shorthand().compact_iri(_iri) for _iri in property_path ]) def _single_propertypath_twoples(property_path: tuple[str, ...]): - yield (TROVE.propertyPathKey, literal(propertypath_key(property_path))) + yield (TROVE.propertyPathKey, literal(osfmap.osfmap_propertypath_key(property_path))) yield (TROVE.propertyPath, _propertypath_sequence(property_path)) yield (TROVE.osfmapPropertyPath, _osfmap_path(property_path)) def _multi_propertypath_twoples(propertypath_set): - yield (TROVE.propertyPathKey, literal(propertypath_set_key(propertypath_set))) + yield (TROVE.propertyPathKey, literal(osfmap.osfmap_propertypath_set_key(propertypath_set))) for _path in propertypath_set: yield (TROVE.propertyPathSet, _propertypath_sequence(_path)) @@ -518,7 +510,7 @@ def _propertypath_sequence(property_path: tuple[str, ...]): _propertypath_metadata = [] for _property_iri in property_path: try: - _property_twopledict = OSFMAP_THESAURUS[_property_iri] + _property_twopledict = osfmap.OSFMAP_THESAURUS[_property_iri] except KeyError: _property_twopledict = {RDF.type: {RDF.Property}} # giving benefit of the doubt _propertypath_metadata.append(_osfmap_json( @@ -533,7 +525,7 @@ def _related_property_result(property_path: tuple[str, ...], count: int): (RDF.type, TROVE.RelatedPropertypath), (TROVE.cardsearchResultCount, count), (TROVE.suggestedFilterOperator, literal(trove_shorthand().compact_iri( - suggested_filter_operator(property_path[-1]), + osfmap.suggested_filter_operator(property_path[-1]), ))), *_single_propertypath_twoples(property_path), )) diff --git a/trove/util/base_trove_params.py b/trove/util/base_trove_params.py index 742cc85f0..23254c48d 100644 --- a/trove/util/base_trove_params.py +++ b/trove/util/base_trove_params.py @@ -18,7 +18,7 @@ parse_propertypath, ) from trove.util import queryparams as _qp -from trove.vocab import osfmap +from trove.vocab.trove import shtrove_shorthand @dataclasses.dataclass(frozen=True) @@ -52,7 +52,7 @@ def parse_queryparams(cls, queryparams: _qp.QueryparamDict) -> dict: @classmethod def _default_shorthand(cls) -> rdf.IriShorthand: - return osfmap.osfmap_shorthand() # NOTE: osfmap entanglement + return shtrove_shorthand() @classmethod def _default_include(cls) -> PropertypathSet: @@ -80,6 +80,7 @@ def _gather_shorthand(cls, queryparams: _qp.QueryparamDict): @classmethod def _gather_included_relations(cls, queryparams: _qp.QueryparamDict, shorthand: rdf.IriShorthand) -> PropertypathSet: _include_params = queryparams.get('include', []) + breakpoint() if _include_params: return frozenset(( parse_propertypath(_path_value, shorthand) diff --git a/trove/views/_base.py b/trove/views/_base.py index e62b10806..ebe040271 100644 --- a/trove/views/_base.py +++ b/trove/views/_base.py @@ -10,7 +10,7 @@ from trove import exceptions as trove_exceptions from trove.vocab.namespaces import RDFS, TROVE -from trove.util.queryparams import BaseTroveParams +from trove.util.base_trove_params import BaseTroveParams from trove.render import ( BaseRenderer, DEFAULT_RENDERER_TYPE, diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index a126fbca2..69207daf0 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -18,11 +18,7 @@ JSONAPI_ATTRIBUTE, JSONAPI_RELATIONSHIP, ) -from trove.vocab.osfmap import ( - DATE_PROPERTIES, - OSFMAP_LINK, - osfmap_shorthand, -) +from trove.vocab import osfmap from trove.vocab.namespaces import ( DCTERMS, OWL, @@ -105,7 +101,7 @@ def trove_browse_link(iri: str): * `@id` with the focus iri * `@type` with the focus resource's `rdf:type` -* property keys from [OSFMAP]({OSFMAP_LINK}) shorthand (each corresponding to an iri) +* property keys from [OSFMAP]({osfmap.OSFMAP_LINK}) shorthand (each corresponding to an iri) * property values as lists of objects: * literal text as `{{"@value": "..."}}` * iri references as `{{"@id": "..."}}` @@ -676,7 +672,7 @@ def trove_browse_link(iri: str): to sort by date values, use `sort` (or `sort[date-value]`) with a **property-path** that ends with one of the following supported date properties: -{", ".join(f"`{osfmap_shorthand().compact_iri(_date_iri)}`" for _date_iri in DATE_PROPERTIES)} +{", ".join(f"`{osfmap.osfmap_shorthand().compact_iri(_date_iri)}`" for _date_iri in osfmap.DATE_PROPERTIES)} to sort by integer values, use `sort[integer-value]` with a **property-path** to the integers of interest. @@ -723,7 +719,7 @@ def trove_browse_link(iri: str): DCTERMS.description: {_literal_markdown(f'''a **property-path** is a dot-separated path of short-hand IRIs, used in several api parameters -currently the only supported shorthand is defined by [OSFMAP]({OSFMAP_LINK}) +currently the only supported shorthand is defined by [OSFMAP]({osfmap.OSFMAP_LINK}) for example, `creator.name` is parsed as a two-step path that follows `creator` (aka `dcterms:creator`, ``) and then `name` (aka `foaf:name`, ``) @@ -850,6 +846,13 @@ def trove_shorthand() -> IriShorthand: ) +@functools.cache +def shtrove_shorthand() -> IriShorthand: + '''build iri shorthand that includes unprefixed terms (as defined in TROVE_API_THESAURUS) + ''' + return trove_shorthand().with_update(osfmap.osfmap_shorthand().prefix_map) + + @functools.cache def trove_indexcard_namespace(): return IriNamespace(f'{settings.SHARE_WEB_URL}trove/index-card/') From 21c3c8404eff8174a4f1ec2605ac99464c33c8b9 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 31 Mar 2025 14:04:22 -0400 Subject: [PATCH 05/43] (wip) improve tests --- tests/trove/test_doctest.py | 6 +++++- trove/trovesearch/search_params.py | 1 - trove/util/base_trove_params.py | 1 - trove/util/chainmap.py | 32 ++++++++++++++++++++---------- trove/util/iris.py | 20 +++++++++++++++---- 5 files changed, 42 insertions(+), 18 deletions(-) diff --git a/tests/trove/test_doctest.py b/tests/trove/test_doctest.py index be74792f6..8da33a947 100644 --- a/tests/trove/test_doctest.py +++ b/tests/trove/test_doctest.py @@ -3,6 +3,10 @@ import trove.util.chainmap import trove.util.iris +_DOCTEST_OPTIONFLAGS = ( + doctest.ELLIPSIS + | doctest.NORMALIZE_WHITESPACE +) _MODULES_WITH_DOCTESTS = ( trove.util.iris, @@ -25,5 +29,5 @@ def _test(): # HACK: allow running with pytest globals().update({ f'test_doctest_{_module.__name__}_{_i}': _make_test_fn(_test_case) - for _i, _test_case in enumerate(doctest.DocTestSuite(_module)) + for _i, _test_case in enumerate(doctest.DocTestSuite(_module, optionflags=_DOCTEST_OPTIONFLAGS)) }) diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index 26ec31656..dfaeba405 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -492,7 +492,6 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: @functools.cached_property def related_property_paths(self) -> tuple[Propertypath, ...]: - breakpoint() return ( _get_related_property_paths(self.cardsearch_filter_set) if (TROVE.relatedPropertyList,) in self.included_relations diff --git a/trove/util/base_trove_params.py b/trove/util/base_trove_params.py index 23254c48d..eb6ab21ff 100644 --- a/trove/util/base_trove_params.py +++ b/trove/util/base_trove_params.py @@ -80,7 +80,6 @@ def _gather_shorthand(cls, queryparams: _qp.QueryparamDict): @classmethod def _gather_included_relations(cls, queryparams: _qp.QueryparamDict, shorthand: rdf.IriShorthand) -> PropertypathSet: _include_params = queryparams.get('include', []) - breakpoint() if _include_params: return frozenset(( parse_propertypath(_path_value, shorthand) diff --git a/trove/util/chainmap.py b/trove/util/chainmap.py index 1622899ba..48a1be487 100644 --- a/trove/util/chainmap.py +++ b/trove/util/chainmap.py @@ -1,4 +1,4 @@ -from collections.abc import Sequence, Mapping +from collections.abc import Sequence, Mapping, Iterator import dataclasses @@ -6,10 +6,8 @@ class SimpleChainMap(Mapping): """Combine multiple mappings for sequential lookup. - For example, to emulate Python's normal lookup sequence: - - import __builtin__ - pylookup = SimpleChainMap([locals(), globals(), vars(__builtin__)]) + (inspired by rejecting the suggested "greatly simplified read-only version of Chainmap" + linked from python docs: https://code.activestate.com/recipes/305268/ ) >>> _map = SimpleChainMap([{'a':1, 'b':2}, {'a':3, 'd':4}]) >>> _map['a'] @@ -17,7 +15,9 @@ class SimpleChainMap(Mapping): >>> _map['d'] 4 >>> _map['f'] - KeyError + Traceback (most recent call last): + ... + KeyError: 'f' >>> 'b' in _map True >>> 'c' in _map @@ -32,6 +32,14 @@ class SimpleChainMap(Mapping): 4 >>> _map.get('f', 40) 40 + >>> sorted(_map) + ['a', 'b', 'd'] + >>> _map + SimpleChainMap(maps=[{'a': 1, 'b': 2}, {'a': 3, 'd': 4}]) + >>> _map.with_new({'a': 11, 'z': 13}) + SimpleChainMap(maps=[{'a': 11, 'z': 13}, {'a': 1, 'b': 2}, {'a': 3, 'd': 4}]) + >>> _map.with_new({'a': 17}).get('a') + 17 """ maps: Sequence[Mapping] @@ -43,14 +51,16 @@ def __getitem__(self, key): pass raise KeyError(key) - def __iter__(self): + def __iter__(self) -> Iterator: _seen: set = set() for _mapping in self.maps: - yield from set(_mapping.keys()).difference(_seen) - _seen.update(_mapping.keys()) + for _key in _mapping.keys(): + if _key not in _seen: + yield _key + _seen.add(_key) - def __len__(self): - return len(self.keys()) + def __len__(self): # for Mapping + return sum(1 for _ in self) # use __iter__ def with_new(self, new_map): return dataclasses.replace(self, maps=[new_map, *self.maps]) diff --git a/trove/util/iris.py b/trove/util/iris.py index 740ef659f..5fbe9c234 100644 --- a/trove/util/iris.py +++ b/trove/util/iris.py @@ -126,10 +126,16 @@ def is_worthwhile_iri(iri: str): def iri_path_as_keyword(iris: list[str] | tuple[str, ...], *, suffuniq=False) -> str: - assert isinstance(iris, (list, tuple)) and all( - isinstance(_pathstep, str) - for _pathstep in iris - ), f'expected list or tuple of str, got {iris}' + '''return a string-serialized list of iris + + meant for storing in an elasticsearch "keyword" field (happens to use json) + >>> iri_path_as_keyword(['flipl://iri.example/blarg', 'namly:urn.example:blerg']) + '["flipl://iri.example/blarg", "namly:urn.example:blerg"]' + >>> iri_path_as_keyword( + ... ['flipl://iri.example/blarg', 'namly:urn.example:blerg'], + ... suffuniq=True) + '["://iri.example/blarg", "namly:urn.example:blerg"]' + ''' _list = iris if suffuniq: _list = [ @@ -142,7 +148,13 @@ def iri_path_as_keyword(iris: list[str] | tuple[str, ...], *, suffuniq=False) -> def unquote_iri(iri: str) -> str: ''' >>> unquote_iri('flipl://iri.example/blarg/?#') + 'flipl://iri.example/blarg/?#' + >>> unquote_iri('flipl%3A//iri.example/blarg/%3F%23') + 'flipl://iri.example/blarg/?#' >>> unquote_iri('namly:urn.example:blerg') + 'namly:urn.example:blerg' + >>> unquote_iri('namly%3Aurn.example%3Ablerg') + 'namly:urn.example:blerg' ''' _unquoted_iri = iri while QUOTED_IRI_REGEX.match(_unquoted_iri): From ad3e968c918a4f454318b62ad0cf18f623147a7a Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 31 Mar 2025 17:48:19 -0400 Subject: [PATCH 06/43] wip --- trove/render/html_browse.py | 159 ++++++++++++------ trove/render/simple_csv.py | 29 ++-- trove/static/css/browse.css | 6 - trove/trovebrowse_gathering.py | 25 ++- trove/trovesearch/search_handle.py | 8 +- trove/trovesearch/search_params.py | 12 +- trove/util/queryparams.py | 13 +- .../{base_trove_params.py => trove_params.py} | 4 +- trove/views/_base.py | 10 +- trove/views/_gather_ask.py | 4 +- trove/views/browse.py | 17 +- trove/views/search.py | 12 +- 12 files changed, 186 insertions(+), 113 deletions(-) rename trove/util/{base_trove_params.py => trove_params.py} (98%) diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index bdd067a06..f250eef18 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -14,12 +14,13 @@ from django.contrib.staticfiles.storage import staticfiles_storage from django.http import QueryDict from django.urls import reverse -from primitive_metadata import primitive_rdf +from primitive_metadata import primitive_rdf as rdf from trove.util.iris import get_sufficiently_unique_iri from trove.util.randomness import shuffled from trove.vocab import mediatypes -from trove.vocab.namespaces import RDF +from trove.vocab.namespaces import RDF, RDFS, SKOS, DCTERMS, FOAF, DC +from trove.vocab.static_vocab import combined_thesaurus__suffuniq from trove.vocab.trove import trove_browse_link from ._base import BaseRenderer @@ -33,13 +34,26 @@ mediatypes.CSV, ) +_LINK_TEXT_PREDICATES = ( + SKOS.prefLabel, + RDFS.label, + SKOS.altLabel, + DCTERMS.title, + DC.title, + FOAF.name, +) + @dataclasses.dataclass class RdfHtmlBrowseRenderer(BaseRenderer): MEDIATYPE = 'text/html; charset=utf-8' def simple_render_document(self) -> str: - _html_builder = _HtmlBuilder(self.response_tripledict, self.response_focus.single_iri(), self.iri_shorthand) + _html_builder = _HtmlBuilder( + all_data=self.response_tripledict, + focus_iri=self.response_focus.single_iri(), + iri_shorthand=self.iri_shorthand, + ) _html_str = etree_tostring(_html_builder.html_element, encoding='unicode', method='html') return ''.join(( '', # TODO: can etree put the doctype in? @@ -49,11 +63,11 @@ def simple_render_document(self) -> str: @dataclasses.dataclass class _HtmlBuilder: - all_data: primitive_rdf.RdfTripleDictionary + all_data: rdf.RdfTripleDictionary focus_iri: str - iri_shorthand: primitive_rdf.IriShorthand + iri_shorthand: rdf.IriShorthand html_element: Element = dataclasses.field(init=False) - __current_data: primitive_rdf.RdfTripleDictionary = dataclasses.field(init=False) + __current_data: rdf.RdfTripleDictionary = dataclasses.field(init=False) __current_element: Element = dataclasses.field(init=False) __visiting_iris: set[str] = dataclasses.field(init=False) __heading_depth: int = 0 @@ -70,33 +84,35 @@ def __post_init__(self): }) _body_attrs = { 'class': 'BrowseWrapper', - 'style': f'--random-turn: {random.random()}turn;', + 'style': self._random_turn_style(), } with self.__nest('body', attrs=_body_attrs): self.__render_subj(self.focus_iri), self.__render_mediatype_links() + self.__render_amalgamation_switch() # TODO:
with unvisited triples in self.data (unreachable from focus_iri) def __render_mediatype_links(self): - with self.__nest('nav', attrs={'class': 'VisibleNest Browse__card'}): + with self.__nest_card(): self.__leaf('header', text='alternate mediatypes') with self.__nest('ul', attrs={'class': 'Browse__twopleset'}): for _mediatype in shuffled((*STABLE_MEDIATYPES, *UNSTABLE_MEDIATYPES)): - with self.__nest('li', attrs={'class': 'VisibleNest Browse__twople'}): + with self.__nest('li', attrs={'class': 'Browse__twople'}): self.__mediatype_link(_mediatype) + def __render_amalgamation_switch(self): + ... # TODO + # with self.__nest_card(): + # _text = ('ON' if ... else 'OFF') + # self.__leaf('header', text=f'amalgamation {_text}') + # self.__leaf('a', text=..., attrs={ + # 'href': self._queryparam_href('withAmalgamation', ('' if ... else None)), + # }) + def __mediatype_link(self, mediatype: str): - (_scheme, _netloc, _path, _query, _fragment) = urlsplit(self.focus_iri) - _qparams = QueryDict(_query, mutable=True) - _qparams['acceptMediatype'] = mediatype - _href = urlunsplit(( - _scheme, - _netloc, - _path, - _qparams.urlencode(), - _fragment, - )) - self.__leaf('a', text=mediatype, attrs={'href': _href}) + self.__leaf('a', text=mediatype, attrs={ + 'href': self._queryparam_href('acceptMediatype', mediatype), + }) if mediatype in UNSTABLE_MEDIATYPES: self.__leaf('aside', text='(unstable)') if mediatype in STABLE_MEDIATYPES: @@ -110,19 +126,12 @@ def __render_subj(self, subj_iri: str, start_collapsed=False): _twopledict = self.__current_data.get(subj_iri, {}) with self.__visiting(subj_iri): with self.__h_tag() as _h_tag: - with self.__nest( - 'details', - attrs={ - 'class': 'Browse__card', - **({} if start_collapsed else {'open': ''}), - }, - visible=True, - ): + with self.__nest_card('details'): with self.__nest('summary'): - _label = self.__label_for_iri(subj_iri) with self.__nest(_h_tag, attrs={'class': 'Browse__heading'}): - with self.__nest_link(subj_iri): - self.__leaf('dfn', text=_label, attrs={'id': quote(subj_iri)}) + for _label in self.__link_texts_for_iri(subj_iri): + with self.__nest_link(subj_iri): + self.__leaf('dfn', text=_label, attrs={'id': quote(subj_iri)}) _compact_focus = self.iri_shorthand.compact_iri(subj_iri) if _compact_focus != _label: self.__leaf('code', text=_compact_focus) @@ -130,7 +139,7 @@ def __render_subj(self, subj_iri: str, start_collapsed=False): self.__leaf('code', text=subj_iri) self.__twoples(_twopledict) - def __twoples(self, twopledict: primitive_rdf.RdfTwopleDictionary): + def __twoples(self, twopledict: rdf.RdfTwopleDictionary): with self.__nest('ul', {'class': 'Browse__twopleset'}): for _pred, _obj_set in shuffled(twopledict.items()): with self.__nest('li', {'class': 'Browse__twople'}): @@ -140,7 +149,7 @@ def __twoples(self, twopledict: primitive_rdf.RdfTwopleDictionary): with self.__nest('li', {'class': 'Browse__object'}): self.__obj(_obj) - def __obj(self, obj: primitive_rdf.RdfObject): + def __obj(self, obj: rdf.RdfObject): if isinstance(obj, str): # iri # TODO: detect whether indexcard? if obj in self.__current_data: @@ -154,17 +163,17 @@ def __obj(self, obj: primitive_rdf.RdfObject): if (RDF.type, RDF.Seq) in obj: self.__sequence(obj) else: - self.__twoples(primitive_rdf.twopledict_from_twopleset(obj)) - elif isinstance(obj, primitive_rdf.Literal): + self.__twoples(rdf.twopledict_from_twopleset(obj)) + elif isinstance(obj, rdf.Literal): self.__literal(obj) elif isinstance(obj, (float, int, datetime.date)): - self.__literal(primitive_rdf.literal(obj)) - elif isinstance(obj, primitive_rdf.QuotedGraph): + self.__literal(rdf.literal(obj)) + elif isinstance(obj, rdf.QuotedGraph): self.__quoted_graph(obj) - def __literal(self, literal: primitive_rdf.Literal): + def __literal(self, literal: rdf.Literal): # TODO language tag, datatypes - _markdown_iri = primitive_rdf.iri_from_mediatype('text/markdown') + _markdown_iri = rdf.iri_from_mediatype('text/markdown') _is_markdown = any( _datatype.startswith(_markdown_iri) for _datatype in literal.datatype_iris @@ -176,12 +185,12 @@ def __literal(self, literal: primitive_rdf.Literal): _html = markdown2.markdown(literal.unicode_value, safe_mode='escape') self.__current_element.append(etree_fromstring(f'{_html}')) else: - self.__leaf('q', text=literal.unicode_value) + self.__leaf('q', text=literal) for _datatype_iri in literal.datatype_iris: self.__leaf_link(_datatype_iri) def __sequence(self, sequence_twoples: frozenset): - _obj_in_order = list(primitive_rdf.sequence_objects_in_order(sequence_twoples)) + _obj_in_order = list(rdf.sequence_objects_in_order(sequence_twoples)) with self.__nest('details', attrs={'open': ''}): self.__leaf('summary', text=str(len(_obj_in_order))) with self.__nest('ol'): # TODO: style? @@ -189,7 +198,7 @@ def __sequence(self, sequence_twoples: frozenset): with self.__nest('li'): # , visible=True): self.__obj(_seq_obj) - def __quoted_graph(self, quoted_graph: primitive_rdf.QuotedGraph): + def __quoted_graph(self, quoted_graph: rdf.QuotedGraph): with self.__quoted_data(quoted_graph.tripledict): self.__render_subj(quoted_graph.focus_iri, start_collapsed=True) @@ -230,14 +239,8 @@ def __quoted_data(self, quoted_data: dict): self.__visiting_iris = _outer_visiting_iris @contextlib.contextmanager - def __nest(self, tag_name, attrs=None, visible=False): + def __nest(self, tag_name, attrs=None): _attrs = {**attrs} if attrs else {} - if visible: - _attrs['class'] = ( - ' '.join((_attrs['class'], 'VisibleNest')) - if 'class' in _attrs - else 'VisibleNest' - ) _parent_element = self.__current_element self.__current_element = SubElement(_parent_element, tag_name, _attrs) try: @@ -247,7 +250,10 @@ def __nest(self, tag_name, attrs=None, visible=False): def __leaf(self, tag_name, *, text=None, attrs=None): _leaf_element = SubElement(self.__current_element, tag_name, attrs or {}) - if text is not None: + if isinstance(text, rdf.Literal): + # TODO: lang + _leaf_element.text = text.unicode_value + elif text is not None: _leaf_element.text = text def __nest_link(self, iri: str, *, attrs=None): @@ -257,14 +263,55 @@ def __nest_link(self, iri: str, *, attrs=None): }) def __leaf_link(self, iri: str, *, attrs=None): - with self.__nest_link(iri, attrs=attrs) as _link: - _link.text = self.iri_shorthand.compact_iri(iri) + for _text in self.__link_texts_for_iri(iri): + with self.__nest_link(iri, attrs=attrs) as _link: + # TODO: lang + _link.text = ( + _text.unicode_value + if isinstance(_text, rdf.Literal) + else _text + ) - def __label_for_iri(self, iri: str): - # TODO: get actual label in requested language + def __nest_card(self, tag: str = 'nav', start_collapsed=False): + return self.__nest( + tag, + attrs={ + 'class': 'Browse__card', + 'style': self._random_turn_style(), + **({} if start_collapsed else {'open': ''}), + }, + ) + + def __link_texts_for_iri(self, iri: str): + # TODO: consider requested language + _suffuniq = get_sufficiently_unique_iri(iri) + _thesaurus_entry = combined_thesaurus__suffuniq().get(_suffuniq) + if _thesaurus_entry: + for _pred in _LINK_TEXT_PREDICATES: + _objects = _thesaurus_entry.get(_pred) + if _objects: + return _objects _shorthand = self.iri_shorthand.compact_iri(iri) return ( - get_sufficiently_unique_iri(iri) + [_suffuniq] if _shorthand == iri - else _shorthand + else [_shorthand] ) + + def _random_turn_style(self): + return f'--random-turn: {random.random()}turn;' + + def _queryparam_href(self, param_name: str, param_value: str | None): + (_scheme, _netloc, _path, _query, _fragment) = urlsplit(self.focus_iri) + _qparams = QueryDict(_query, mutable=True) + if param_value is None: + del _qparams[param_name] + else: + _qparams[param_name] = param_value + return urlunsplit(( + _scheme, + _netloc, + _path, + _qparams.urlencode(), + _fragment, + )) diff --git a/trove/render/simple_csv.py b/trove/render/simple_csv.py index 0642ed179..dd644bd52 100644 --- a/trove/render/simple_csv.py +++ b/trove/render/simple_csv.py @@ -1,4 +1,8 @@ from __future__ import annotations +from collections.abc import ( + Iterable, + Iterator, +) import csv import functools import itertools @@ -6,19 +10,20 @@ import typing from trove.trovesearch.search_params import ( - Propertypath, - BaseTroveParams, CardsearchParams, ValuesearchParams, ) +from trove.util.propertypath import Propertypath from trove.vocab import mediatypes from trove.vocab import osfmap from trove.vocab.namespaces import TROVE from ._simple_trovesearch import SimpleTrovesearchRenderer from ._rendering import StreamableRendering +if typing.TYPE_CHECKING: + from trove.util.trove_params import BasicTroveParams -Jsonpath = typing.Iterable[str] # path of json keys +Jsonpath = Iterable[str] # path of json keys _MULTIVALUE_DELIMITER = ' ; ' # possible improvement: smarter in-value delimiting? _VALUE_KEY_PREFERENCE = ('@value', '@id', 'name', 'prefLabel', 'label') @@ -33,7 +38,7 @@ class TrovesearchSimpleCsvRenderer(SimpleTrovesearchRenderer): def unicard_rendering(self, card_iri: str, osfmap_json: dict): self.multicard_rendering(card_pages=iter([{card_iri: osfmap_json}])) - def multicard_rendering(self, card_pages: typing.Iterator[dict[str, dict]]): + def multicard_rendering(self, card_pages: Iterator[dict[str, dict]]): _doc = TabularDoc( card_pages, trove_params=getattr(self.response_focus, 'search_params', None), @@ -44,7 +49,7 @@ def multicard_rendering(self, card_pages: typing.Iterator[dict[str, dict]]): ) -def csv_stream(csv_dialect, header: list, rows: typing.Iterator[list]) -> typing.Iterator[str]: +def csv_stream(csv_dialect, header: list, rows: Iterator[list]) -> Iterator[str]: _writer = csv.writer(_Echo(), dialect=csv_dialect) yield _writer.writerow(header) for _row in rows: @@ -53,8 +58,8 @@ def csv_stream(csv_dialect, header: list, rows: typing.Iterator[list]) -> typing @dataclasses.dataclass class TabularDoc: - card_pages: typing.Iterator[dict[str, dict]] - trove_params: BaseTroveParams | None = None + card_pages: Iterator[dict[str, dict]] + trove_params: BasicTroveParams | None = None _started: bool = False @functools.cached_property @@ -69,8 +74,8 @@ def column_jsonpaths(self) -> tuple[Jsonpath, ...]: def first_page(self) -> dict[str, dict]: return next(self.card_pages, {}) - def _column_paths(self) -> typing.Iterator[Propertypath]: - _pathlists: list[typing.Iterable[Propertypath]] = [] + def _column_paths(self) -> Iterator[Propertypath]: + _pathlists: list[Iterable[Propertypath]] = [] if self.trove_params is not None: # hacks if isinstance(self.trove_params, ValuesearchParams): _expected_card_types = set(self.trove_params.valuesearch_type_iris()) @@ -99,7 +104,7 @@ def _iter_card_pages(self): def header(self) -> list[str]: return ['.'.join(_path) for _path in self.column_jsonpaths] - def rows(self) -> typing.Iterator[list[str]]: + def rows(self) -> Iterator[list[str]]: for _page in self._iter_card_pages(): for _card_iri, _osfmap_json in _page.items(): yield self._row_values(_osfmap_json) @@ -121,7 +126,7 @@ def _row_field_value(self, osfmap_json: dict, field_path: Jsonpath) -> str: return _MULTIVALUE_DELIMITER.join(map(str, _rendered_values)) -def _osfmap_jsonpath(iri_path: typing.Iterable[str]) -> Jsonpath: +def _osfmap_jsonpath(iri_path: Iterable[str]) -> Jsonpath: _shorthand = osfmap.osfmap_shorthand() return tuple( _shorthand.compact_iri(_pathstep) @@ -138,7 +143,7 @@ def _has_value(osfmap_json: dict, path: Jsonpath) -> bool: return True -def _iter_values(osfmap_json: dict, path: Jsonpath) -> typing.Iterator: +def _iter_values(osfmap_json: dict, path: Jsonpath) -> Iterator: assert path (_step, *_rest) = path _val = osfmap_json.get(_step) diff --git a/trove/static/css/browse.css b/trove/static/css/browse.css index 0182e1be3..28d009b83 100644 --- a/trove/static/css/browse.css +++ b/trove/static/css/browse.css @@ -1,9 +1,3 @@ -.VisibleNest { - padding: 0.191rem 0.382rem; - filter: hue-rotate(0.192turn); - backdrop-filter: hue-rotate(0.192turn); -} - .BrowseWrapper { display: flex; flex-direction: row; diff --git a/trove/trovebrowse_gathering.py b/trove/trovebrowse_gathering.py index 5c687379b..864f1cd89 100644 --- a/trove/trovebrowse_gathering.py +++ b/trove/trovebrowse_gathering.py @@ -16,7 +16,9 @@ rdf.literal('browse a trove of IRI-linked metadata', language='en'), ), focustype_iris={ns.RDFS.Resource}, + param_iris={ns.TROVE.withAmalgamation}, thesaurus=TROVE_API_THESAURUS, + ) @@ -25,26 +27,35 @@ rdf.literal('trovebrowse organizer', language='en'), ), norms=TROVEBROWSE_NORMS, - gatherer_params={}, + gatherer_params={'with_amalgamation': ns.TROVE.withAmalgamation}, ) -@trovebrowse.gatherer(focustype_iris={ns.RDFS.Resource}) -def gather_thesaurus_entry(focus): +@trovebrowse.gatherer() +def gather_thesaurus_entry(focus, *, with_amalgamation: bool): _thesaurus = static_vocab.combined_thesaurus__suffuniq() for _iri in focus.iris: _suffuniq_iri = get_sufficiently_unique_iri(_iri) _thesaurus_entry = _thesaurus.get(_suffuniq_iri, None) if _thesaurus_entry: - yield from rdf.iter_twoples(_thesaurus_entry) + if with_amalgamation: + yield from rdf.iter_twoples(_thesaurus_entry) + else: + yield (ns.FOAF.primaryTopicOf, rdf.QuotedGraph({_iri: _thesaurus_entry}, focus_iri=_iri)) @trovebrowse.gatherer(ns.FOAF.primaryTopicOf) -def gather_cards_focused_on(focus): +def gather_cards_focused_on(focus, *, with_amalgamation: bool): _identifier_qs = trove_db.ResourceIdentifier.objects.queryset_for_iris(focus.iris) _indexcard_qs = trove_db.Indexcard.objects.filter(focus_identifier_set__in=_identifier_qs) - for _indexcard in _indexcard_qs: - yield (ns.FOAF.primaryTopicOf, _indexcard.get_iri()) + if with_amalgamation: + for _latest_rdf in trove_db.LatestIndexcardRdf.objects.filter(indexcard__in=_indexcard_qs): + yield from rdf.iter_tripleset(_latest_rdf.as_rdf_tripledict()) + else: + for _indexcard in _indexcard_qs: + _card_iri = _indexcard.get_iri() + yield (ns.FOAF.primaryTopicOf, _card_iri) + yield (_card_iri, ns.RDF.type, ns.TROVE.Indexcard) @trovebrowse.gatherer(ns.TROVE.usedAtPath) diff --git a/trove/trovesearch/search_handle.py b/trove/trovesearch/search_handle.py index 90f44265d..01dbffd84 100644 --- a/trove/trovesearch/search_handle.py +++ b/trove/trovesearch/search_handle.py @@ -9,9 +9,9 @@ ReproduciblyRandomSampleCursor, ) from trove.trovesearch.search_params import ( - BaseTroveParams, CardsearchParams, ) +from trove.util.trove_params import BasicTroveParams from trove.vocab.namespaces import TROVE from trove.vocab.trove import trove_indexcard_namespace @@ -19,8 +19,8 @@ @dataclasses.dataclass class BasicSearchHandle: cursor: PageCursor - search_params: BaseTroveParams - handler: typing.Callable[[BaseTroveParams], typing.Self] | None = None + search_params: BasicTroveParams + handler: typing.Callable[[BasicTroveParams], typing.Self] | None = None @property def total_result_count(self) -> primitive_rdf.Literal: @@ -134,7 +134,7 @@ def __post_init__(self): ### # types -TrovesearchHandler = typing.Callable[[BaseTroveParams], BasicSearchHandle] +TrovesearchHandler = typing.Callable[[BasicTroveParams], BasicSearchHandle] ### diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index dfaeba405..708418716 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -22,7 +22,7 @@ Propertypath, is_globpath, ) -from trove.util.base_trove_params import BaseTroveParams +from trove.util.trove_params import BasicTroveParams from trove.util.queryparams import ( QueryparamDict, QueryparamName, @@ -112,7 +112,7 @@ def to_shortname(self) -> str: @dataclasses.dataclass(frozen=True) -class BaseTrovesearchParams(BaseTroveParams): +class BasicTrovesearchParams(BasicTroveParams): static_focus_type: typing.ClassVar[str] # expected on subclasses @classmethod @@ -464,12 +464,12 @@ def as_queryparam(self) -> tuple[str, str]: @dataclasses.dataclass(frozen=True) -class IndexcardParams(BaseTroveParams): +class IndexcardParams(BasicTroveParams): static_focus_type = TROVE.Indexcard @dataclasses.dataclass(frozen=True) -class CardsearchParams(BaseTroveParams): +class CardsearchParams(BasicTroveParams): cardsearch_textsegment_set: frozenset[Textsegment] cardsearch_filter_set: frozenset[SearchFilter] index_strategy_name: str | None @@ -485,7 +485,7 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: **super().parse_queryparams(queryparams), 'cardsearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'cardSearchText'), 'cardsearch_filter_set': _filter_set, - 'index_strategy_name': get_single_value(queryparams, QueryparamName('indexStrategy')), + 'index_strategy_name': get_single_value(queryparams, 'indexStrategy'), 'sort_list': SortParam.from_sort_queryparams(queryparams), 'page_cursor': _get_page_cursor(queryparams), } @@ -550,7 +550,7 @@ class ValuesearchParams(CardsearchParams): # override CardsearchParams @classmethod def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: - _raw_propertypath = get_single_value(queryparams, QueryparamName('valueSearchPropertyPath')) + _raw_propertypath = get_single_value(queryparams, 'valueSearchPropertyPath') if not _raw_propertypath: raise trove_exceptions.MissingRequiredQueryParam('valueSearchPropertyPath') return { diff --git a/trove/util/queryparams.py b/trove/util/queryparams.py index c4128d481..aeeaab11a 100644 --- a/trove/util/queryparams.py +++ b/trove/util/queryparams.py @@ -94,13 +94,18 @@ def join_queryparam_value(values: typing.Iterable[str]): def get_single_value( queryparams: QueryparamDict, - queryparam_name: QueryparamName, + queryparam_name: QueryparamName | str, ): - _family_params = queryparams.get(queryparam_name.family, ()) + if isinstance(queryparam_name, QueryparamName): + _family_name = queryparam_name.family + _expected_brackets = queryparam_name.bracketed_names + else: + _family_name = queryparam_name + _expected_brackets = () _paramvalues = [ _paramvalue - for _paramname, _paramvalue in _family_params - if _paramname.bracketed_names == queryparam_name.bracketed_names + for _paramname, _paramvalue in queryparams.get(_family_name, ()) + if _paramname.bracketed_names == _expected_brackets ] if not _paramvalues: return None diff --git a/trove/util/base_trove_params.py b/trove/util/trove_params.py similarity index 98% rename from trove/util/base_trove_params.py rename to trove/util/trove_params.py index eb6ab21ff..88448a18a 100644 --- a/trove/util/base_trove_params.py +++ b/trove/util/trove_params.py @@ -22,7 +22,7 @@ @dataclasses.dataclass(frozen=True) -class BaseTroveParams: +class BasicTroveParams: iri_shorthand: rdf.IriShorthand = dataclasses.field(repr=False) accept_mediatype: str | None included_relations: PropertypathSet = dataclasses.field(repr=False, compare=False) @@ -47,7 +47,7 @@ def parse_queryparams(cls, queryparams: _qp.QueryparamDict) -> dict: 'iri_shorthand': _shorthand, 'included_relations': cls._gather_included_relations(queryparams, _shorthand), 'attrpaths_by_type': cls._gather_attrpaths(queryparams, _shorthand), - 'accept_mediatype': _qp.get_single_value(queryparams, _qp.QueryparamName('acceptMediatype')), + 'accept_mediatype': _qp.get_single_value(queryparams, 'acceptMediatype'), } @classmethod diff --git a/trove/views/_base.py b/trove/views/_base.py index ebe040271..fd138ad67 100644 --- a/trove/views/_base.py +++ b/trove/views/_base.py @@ -10,7 +10,7 @@ from trove import exceptions as trove_exceptions from trove.vocab.namespaces import RDFS, TROVE -from trove.util.base_trove_params import BaseTroveParams +from trove.util.trove_params import BasicTroveParams from trove.render import ( BaseRenderer, DEFAULT_RENDERER_TYPE, @@ -26,7 +26,7 @@ class BaseTroveView(View, abc.ABC): # ClassVars expected on inheritors: gathering_organizer: ClassVar[gather.GatheringOrganizer] - params_type: ClassVar[type[BaseTroveParams]] = BaseTroveParams + params_type: ClassVar[type[BasicTroveParams]] = BasicTroveParams focus_type_iris: ClassVar[Container[str]] = (RDFS.Resource,) def get(self, request): @@ -39,7 +39,7 @@ def get(self, request): ) try: _params = self._parse_params(request) - return self._make_response(request, _params, _renderer_type) + return self._respond(request, _params, _renderer_type) except trove_exceptions.TroveError as _error: return make_http_error_response( error=_error, @@ -67,7 +67,7 @@ def _get_focus_iri(self, request, params): return request.build_absolute_uri() def _build_focus(self, request, params): - return gather.Focus(self._get_focus_iri(request, params), self.focus_type_iri) + return gather.Focus.new(self._get_focus_iri(request, params), self.focus_type_iris) def _build_gathering(self, params, renderer_type: type[BaseRenderer]) -> gather.Gathering: return self.gathering_organizer.new_gathering( @@ -76,7 +76,7 @@ def _build_gathering(self, params, renderer_type: type[BaseRenderer]) -> gather. def _get_gatherer_kwargs(self, params, renderer_type): _kwargs = {} - _deriver_kw = _get_param_keyword(TROVE.deriverIRI, self.organizer) + _deriver_kw = _get_param_keyword(TROVE.deriverIRI, self.gathering_organizer) if _deriver_kw: _kwargs[_deriver_kw] = renderer_type.INDEXCARD_DERIVER_IRI return _kwargs diff --git a/trove/views/_gather_ask.py b/trove/views/_gather_ask.py index 63bae1098..c995a9907 100644 --- a/trove/views/_gather_ask.py +++ b/trove/views/_gather_ask.py @@ -1,11 +1,11 @@ from primitive_metadata import gather -from trove.trovesearch.search_params import BaseTroveParams +from trove.util.trove_params import BasicTroveParams def ask_gathering_from_params( gathering: gather.Gathering, - params: BaseTroveParams, + params: BasicTroveParams, start_focus: gather.Focus, ): # fill the gathering's cache with included related resources... diff --git a/trove/views/browse.py b/trove/views/browse.py index fa20c6e1d..bea82468f 100644 --- a/trove/views/browse.py +++ b/trove/views/browse.py @@ -4,28 +4,29 @@ from trove.util.iris import unquote_iri from trove.vocab import namespaces as ns from trove.trovebrowse_gathering import trovebrowse -from trove.util.base_trove_params import BaseTroveParams +from trove.util.trove_params import BasicTroveParams from trove.util.queryparams import ( QueryparamDict, - QueryparamName, get_single_value, ) from ._base import BaseTroveView @dataclasses.dataclass(frozen=True) -class BrowseParams(BaseTroveParams): +class BrowseParams(BasicTroveParams): iri: str + with_amalgamation: bool @classmethod def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: - _iri_value = get_single_value(queryparams, QueryparamName('iri')) + _iri_value = get_single_value(queryparams, 'iri') if not _iri_value: raise trove_exceptions.MissingRequiredQueryParam('iri') _iri = ns.NAMESPACES_SHORTHAND.expand_iri(unquote_iri(_iri_value)) return { **super().parse_queryparams(queryparams), 'iri': _iri, + 'with_amalgamation': ('withAmalgamation' in queryparams), } @@ -33,5 +34,11 @@ class BrowseIriView(BaseTroveView): gathering_organizer = trovebrowse params_type = BrowseParams - def _get_focus_iri(self, request, params: BrowseParams): + def _get_focus_iri(self, request, params: BrowseParams): # override BaseTroveView return params.iri + + def _get_gatherer_kwargs(self, params, renderer_type): # override BaseTroveView + return { + **super()._get_gatherer_kwargs(params, renderer_type), + 'with_amalgamation': params.with_amalgamation, + } diff --git a/trove/views/search.py b/trove/views/search.py index 288738782..cadf9f9b8 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -1,11 +1,12 @@ import abc +from collections.abc import Callable import logging -from typing import Callable + +from primitive_metadata import gather from share.search import index_strategy from trove.trovesearch.search_handle import BasicSearchHandle from trove.trovesearch.search_params import ( - BaseTroveParams, CardsearchParams, ValuesearchParams, ) @@ -14,17 +15,20 @@ CardsearchFocus, ValuesearchFocus, ) +from trove.util.trove_params import BasicTroveParams from ._base import BaseTroveView logger = logging.getLogger(__name__) -_TrovesearchHandler = Callable[[BaseTroveParams], BasicSearchHandle] +_TrovesearchHandler = Callable[[BasicTroveParams], BasicSearchHandle] class _BaseTrovesearchView(BaseTroveView, abc.ABC): - organizer = trovesearch_by_indexstrategy + focus_type: type[gather.Focus] = gather.Focus # expected on subclasses + + gathering_organizer = trovesearch_by_indexstrategy # for BaseTroveView def _build_focus(self, url, params): # override BaseTroveView _strategy = index_strategy.get_strategy_for_trovesearch(params) From 69d731355d17106cf0be2340a29679193b922847 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 1 Apr 2025 14:16:12 -0400 Subject: [PATCH 07/43] wip --- .../trove/render/test_simple_json_renderer.py | 6 +- trove/render/html_browse.py | 158 +++++++++++------- trove/render/simple_json.py | 2 +- trove/static/css/browse.css | 67 ++++++-- trove/trovebrowse_gathering.py | 8 +- trove/trovesearch/trovesearch_gathering.py | 2 +- trove/views/_base.py | 4 +- trove/views/browse.py | 1 + 8 files changed, 157 insertions(+), 91 deletions(-) diff --git a/tests/trove/render/test_simple_json_renderer.py b/tests/trove/render/test_simple_json_renderer.py index d9481e183..7f59c8a59 100644 --- a/tests/trove/render/test_simple_json_renderer.py +++ b/tests/trove/render/test_simple_json_renderer.py @@ -28,7 +28,7 @@ class TestSimpleJsonRenderer(_base.TrovesearchJsonRendererTests): { "@id": BLARG.anItem, "title": "an item, yes", - "foaf:primaryTopicOf": [ + "foaf:isPrimaryTopicOf": [ { "@id": BLARG.aCard } @@ -37,7 +37,7 @@ class TestSimpleJsonRenderer(_base.TrovesearchJsonRendererTests): { "@id": BLARG.anItemm, "title": "an itemm, yes", - "foaf:primaryTopicOf": [ + "foaf:isPrimaryTopicOf": [ { "@id": BLARG.aCardd } @@ -46,7 +46,7 @@ class TestSimpleJsonRenderer(_base.TrovesearchJsonRendererTests): { "@id": BLARG.anItemmm, "title": "an itemmm, yes", - "foaf:primaryTopicOf": [ + "foaf:isPrimaryTopicOf": [ { "@id": BLARG.aCarddd } diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index f250eef18..587ee15ea 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -1,7 +1,7 @@ import contextlib import dataclasses import datetime -import markdown2 +import math import random from urllib.parse import quote, urlsplit, urlunsplit from xml.etree.ElementTree import ( @@ -14,6 +14,7 @@ from django.contrib.staticfiles.storage import staticfiles_storage from django.http import QueryDict from django.urls import reverse +import markdown2 from primitive_metadata import primitive_rdf as rdf from trove.util.iris import get_sufficiently_unique_iri @@ -42,6 +43,12 @@ DC.title, FOAF.name, ) +_IMPLICIT_DATATYPES = frozenset(( + RDF.string, + RDF.langString, +)) + +_PHI = (math.sqrt(5) + 1) / 2 @dataclasses.dataclass @@ -71,6 +78,7 @@ class _HtmlBuilder: __current_element: Element = dataclasses.field(init=False) __visiting_iris: set[str] = dataclasses.field(init=False) __heading_depth: int = 0 + __last_hue_turn: float = dataclasses.field(default_factory=random.random) def __post_init__(self): # TODO: lang (according to request -- also translate) @@ -84,7 +92,7 @@ def __post_init__(self): }) _body_attrs = { 'class': 'BrowseWrapper', - 'style': self._random_turn_style(), + 'style': self._hue_turn_css(), } with self.__nest('body', attrs=_body_attrs): self.__render_subj(self.focus_iri), @@ -122,48 +130,63 @@ def __mediatype_link(self, mediatype: str): _link.text = 'documented use' _link.tail = ')' - def __render_subj(self, subj_iri: str, start_collapsed=False): + def __render_subj(self, subj_iri: str, *, start_collapsed=True): _twopledict = self.__current_data.get(subj_iri, {}) with self.__visiting(subj_iri): - with self.__h_tag() as _h_tag: - with self.__nest_card('details'): - with self.__nest('summary'): - with self.__nest(_h_tag, attrs={'class': 'Browse__heading'}): - for _label in self.__link_texts_for_iri(subj_iri): - with self.__nest_link(subj_iri): - self.__leaf('dfn', text=_label, attrs={'id': quote(subj_iri)}) - _compact_focus = self.iri_shorthand.compact_iri(subj_iri) - if _compact_focus != _label: - self.__leaf('code', text=_compact_focus) - if _compact_focus != subj_iri: - self.__leaf('code', text=subj_iri) - self.__twoples(_twopledict) + with self.__nest_card('article'): + with self.__nest('header'): + _compact = self.iri_shorthand.compact_iri(subj_iri) + _suffuniq = get_sufficiently_unique_iri(subj_iri) + _h_text = (_compact if (_compact != subj_iri) else _suffuniq) + with self.__nest_h_tag(): + self.__leaf('dfn', text=_h_text, attrs={'id': quote(subj_iri)}) + if _compact not in (subj_iri, _h_text): + self.__leaf('code', text=_compact) + if _suffuniq != _h_text: + self.__leaf('code', text=_suffuniq) + for _label in self.__labels_for_iri(subj_iri): + self.__literal(_label) + if _twopledict: + with self.__nest_card('details') as _details: + if not start_collapsed: + _details['open'] = '' + self.__leaf('summary', text='details...') + self.__twoples(_twopledict) def __twoples(self, twopledict: rdf.RdfTwopleDictionary): - with self.__nest('ul', {'class': 'Browse__twopleset'}): + with self.__nest('dl', {'class': 'Browse__twopleset'}): for _pred, _obj_set in shuffled(twopledict.items()): - with self.__nest('li', {'class': 'Browse__twople'}): - self.__leaf_link(_pred) - with self.__nest('ul', {'class': 'Browse__objectset'}): - for _obj in shuffled(_obj_set): - with self.__nest('li', {'class': 'Browse__object'}): - self.__obj(_obj) + with self.__nest('dt'): + self.__compact_link(_pred) + for _text in self.__labels_for_iri(_pred): + self.__literal(_text) + with self.__nest('dd'): + for _obj in shuffled(_obj_set): + self.__obj(_obj) + # with self.__nest('ul', {'class': 'Browse__twopleset'}): + # for _pred, _obj_set in shuffled(twopledict.items()): + # with self.__nest('li', {'class': 'Browse__twople'}): + # self.__leaf_link(_pred) + # with self.__nest('ul', {'class': 'Browse__objectset'}): + # for _obj in shuffled(_obj_set): + # with self.__nest('li', {'class': 'Browse__object'}): + # self.__obj(_obj) def __obj(self, obj: rdf.RdfObject): if isinstance(obj, str): # iri # TODO: detect whether indexcard? if obj in self.__current_data: if obj in self.__visiting_iris: - self.__leaf_link(obj) # TODO: consider + self.__iri_link_and_labels(obj) # TODO: consider else: self.__render_subj(obj) else: - self.__leaf_link(obj) + self.__iri_link_and_labels(obj) elif isinstance(obj, frozenset): # blanknode if (RDF.type, RDF.Seq) in obj: self.__sequence(obj) else: - self.__twoples(rdf.twopledict_from_twopleset(obj)) + self.__blanknode(obj) elif isinstance(obj, rdf.Literal): self.__literal(obj) elif isinstance(obj, (float, int, datetime.date)): @@ -171,28 +194,28 @@ def __obj(self, obj: rdf.RdfObject): elif isinstance(obj, rdf.QuotedGraph): self.__quoted_graph(obj) - def __literal(self, literal: rdf.Literal): - # TODO language tag, datatypes + def __literal(self, literal: rdf.Literal | str): + _lit = (literal if isinstance(literal, rdf.Literal) else rdf.literal(literal)) _markdown_iri = rdf.iri_from_mediatype('text/markdown') _is_markdown = any( _datatype.startswith(_markdown_iri) - for _datatype in literal.datatype_iris + for _datatype in _lit.datatype_iris ) # TODO: checksum_iri, literal_iri with self.__nest('article', attrs={'class': 'Browse__literal'}): if _is_markdown: # TODO: tests for safe_mode - _html = markdown2.markdown(literal.unicode_value, safe_mode='escape') + _html = markdown2.markdown(_lit.unicode_value, safe_mode='escape') self.__current_element.append(etree_fromstring(f'{_html}')) else: - self.__leaf('q', text=literal) - for _datatype_iri in literal.datatype_iris: - self.__leaf_link(_datatype_iri) + self.__leaf('q', text=_lit) + for _datatype_iri in _lit.datatype_iris.difference(_IMPLICIT_DATATYPES): + self.__compact_link(_datatype_iri) def __sequence(self, sequence_twoples: frozenset): _obj_in_order = list(rdf.sequence_objects_in_order(sequence_twoples)) with self.__nest('details', attrs={'open': ''}): - self.__leaf('summary', text=str(len(_obj_in_order))) + self.__leaf('summary', text=f'sequence of {len(_obj_in_order)}') with self.__nest('ol'): # TODO: style? for _seq_obj in _obj_in_order: with self.__nest('li'): # , visible=True): @@ -202,6 +225,15 @@ def __quoted_graph(self, quoted_graph: rdf.QuotedGraph): with self.__quoted_data(quoted_graph.tripledict): self.__render_subj(quoted_graph.focus_iri, start_collapsed=True) + def __blanknode(self, blanknode: rdf.RdfTwopleDictionary | frozenset): + _twopledict = ( + blanknode + if isinstance(blanknode, dict) + else rdf.twopledict_from_twopleset(blanknode) + ) + with self.__nest('article', attrs={'class': 'Browse__blanknode'}): + self.__twoples(_twopledict) + ### # private html-building helpers @@ -215,16 +247,18 @@ def __visiting(self, iri: str): self.__visiting_iris.remove(iri) @contextlib.contextmanager - def __h_tag(self): + def __nest_h_tag(self, **kwargs): _outer_heading_depth = self.__heading_depth if not _outer_heading_depth: self.__heading_depth = 1 elif _outer_heading_depth < 6: # h6 deepest self.__heading_depth += 1 - try: - yield f'h{self.__heading_depth}' - finally: - self.__heading_depth = _outer_heading_depth + _h_tag = f'h{self.__heading_depth}' + with self.__nest(_h_tag, **kwargs) as _nested: + try: + yield _nested + finally: + self.__heading_depth = _outer_heading_depth @contextlib.contextmanager def __quoted_data(self, quoted_data: dict): @@ -256,50 +290,44 @@ def __leaf(self, tag_name, *, text=None, attrs=None): elif text is not None: _leaf_element.text = text - def __nest_link(self, iri: str, *, attrs=None): + def __browse_link(self, iri: str, *, attrs=None): return self.__nest('a', attrs={ **(attrs or {}), 'href': trove_browse_link(iri), }) - def __leaf_link(self, iri: str, *, attrs=None): - for _text in self.__link_texts_for_iri(iri): - with self.__nest_link(iri, attrs=attrs) as _link: - # TODO: lang - _link.text = ( - _text.unicode_value - if isinstance(_text, rdf.Literal) - else _text - ) - - def __nest_card(self, tag: str = 'nav', start_collapsed=False): + def __iri_link_and_labels(self, iri: str): + self.__compact_link(iri) + for _text in self.__labels_for_iri(iri): + self.__literal(_text) + + def __compact_link(self, iri: str): + _compact = self.iri_shorthand.compact_iri(iri) + with self.__browse_link(iri) as _link: + _link.text = _compact + + def __nest_card(self, tag: str = 'nav'): return self.__nest( tag, attrs={ 'class': 'Browse__card', - 'style': self._random_turn_style(), - **({} if start_collapsed else {'open': ''}), + 'style': self._hue_turn_css(), }, ) - def __link_texts_for_iri(self, iri: str): + def __labels_for_iri(self, iri: str): # TODO: consider requested language _suffuniq = get_sufficiently_unique_iri(iri) _thesaurus_entry = combined_thesaurus__suffuniq().get(_suffuniq) if _thesaurus_entry: for _pred in _LINK_TEXT_PREDICATES: - _objects = _thesaurus_entry.get(_pred) - if _objects: - return _objects - _shorthand = self.iri_shorthand.compact_iri(iri) - return ( - [_suffuniq] - if _shorthand == iri - else [_shorthand] - ) + yield from shuffled(_thesaurus_entry.get(_pred, ())) - def _random_turn_style(self): - return f'--random-turn: {random.random()}turn;' + def _hue_turn_css(self): + # return f'--hue-turn: {random.random()}turn;' + _hue_turn = self.__last_hue_turn + (_PHI / 13) + self.__last_hue_turn = _hue_turn + return f'--hue-turn: {_hue_turn}turn;' def _queryparam_href(self, param_name: str, param_value: str | None): (_scheme, _netloc, _path, _query, _fragment) = urlsplit(self.focus_iri) diff --git a/trove/render/simple_json.py b/trove/render/simple_json.py index 10f896fff..480ef1c7f 100644 --- a/trove/render/simple_json.py +++ b/trove/render/simple_json.py @@ -55,7 +55,7 @@ def _stream_json(self, card_pages: typing.Iterator[dict[str, dict]]): ) def _render_card_content(self, card_iri: str, osfmap_json: dict): - self._add_twople(osfmap_json, 'foaf:primaryTopicOf', card_iri) + self._add_twople(osfmap_json, 'foaf:isPrimaryTopicOf', card_iri) return osfmap_json def _render_meta(self): diff --git a/trove/static/css/browse.css b/trove/static/css/browse.css index 28d009b83..b5fd216a2 100644 --- a/trove/static/css/browse.css +++ b/trove/static/css/browse.css @@ -7,8 +7,9 @@ margin: 0; padding: 0; min-height: 100vh; - background-color: #fedbae; - backdrop-filter: hue-rotate(var(--random-turn)); + /*background-color: #fedbae;*/ + background-color: hsl(var(--hue-turn), 100%, 81%); + /* backdrop-filter: hue-rotate(var(--hue-turn)); */ } .BrowseWrapper dfn { @@ -16,39 +17,44 @@ } .Browse__card { + --card-color: hsl(var(--hue-turn), 100%, 81%); display: flex; flex-direction: column; - + padding: 0.382rem 0.618rem; + border: solid 1px rgba(0,0,0,0.191); + background-color: var(--card-color); + box-shadow: -0.191rem 0.191rem hsl(from var(--card-color) h s l / 38%); + margin: 0 0 0.618rem 0.618rem; /*max-width: 31rem;*/ - /* - border: solid 0.382rem rgba(0,0,0,0.191); - */ + transition-property: height, width; + transition-duration: 1.618s; } details.Browse__card > summary::before { content: '‽'; display: inline-block; transition-property: rotate; - transition-duration: 1s; + transition-duration: 1.618s; + margin-right: 0.618rem; } details.Browse__card[open] > summary::before { - rotate: var(--random-turn); + rotate: var(--hue-turn); } .BrowseWrapper > .Browse__card { - margin: 1em; + margin: 1rem; } .Browse__card > header { display: flex; - align-items: center; - flex-wrap: wrap; + flex-direction: column; gap: 0.618rem; padding: 0.618rem; } -.Browse__heading { +.Browse__card > header > :first-child { + align-self: stretch; margin: 0; } @@ -56,14 +62,34 @@ details.Browse__card[open] > summary::before { padding: 0.618rem; } -.Browse__twopleset { +dl.Browse__twopleset { + display: grid; + grid-template-columns: + [twople-pred] auto + [twople-obj] auto + ; + grid-auto-flow: row; + gap: 0.382rem; + /* display: flex; flex-direction: column; + */ margin: 0; padding: 0; } +dl.Browse__twopleset > dt { + grid-column: twople-pred; + display: flex; + flex-direction: column; + gap: 0.191rem; +} + +dl.Browse__twopleset > dd { + grid-column: twople-obj; +} + .Browse__twople { display: flex; flex-direction: row; @@ -95,17 +121,28 @@ details.Browse__card[open] > summary::before { flex-direction: row; margin: 0; /*border: 1px dotted #000;*/ - border: dotted 1px; + /*border: dotted 1px;*/ gap: 0.382rem; } +.Browse__blanknode { + padding: 0.191rem; + border: dotted 0.191rem rgba(0,0,0,0.382); +} + .Browse__literal { display: flex; flex-direction: row; - flex-wrap: wrap; + /*flex-wrap: wrap;*/ gap: 0.382rem; + background-color: hsl(from var(--card-color) h s 91%); + padding: 0.191rem; + /*border: dotted 0.191rem rgba(0,0,0,0.382);*/ } +.Browse__literal > q { + flex-basis: 100%; +} /* .Browse :focus-within { backdrop-filter: hue-rotate(var(--hue-rotate-step)); diff --git a/trove/trovebrowse_gathering.py b/trove/trovebrowse_gathering.py index 864f1cd89..93be98cbc 100644 --- a/trove/trovebrowse_gathering.py +++ b/trove/trovebrowse_gathering.py @@ -15,7 +15,7 @@ rdf.literal('trovebrowse', language='en'), rdf.literal('browse a trove of IRI-linked metadata', language='en'), ), - focustype_iris={ns.RDFS.Resource}, + focustype_iris={}, param_iris={ns.TROVE.withAmalgamation}, thesaurus=TROVE_API_THESAURUS, @@ -41,10 +41,10 @@ def gather_thesaurus_entry(focus, *, with_amalgamation: bool): if with_amalgamation: yield from rdf.iter_twoples(_thesaurus_entry) else: - yield (ns.FOAF.primaryTopicOf, rdf.QuotedGraph({_iri: _thesaurus_entry}, focus_iri=_iri)) + yield (ns.FOAF.isPrimaryTopicOf, rdf.QuotedGraph({_iri: _thesaurus_entry}, focus_iri=_iri)) -@trovebrowse.gatherer(ns.FOAF.primaryTopicOf) +@trovebrowse.gatherer(ns.FOAF.isPrimaryTopicOf) def gather_cards_focused_on(focus, *, with_amalgamation: bool): _identifier_qs = trove_db.ResourceIdentifier.objects.queryset_for_iris(focus.iris) _indexcard_qs = trove_db.Indexcard.objects.filter(focus_identifier_set__in=_identifier_qs) @@ -54,7 +54,7 @@ def gather_cards_focused_on(focus, *, with_amalgamation: bool): else: for _indexcard in _indexcard_qs: _card_iri = _indexcard.get_iri() - yield (ns.FOAF.primaryTopicOf, _card_iri) + yield (ns.FOAF.isPrimaryTopicOf, _card_iri) yield (_card_iri, ns.RDF.type, ns.TROVE.Indexcard) diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 9d97765da..8f8629927 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -363,7 +363,7 @@ def _load_cards_and_extracted_rdf_contents(card_iris=None, value_iris=None) -> d _card_iri = _card.get_iri() _quoted_graph = _indexcard_rdf.as_quoted_graph() _quoted_graph.add( - (_quoted_graph.focus_iri, FOAF.primaryTopicOf, _card_iri), + (_quoted_graph.focus_iri, FOAF.isPrimaryTopicOf, _card_iri), ) _card_foci[_card_iri] = IndexcardFocus.new( iris=_card_iri, diff --git a/trove/views/_base.py b/trove/views/_base.py index fd138ad67..7e2a0f743 100644 --- a/trove/views/_base.py +++ b/trove/views/_base.py @@ -9,7 +9,7 @@ from primitive_metadata import gather from trove import exceptions as trove_exceptions -from trove.vocab.namespaces import RDFS, TROVE +from trove.vocab.namespaces import TROVE from trove.util.trove_params import BasicTroveParams from trove.render import ( BaseRenderer, @@ -27,7 +27,7 @@ class BaseTroveView(View, abc.ABC): # ClassVars expected on inheritors: gathering_organizer: ClassVar[gather.GatheringOrganizer] params_type: ClassVar[type[BasicTroveParams]] = BasicTroveParams - focus_type_iris: ClassVar[Container[str]] = (RDFS.Resource,) + focus_type_iris: ClassVar[Container[str]] = () def get(self, request): try: diff --git a/trove/views/browse.py b/trove/views/browse.py index bea82468f..6061bcf78 100644 --- a/trove/views/browse.py +++ b/trove/views/browse.py @@ -23,6 +23,7 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: if not _iri_value: raise trove_exceptions.MissingRequiredQueryParam('iri') _iri = ns.NAMESPACES_SHORTHAND.expand_iri(unquote_iri(_iri_value)) + _iri = ns.NAMESPACES_SHORTHAND.expand_iri(unquote_iri(_iri_value)) return { **super().parse_queryparams(queryparams), 'iri': _iri, From 3669aa40555097054800991ad51d3f354647c292 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 2 Apr 2025 14:48:07 -0400 Subject: [PATCH 08/43] wip --- project/settings.py | 12 +-- trove/render/_base.py | 8 +- trove/render/html_browse.py | 175 +++++++++++++++++++-------------- trove/render/jsonapi.py | 7 +- trove/static/css/browse.css | 139 +++++++++++++++----------- trove/trovebrowse_gathering.py | 12 +-- trove/util/iris.py | 15 ++- trove/util/queryparams.py | 29 +++++- trove/util/trove_params.py | 4 +- trove/views/browse.py | 23 +++-- trove/views/vocab.py | 2 +- trove/vocab/trove.py | 75 +++++++------- 12 files changed, 288 insertions(+), 213 deletions(-) diff --git a/project/settings.py b/project/settings.py index adb6ec1a6..e6db19bd5 100644 --- a/project/settings.py +++ b/project/settings.py @@ -16,15 +16,7 @@ import jwe from share import __version__ - - -def strtobool(s: str) -> bool: - s = s.lower() - if s in ('t', 'true', '1'): - return True - if s in ('f', 'false', '0'): - return False - raise ValueError(f'unboolable string: "{s}"') +from trove.util.queryparams import parse_booly_str def split(string, delim): @@ -463,7 +455,7 @@ def route_urgent_task(name, args, kwargs, options, task=None, **kw): SUBJECTS_CENTRAL_TAXONOMY = os.environ.get('SUBJECTS_CENTRAL_TAXONOMY', 'bepress') -HIDE_DEPRECATED_VIEWS = strtobool(os.environ.get('HIDE_DEPRECATED_VIEWS', 'False')) +HIDE_DEPRECATED_VIEWS = parse_booly_str(os.environ.get('HIDE_DEPRECATED_VIEWS', 'False')) # Regulator pipeline, names of setuptools entry points SHARE_REGULATOR_CONFIG = { diff --git a/trove/render/_base.py b/trove/render/_base.py index 996ff6744..4f9d16c87 100644 --- a/trove/render/_base.py +++ b/trove/render/_base.py @@ -11,8 +11,10 @@ from trove import exceptions as trove_exceptions from trove.vocab import mediatypes -from trove.vocab.namespaces import NAMESPACES_SHORTHAND -from trove.vocab.trove import TROVE_API_THESAURUS +from trove.vocab.trove import ( + TROVE_API_THESAURUS, + trove_shorthand, +) from ._rendering import ProtoRendering, SimpleRendering @@ -31,7 +33,7 @@ class BaseRenderer(abc.ABC): # instance fields response_focus: gather.Focus response_gathering: gather.Gathering - iri_shorthand: rdf.IriShorthand = NAMESPACES_SHORTHAND + iri_shorthand: rdf.IriShorthand = dataclasses.field(default_factory=trove_shorthand) thesaurus_tripledict: rdf.RdfTripleDictionary = dataclasses.field(default_factory=lambda: TROVE_API_THESAURUS) @functools.cached_property diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index 587ee15ea..b59a7100b 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -11,9 +11,11 @@ fromstring as etree_fromstring, ) +from django.conf import settings from django.contrib.staticfiles.storage import staticfiles_storage from django.http import QueryDict from django.urls import reverse +from django.utils.translation import gettext as _ import markdown2 from primitive_metadata import primitive_rdf as rdf @@ -60,6 +62,7 @@ def simple_render_document(self) -> str: all_data=self.response_tripledict, focus_iri=self.response_focus.single_iri(), iri_shorthand=self.iri_shorthand, + is_data_blended=self.response_gathering.gatherer_kwargs.get('blend_cards'), ) _html_str = etree_tostring(_html_builder.html_element, encoding='unicode', method='html') return ''.join(( @@ -73,6 +76,7 @@ class _HtmlBuilder: all_data: rdf.RdfTripleDictionary focus_iri: str iri_shorthand: rdf.IriShorthand + is_data_blended: bool | None = None html_element: Element = dataclasses.field(init=False) __current_data: rdf.RdfTripleDictionary = dataclasses.field(init=False) __current_element: Element = dataclasses.field(init=False) @@ -96,126 +100,126 @@ def __post_init__(self): } with self.__nest('body', attrs=_body_attrs): self.__render_subj(self.focus_iri), - self.__render_mediatype_links() - self.__render_amalgamation_switch() + self.__alternate_mediatypes_card() + if self.is_data_blended is not None: + self.__blender_toggle_card() # TODO:
with unvisited triples in self.data (unreachable from focus_iri) - def __render_mediatype_links(self): - with self.__nest_card(): - self.__leaf('header', text='alternate mediatypes') - with self.__nest('ul', attrs={'class': 'Browse__twopleset'}): - for _mediatype in shuffled((*STABLE_MEDIATYPES, *UNSTABLE_MEDIATYPES)): - with self.__nest('li', attrs={'class': 'Browse__twople'}): - self.__mediatype_link(_mediatype) - - def __render_amalgamation_switch(self): - ... # TODO - # with self.__nest_card(): - # _text = ('ON' if ... else 'OFF') - # self.__leaf('header', text=f'amalgamation {_text}') - # self.__leaf('a', text=..., attrs={ - # 'href': self._queryparam_href('withAmalgamation', ('' if ... else None)), - # }) + def __alternate_mediatypes_card(self): + with self.__nest_card('nav'): + self.__leaf('header', text=_('alternate mediatypes')) + for _mediatype in shuffled((*STABLE_MEDIATYPES, *UNSTABLE_MEDIATYPES)): + with self.__nest('span', attrs={'class': 'Browse__literal'}): + self.__mediatype_link(_mediatype) + + def __blender_toggle_card(self): + with self.__nest_card('nav'): + if self.is_data_blended: + _header_text = _('card-blending ON') + _link_text = _('disable card-blending') + _link_blend = '0' # blendCards=0 + else: + _header_text = _('card-blending OFF') + _link_text = _('enable card-blending') + _link_blend = None # remove blendCards param (defaults true) + self.__leaf('header', text=_header_text) + self.__leaf('a', text=_link_text, attrs={ + 'href': self._queryparam_href('blendCards', _link_blend), + }) def __mediatype_link(self, mediatype: str): self.__leaf('a', text=mediatype, attrs={ 'href': self._queryparam_href('acceptMediatype', mediatype), }) if mediatype in UNSTABLE_MEDIATYPES: - self.__leaf('aside', text='(unstable)') + self.__leaf('aside', text=_('(unstable)')) if mediatype in STABLE_MEDIATYPES: - with self.__nest('aside') as _aside: - _aside.text = '(stable for ' + with self.__nest('aside'): with self.__nest('a', attrs={'href': reverse('trove:docs')}) as _link: - _link.text = 'documented use' - _link.tail = ')' + _link.text = _('(stable for documented use)') - def __render_subj(self, subj_iri: str, *, start_collapsed=True): + def __render_subj(self, subj_iri: str, *, start_collapsed=False): _twopledict = self.__current_data.get(subj_iri, {}) with self.__visiting(subj_iri): with self.__nest_card('article'): with self.__nest('header'): - _compact = self.iri_shorthand.compact_iri(subj_iri) - _suffuniq = get_sufficiently_unique_iri(subj_iri) - _h_text = (_compact if (_compact != subj_iri) else _suffuniq) + _h_text, _also_texts = self.__iri_display_texts(subj_iri) with self.__nest_h_tag(): self.__leaf('dfn', text=_h_text, attrs={'id': quote(subj_iri)}) - if _compact not in (subj_iri, _h_text): - self.__leaf('code', text=_compact) - if _suffuniq != _h_text: - self.__leaf('code', text=_suffuniq) - for _label in self.__labels_for_iri(subj_iri): + for _also_text in _also_texts: + self.__leaf('code', text=_also_text) + for _label in self.__iri_thesaurus_labels(subj_iri): self.__literal(_label) if _twopledict: - with self.__nest_card('details') as _details: + with self.__nest('details') as _details: if not start_collapsed: - _details['open'] = '' - self.__leaf('summary', text='details...') + _details.set('open', '') + self.__leaf('summary', text=_('details...')) self.__twoples(_twopledict) def __twoples(self, twopledict: rdf.RdfTwopleDictionary): with self.__nest('dl', {'class': 'Browse__twopleset'}): for _pred, _obj_set in shuffled(twopledict.items()): with self.__nest('dt'): - self.__compact_link(_pred) - for _text in self.__labels_for_iri(_pred): + _pred_link = self.__compact_link(_pred) + _append_class(_pred_link, 'Browse__predicate') + for _text in self.__iri_thesaurus_labels(_pred): self.__literal(_text) with self.__nest('dd'): for _obj in shuffled(_obj_set): self.__obj(_obj) - # with self.__nest('ul', {'class': 'Browse__twopleset'}): - # for _pred, _obj_set in shuffled(twopledict.items()): - # with self.__nest('li', {'class': 'Browse__twople'}): - # self.__leaf_link(_pred) - # with self.__nest('ul', {'class': 'Browse__objectset'}): - # for _obj in shuffled(_obj_set): - # with self.__nest('li', {'class': 'Browse__object'}): - # self.__obj(_obj) def __obj(self, obj: rdf.RdfObject): if isinstance(obj, str): # iri # TODO: detect whether indexcard? - if obj in self.__current_data: - if obj in self.__visiting_iris: - self.__iri_link_and_labels(obj) # TODO: consider - else: - self.__render_subj(obj) + if (obj in self.__current_data) and (obj not in self.__visiting_iris): + self.__render_subj(obj) else: - self.__iri_link_and_labels(obj) + with self.__nest('article', attrs={'class': 'Browse__object'}): + self.__iri_link_and_labels(obj) elif isinstance(obj, frozenset): # blanknode if (RDF.type, RDF.Seq) in obj: self.__sequence(obj) else: self.__blanknode(obj) elif isinstance(obj, rdf.Literal): - self.__literal(obj) + self.__literal(obj, is_rdf_object=True) elif isinstance(obj, (float, int, datetime.date)): - self.__literal(rdf.literal(obj)) + self.__literal(rdf.literal(obj), is_rdf_object=True) elif isinstance(obj, rdf.QuotedGraph): self.__quoted_graph(obj) - def __literal(self, literal: rdf.Literal | str): + def __literal( + self, + literal: rdf.Literal | str, + *, + is_rdf_object: bool = False, + ): _lit = (literal if isinstance(literal, rdf.Literal) else rdf.literal(literal)) _markdown_iri = rdf.iri_from_mediatype('text/markdown') _is_markdown = any( _datatype.startswith(_markdown_iri) for _datatype in _lit.datatype_iris ) + _element_classes = ['Browse__literal'] + if is_rdf_object: + _element_classes.append('Browse__object') # TODO: checksum_iri, literal_iri - with self.__nest('article', attrs={'class': 'Browse__literal'}): + with self.__nest('article', attrs={'class': ' '.join(_element_classes)}): + for _datatype_iri in _lit.datatype_iris.difference(_IMPLICIT_DATATYPES): + self.__compact_link(_datatype_iri) if _is_markdown: # TODO: tests for safe_mode _html = markdown2.markdown(_lit.unicode_value, safe_mode='escape') self.__current_element.append(etree_fromstring(f'{_html}')) else: self.__leaf('q', text=_lit) - for _datatype_iri in _lit.datatype_iris.difference(_IMPLICIT_DATATYPES): - self.__compact_link(_datatype_iri) def __sequence(self, sequence_twoples: frozenset): _obj_in_order = list(rdf.sequence_objects_in_order(sequence_twoples)) with self.__nest('details', attrs={'open': ''}): - self.__leaf('summary', text=f'sequence of {len(_obj_in_order)}') + _text = _('sequence of %(count)') % {'count': len(_obj_in_order)} + self.__leaf('summary', text=_text) with self.__nest('ol'): # TODO: style? for _seq_obj in _obj_in_order: with self.__nest('li'): # , visible=True): @@ -223,7 +227,7 @@ def __sequence(self, sequence_twoples: frozenset): def __quoted_graph(self, quoted_graph: rdf.QuotedGraph): with self.__quoted_data(quoted_graph.tripledict): - self.__render_subj(quoted_graph.focus_iri, start_collapsed=True) + self.__render_subj(quoted_graph.focus_iri) # , start_collapsed=True) def __blanknode(self, blanknode: rdf.RdfTwopleDictionary | frozenset): _twopledict = ( @@ -231,7 +235,10 @@ def __blanknode(self, blanknode: rdf.RdfTwopleDictionary | frozenset): if isinstance(blanknode, dict) else rdf.twopledict_from_twopleset(blanknode) ) - with self.__nest('article', attrs={'class': 'Browse__blanknode'}): + with self.__nest('article', attrs={ + 'class': 'Browse__blanknode Browse__object', + 'style': self._hue_turn_css(), + }): self.__twoples(_twopledict) ### @@ -290,23 +297,17 @@ def __leaf(self, tag_name, *, text=None, attrs=None): elif text is not None: _leaf_element.text = text - def __browse_link(self, iri: str, *, attrs=None): - return self.__nest('a', attrs={ - **(attrs or {}), - 'href': trove_browse_link(iri), - }) - def __iri_link_and_labels(self, iri: str): self.__compact_link(iri) - for _text in self.__labels_for_iri(iri): + for _text in self.__iri_thesaurus_labels(iri): self.__literal(_text) def __compact_link(self, iri: str): - _compact = self.iri_shorthand.compact_iri(iri) - with self.__browse_link(iri) as _link: - _link.text = _compact + with self.__nest('a', attrs={'href': trove_browse_link(iri)}) as _a: + _a.text = self.iri_shorthand.compact_iri(iri) + return _a - def __nest_card(self, tag: str = 'nav'): + def __nest_card(self, tag: str): return self.__nest( tag, attrs={ @@ -315,7 +316,7 @@ def __nest_card(self, tag: str = 'nav'): }, ) - def __labels_for_iri(self, iri: str): + def __iri_thesaurus_labels(self, iri: str): # TODO: consider requested language _suffuniq = get_sufficiently_unique_iri(iri) _thesaurus_entry = combined_thesaurus__suffuniq().get(_suffuniq) @@ -325,15 +326,21 @@ def __labels_for_iri(self, iri: str): def _hue_turn_css(self): # return f'--hue-turn: {random.random()}turn;' - _hue_turn = self.__last_hue_turn + (_PHI / 13) + _hue_turn = (self.__last_hue_turn + _PHI) % 1.0 self.__last_hue_turn = _hue_turn return f'--hue-turn: {_hue_turn}turn;' def _queryparam_href(self, param_name: str, param_value: str | None): - (_scheme, _netloc, _path, _query, _fragment) = urlsplit(self.focus_iri) + _base_url = self.focus_iri + if not _base_url.startswith(settings.SHARE_WEB_URL): + _base_url = trove_browse_link(_base_url) + (_scheme, _netloc, _path, _query, _fragment) = urlsplit(_base_url) _qparams = QueryDict(_query, mutable=True) if param_value is None: - del _qparams[param_name] + try: + del _qparams[param_name] + except KeyError: + pass else: _qparams[param_name] = param_value return urlunsplit(( @@ -343,3 +350,21 @@ def _queryparam_href(self, param_name: str, param_value: str | None): _qparams.urlencode(), _fragment, )) + + def __iri_display_texts(self, iri: str) -> tuple[str, set[str]]: + _compact = self.iri_shorthand.compact_iri(iri) + _suffuniq = get_sufficiently_unique_iri(iri) + _main_display = ( + _compact + if (_compact != iri) + else _suffuniq + ) + _also_display = {iri, _compact} - {_main_display} + return (_main_display, _also_display) + + +def _append_class(el: Element, element_class: str): + el.set( + 'class', + ' '.join(filter(None, (element_class, el.get('class')))), + ) diff --git a/trove/render/jsonapi.py b/trove/render/jsonapi.py index 8e9fc2bcb..ef9da875b 100644 --- a/trove/render/jsonapi.py +++ b/trove/render/jsonapi.py @@ -22,12 +22,13 @@ OSFMAP, OWL, RDF, + RDFS, TROVE, XSD, - NAMESPACES_SHORTHAND, ) from trove.vocab.trove import ( trove_indexcard_namespace, + trove_shorthand, ) from ._base import BaseRenderer @@ -150,7 +151,7 @@ def render_identifier_object(self, iri_or_blanknode: _IriOrBlanknode): def _single_typename(self, type_iris: list[str]): if not type_iris: - raise trove_exceptions.MissingRdfType + return self._membername_for_iri(RDFS.Resource) if len(type_iris) == 1: return self._membername_for_iri(type_iris[0]) # choose one predictably, preferring osfmap and trove @@ -302,7 +303,7 @@ def _render_attribute_datum(self, rdfobject: primitive_rdf.RdfObject) -> dict | try: # maybe it's a jsonapi resource return self.render_identifier_object(rdfobject) except Exception: - return NAMESPACES_SHORTHAND.compact_iri(rdfobject) + return trove_shorthand().compact_iri(rdfobject) elif isinstance(rdfobject, (float, int)): return rdfobject elif isinstance(rdfobject, datetime.date): diff --git a/trove/static/css/browse.css b/trove/static/css/browse.css index b5fd216a2..f26aeec5e 100644 --- a/trove/static/css/browse.css +++ b/trove/static/css/browse.css @@ -1,14 +1,22 @@ .BrowseWrapper { + --hue-turn: 0.618turn; + --saturation: 71%; + --luminosity: 83%; + --gap-1: 1.618rem; + --gap-2: 0.618rem; + --gap-3: 0.382rem; + --gap-4: 0.191rem; + --gap-5: 0.095rem; display: flex; flex-direction: row; align-items: flex-start; flex-wrap: wrap; - gap: 0.618rem; + gap: var(--gap-1); margin: 0; - padding: 0; + padding: 1rem; min-height: 100vh; /*background-color: #fedbae;*/ - background-color: hsl(var(--hue-turn), 100%, 81%); + background-color: hsl(var(--hue-turn), var(--saturation), var(--luminosity)); /* backdrop-filter: hue-rotate(var(--hue-turn)); */ } @@ -17,40 +25,48 @@ } .Browse__card { - --card-color: hsl(var(--hue-turn), 100%, 81%); display: flex; flex-direction: column; - padding: 0.382rem 0.618rem; - border: solid 1px rgba(0,0,0,0.191); - background-color: var(--card-color); - box-shadow: -0.191rem 0.191rem hsl(from var(--card-color) h s l / 38%); - margin: 0 0 0.618rem 0.618rem; - /*max-width: 31rem;*/ - transition-property: height, width; - transition-duration: 1.618s; + padding: var(--gap-3) var(--gap-2); + background-color: hsl(var(--hue-turn), var(--saturation), var(--luminosity)); + --border-luminosity: calc(var(--luminosity) * 0.618); + border-color: hsl(var(--hue-turn), var(--saturation), var(--border-luminosity)); + border-style: outset; + /* + border-block-start-width: 0; + border-block-end-width: var(--gap-3); + border-inline-start-width: var(--gap-3); + border-inline-end-width: 0; + border-start-start-radius: 1rem; + border-end-end-radius: 1rem; + */ + border-block-start-width: 1px; + border-inline-start-width: var(--gap-3); + border-block-end-width: var(--gap-4); + border-inline-end-width: 1px; + border-start-start-radius: 1rem; + border-end-end-radius: 1rem; } -details.Browse__card > summary::before { +.BrowseWrapper details > summary::before { content: '‽'; display: inline-block; transition-property: rotate; - transition-duration: 1.618s; - margin-right: 0.618rem; + transition-duration: 0.618s; + margin-right: var(--gap-2); } -details.Browse__card[open] > summary::before { +.BrowseWrapper details[open] > summary::before { rotate: var(--hue-turn); } -.BrowseWrapper > .Browse__card { - margin: 1rem; -} - .Browse__card > header { display: flex; flex-direction: column; - gap: 0.618rem; - padding: 0.618rem; + border-bottom: solid 1px rgba(0,0,0,0.382); + margin-bottom: var(--gap-3); + /*font-style: italic;*/ + /* gap: var(--gap-2); */ } .Browse__card > header > :first-child { @@ -59,17 +75,17 @@ details.Browse__card[open] > summary::before { } .Browse__card > footer { - padding: 0.618rem; + padding: var(--gap-2); } dl.Browse__twopleset { display: grid; grid-template-columns: [twople-pred] auto - [twople-obj] auto + [twople-obj] 1fr ; grid-auto-flow: row; - gap: 0.382rem; + row-gap: var(--gap-2); /* display: flex; flex-direction: column; @@ -83,66 +99,71 @@ dl.Browse__twopleset > dt { grid-column: twople-pred; display: flex; flex-direction: column; - gap: 0.191rem; + /*gap: var(--gap-4);*/ } dl.Browse__twopleset > dd { grid-column: twople-obj; + margin: 0; + display: flex; + flex-direction: column; + gap: var(--gap-5); } .Browse__twople { display: flex; flex-direction: row; align-items: flex-start; - gap: 0.382rem; + gap: var(--gap-3); margin: 0; - /* - border: solid 1px rgba(0,0,0,0.382); } -.Browse__twople:not(:first-child) { - border-top: 0; - */ +.Browse__blanknode { + /*margin-top: var(--gap-4);*/ + padding: var(--gap-4); + /*border: dotted var(--gap-4) rgba(0,0,0,0.382);*/ + border: outset var(--gap-4) rgba(0,0,0,0.382); } -.Browse__objectset { +.Browse__literal { display: flex; flex-direction: row; - flex-wrap: wrap; - align-items: flex-start; - - margin: 0; - padding: 0; - gap: 0.382rem; + /*flex-wrap: wrap;*/ + gap: var(--gap-3); + padding: var(--gap-4); + /*border-block-start: solid 1px rgba(0,0,0,0.382);*/ + /*border: dotted var(--gap-4) rgba(0,0,0,0.382);*/ } -.Browse__object { - display: flex; - flex-direction: row; +.Browse__literal > q { + flex-basis: 100%; + font-style: italic; +} +.Browse__literal > q > p { margin: 0; - /*border: 1px dotted #000;*/ - /*border: dotted 1px;*/ - gap: 0.382rem; } -.Browse__blanknode { - padding: 0.191rem; - border: dotted 0.191rem rgba(0,0,0,0.382); +.Browse__predicate { + --luminosity-boost: 0.31; + --boosted-luminosity: calc(var(--luminosity) + ((100% - var(--luminosity)) * var(--luminosity-boost))); + background-color: hsl(var(--hue-turn), var(--saturation), var(--boosted-luminosity)); + padding: var(--gap-4); } -.Browse__literal { - display: flex; - flex-direction: row; - /*flex-wrap: wrap;*/ - gap: 0.382rem; - background-color: hsl(from var(--card-color) h s 91%); - padding: 0.191rem; - /*border: dotted 0.191rem rgba(0,0,0,0.382);*/ +.Browse__object { + --luminosity-boost: 0.51; + --boosted-luminosity: calc(var(--luminosity) + ((100% - var(--luminosity)) * var(--luminosity-boost))); + background-color: hsl(var(--hue-turn), var(--saturation), var(--boosted-luminosity)); + padding: var(--gap-4); + /*margin-top: var(--gap-4);*/ + /*border-block-start: solid 1px rgba(0,0,0,0.382); +} +.Browse__object:first-of-type { + border-block-start: none; + */ } -.Browse__literal > q { - flex-basis: 100%; -} + /* .Browse :focus-within { backdrop-filter: hue-rotate(var(--hue-rotate-step)); diff --git a/trove/trovebrowse_gathering.py b/trove/trovebrowse_gathering.py index 93be98cbc..ec8593008 100644 --- a/trove/trovebrowse_gathering.py +++ b/trove/trovebrowse_gathering.py @@ -16,7 +16,7 @@ rdf.literal('browse a trove of IRI-linked metadata', language='en'), ), focustype_iris={}, - param_iris={ns.TROVE.withAmalgamation}, + param_iris={ns.TROVE.blendCards}, thesaurus=TROVE_API_THESAURUS, ) @@ -27,28 +27,28 @@ rdf.literal('trovebrowse organizer', language='en'), ), norms=TROVEBROWSE_NORMS, - gatherer_params={'with_amalgamation': ns.TROVE.withAmalgamation}, + gatherer_params={'blend_cards': ns.TROVE.blendCards}, ) @trovebrowse.gatherer() -def gather_thesaurus_entry(focus, *, with_amalgamation: bool): +def gather_thesaurus_entry(focus, *, blend_cards: bool): _thesaurus = static_vocab.combined_thesaurus__suffuniq() for _iri in focus.iris: _suffuniq_iri = get_sufficiently_unique_iri(_iri) _thesaurus_entry = _thesaurus.get(_suffuniq_iri, None) if _thesaurus_entry: - if with_amalgamation: + if blend_cards: yield from rdf.iter_twoples(_thesaurus_entry) else: yield (ns.FOAF.isPrimaryTopicOf, rdf.QuotedGraph({_iri: _thesaurus_entry}, focus_iri=_iri)) @trovebrowse.gatherer(ns.FOAF.isPrimaryTopicOf) -def gather_cards_focused_on(focus, *, with_amalgamation: bool): +def gather_cards_focused_on(focus, *, blend_cards: bool): _identifier_qs = trove_db.ResourceIdentifier.objects.queryset_for_iris(focus.iris) _indexcard_qs = trove_db.Indexcard.objects.filter(focus_identifier_set__in=_identifier_qs) - if with_amalgamation: + if blend_cards: for _latest_rdf in trove_db.LatestIndexcardRdf.objects.filter(indexcard__in=_indexcard_qs): yield from rdf.iter_tripleset(_latest_rdf.as_rdf_tripledict()) else: diff --git a/trove/util/iris.py b/trove/util/iris.py index 5fbe9c234..15dc64f94 100644 --- a/trove/util/iris.py +++ b/trove/util/iris.py @@ -155,10 +155,17 @@ def unquote_iri(iri: str) -> str: 'namly:urn.example:blerg' >>> unquote_iri('namly%3Aurn.example%3Ablerg') 'namly:urn.example:blerg' + >>> unquote_iri('werbleWord') + 'werbleWord' + >>> quote(quote('flipl://iri.example/blarg/?' + urlencode({'iri': '://blarg///' + quote('://bl@rg?')))) + >>> unquote_iri(_) + >>> quote('namly:urn.example:' + quote('flipl://iri.example/blarg/?')) + >>> unquote_iri(_) ''' _unquoted_iri = iri - while QUOTED_IRI_REGEX.match(_unquoted_iri): - _unquoted_iri = unquote(_unquoted_iri) - if not UNQUOTED_IRI_REGEX.match(_unquoted_iri): - raise trove_exceptions.InvalidQuotedIri(f'does not look like a quoted iri: {iri}') + while not UNQUOTED_IRI_REGEX.match(_unquoted_iri): + _next_unquoted_iri = unquote(_unquoted_iri) + if _unquoted_iri == _next_unquoted_iri: + break + _unquoted_iri = _next_unquoted_iri return _unquoted_iri diff --git a/trove/util/queryparams.py b/trove/util/queryparams.py index aeeaab11a..0a9bb5d75 100644 --- a/trove/util/queryparams.py +++ b/trove/util/queryparams.py @@ -26,6 +26,9 @@ # value to be split on commas, used as a list or set QUERYPARAM_VALUES_DELIM = ',' +TRUTHY_VALUES = frozenset(('t', 'true', '1', 'y', 'yes')) +FALSY_VALUES = frozenset(('f', 'false', '0', 'n', 'no')) + @dataclasses.dataclass(frozen=True) class QueryparamName: @@ -95,7 +98,7 @@ def join_queryparam_value(values: typing.Iterable[str]): def get_single_value( queryparams: QueryparamDict, queryparam_name: QueryparamName | str, -): +) -> str | None: if isinstance(queryparam_name, QueryparamName): _family_name = queryparam_name.family _expected_brackets = queryparam_name.bracketed_names @@ -115,3 +118,27 @@ def get_single_value( raise trove_exceptions.InvalidRepeatedQueryParam(str(queryparam_name)) else: return _singlevalue + + +def get_bool_value( + queryparams: QueryparamDict, + queryparam_name: QueryparamName | str, + *, + if_absent: bool = False, # by default, param absence is falsy + if_empty: bool = True, # by default, presence (with empty value) is truthy +) -> bool: + _value = get_single_value(queryparams, queryparam_name) + if _value is None: + return if_absent + if _value == '': + return if_empty + return parse_booly_str(_value) + + +def parse_booly_str(value: str): + _lowered = value.lower() + if _lowered in TRUTHY_VALUES: + return True + if _lowered in FALSY_VALUES: + return False + raise ValueError(f'unboolable string: "{value}"') diff --git a/trove/util/trove_params.py b/trove/util/trove_params.py index 88448a18a..d23010b3c 100644 --- a/trove/util/trove_params.py +++ b/trove/util/trove_params.py @@ -18,7 +18,7 @@ parse_propertypath, ) from trove.util import queryparams as _qp -from trove.vocab.trove import shtrove_shorthand +from trove.vocab.trove import trove_shorthand @dataclasses.dataclass(frozen=True) @@ -52,7 +52,7 @@ def parse_queryparams(cls, queryparams: _qp.QueryparamDict) -> dict: @classmethod def _default_shorthand(cls) -> rdf.IriShorthand: - return shtrove_shorthand() + return trove_shorthand() @classmethod def _default_include(cls) -> PropertypathSet: diff --git a/trove/views/browse.py b/trove/views/browse.py index 6061bcf78..457352fe0 100644 --- a/trove/views/browse.py +++ b/trove/views/browse.py @@ -2,12 +2,14 @@ from trove import exceptions as trove_exceptions from trove.util.iris import unquote_iri -from trove.vocab import namespaces as ns +from trove.vocab.osfmap import osfmap_shorthand +from trove.vocab.trove import trove_shorthand from trove.trovebrowse_gathering import trovebrowse from trove.util.trove_params import BasicTroveParams from trove.util.queryparams import ( QueryparamDict, get_single_value, + get_bool_value, ) from ._base import BaseTroveView @@ -15,21 +17,28 @@ @dataclasses.dataclass(frozen=True) class BrowseParams(BasicTroveParams): iri: str - with_amalgamation: bool + blend_cards: bool @classmethod def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: _iri_value = get_single_value(queryparams, 'iri') if not _iri_value: raise trove_exceptions.MissingRequiredQueryParam('iri') - _iri = ns.NAMESPACES_SHORTHAND.expand_iri(unquote_iri(_iri_value)) - _iri = ns.NAMESPACES_SHORTHAND.expand_iri(unquote_iri(_iri_value)) return { **super().parse_queryparams(queryparams), - 'iri': _iri, - 'with_amalgamation': ('withAmalgamation' in queryparams), + 'iri': cls._parse_iri(_iri_value), + 'blend_cards': get_bool_value(queryparams, 'blendCards', if_absent=True), } + @classmethod + def _parse_iri(cls, iri_value: str): + _iri = unquote_iri(iri_value) + if ':' in _iri: + _iri = trove_shorthand().expand_iri(_iri) + else: # NOTE: special osfmap + _iri = osfmap_shorthand().expand_iri(_iri) + return _iri + class BrowseIriView(BaseTroveView): gathering_organizer = trovebrowse @@ -41,5 +50,5 @@ def _get_focus_iri(self, request, params: BrowseParams): # override BaseTroveVi def _get_gatherer_kwargs(self, params, renderer_type): # override BaseTroveView return { **super()._get_gatherer_kwargs(params, renderer_type), - 'with_amalgamation': params.with_amalgamation, + 'blend_cards': params.blend_cards, } diff --git a/trove/views/vocab.py b/trove/views/vocab.py index b2c5026b0..3a896fe82 100644 --- a/trove/views/vocab.py +++ b/trove/views/vocab.py @@ -15,7 +15,7 @@ def get(self, request, vocab_term): if _iri not in TROVE_API_THESAURUS: raise http.Http404 _browse_url = '?'.join(( - reverse('trove-browse'), + reverse('trove:browse-iri'), urlencode({'iri': _iri}), )) return redirect(_browse_url) diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index 69207daf0..2d8dd9c86 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -43,9 +43,10 @@ def _literal_markdown(text: str, *, language: str): def trove_browse_link(iri: str): + _compact = trove_shorthand().compact_iri(iri) return urllib.parse.urljoin( reverse('trove:browse-iri'), - f'?iri={urllib.parse.quote(iri)}', + f'?iri={urllib.parse.quote(_compact)}', ) @@ -159,9 +160,8 @@ def trove_browse_link(iri: str): search index-cards that match a fuzzy text search for the word "word" in the title (aka `dcterms:title`, ``) uses query parameter: -``` -cardSearchText[title]=word -``` + +* `cardSearchText[title]=word` ''', language='en')}, RDF.value: {literal('/trove/index-card-search?cardSearchText[title]=word&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -172,9 +172,8 @@ def trove_browse_link(iri: str): search index-cards that have at least one creator affiliated with [COS](https://cos.io) uses query parameter: -``` -cardSearchFilter[creator.affiliation]=https://cos.io -``` + +* `cardSearchFilter[creator.affiliation]=https://cos.io` ''', language='en')}, RDF.value: {literal('/trove/index-card-search?cardSearchFilter[creator.affiliation]=https://cos.io&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -186,9 +185,8 @@ def trove_browse_link(iri: str): values after 2022 uses query parameter: -``` -cardSearchFilter[dateCreated][after]=2022 -``` + +* `cardSearchFilter[dateCreated][after]=2022` ''', language='en')}, RDF.value: {literal('/trove/index-card-search?cardSearchFilter[dateCreated][after]=2022&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -199,9 +197,8 @@ def trove_browse_link(iri: str): searches index-cards with a specific iri value at any property uses query parameter: -``` -cardSearchFilter[*]=https://osf.io -``` + +* `cardSearchFilter[*]=https://osf.io` ''', language='en')}, RDF.value: {literal('/trove/index-card-search?cardSearchFilter[*]=https://osf.io&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -212,10 +209,9 @@ def trove_browse_link(iri: str): searches for index-cards that have a `funder` and do not have an `affiliation` uses query parameters: -``` -cardSearchFilter[funder][is-present] -cardSearchFilter[affiliation][is-absent] -``` + +* `cardSearchFilter[funder][is-present]` +* `cardSearchFilter[affiliation][is-absent]` ''', language='en')}, RDF.value: {literal('/trove/index-card-search?cardSearchFilter[funder][is-present]&cardSearchFilter[affiliation][is-absent]&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -249,10 +245,9 @@ def trove_browse_link(iri: str): search for iri values for the property `creator` (aka `dcterms:creator`, ``) -uses query parameter: -``` -valueSearchPropertyPath=creator -``` +uses query parameters: + +* `valueSearchPropertyPath=creator` ''', language='en')}, RDF.value: {literal('/trove/index-value-search?valueSearchPropertyPath=creator&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -263,11 +258,10 @@ def trove_browse_link(iri: str): search for iri values for the property `creator` within the context of a card-search uses query parameter: -``` -valueSearchPropertyPath=creator -cardSearchText=sciency -cardSearchFilter[subject][is-present] -``` + +* `valueSearchPropertyPath=creator` +* `cardSearchText=sciency` +* `cardSearchFilter[subject][is-present]` ''', language='en')}, RDF.value: {literal('/trove/index-value-search?valueSearchPropertyPath=creator&cardSearchText=sciency&cardSearchFilter[subject][is-present]&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -277,11 +271,10 @@ def trove_browse_link(iri: str): DCTERMS.description: {_literal_markdown(''' search for a specific iri value in the property `creator` -uses query parameter: -``` -valueSearchPropertyPath=creator -valueSearchFilter[sameAs]=https://orcid.org/0000-0002-6155-6104 -``` +uses query parameters: + +* `valueSearchPropertyPath=creator` +* `valueSearchFilter[sameAs]=https://orcid.org/0000-0002-6155-6104` ''', language='en')}, RDF.value: {literal('/trove/index-value-search?valueSearchPropertyPath=creator&valueSearchFilter[sameAs]=https://orcid.org/0000-0002-6155-6104&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -291,11 +284,10 @@ def trove_browse_link(iri: str): DCTERMS.description: {_literal_markdown(''' search for iri values that are used as `creator` and have `rdf:type` `Person` (aka `foaf:Person`) -uses query parameter: -``` -valueSearchPropertyPath=creator -valueSearchFilter[resourceType]=Person -``` +uses query parameters: + +* `valueSearchPropertyPath=creator` +* `valueSearchFilter[resourceType]=Person` ''', language='en')}, RDF.value: {literal('/trove/index-value-search?valueSearchPropertyPath=creator&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -306,11 +298,10 @@ def trove_browse_link(iri: str): search for iri values used as `license` that have "cc" in their label (`rdfs:label`, `dcterms:title`, or `foaf:name`) -uses query parameter: -``` -valueSearchPropertyPath=license -valueSearchText=cc -``` +uses query parameters: + +* `valueSearchPropertyPath=license` +* `valueSearchText=cc` ''', language='en')}, RDF.value: {literal('/trove/index-value-search?valueSearchPropertyPath=license&valueSearchText=cc&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -848,7 +839,7 @@ def trove_shorthand() -> IriShorthand: @functools.cache def shtrove_shorthand() -> IriShorthand: - '''build iri shorthand that includes unprefixed terms (as defined in TROVE_API_THESAURUS) + '''build iri shorthand that includes osfmap shorthands... ''' return trove_shorthand().with_update(osfmap.osfmap_shorthand().prefix_map) From eb8e26f49fece91ecdb556961ba1c823f2ca5001 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 3 Apr 2025 09:34:13 -0400 Subject: [PATCH 09/43] ingest_from --- .../commands/ingest_from_another_shtrove.py | 65 +++++++++++++++++++ trove/urls.py | 3 +- 2 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 trove/management/commands/ingest_from_another_shtrove.py diff --git a/trove/management/commands/ingest_from_another_shtrove.py b/trove/management/commands/ingest_from_another_shtrove.py new file mode 100644 index 000000000..56e6eba7c --- /dev/null +++ b/trove/management/commands/ingest_from_another_shtrove.py @@ -0,0 +1,65 @@ +import functools +from itertools import islice +import re +from urllib.parse import urlunsplit +import uuid + +from django.conf import settings +from django.core.management.base import BaseCommand +import requests + +from share import models as share_db +from trove import digestive_tract +from trove.vocab import mediatypes + + +class Command(BaseCommand): + help = "ingest metadata from another SHARE/trove instance" + + def add_arguments(self, parser): + parser.add_argument("host", help="host name of the shtrove instance (e.g. 'staging-share.osf.io')") + parser.add_argument("--count", type=int, default=333) + + def handle(self, *args, host, count, **options): + if not settings.DEBUG: + raise Exception('this command not meant for non-debug use') + _ingested_count = 0 + _skipped_count = 0 + for _datum in islice(self._iter_datums(host), count): + if self._ingest(_datum): + _ingested_count += 1 + else: + _skipped_count += 1 + self.stdout.write( + self.style.SUCCESS(f'ingested {_ingested_count} (skipped {_skipped_count}) from {host}') + ) + + def _iter_datums(self, host: str): + _url = urlunsplit(('https', host, '/api/v2/rawdata/', '', '')) + while _url: + _json = requests.get(_url, headers={'Accept': mediatypes.JSONAPI}).json() + for _item in _json['data']: + yield _item['attributes']['datum'] + _url = _json['links'].get('next') + + def _ingest(self, datum: str) -> bool: + _first_subject_match = re.search( + r'^<([^>\s]+)>', # HACK: depends on specific serialization + datum, + re.MULTILINE, + ) + if _first_subject_match: + _subject_iri = _first_subject_match.group(1) + digestive_tract.swallow( + from_user=self._application_user, + record=datum, + record_identifier=uuid.uuid4(), + record_mediatype=mediatypes.TURTLE, + focus_iri=_subject_iri, + ) + return True + return False + + @functools.cached_property + def _application_user(self): + return share_db.ShareUser.objects.get(username=settings.APPLICATION_USERNAME) diff --git a/trove/urls.py b/trove/urls.py index 8b183a48e..64f4b4e3c 100644 --- a/trove/urls.py +++ b/trove/urls.py @@ -1,5 +1,4 @@ from django.urls import path, re_path -from django.views.generic.base import RedirectView from .views.browse import BrowseIriView from .views.ingest import RdfIngestView @@ -24,5 +23,5 @@ path('ingest', view=RdfIngestView.as_view(), name='ingest-rdf'), path('docs/openapi.json', view=OpenapiJsonView.as_view(), name='docs.openapi-json'), path('docs/openapi.html', view=OpenapiHtmlView.as_view(), name='docs.openapi-html'), - re_path(r'docs/?', view=RedirectView.as_view(pattern_name='trove:docs.openapi-html'), name='docs'), + re_path(r'docs/?', view=OpenapiHtmlView.as_view(), name='docs'), ] From 330687494e77ef35f72de145fef4a76df9ee2d04 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 3 Apr 2025 10:46:38 -0400 Subject: [PATCH 10/43] clarify shorthands --- trove/derive/osfmap_json.py | 12 ++++----- trove/render/_base.py | 8 +++--- trove/render/jsonapi.py | 7 ++--- trove/render/simple_csv.py | 2 +- trove/render/turtle.py | 3 +-- trove/static/css/browse.css | 14 +++++++--- trove/trovebrowse_gathering.py | 30 +++++++++++----------- trove/trovesearch/search_params.py | 18 ++++++------- trove/trovesearch/trovesearch_gathering.py | 5 ++-- trove/util/trove_params.py | 4 +-- trove/views/_base.py | 5 +++- trove/views/browse.py | 23 ++++++++++++----- trove/views/search.py | 4 +-- trove/vocab/namespaces.py | 8 ++++-- trove/vocab/osfmap.py | 10 ++++---- trove/vocab/trove.py | 17 ++++-------- 16 files changed, 91 insertions(+), 79 deletions(-) diff --git a/trove/derive/osfmap_json.py b/trove/derive/osfmap_json.py index 1666025f5..5298715cf 100644 --- a/trove/derive/osfmap_json.py +++ b/trove/derive/osfmap_json.py @@ -7,7 +7,7 @@ from trove.vocab.namespaces import TROVE, RDF, OWL from trove.vocab.osfmap import ( OSFMAP_THESAURUS, - osfmap_shorthand, + osfmap_json_shorthand, ) from ._base import IndexcardDeriver @@ -64,7 +64,7 @@ def rdfobject_as_jsonld(self, rdfobject: rdf.RdfObject) -> dict: # datatype iri (or non-standard language iri) _datatype_iris = sorted( ( - osfmap_shorthand().compact_iri(_datatype_iri) + osfmap_json_shorthand().compact_iri(_datatype_iri) for _datatype_iri in rdfobject.datatype_iris ), key=len, @@ -74,7 +74,7 @@ def rdfobject_as_jsonld(self, rdfobject: rdf.RdfObject) -> dict: '@type': (_datatype_iris if (len(_datatype_iris) > 1) else _datatype_iris[0]), } elif isinstance(rdfobject, str): - return {'@id': osfmap_shorthand().compact_iri(rdfobject)} + return {'@id': osfmap_json_shorthand().compact_iri(rdfobject)} elif isinstance(rdfobject, (float, int)): return {'@value': rdfobject} elif isinstance(rdfobject, datetime.date): @@ -91,7 +91,7 @@ def twopledict_as_jsonld(self, twopledict: rdf.RdfTwopleDictionary) -> dict: _jsonld = {} for _pred, _objectset in twopledict.items(): if _objectset: - _key = osfmap_shorthand().compact_iri(_pred) + _key = osfmap_json_shorthand().compact_iri(_pred) _jsonld[_key] = self._list_or_single_value(_pred, [ self.rdfobject_as_jsonld(_obj) for _obj in _objectset @@ -114,10 +114,10 @@ def __nested_rdfobject_as_jsonld( _nested_obj = ( {} if rdfobject.startswith('_:') # HACK: non-blank blank nodes (stop that) - else {'@id': osfmap_shorthand().compact_iri(rdfobject)} + else {'@id': osfmap_json_shorthand().compact_iri(rdfobject)} ) for _pred, _objectset in tripledict[rdfobject].items(): - _label = osfmap_shorthand().compact_iri(_pred) + _label = osfmap_json_shorthand().compact_iri(_pred) if _objectset: _nested_obj[_label] = self._list_or_single_value( _pred, diff --git a/trove/render/_base.py b/trove/render/_base.py index 4f9d16c87..76b48cbbf 100644 --- a/trove/render/_base.py +++ b/trove/render/_base.py @@ -11,10 +11,8 @@ from trove import exceptions as trove_exceptions from trove.vocab import mediatypes -from trove.vocab.trove import ( - TROVE_API_THESAURUS, - trove_shorthand, -) +from trove.vocab.trove import TROVE_API_THESAURUS +from trove.vocab.namespaces import namespaces_shorthand from ._rendering import ProtoRendering, SimpleRendering @@ -33,7 +31,7 @@ class BaseRenderer(abc.ABC): # instance fields response_focus: gather.Focus response_gathering: gather.Gathering - iri_shorthand: rdf.IriShorthand = dataclasses.field(default_factory=trove_shorthand) + iri_shorthand: rdf.IriShorthand = dataclasses.field(default_factory=namespaces_shorthand) thesaurus_tripledict: rdf.RdfTripleDictionary = dataclasses.field(default_factory=lambda: TROVE_API_THESAURUS) @functools.cached_property diff --git a/trove/render/jsonapi.py b/trove/render/jsonapi.py index ef9da875b..d9ed7c6ea 100644 --- a/trove/render/jsonapi.py +++ b/trove/render/jsonapi.py @@ -26,10 +26,7 @@ TROVE, XSD, ) -from trove.vocab.trove import ( - trove_indexcard_namespace, - trove_shorthand, -) +from trove.vocab.trove import trove_indexcard_namespace from ._base import BaseRenderer @@ -303,7 +300,7 @@ def _render_attribute_datum(self, rdfobject: primitive_rdf.RdfObject) -> dict | try: # maybe it's a jsonapi resource return self.render_identifier_object(rdfobject) except Exception: - return trove_shorthand().compact_iri(rdfobject) + return self.iri_shorthand.compact_iri(rdfobject) elif isinstance(rdfobject, (float, int)): return rdfobject elif isinstance(rdfobject, datetime.date): diff --git a/trove/render/simple_csv.py b/trove/render/simple_csv.py index dd644bd52..c3dd3c243 100644 --- a/trove/render/simple_csv.py +++ b/trove/render/simple_csv.py @@ -127,7 +127,7 @@ def _row_field_value(self, osfmap_json: dict, field_path: Jsonpath) -> str: def _osfmap_jsonpath(iri_path: Iterable[str]) -> Jsonpath: - _shorthand = osfmap.osfmap_shorthand() + _shorthand = osfmap.osfmap_json_shorthand() return tuple( _shorthand.compact_iri(_pathstep) for _pathstep in iri_path diff --git a/trove/render/turtle.py b/trove/render/turtle.py index 2b682178c..e8239b34f 100644 --- a/trove/render/turtle.py +++ b/trove/render/turtle.py @@ -1,7 +1,6 @@ from primitive_metadata import primitive_rdf as rdf from trove.vocab.namespaces import TROVE -from trove.vocab.trove import trove_shorthand from ._base import BaseRenderer @@ -14,5 +13,5 @@ def simple_render_document(self) -> str: return rdf.turtle_from_tripledict( self.response_data.tripledict, focus=self.response_focus.single_iri(), - shorthand=trove_shorthand, + shorthand=self.iri_shorthand, ) diff --git a/trove/static/css/browse.css b/trove/static/css/browse.css index f26aeec5e..e12222a15 100644 --- a/trove/static/css/browse.css +++ b/trove/static/css/browse.css @@ -27,11 +27,11 @@ .Browse__card { display: flex; flex-direction: column; - padding: var(--gap-3) var(--gap-2); + padding: var(--gap-2) var(--gap-3); background-color: hsl(var(--hue-turn), var(--saturation), var(--luminosity)); --border-luminosity: calc(var(--luminosity) * 0.618); border-color: hsl(var(--hue-turn), var(--saturation), var(--border-luminosity)); - border-style: outset; + border-style: inset; /* border-block-start-width: 0; border-block-end-width: var(--gap-3); @@ -40,12 +40,20 @@ border-start-start-radius: 1rem; border-end-end-radius: 1rem; */ + /* border-block-start-width: 1px; border-inline-start-width: var(--gap-3); border-block-end-width: var(--gap-4); border-inline-end-width: 1px; border-start-start-radius: 1rem; border-end-end-radius: 1rem; + */ + border-inline-start-width: var(--gap-3); + border-block-start-width: var(--gap-4); + border-inline-end-width: 1px; + border-block-end-width: 1px; + border-start-end-radius: 1rem; + border-end-start-radius: 1rem; } .BrowseWrapper details > summary::before { @@ -122,7 +130,7 @@ dl.Browse__twopleset > dd { /*margin-top: var(--gap-4);*/ padding: var(--gap-4); /*border: dotted var(--gap-4) rgba(0,0,0,0.382);*/ - border: outset var(--gap-4) rgba(0,0,0,0.382); + border: inset var(--gap-4) rgba(0,0,0,0.382); } .Browse__literal { diff --git a/trove/trovebrowse_gathering.py b/trove/trovebrowse_gathering.py index ec8593008..76903d158 100644 --- a/trove/trovebrowse_gathering.py +++ b/trove/trovebrowse_gathering.py @@ -31,19 +31,6 @@ ) -@trovebrowse.gatherer() -def gather_thesaurus_entry(focus, *, blend_cards: bool): - _thesaurus = static_vocab.combined_thesaurus__suffuniq() - for _iri in focus.iris: - _suffuniq_iri = get_sufficiently_unique_iri(_iri) - _thesaurus_entry = _thesaurus.get(_suffuniq_iri, None) - if _thesaurus_entry: - if blend_cards: - yield from rdf.iter_twoples(_thesaurus_entry) - else: - yield (ns.FOAF.isPrimaryTopicOf, rdf.QuotedGraph({_iri: _thesaurus_entry}, focus_iri=_iri)) - - @trovebrowse.gatherer(ns.FOAF.isPrimaryTopicOf) def gather_cards_focused_on(focus, *, blend_cards: bool): _identifier_qs = trove_db.ResourceIdentifier.objects.queryset_for_iris(focus.iris) @@ -58,6 +45,19 @@ def gather_cards_focused_on(focus, *, blend_cards: bool): yield (_card_iri, ns.RDF.type, ns.TROVE.Indexcard) +@trovebrowse.gatherer(ns.TROVE.thesaurusEntry) +def gather_thesaurus_entry(focus, *, blend_cards: bool): + _thesaurus = static_vocab.combined_thesaurus__suffuniq() + for _iri in focus.iris: + _suffuniq_iri = get_sufficiently_unique_iri(_iri) + _thesaurus_entry = _thesaurus.get(_suffuniq_iri, None) + if _thesaurus_entry: + if blend_cards: + yield from rdf.iter_twoples(_thesaurus_entry) + else: + yield (ns.TROVE.thesaurusEntry, rdf.QuotedGraph({_iri: _thesaurus_entry}, focus_iri=_iri)) + + @trovebrowse.gatherer(ns.TROVE.usedAtPath) -def gather_paths_used_at(focus): - ... # TODO via elasticsearch aggregation +def gather_paths_used_at(focus, **kwargs): + yield from () # TODO via elasticsearch aggregation diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index 708418716..b0b6d85a4 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -31,7 +31,7 @@ get_single_value, ) from trove.vocab import osfmap -from trove.vocab.trove import trove_shorthand +from trove.vocab.trove import trove_json_shorthand from trove.vocab.namespaces import RDF, TROVE, OWL, FOAF, DCTERMS @@ -95,7 +95,7 @@ class ValueType(enum.Enum): @classmethod def from_shortname(cls, shortname): - _iri = trove_shorthand().expand_iri(shortname) + _iri = trove_json_shorthand().expand_iri(shortname) return cls(_iri) @classmethod @@ -104,7 +104,7 @@ def shortnames(cls): yield _value_type.to_shortname() def to_shortname(self) -> str: - return trove_shorthand().compact_iri(self.value) + return trove_json_shorthand().compact_iri(self.value) ### @@ -116,8 +116,8 @@ class BasicTrovesearchParams(BasicTroveParams): static_focus_type: typing.ClassVar[str] # expected on subclasses @classmethod - def _default_shorthand(cls): - return osfmap.osfmap_shorthand() + def _default_shorthand(cls): # NOTE: osfmap special + return osfmap.osfmap_json_shorthand() @classmethod def _default_include(cls): @@ -291,11 +291,11 @@ class FilterOperator(enum.Enum): @classmethod def from_shortname(cls, shortname): - _iri = trove_shorthand().expand_iri(shortname) + _iri = trove_json_shorthand().expand_iri(shortname) return cls(_iri) def to_shortname(self) -> str: - return trove_shorthand().compact_iri(self.value) + return trove_json_shorthand().compact_iri(self.value) def is_date_operator(self): return self in (self.BEFORE, self.AFTER, self.AT_DATE) @@ -362,7 +362,7 @@ def from_filter_param(cls, param_name: QueryparamName, param_value: str): if _is_date_filter: _value_list.append(_value) # TODO: vali-date else: - _value_list.append(osfmap.osfmap_shorthand().expand_iri(_value)) + _value_list.append(osfmap.osfmap_json_shorthand().expand_iri(_value)) return cls( value_set=frozenset(_value_list), operator=_operator, @@ -391,7 +391,7 @@ def as_queryparam(self, queryparam_family: str): self.operator.to_shortname(), )) _qp_value = join_queryparam_value( - osfmap.osfmap_shorthand().compact_iri(_value) + osfmap.osfmap_json_shorthand().compact_iri(_value) for _value in self.value_set ) return str(_qp_name), _qp_value diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 8f8629927..44486a5a9 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -35,7 +35,6 @@ from trove.vocab.trove import ( TROVE_API_THESAURUS, trove_indexcard_namespace, - trove_shorthand, ) @@ -489,7 +488,7 @@ def _osfmap_twople_json(twopledict): def _osfmap_path(property_path): return rdf.literal_json([ - osfmap.osfmap_shorthand().compact_iri(_iri) + osfmap.osfmap_json_shorthand().compact_iri(_iri) for _iri in property_path ]) @@ -524,7 +523,7 @@ def _related_property_result(property_path: tuple[str, ...], count: int): return frozenset(( (RDF.type, TROVE.RelatedPropertypath), (TROVE.cardsearchResultCount, count), - (TROVE.suggestedFilterOperator, literal(trove_shorthand().compact_iri( + (TROVE.suggestedFilterOperator, literal(osfmap.osfmap_json_shorthand().compact_iri( osfmap.suggested_filter_operator(property_path[-1]), ))), *_single_propertypath_twoples(property_path), diff --git a/trove/util/trove_params.py b/trove/util/trove_params.py index d23010b3c..46c8d9119 100644 --- a/trove/util/trove_params.py +++ b/trove/util/trove_params.py @@ -18,7 +18,7 @@ parse_propertypath, ) from trove.util import queryparams as _qp -from trove.vocab.trove import trove_shorthand +from trove.vocab.namespaces import namespaces_shorthand @dataclasses.dataclass(frozen=True) @@ -52,7 +52,7 @@ def parse_queryparams(cls, queryparams: _qp.QueryparamDict) -> dict: @classmethod def _default_shorthand(cls) -> rdf.IriShorthand: - return trove_shorthand() + return namespaces_shorthand() @classmethod def _default_include(cls) -> PropertypathSet: diff --git a/trove/views/_base.py b/trove/views/_base.py index 7e2a0f743..3c83805ab 100644 --- a/trove/views/_base.py +++ b/trove/views/_base.py @@ -64,7 +64,10 @@ def _parse_params(self, request: djhttp.HttpRequest): return self.params_type.from_querystring(request.META['QUERY_STRING']) def _get_focus_iri(self, request, params): - return request.build_absolute_uri() + _iri = request.build_absolute_uri() + if not _iri: + breakpoint() + return _iri def _build_focus(self, request, params): return gather.Focus.new(self._get_focus_iri(request, params), self.focus_type_iris) diff --git a/trove/views/browse.py b/trove/views/browse.py index 457352fe0..052ee4105 100644 --- a/trove/views/browse.py +++ b/trove/views/browse.py @@ -2,8 +2,9 @@ from trove import exceptions as trove_exceptions from trove.util.iris import unquote_iri -from trove.vocab.osfmap import osfmap_shorthand -from trove.vocab.trove import trove_shorthand +from trove.vocab import namespaces as _ns +from trove.vocab.osfmap import osfmap_json_shorthand +from trove.vocab.trove import trove_json_shorthand from trove.trovebrowse_gathering import trovebrowse from trove.util.trove_params import BasicTroveParams from trove.util.queryparams import ( @@ -34,10 +35,20 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: def _parse_iri(cls, iri_value: str): _iri = unquote_iri(iri_value) if ':' in _iri: - _iri = trove_shorthand().expand_iri(_iri) - else: # NOTE: special osfmap - _iri = osfmap_shorthand().expand_iri(_iri) - return _iri + return _ns.namespaces_shorthand().expand_iri(_iri) + for _shorthand_factory in (osfmap_json_shorthand, trove_json_shorthand): + _expanded = _shorthand_factory().expand_iri(_iri) + if _expanded != _iri: + return _expanded + raise trove_exceptions.IriInvalid(_iri) + + @classmethod + def _default_include(cls): + return frozenset(( + _ns.TROVE.thesaurusEntry, + _ns.FOAF.isPrimaryTopicOf, + _ns.TROVE.usedAtPath, + )) class BrowseIriView(BaseTroveView): diff --git a/trove/views/search.py b/trove/views/search.py index cadf9f9b8..481050944 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -30,10 +30,10 @@ class _BaseTrovesearchView(BaseTroveView, abc.ABC): gathering_organizer = trovesearch_by_indexstrategy # for BaseTroveView - def _build_focus(self, url, params): # override BaseTroveView + def _build_focus(self, request, params): # override BaseTroveView _strategy = index_strategy.get_strategy_for_trovesearch(params) return self.focus_type.new( - iris=url, + iris=self._get_focus_iri(request, params), search_params=params, search_handle=self.get_search_handle(_strategy, params), ) diff --git a/trove/vocab/namespaces.py b/trove/vocab/namespaces.py index f61f176c4..c0ebf1cb6 100644 --- a/trove/vocab/namespaces.py +++ b/trove/vocab/namespaces.py @@ -1,3 +1,5 @@ +import functools + from primitive_metadata import primitive_rdf as rdf from primitive_metadata.namespaces import ( RDF, @@ -32,7 +34,7 @@ 'SKOS', 'TROVE', 'XSD', - 'NAMESPACES_SHORTHAND', + 'namespaces_shorthand', ) # namespaces used in OAI-PMH @@ -63,4 +65,6 @@ _NAMESPACES_BY_PREFIX['blarg'] = BLARG -NAMESPACES_SHORTHAND = DEFAULT_SHORTHAND.with_update(_NAMESPACES_BY_PREFIX) +@functools.cache +def namespaces_shorthand() -> rdf.IriShorthand: + return DEFAULT_SHORTHAND.with_update(_NAMESPACES_BY_PREFIX) diff --git a/trove/vocab/osfmap.py b/trove/vocab/osfmap.py index 91deb1bdb..7af7b283a 100644 --- a/trove/vocab/osfmap.py +++ b/trove/vocab/osfmap.py @@ -36,7 +36,7 @@ RDFS, SKOS, TROVE, - NAMESPACES_SHORTHAND, + namespaces_shorthand, ) OSFMAP_LINK = 'https://osf.io/8yczr' @@ -952,18 +952,18 @@ # functions @functools.cache # built once -def osfmap_shorthand() -> IriShorthand: +def osfmap_json_shorthand() -> IriShorthand: '''build iri shorthand that includes unprefixed osfmap terms ''' return build_shorthand_from_thesaurus( thesaurus=OSFMAP_THESAURUS, label_predicate=JSONAPI_MEMBERNAME, - base_shorthand=NAMESPACES_SHORTHAND, + base_shorthand=namespaces_shorthand(), ) def parse_osfmap_propertypath(serialized_path: str, *, allow_globs=False) -> Propertypath: - return parse_propertypath(serialized_path, osfmap_shorthand(), allow_globs=allow_globs) + return parse_propertypath(serialized_path, osfmap_json_shorthand(), allow_globs=allow_globs) def parse_osfmap_propertypath_set(serialized_path_set: str, *, allow_globs=False) -> Iterator[Propertypath]: @@ -972,7 +972,7 @@ def parse_osfmap_propertypath_set(serialized_path_set: str, *, allow_globs=False def osfmap_propertypath_key(propertypath: Propertypath) -> str: - return propertypath_key(propertypath, osfmap_shorthand()) + return propertypath_key(propertypath, osfmap_json_shorthand()) def osfmap_propertypath_set_key(propertypath_set: PropertypathSet) -> str: diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index 2d8dd9c86..bb9fe879e 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -26,7 +26,7 @@ RDFS, SKOS, TROVE, - NAMESPACES_SHORTHAND, + namespaces_shorthand, ) @@ -43,7 +43,7 @@ def _literal_markdown(text: str, *, language: str): def trove_browse_link(iri: str): - _compact = trove_shorthand().compact_iri(iri) + _compact = namespaces_shorthand().compact_iri(iri) return urllib.parse.urljoin( reverse('trove:browse-iri'), f'?iri={urllib.parse.quote(_compact)}', @@ -663,7 +663,7 @@ def trove_browse_link(iri: str): to sort by date values, use `sort` (or `sort[date-value]`) with a **property-path** that ends with one of the following supported date properties: -{", ".join(f"`{osfmap.osfmap_shorthand().compact_iri(_date_iri)}`" for _date_iri in osfmap.DATE_PROPERTIES)} +{", ".join(f"`{osfmap.osfmap_json_shorthand().compact_iri(_date_iri)}`" for _date_iri in osfmap.DATE_PROPERTIES)} to sort by integer values, use `sort[integer-value]` with a **property-path** to the integers of interest. @@ -827,23 +827,16 @@ def trove_browse_link(iri: str): @functools.cache -def trove_shorthand() -> IriShorthand: +def trove_json_shorthand() -> IriShorthand: '''build iri shorthand that includes unprefixed terms (as defined in TROVE_API_THESAURUS) ''' return build_shorthand_from_thesaurus( thesaurus=TROVE_API_THESAURUS, label_predicate=JSONAPI_MEMBERNAME, - base_shorthand=NAMESPACES_SHORTHAND, + base_shorthand=namespaces_shorthand(), ) -@functools.cache -def shtrove_shorthand() -> IriShorthand: - '''build iri shorthand that includes osfmap shorthands... - ''' - return trove_shorthand().with_update(osfmap.osfmap_shorthand().prefix_map) - - @functools.cache def trove_indexcard_namespace(): return IriNamespace(f'{settings.SHARE_WEB_URL}trove/index-card/') From 322a29c1531ea623f892056326bc187971eea3e0 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 3 Apr 2025 12:51:42 -0400 Subject: [PATCH 11/43] wip (tests) --- .../_common_trovesearch_tests.py | 2 +- tests/trove/render/test_jsonapi_renderer.py | 30 +++++++++---------- tests/trove/test_doctest.py | 6 +++- trove/render/jsonapi.py | 15 +++++----- trove/trovesearch/search_params.py | 6 ++-- trove/util/frozen.py | 23 ++++++++++---- trove/util/iris.py | 30 ++++++++++++------- trove/util/propertypath.py | 2 -- trove/views/_base.py | 5 +--- 9 files changed, 71 insertions(+), 48 deletions(-) diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index f8d49485b..8f3fc66fe 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -152,7 +152,7 @@ def test_cardsearch_related_properties(self): (BLARG.nada,), ), ): - _cardsearch_params = CardsearchParams.from_querystring('include=relatedProperties') + _cardsearch_params = CardsearchParams.from_querystring('') _cardsearch_handle = self.index_strategy.pls_handle_cardsearch(_cardsearch_params) self.assertEqual(_cardsearch_handle.related_propertypath_results, [ PropertypathUsage((DCTERMS.creator,), 3), diff --git a/tests/trove/render/test_jsonapi_renderer.py b/tests/trove/render/test_jsonapi_renderer.py index a5e8bdc6d..3c4f6c254 100644 --- a/tests/trove/render/test_jsonapi_renderer.py +++ b/tests/trove/render/test_jsonapi_renderer.py @@ -35,7 +35,7 @@ class TestJsonapiRenderer(_BaseJsonapiRendererTest): mediatype='application/vnd.api+json', rendered_content=json.dumps({ "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmQ=", + "id": "blarg:aCard", "type": "index-card", "attributes": { "resourceIdentifier": [ @@ -51,7 +51,7 @@ class TestJsonapiRenderer(_BaseJsonapiRendererTest): }, "meta": { "foaf:primaryTopic": [ - "blarg:anItem" + {"id": "blarg:anItem"}, ], "dcterms:issued": [ "2024-01-01" @@ -67,10 +67,10 @@ class TestJsonapiRenderer(_BaseJsonapiRendererTest): mediatype='application/vnd.api+json', rendered_content=json.dumps({ "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYVN1YmplY3Q=", + "id": "blarg:aSubject", "type": "blarg:aType", "meta": { - "blarg:hasIri": ["blarg:anIri"], + "blarg:hasIri": [{"id": "blarg:anIri"}], "blarg:hasRdfStringLiteral": ["an rdf:string literal"], "blarg:hasRdfLangStringLiteral": ['a rdf:langString literal'], "blarg:hasIntegerLiteral": [17], @@ -90,7 +90,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR mediatype='application/vnd.api+json', rendered_content=json.dumps({ "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYVNlYXJjaA==", + "id": "blarg:aSearch", "type": "index-card-search", "attributes": { "totalResultCount": 0, @@ -105,7 +105,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR mediatype='application/vnd.api+json', rendered_content=json.dumps({ "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYVNlYXJjaEZldw==", + "id": "blarg:aSearchFew", "type": "index-card-search", "attributes": { "totalResultCount": 3 @@ -139,7 +139,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR "relationships": { "indexCard": { "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmQ=", + "id": "blarg:aCard", "type": "index-card" } } @@ -151,7 +151,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR "relationships": { "indexCard": { "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmRk", + "id": "blarg:aCardd", "type": "index-card" } } @@ -163,18 +163,18 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR "relationships": { "indexCard": { "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmRkZA==", + "id": "blarg:aCarddd", "type": "index-card" } } } }, { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmQ=", + "id": "blarg:aCard", "type": "index-card", "meta": { "foaf:primaryTopic": [ - "blarg:anItem" + {"id": "blarg:anItem"}, ], "dcterms:issued": [ "2024-01-01" @@ -197,11 +197,11 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR } }, { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmRkZA==", + "id": "blarg:aCarddd", "type": "index-card", "meta": { "foaf:primaryTopic": [ - "blarg:anItemmm" + {"id": "blarg:anItemmm"}, ], "dcterms:issued": [ "2024-03-03" @@ -224,11 +224,11 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR } }, { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmRk", + "id": "blarg:aCardd", "type": "index-card", "meta": { "foaf:primaryTopic": [ - "blarg:anItemm" + {"id": "blarg:anItemm"}, ], "dcterms:issued": [ "2024-02-02" diff --git a/tests/trove/test_doctest.py b/tests/trove/test_doctest.py index 8da33a947..18c77a18b 100644 --- a/tests/trove/test_doctest.py +++ b/tests/trove/test_doctest.py @@ -1,7 +1,9 @@ import doctest import trove.util.chainmap +import trove.util.frozen import trove.util.iris +import trove.util.propertypath _DOCTEST_OPTIONFLAGS = ( doctest.ELLIPSIS @@ -9,8 +11,10 @@ ) _MODULES_WITH_DOCTESTS = ( - trove.util.iris, trove.util.chainmap, + trove.util.frozen, + trove.util.iris, + trove.util.propertypath, ) diff --git a/trove/render/jsonapi.py b/trove/render/jsonapi.py index d9ed7c6ea..050ab920d 100644 --- a/trove/render/jsonapi.py +++ b/trove/render/jsonapi.py @@ -22,7 +22,6 @@ OSFMAP, OWL, RDF, - RDFS, TROVE, XSD, ) @@ -129,8 +128,9 @@ def render_identifier_object(self, iri_or_blanknode: _IriOrBlanknode): _type_iris = list(self.response_data.q(iri_or_blanknode, RDF.type)) _id_obj = { 'id': self._resource_id_for_iri(iri_or_blanknode), - 'type': self._single_typename(_type_iris), } + if _type_iris: + _id_obj['type'] = self._single_typename(_type_iris) elif isinstance(iri_or_blanknode, frozenset): _type_iris = [ _obj @@ -148,7 +148,7 @@ def render_identifier_object(self, iri_or_blanknode: _IriOrBlanknode): def _single_typename(self, type_iris: list[str]): if not type_iris: - return self._membername_for_iri(RDFS.Resource) + return '' if len(type_iris) == 1: return self._membername_for_iri(type_iris[0]) # choose one predictably, preferring osfmap and trove @@ -176,6 +176,10 @@ def _resource_id_for_iri(self, iri: str): for _iri_namespace in self._id_namespace_set: if iri in _iri_namespace: return primitive_rdf.iri_minus_namespace(iri, namespace=_iri_namespace) + # check for a shorthand + _compact = self.iri_shorthand.compact_iri(iri) + if _compact != iri: + return _compact # as fallback, encode the iri into a valid jsonapi member name return base64.urlsafe_b64encode(iri.encode()).decode() @@ -297,10 +301,7 @@ def _render_attribute_datum(self, rdfobject: primitive_rdf.RdfObject) -> dict | return int(rdfobject.unicode_value) return rdfobject.unicode_value # TODO: decide how to represent language elif isinstance(rdfobject, str): - try: # maybe it's a jsonapi resource - return self.render_identifier_object(rdfobject) - except Exception: - return self.iri_shorthand.compact_iri(rdfobject) + return self.render_identifier_object(rdfobject) elif isinstance(rdfobject, (float, int)): return rdfobject elif isinstance(rdfobject, datetime.date): diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index b0b6d85a4..760c078e5 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -112,7 +112,7 @@ def to_shortname(self) -> str: @dataclasses.dataclass(frozen=True) -class BasicTrovesearchParams(BasicTroveParams): +class TrovesearchParams(BasicTroveParams): static_focus_type: typing.ClassVar[str] # expected on subclasses @classmethod @@ -464,12 +464,12 @@ def as_queryparam(self) -> tuple[str, str]: @dataclasses.dataclass(frozen=True) -class IndexcardParams(BasicTroveParams): +class IndexcardParams(TrovesearchParams): static_focus_type = TROVE.Indexcard @dataclasses.dataclass(frozen=True) -class CardsearchParams(BasicTroveParams): +class CardsearchParams(TrovesearchParams): cardsearch_textsegment_set: frozenset[Textsegment] cardsearch_filter_set: frozenset[SearchFilter] index_strategy_name: str | None diff --git a/trove/util/frozen.py b/trove/util/frozen.py index 0e57eb531..65709f3fb 100644 --- a/trove/util/frozen.py +++ b/trove/util/frozen.py @@ -5,7 +5,6 @@ _FROZEN_TYPES = ( tuple, frozenset, - types.MappingProxyType, str, int, float, @@ -13,12 +12,26 @@ def freeze(obj): + ''' + >>> freeze([1, 1, 2]) + (1, 1, 2) + >>> freeze({3}) + frozenset({3}) + >>> freeze('five') + 'five' + >>> freeze({8: [13, 21, {34}]}) + mappingproxy({8: (13, 21, frozenset({34}))}) + >>> freeze(object()) + Traceback (most recent call last): + ... + ValueError: how freeze ? + ''' + if isinstance(obj, set): + return frozenset(obj) # use hashability to approximate immutability + if isinstance(obj, (list, tuple)): + return tuple(map(freeze, obj)) if isinstance(obj, dict): return freeze_mapping(obj) - if isinstance(obj, set): - return frozenset(obj) - if isinstance(obj, list): - return tuple(obj) if isinstance(obj, _FROZEN_TYPES): return obj raise ValueError(f'how freeze {obj!r}?') diff --git a/trove/util/iris.py b/trove/util/iris.py index 15dc64f94..35d9123f4 100644 --- a/trove/util/iris.py +++ b/trove/util/iris.py @@ -1,6 +1,6 @@ import json import re -from urllib.parse import urlsplit, urlunsplit, quote, unquote +import urllib.parse as _urp from trove import exceptions as trove_exceptions @@ -15,8 +15,8 @@ COLON = ':' COLON_SLASH_SLASH = '://' QUOTED_IRI_REGEX = re.compile( - f'{IRI_SCHEME_REGEX.pattern}{re.escape(quote(COLON))}' - f'|{re.escape(quote(COLON_SLASH_SLASH))}' + f'{IRI_SCHEME_REGEX.pattern}{re.escape(_urp.quote(COLON))}' + f'|{re.escape(_urp.quote(COLON_SLASH_SLASH))}' ) UNQUOTED_IRI_REGEX = re.compile(f'{IRI_SCHEME_REGEX.pattern}{COLON}|{COLON_SLASH_SLASH}') @@ -99,8 +99,8 @@ def get_sufficiently_unique_iri_and_scheme(iri: str) -> tuple[str, str]: _scheme = '' _remainder = iri # for an iri with '://', is "safe enough" to normalize a little: - _split_remainder = urlsplit(_remainder) - _cleaned_remainder = urlunsplit(( + _split_remainder = _urp.urlsplit(_remainder) + _cleaned_remainder = _urp.urlunsplit(( '', # scheme already split _split_remainder.netloc, _split_remainder.path.rstrip('/'), # remove trailing slashes @@ -147,6 +147,9 @@ def iri_path_as_keyword(iris: list[str] | tuple[str, ...], *, suffuniq=False) -> def unquote_iri(iri: str) -> str: ''' + like `urllib.parse.unquote` but recognizes multiply-quoted IRIs + (unquoting until starting "foo:" or "://", leaving further quoted characters intact) + >>> unquote_iri('flipl://iri.example/blarg/?#') 'flipl://iri.example/blarg/?#' >>> unquote_iri('flipl%3A//iri.example/blarg/%3F%23') @@ -157,14 +160,21 @@ def unquote_iri(iri: str) -> str: 'namly:urn.example:blerg' >>> unquote_iri('werbleWord') 'werbleWord' - >>> quote(quote('flipl://iri.example/blarg/?' + urlencode({'iri': '://blarg///' + quote('://bl@rg?')))) - >>> unquote_iri(_) - >>> quote('namly:urn.example:' + quote('flipl://iri.example/blarg/?')) - >>> unquote_iri(_) + + >>> import urllib.parse as _urp + >>> _unquoted = 'flipl://iri.example/blarg/?' + _urp.urlencode({'param': '://bl@rg?'}) + >>> unquote_iri(_unquoted) + 'flipl://iri.example/blarg/?param=%3A%2F%2Fbl%40rg%3F' + >>> unquote_iri(_urp.quote(_unquoted)) + 'flipl://iri.example/blarg/?param=%3A%2F%2Fbl%40rg%3F' + >>> unquote_iri(_urp.quote(_urp.quote(_unquoted))) + 'flipl://iri.example/blarg/?param=%3A%2F%2Fbl%40rg%3F' + >>> unquote_iri(_urp.quote(_urp.quote(_urp.quote(_unquoted)))) + 'flipl://iri.example/blarg/?param=%3A%2F%2Fbl%40rg%3F' ''' _unquoted_iri = iri while not UNQUOTED_IRI_REGEX.match(_unquoted_iri): - _next_unquoted_iri = unquote(_unquoted_iri) + _next_unquoted_iri = _urp.unquote(_unquoted_iri) if _unquoted_iri == _next_unquoted_iri: break _unquoted_iri = _next_unquoted_iri diff --git a/trove/util/propertypath.py b/trove/util/propertypath.py index c33355fad..eaf8a30cf 100644 --- a/trove/util/propertypath.py +++ b/trove/util/propertypath.py @@ -29,8 +29,6 @@ def is_globpath(path: Propertypath) -> bool: True >>> is_globpath(('*', 'url:url')) False - >>> is_globpath(()) - False ''' return all(_pathstep == GLOB_PATHSTEP for _pathstep in path) diff --git a/trove/views/_base.py b/trove/views/_base.py index 3c83805ab..7e2a0f743 100644 --- a/trove/views/_base.py +++ b/trove/views/_base.py @@ -64,10 +64,7 @@ def _parse_params(self, request: djhttp.HttpRequest): return self.params_type.from_querystring(request.META['QUERY_STRING']) def _get_focus_iri(self, request, params): - _iri = request.build_absolute_uri() - if not _iri: - breakpoint() - return _iri + return request.build_absolute_uri() def _build_focus(self, request, params): return gather.Focus.new(self._get_focus_iri(request, params), self.focus_type_iris) From 5f25bff774a545f96ac6d285b5ec554037131e31 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 3 Apr 2025 13:26:09 -0400 Subject: [PATCH 12/43] wip/fix --- trove/render/html_browse.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index b59a7100b..bbdc70f5a 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -218,7 +218,7 @@ def __literal( def __sequence(self, sequence_twoples: frozenset): _obj_in_order = list(rdf.sequence_objects_in_order(sequence_twoples)) with self.__nest('details', attrs={'open': ''}): - _text = _('sequence of %(count)') % {'count': len(_obj_in_order)} + _text = _('sequence of %(count)s') % {'count': len(_obj_in_order)} self.__leaf('summary', text=_text) with self.__nest('ol'): # TODO: style? for _seq_obj in _obj_in_order: @@ -303,7 +303,12 @@ def __iri_link_and_labels(self, iri: str): self.__literal(_text) def __compact_link(self, iri: str): - with self.__nest('a', attrs={'href': trove_browse_link(iri)}) as _a: + _href = ( + iri + if iri.startswith(settings.SHARE_WEB_URL) + else trove_browse_link(iri) + ) + with self.__nest('a', attrs={'href': _href}) as _a: _a.text = self.iri_shorthand.compact_iri(iri) return _a From 605d25619231e4a960bc9ce6afa8226231fb1dd5 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 7 Apr 2025 09:43:09 -0400 Subject: [PATCH 13/43] wip (landing page) --- project/urls.py | 2 + trove/render/html_browse.py | 21 +++- trove/render/jsonapi.py | 9 +- trove/static/css/browse.css | 127 ++++++++--------------- trove/templates/trove/openapi-redoc.html | 2 +- trove/views/_base.py | 81 ++++++++++++--- trove/views/browse.py | 8 +- trove/views/search.py | 8 +- trove/views/shtrove_root.py | 27 +++++ 9 files changed, 172 insertions(+), 113 deletions(-) create mode 100644 trove/views/shtrove_root.py diff --git a/project/urls.py b/project/urls.py index da8ad1f28..e69fc52b1 100644 --- a/project/urls.py +++ b/project/urls.py @@ -12,6 +12,7 @@ from share.admin import admin_site from share.oaipmh.views import OAIPMHView from trove.views.vocab import TroveVocabView +from trove.views.shtrove_root import ShtroveRootView urlpatterns = [ @@ -32,6 +33,7 @@ permanent=False ), name='favicon'), url(r'^icons/(?P[^/]+).ico$', source_icon_view, name='source_icon'), + path('', ShtroveRootView.as_view()), ] if settings.DEBUG: diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index bbdc70f5a..2ff1e3b7d 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -83,6 +83,7 @@ class _HtmlBuilder: __visiting_iris: set[str] = dataclasses.field(init=False) __heading_depth: int = 0 __last_hue_turn: float = dataclasses.field(default_factory=random.random) + __nested_tags: list[str] = dataclasses.field(default_factory=list) def __post_init__(self): # TODO: lang (according to request -- also translate) @@ -138,7 +139,7 @@ def __mediatype_link(self, mediatype: str): with self.__nest('a', attrs={'href': reverse('trove:docs')}) as _link: _link.text = _('(stable for documented use)') - def __render_subj(self, subj_iri: str, *, start_collapsed=False): + def __render_subj(self, subj_iri: str, *, start_collapsed=None): _twopledict = self.__current_data.get(subj_iri, {}) with self.__visiting(subj_iri): with self.__nest_card('article'): @@ -152,7 +153,13 @@ def __render_subj(self, subj_iri: str, *, start_collapsed=False): self.__literal(_label) if _twopledict: with self.__nest('details') as _details: - if not start_collapsed: + _detail_depth = sum((_tag == 'details') for _tag in self.__nested_tags) + _should_open = ( + _detail_depth < 4 + if start_collapsed is None + else not start_collapsed + ) + if _should_open: _details.set('open', '') self.__leaf('summary', text=_('details...')) self.__twoples(_twopledict) @@ -284,9 +291,11 @@ def __nest(self, tag_name, attrs=None): _attrs = {**attrs} if attrs else {} _parent_element = self.__current_element self.__current_element = SubElement(_parent_element, tag_name, _attrs) + self.__nested_tags.append(tag_name) try: yield self.__current_element finally: + self.__nested_tags.pop() self.__current_element = _parent_element def __leaf(self, tag_name, *, text=None, attrs=None): @@ -328,12 +337,16 @@ def __iri_thesaurus_labels(self, iri: str): if _thesaurus_entry: for _pred in _LINK_TEXT_PREDICATES: yield from shuffled(_thesaurus_entry.get(_pred, ())) + _twoples = self.__current_data.get(iri) + if _twoples: + for _pred in _LINK_TEXT_PREDICATES: + yield from shuffled(_twoples.get(_pred, ())) def _hue_turn_css(self): - # return f'--hue-turn: {random.random()}turn;' + # return f'--hue: {random.random()}turn;' _hue_turn = (self.__last_hue_turn + _PHI) % 1.0 self.__last_hue_turn = _hue_turn - return f'--hue-turn: {_hue_turn}turn;' + return f'--bg-hue-turn: {_hue_turn}turn;' def _queryparam_href(self, param_name: str, param_value: str | None): _base_url = self.focus_iri diff --git a/trove/render/jsonapi.py b/trove/render/jsonapi.py index 050ab920d..a85d9dd26 100644 --- a/trove/render/jsonapi.py +++ b/trove/render/jsonapi.py @@ -125,12 +125,15 @@ def render_identifier_object(self, iri_or_blanknode: _IriOrBlanknode): return self._identifier_object_cache[iri_or_blanknode] except KeyError: if isinstance(iri_or_blanknode, str): - _type_iris = list(self.response_data.q(iri_or_blanknode, RDF.type)) _id_obj = { - 'id': self._resource_id_for_iri(iri_or_blanknode), + '@id': self.iri_shorthand.compact_iri(iri_or_blanknode), } + _type_iris = list(self.response_data.q(iri_or_blanknode, RDF.type)) if _type_iris: - _id_obj['type'] = self._single_typename(_type_iris) + _id_obj = { + 'id': self._resource_id_for_iri(iri_or_blanknode), + 'type': self._single_typename(_type_iris), + } elif isinstance(iri_or_blanknode, frozenset): _type_iris = [ _obj diff --git a/trove/static/css/browse.css b/trove/static/css/browse.css index e12222a15..62a7bf742 100644 --- a/trove/static/css/browse.css +++ b/trove/static/css/browse.css @@ -1,23 +1,33 @@ +:root { + --phi: 1.618; + /* rotating colorspace */ + --hue-turn: var(--phi)turn; /* initial */ + --bg-chroma: 71%; + --bg-chroma-2: calc(var(--bg-chroma) + ((100% - var(--bg-chroma)) / 3)); + --bg-luminance: 83%; + --bg-luminance-2: calc(var(--bg-luminance) + ((100% - var(--bg-luminance)) / 3)); + --bg-luminance-3: calc(var(--bg-luminance-2) + ((100% - var(--bg-luminance-2)) / 3)); + --bg-color: lch(var(--bg-luminance) var(--bg-chroma) var(--hue-turn)); + --bg-color-2: lch(var(--bg-luminance-2) var(--bg-chroma) var(--hue-turn)); + --bg-color-3: lch(var(--bg-luminance-3) var(--bg-chroma) var(--hue-turn)); + /* gutter spaces (gaps, paddings, margins...) */ + --gutter-1: var(--phi)rem; + --gutter-2: calc(var(--phi)rem - 1); + --gutter-3: calc(var(--gutter-2) / 2); + --gutter-4: calc(var(--gutter-3) / 2); + --gutter-5: calc(var(--gutter-4) / 2); +} + .BrowseWrapper { - --hue-turn: 0.618turn; - --saturation: 71%; - --luminosity: 83%; - --gap-1: 1.618rem; - --gap-2: 0.618rem; - --gap-3: 0.382rem; - --gap-4: 0.191rem; - --gap-5: 0.095rem; display: flex; flex-direction: row; align-items: flex-start; flex-wrap: wrap; - gap: var(--gap-1); + gap: var(--gutter-1); margin: 0; padding: 1rem; min-height: 100vh; - /*background-color: #fedbae;*/ - background-color: hsl(var(--hue-turn), var(--saturation), var(--luminosity)); - /* backdrop-filter: hue-rotate(var(--hue-turn)); */ + background-color: var(--bg-color); } .BrowseWrapper dfn { @@ -27,29 +37,14 @@ .Browse__card { display: flex; flex-direction: column; - padding: var(--gap-2) var(--gap-3); - background-color: hsl(var(--hue-turn), var(--saturation), var(--luminosity)); - --border-luminosity: calc(var(--luminosity) * 0.618); - border-color: hsl(var(--hue-turn), var(--saturation), var(--border-luminosity)); + padding: var(--gutter-2) var(--gutter-3); + background-color: var(--bg-color); + border-color: lch(from var(--bg-color) h s + calc(var(--bg-luminosity) * 0.618) + ); border-style: inset; - /* - border-block-start-width: 0; - border-block-end-width: var(--gap-3); - border-inline-start-width: var(--gap-3); - border-inline-end-width: 0; - border-start-start-radius: 1rem; - border-end-end-radius: 1rem; - */ - /* - border-block-start-width: 1px; - border-inline-start-width: var(--gap-3); - border-block-end-width: var(--gap-4); - border-inline-end-width: 1px; - border-start-start-radius: 1rem; - border-end-end-radius: 1rem; - */ - border-inline-start-width: var(--gap-3); - border-block-start-width: var(--gap-4); + border-inline-start-width: var(--gutter-3); + border-block-start-width: var(--gutter-4); border-inline-end-width: 1px; border-block-end-width: 1px; border-start-end-radius: 1rem; @@ -61,20 +56,18 @@ display: inline-block; transition-property: rotate; transition-duration: 0.618s; - margin-right: var(--gap-2); + margin-right: var(--gutter-2); } .BrowseWrapper details[open] > summary::before { - rotate: var(--hue-turn); + rotate: var(--bg-turn); } .Browse__card > header { display: flex; flex-direction: column; border-bottom: solid 1px rgba(0,0,0,0.382); - margin-bottom: var(--gap-3); - /*font-style: italic;*/ - /* gap: var(--gap-2); */ + margin-bottom: var(--gutter-3); } .Browse__card > header > :first-child { @@ -83,7 +76,7 @@ } .Browse__card > footer { - padding: var(--gap-2); + padding: var(--gutter-2); } dl.Browse__twopleset { @@ -93,12 +86,7 @@ dl.Browse__twopleset { [twople-obj] 1fr ; grid-auto-flow: row; - row-gap: var(--gap-2); - /* - display: flex; - flex-direction: column; - */ - + row-gap: var(--gutter-2); margin: 0; padding: 0; } @@ -107,7 +95,6 @@ dl.Browse__twopleset > dt { grid-column: twople-pred; display: flex; flex-direction: column; - /*gap: var(--gap-4);*/ } dl.Browse__twopleset > dd { @@ -115,32 +102,27 @@ dl.Browse__twopleset > dd { margin: 0; display: flex; flex-direction: column; - gap: var(--gap-5); + gap: var(--gutter-5); } .Browse__twople { display: flex; flex-direction: row; align-items: flex-start; - gap: var(--gap-3); + gap: var(--gutter-3); margin: 0; } .Browse__blanknode { - /*margin-top: var(--gap-4);*/ - padding: var(--gap-4); - /*border: dotted var(--gap-4) rgba(0,0,0,0.382);*/ - border: inset var(--gap-4) rgba(0,0,0,0.382); + padding: var(--gutter-4); + border: inset var(--gutter-4) rgba(0,0,0,0.382); } .Browse__literal { display: flex; flex-direction: row; - /*flex-wrap: wrap;*/ - gap: var(--gap-3); - padding: var(--gap-4); - /*border-block-start: solid 1px rgba(0,0,0,0.382);*/ - /*border: dotted var(--gap-4) rgba(0,0,0,0.382);*/ + gap: var(--gutter-3); + padding: var(--gutter-4); } .Browse__literal > q { @@ -152,32 +134,11 @@ dl.Browse__twopleset > dd { } .Browse__predicate { - --luminosity-boost: 0.31; - --boosted-luminosity: calc(var(--luminosity) + ((100% - var(--luminosity)) * var(--luminosity-boost))); - background-color: hsl(var(--hue-turn), var(--saturation), var(--boosted-luminosity)); - padding: var(--gap-4); + background-color: var(--bg-color-2); + padding: var(--gutter-4); } .Browse__object { - --luminosity-boost: 0.51; - --boosted-luminosity: calc(var(--luminosity) + ((100% - var(--luminosity)) * var(--luminosity-boost))); - background-color: hsl(var(--hue-turn), var(--saturation), var(--boosted-luminosity)); - padding: var(--gap-4); - /*margin-top: var(--gap-4);*/ - /*border-block-start: solid 1px rgba(0,0,0,0.382); -} -.Browse__object:first-of-type { - border-block-start: none; - */ -} - - -/* -.Browse :focus-within { - backdrop-filter: hue-rotate(var(--hue-rotate-step)); -} - -.Browse :focus { - border: 5px dotted #e28; + background-color: var(--bg-color-3); + padding: var(--gutter-4); } -*/ diff --git a/trove/templates/trove/openapi-redoc.html b/trove/templates/trove/openapi-redoc.html index c0a0da18b..8841d68f9 100644 --- a/trove/templates/trove/openapi-redoc.html +++ b/trove/templates/trove/openapi-redoc.html @@ -6,7 +6,7 @@ - + diff --git a/trove/views/_base.py b/trove/views/_base.py index 7e2a0f743..e94c1a783 100644 --- a/trove/views/_base.py +++ b/trove/views/_base.py @@ -1,21 +1,27 @@ -__all__ = ('BaseTroveView',) +__all__ = ( + 'GatheredTroveView', + 'StaticTroveView', +) import abc from collections.abc import Container +import functools from typing import ClassVar from django import http as djhttp from django.views import View from primitive_metadata import gather +from primitive_metadata import primitive_rdf as rdf from trove import exceptions as trove_exceptions -from trove.vocab.namespaces import TROVE +from trove.vocab.namespaces import TROVE, RDF from trove.util.trove_params import BasicTroveParams from trove.render import ( BaseRenderer, DEFAULT_RENDERER_TYPE, get_renderer_type, ) +from trove.render._rendering import ProtoRendering from ._gather_ask import ask_gathering_from_params from ._responder import ( make_http_error_response, @@ -24,10 +30,12 @@ class BaseTroveView(View, abc.ABC): - # ClassVars expected on inheritors: - gathering_organizer: ClassVar[gather.GatheringOrganizer] + # optional ClassVars: params_type: ClassVar[type[BasicTroveParams]] = BasicTroveParams - focus_type_iris: ClassVar[Container[str]] = () + + @abc.abstractmethod + def _render_response_content(self, request, params, renderer_type: type[BaseRenderer]) -> ProtoRendering: + raise NotImplementedError def get(self, request): try: @@ -39,20 +47,30 @@ def get(self, request): ) try: _params = self._parse_params(request) - return self._respond(request, _params, _renderer_type) + return make_http_response( + content_rendering=self._render_response_content(request, _params, _renderer_type), + http_request=request, + ) except trove_exceptions.TroveError as _error: return make_http_error_response( error=_error, renderer_type=_renderer_type, ) - def _respond(self, request, params, renderer_type: type[BaseRenderer]): + def _parse_params(self, request: djhttp.HttpRequest): + return self.params_type.from_querystring(request.META['QUERY_STRING']) + + +class GatheredTroveView(BaseTroveView, abc.ABC): + # ClassVars expected on inheritors: + gathering_organizer: ClassVar[gather.GatheringOrganizer] + # optional ClassVars: + focus_type_iris: ClassVar[Container[str]] = () + + def _render_response_content(self, request, params, renderer_type: type[BaseRenderer]): _focus = self._build_focus(request, params) _renderer = self._gather_to_renderer(_focus, params, renderer_type) - return make_http_response( - content_rendering=_renderer.render_document(), - http_request=request, - ) + return _renderer.render_document() def _gather_to_renderer(self, focus, params, renderer_type: type[BaseRenderer]) -> BaseRenderer: _gathering = self._build_gathering(params, renderer_type) @@ -60,9 +78,6 @@ def _gather_to_renderer(self, focus, params, renderer_type: type[BaseRenderer]) ask_gathering_from_params(_gathering, params, focus) return renderer_type(focus, _gathering) - def _parse_params(self, request: djhttp.HttpRequest): - return self.params_type.from_querystring(request.META['QUERY_STRING']) - def _get_focus_iri(self, request, params): return request.build_absolute_uri() @@ -82,6 +97,44 @@ def _get_gatherer_kwargs(self, params, renderer_type): return _kwargs +class StaticTroveView(BaseTroveView, abc.ABC): + @classmethod + def get_static_twoples(cls) -> rdf.RdfTripleDictionary: + raise NotImplementedError(f'implement either get_static_triples or get_static_twoples on {cls}') + + @classmethod + @functools.cache + def get_static_triples(cls, focus_iri: str) -> rdf.RdfTripleDictionary: + return {focus_iri: cls.get_static_twoples()} + + @classmethod + def get_focus_iri(cls) -> str: + raise NotImplementedError(f'implement get_focus_iri on {cls}') + + def _render_response_content(self, request, params, renderer_type: type[BaseRenderer]): + _focus_iri = self.get_focus_iri() + _triples = self.get_static_triples(_focus_iri) + _focus = gather.Focus.new( + _focus_iri, + type_iris=_triples.get(_focus_iri, {}).get(RDF.type, ()), + ) + + class _FakeStaticGathering: + gatherer_kwargs: dict = {} + + def leaf_a_record(self): + return _triples + + _renderer = renderer_type( + response_focus=_focus, + response_gathering=_FakeStaticGathering(), + ) + return _renderer.render_document() + + +### +# local helpers + def _get_param_keyword(param_iri: str, organizer: gather.GatheringOrganizer) -> str | None: if param_iri in organizer.norms.param_iris: for (_k, _v) in organizer.gatherer_params.items(): diff --git a/trove/views/browse.py b/trove/views/browse.py index 052ee4105..21e91a245 100644 --- a/trove/views/browse.py +++ b/trove/views/browse.py @@ -12,7 +12,7 @@ get_single_value, get_bool_value, ) -from ._base import BaseTroveView +from ._base import GatheredTroveView @dataclasses.dataclass(frozen=True) @@ -51,14 +51,14 @@ def _default_include(cls): )) -class BrowseIriView(BaseTroveView): +class BrowseIriView(GatheredTroveView): gathering_organizer = trovebrowse params_type = BrowseParams - def _get_focus_iri(self, request, params: BrowseParams): # override BaseTroveView + def _get_focus_iri(self, request, params: BrowseParams): # override GatheredTroveView return params.iri - def _get_gatherer_kwargs(self, params, renderer_type): # override BaseTroveView + def _get_gatherer_kwargs(self, params, renderer_type): # override GatheredTroveView return { **super()._get_gatherer_kwargs(params, renderer_type), 'blend_cards': params.blend_cards, diff --git a/trove/views/search.py b/trove/views/search.py index 481050944..994d7db48 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -16,7 +16,7 @@ ValuesearchFocus, ) from trove.util.trove_params import BasicTroveParams -from ._base import BaseTroveView +from ._base import GatheredTroveView logger = logging.getLogger(__name__) @@ -25,12 +25,12 @@ _TrovesearchHandler = Callable[[BasicTroveParams], BasicSearchHandle] -class _BaseTrovesearchView(BaseTroveView, abc.ABC): +class _BaseTrovesearchView(GatheredTroveView, abc.ABC): focus_type: type[gather.Focus] = gather.Focus # expected on subclasses - gathering_organizer = trovesearch_by_indexstrategy # for BaseTroveView + gathering_organizer = trovesearch_by_indexstrategy # for GatheredTroveView - def _build_focus(self, request, params): # override BaseTroveView + def _build_focus(self, request, params): # override GatheredTroveView _strategy = index_strategy.get_strategy_for_trovesearch(params) return self.focus_type.new( iris=self._get_focus_iri(request, params), diff --git a/trove/views/shtrove_root.py b/trove/views/shtrove_root.py new file mode 100644 index 000000000..a450c5c11 --- /dev/null +++ b/trove/views/shtrove_root.py @@ -0,0 +1,27 @@ +from primitive_metadata import primitive_rdf as rdf +from django.conf import settings + +from trove.vocab import namespaces as ns +from ._base import StaticTroveView + + +class ShtroveRootView(StaticTroveView): + @classmethod + def get_focus_iri(cls): + return settings.SHARE_WEB_URL + + @classmethod + def get_static_triples(cls, focus_iri: str) -> rdf.RdfTripleDictionary: + _here = rdf.IriNamespace(focus_iri) + return { + focus_iri: { + ns.DCTERMS.description: { + rdf.literal('a trove of metadata meant to be shared', language='en'), + }, + ns.RDFS.seeAlso: { + _here['trove/docs'], + _here['trove/browse'], + _here['trove/index-card-search'], + }, + }, + } From 7e63c70435d5c801a34fec759a78cc5f81ef774a Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 9 Apr 2025 14:34:54 -0400 Subject: [PATCH 14/43] fix: test --- tests/trove/render/test_jsonapi_renderer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/trove/render/test_jsonapi_renderer.py b/tests/trove/render/test_jsonapi_renderer.py index 3c4f6c254..9357c5ff6 100644 --- a/tests/trove/render/test_jsonapi_renderer.py +++ b/tests/trove/render/test_jsonapi_renderer.py @@ -51,7 +51,7 @@ class TestJsonapiRenderer(_BaseJsonapiRendererTest): }, "meta": { "foaf:primaryTopic": [ - {"id": "blarg:anItem"}, + {"@id": "blarg:anItem"}, ], "dcterms:issued": [ "2024-01-01" @@ -70,7 +70,7 @@ class TestJsonapiRenderer(_BaseJsonapiRendererTest): "id": "blarg:aSubject", "type": "blarg:aType", "meta": { - "blarg:hasIri": [{"id": "blarg:anIri"}], + "blarg:hasIri": [{"@id": "blarg:anIri"}], "blarg:hasRdfStringLiteral": ["an rdf:string literal"], "blarg:hasRdfLangStringLiteral": ['a rdf:langString literal'], "blarg:hasIntegerLiteral": [17], @@ -174,7 +174,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR "type": "index-card", "meta": { "foaf:primaryTopic": [ - {"id": "blarg:anItem"}, + {"@id": "blarg:anItem"}, ], "dcterms:issued": [ "2024-01-01" @@ -201,7 +201,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR "type": "index-card", "meta": { "foaf:primaryTopic": [ - {"id": "blarg:anItemmm"}, + {"@id": "blarg:anItemmm"}, ], "dcterms:issued": [ "2024-03-03" @@ -228,7 +228,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR "type": "index-card", "meta": { "foaf:primaryTopic": [ - {"id": "blarg:anItemm"}, + {"@id": "blarg:anItemm"}, ], "dcterms:issued": [ "2024-02-02" From 541588d9f368abd65cd314ad8e95538a231fe3c0 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 9 Apr 2025 14:58:24 -0400 Subject: [PATCH 15/43] ignore unhelpful test pytest warnings --- tests/share/search/test_daemon.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/share/search/test_daemon.py b/tests/share/search/test_daemon.py index ba0842a3d..b172a6ddb 100644 --- a/tests/share/search/test_daemon.py +++ b/tests/share/search/test_daemon.py @@ -113,6 +113,7 @@ def test_unsupported_message_type(self): daemon.on_message(unsupported_message.payload, unsupported_message) assert not unsupported_message.acked + @pytest.mark.filterwarnings('ignore::pytest.PytestUnhandledThreadExceptionWarning') def test_unexpected_error(self): class UnexpectedError(Exception): pass @@ -128,10 +129,7 @@ def pls_handle_messages_chunk(self, messages_chunk): with mock.patch('share.search.daemon.sentry_sdk') as mock_sentry: with mock.patch('share.search.daemon.logger') as mock_logger: - with _daemon_running( - FakeIndexStrategyWithUnexpectedError(), - daemonthread_context=lambda: pytest.raises(UnexpectedError) - ) as daemon: + with _daemon_running(FakeIndexStrategyWithUnexpectedError()) as daemon: message = FakeCeleryMessage(messages.MessageType.INDEX_SUID, 1) daemon.on_message(message.payload, message) assert daemon.stop_event.wait(timeout=10), ( @@ -140,6 +138,7 @@ def pls_handle_messages_chunk(self, messages_chunk): mock_sentry.capture_exception.assert_called_once() mock_logger.exception.assert_called_once() + @pytest.mark.filterwarnings('ignore::pytest.PytestUnhandledThreadExceptionWarning') def test_noncurrent_backfill(self): class FakeIndexStrategyWithNoncurrentBackfill: CURRENT_STRATEGY_CHECKSUM = 'not-what-you-expected' @@ -153,10 +152,7 @@ class FakeIndexBackfill: strategy_checksum = 'what-you-expected' return FakeIndexBackfill() - with _daemon_running( - FakeIndexStrategyWithNoncurrentBackfill(), - daemonthread_context=lambda: pytest.raises(exceptions.DaemonSetupError) - ) as daemon: + with _daemon_running(FakeIndexStrategyWithNoncurrentBackfill()) as daemon: message = FakeCeleryMessage(messages.MessageType.BACKFILL_SUID, 1) daemon.on_message(message.payload, message) assert daemon.stop_event.wait(timeout=10), ( From bf63f5ab58c3c26fbcd27af661a3afecf56e551c Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 21 Apr 2025 10:16:40 -0400 Subject: [PATCH 16/43] wip --- trove/render/html_browse.py | 2 +- trove/static/css/browse.css | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index 2ff1e3b7d..9696c8d0e 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -346,7 +346,7 @@ def _hue_turn_css(self): # return f'--hue: {random.random()}turn;' _hue_turn = (self.__last_hue_turn + _PHI) % 1.0 self.__last_hue_turn = _hue_turn - return f'--bg-hue-turn: {_hue_turn}turn;' + return f'--hue-turn: {_hue_turn}turn;' def _queryparam_href(self, param_name: str, param_value: str | None): _base_url = self.focus_iri diff --git a/trove/static/css/browse.css b/trove/static/css/browse.css index 62a7bf742..3a0585f2d 100644 --- a/trove/static/css/browse.css +++ b/trove/static/css/browse.css @@ -1,12 +1,13 @@ :root { --phi: 1.618; - /* rotating colorspace */ + /* rotating colorspace (using lch with luminance and chroma held constant) */ --hue-turn: var(--phi)turn; /* initial */ - --bg-chroma: 71%; - --bg-chroma-2: calc(var(--bg-chroma) + ((100% - var(--bg-chroma)) / 3)); + /* background colors */ --bg-luminance: 83%; --bg-luminance-2: calc(var(--bg-luminance) + ((100% - var(--bg-luminance)) / 3)); --bg-luminance-3: calc(var(--bg-luminance-2) + ((100% - var(--bg-luminance-2)) / 3)); + --bg-chroma: 71%; + --bg-chroma-2: calc(var(--bg-chroma) + ((100% - var(--bg-chroma)) / 3)); --bg-color: lch(var(--bg-luminance) var(--bg-chroma) var(--hue-turn)); --bg-color-2: lch(var(--bg-luminance-2) var(--bg-chroma) var(--hue-turn)); --bg-color-3: lch(var(--bg-luminance-3) var(--bg-chroma) var(--hue-turn)); @@ -47,8 +48,10 @@ border-block-start-width: var(--gutter-4); border-inline-end-width: 1px; border-block-end-width: 1px; + /* border-start-end-radius: 1rem; border-end-start-radius: 1rem; + */ } .BrowseWrapper details > summary::before { From b5b5c98b32744cf4565ae75f31cf2c8aec4ecf0d Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 21 Apr 2025 13:09:15 -0400 Subject: [PATCH 17/43] easier feature-flag flipping --- share/admin/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/share/admin/__init__.py b/share/admin/__init__.py index 1f21210e4..7d1328756 100644 --- a/share/admin/__init__.py +++ b/share/admin/__init__.py @@ -213,6 +213,7 @@ class FeatureFlagAdmin(admin.ModelAdmin): readonly_fields = ('name',) search_fields = ('name',) list_display = ('name', 'is_up', 'is_defined') + list_editable = ('is_up',) admin_site.register(AccessToken, AccessTokenAdmin) From 8082ed18f0ab187103c20bf43cfbcf865b87c019 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 21 Apr 2025 16:58:47 -0400 Subject: [PATCH 18/43] improved ingest_from_another_shtrove --- .../commands/ingest_from_another_shtrove.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/trove/management/commands/ingest_from_another_shtrove.py b/trove/management/commands/ingest_from_another_shtrove.py index 56e6eba7c..09ab22aa6 100644 --- a/trove/management/commands/ingest_from_another_shtrove.py +++ b/trove/management/commands/ingest_from_another_shtrove.py @@ -2,7 +2,6 @@ from itertools import islice import re from urllib.parse import urlunsplit -import uuid from django.conf import settings from django.core.management.base import BaseCommand @@ -37,27 +36,31 @@ def handle(self, *args, host, count, **options): def _iter_datums(self, host: str): _url = urlunsplit(('https', host, '/api/v2/rawdata/', '', '')) while _url: + self.stdout.write('fetching a page...') _json = requests.get(_url, headers={'Accept': mediatypes.JSONAPI}).json() for _item in _json['data']: yield _item['attributes']['datum'] _url = _json['links'].get('next') def _ingest(self, datum: str) -> bool: - _first_subject_match = re.search( - r'^<([^>\s]+)>', # HACK: depends on specific serialization - datum, - re.MULTILINE, - ) - if _first_subject_match: - _subject_iri = _first_subject_match.group(1) - digestive_tract.swallow( - from_user=self._application_user, - record=datum, - record_identifier=uuid.uuid4(), - record_mediatype=mediatypes.TURTLE, - focus_iri=_subject_iri, + # HACK: get only turtle files by checking it starts with a prefix (unreliable, generally, but good enough for this) + _smells_like_turtle = datum.startswith('@prefix ') or datum.startswith('PREFIX ') + if _smells_like_turtle: + _first_subject_match = re.search( + r'^<([^>\s]+)>', # HACK: depends on specific serialization + datum, + re.MULTILINE, ) - return True + if _first_subject_match: + _subject_iri = _first_subject_match.group(1) + digestive_tract.swallow( + from_user=self._application_user, + record=datum, + record_identifier=_subject_iri, + record_mediatype=mediatypes.TURTLE, + focus_iri=_subject_iri, + ) + return True return False @functools.cached_property From e8fbaf1c0e180d2ea36ec475c974290875a92ccd Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 21 Apr 2025 17:00:02 -0400 Subject: [PATCH 19/43] wip --- trove/render/html_browse.py | 61 +++++++++++++--------- trove/static/css/browse.css | 56 ++++++++++---------- trove/trovesearch/trovesearch_gathering.py | 7 ++- 3 files changed, 71 insertions(+), 53 deletions(-) diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index 9696c8d0e..59d0e97c8 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -1,8 +1,10 @@ +from collections.abc import Iterator import contextlib import dataclasses import datetime import math import random +import re from urllib.parse import quote, urlsplit, urlunsplit from xml.etree.ElementTree import ( Element, @@ -50,6 +52,8 @@ RDF.langString, )) +_QUERYPARAM_SPLIT_RE = re.compile(r'(?=[?&])') + _PHI = (math.sqrt(5) + 1) / 2 @@ -144,18 +148,16 @@ def __render_subj(self, subj_iri: str, *, start_collapsed=None): with self.__visiting(subj_iri): with self.__nest_card('article'): with self.__nest('header'): - _h_text, _also_texts = self.__iri_display_texts(subj_iri) - with self.__nest_h_tag(): - self.__leaf('dfn', text=_h_text, attrs={'id': quote(subj_iri)}) - for _also_text in _also_texts: - self.__leaf('code', text=_also_text) - for _label in self.__iri_thesaurus_labels(subj_iri): - self.__literal(_label) + _compact = self.iri_shorthand.compact_iri(subj_iri) + if (_compact != subj_iri): + with self.__nest_h_tag(attrs={'id': quote(subj_iri)}) as _h: + _h.text = _compact + self.__iri_display(subj_iri) if _twopledict: with self.__nest('details') as _details: _detail_depth = sum((_tag == 'details') for _tag in self.__nested_tags) _should_open = ( - _detail_depth < 4 + _detail_depth < 5 if start_collapsed is None else not start_collapsed ) @@ -167,9 +169,8 @@ def __render_subj(self, subj_iri: str, *, start_collapsed=None): def __twoples(self, twopledict: rdf.RdfTwopleDictionary): with self.__nest('dl', {'class': 'Browse__twopleset'}): for _pred, _obj_set in shuffled(twopledict.items()): - with self.__nest('dt'): - _pred_link = self.__compact_link(_pred) - _append_class(_pred_link, 'Browse__predicate') + with self.__nest('dt', attrs={'class': 'Browse__predicate'}): + self.__compact_link(_pred) for _text in self.__iri_thesaurus_labels(_pred): self.__literal(_text) with self.__nest('dd'): @@ -224,7 +225,7 @@ def __literal( def __sequence(self, sequence_twoples: frozenset): _obj_in_order = list(rdf.sequence_objects_in_order(sequence_twoples)) - with self.__nest('details', attrs={'open': ''}): + with self.__nest('details', attrs={'open': '', 'class': 'Browse__blanknode Browse__object'}): _text = _('sequence of %(count)s') % {'count': len(_obj_in_order)} self.__leaf('summary', text=_text) with self.__nest('ol'): # TODO: style? @@ -242,10 +243,12 @@ def __blanknode(self, blanknode: rdf.RdfTwopleDictionary | frozenset): if isinstance(blanknode, dict) else rdf.twopledict_from_twopleset(blanknode) ) - with self.__nest('article', attrs={ + with self.__nest('details', attrs={ + 'open': '', 'class': 'Browse__blanknode Browse__object', 'style': self._hue_turn_css(), }): + self.__leaf('summary', text='(blank node)') self.__twoples(_twopledict) ### @@ -332,15 +335,17 @@ def __nest_card(self, tag: str): def __iri_thesaurus_labels(self, iri: str): # TODO: consider requested language + _labels: set[rdf.RdfObject] = set() _suffuniq = get_sufficiently_unique_iri(iri) _thesaurus_entry = combined_thesaurus__suffuniq().get(_suffuniq) if _thesaurus_entry: for _pred in _LINK_TEXT_PREDICATES: - yield from shuffled(_thesaurus_entry.get(_pred, ())) + _labels.update(_thesaurus_entry.get(_pred, ())) _twoples = self.__current_data.get(iri) if _twoples: for _pred in _LINK_TEXT_PREDICATES: - yield from shuffled(_twoples.get(_pred, ())) + _labels.update(_twoples.get(_pred, ())) + yield from shuffled(_labels) def _hue_turn_css(self): # return f'--hue: {random.random()}turn;' @@ -369,16 +374,24 @@ def _queryparam_href(self, param_name: str, param_value: str | None): _fragment, )) - def __iri_display_texts(self, iri: str) -> tuple[str, set[str]]: - _compact = self.iri_shorthand.compact_iri(iri) - _suffuniq = get_sufficiently_unique_iri(iri) - _main_display = ( - _compact - if (_compact != iri) - else _suffuniq + def __iri_display(self, iri: str) -> None: + self.__leaf('pre', text='\n'.join(self.__iri_lines(iri))) + for _label in self.__iri_thesaurus_labels(iri): + self.__literal(_label) + for _type_iri in self.__current_data.get(iri, {}).get(RDF.type, ()): + self.__compact_link(_type_iri) + + def __iri_lines(self, iri: str) -> Iterator[str]: + (_scheme, _netloc, _path, _query, _fragment) = urlsplit(iri) + yield ( + f'://{_netloc}{_path}' + if _netloc + else f'{_scheme}:{_path}' ) - _also_display = {iri, _compact} - {_main_display} - return (_main_display, _also_display) + if _query: + yield from filter(bool, _QUERYPARAM_SPLIT_RE.split(f'?{_query}')) + if _fragment: + yield f'#{_fragment}' def _append_class(el: Element, element_class: str): diff --git a/trove/static/css/browse.css b/trove/static/css/browse.css index 3a0585f2d..33d051090 100644 --- a/trove/static/css/browse.css +++ b/trove/static/css/browse.css @@ -1,22 +1,16 @@ :root { --phi: 1.618; - /* rotating colorspace (using lch with luminance and chroma held constant) */ - --hue-turn: var(--phi)turn; /* initial */ - /* background colors */ + /* rotating colorspace (using `lch` with luminance and chroma held locally constant) */ + --hue-turn: 0; /* initial */ --bg-luminance: 83%; - --bg-luminance-2: calc(var(--bg-luminance) + ((100% - var(--bg-luminance)) / 3)); - --bg-luminance-3: calc(var(--bg-luminance-2) + ((100% - var(--bg-luminance-2)) / 3)); - --bg-chroma: 71%; - --bg-chroma-2: calc(var(--bg-chroma) + ((100% - var(--bg-chroma)) / 3)); - --bg-color: lch(var(--bg-luminance) var(--bg-chroma) var(--hue-turn)); - --bg-color-2: lch(var(--bg-luminance-2) var(--bg-chroma) var(--hue-turn)); - --bg-color-3: lch(var(--bg-luminance-3) var(--bg-chroma) var(--hue-turn)); + --bg-chroma: 19%; + --bg-color-initial: lch(83% 19% 1.618turn); /* gutter spaces (gaps, paddings, margins...) */ - --gutter-1: var(--phi)rem; - --gutter-2: calc(var(--phi)rem - 1); - --gutter-3: calc(var(--gutter-2) / 2); - --gutter-4: calc(var(--gutter-3) / 2); - --gutter-5: calc(var(--gutter-4) / 2); + --gutter-1: 1.618rem; + --gutter-2: 0.618rem; + --gutter-3: 0.309rem; + --gutter-4: 0.155rem; + --gutter-5: 0.077rem; } .BrowseWrapper { @@ -28,26 +22,29 @@ margin: 0; padding: 1rem; min-height: 100vh; - background-color: var(--bg-color); + background-color: lch(var(--bg-luminance) var(--bg-chroma) var(--hue-turn)); } .BrowseWrapper dfn { font-weight: bold; } +.BrowseWrapper pre { + margin: 0; + background-color: lch(91% var(--bg-chroma) var(--hue-turn)); +} + .Browse__card { display: flex; flex-direction: column; padding: var(--gutter-2) var(--gutter-3); - background-color: var(--bg-color); - border-color: lch(from var(--bg-color) h s - calc(var(--bg-luminosity) * 0.618) - ); - border-style: inset; + background-color: lch(var(--bg-luminance) var(--bg-chroma) var(--hue-turn)); + border-color: lch(59% var(--bg-chroma) var(--hue-turn)); + border-style: solid; border-inline-start-width: var(--gutter-3); border-block-start-width: var(--gutter-4); - border-inline-end-width: 1px; - border-block-end-width: 1px; + border-inline-end-width: 0; + border-block-end-width: 0; /* border-start-end-radius: 1rem; border-end-start-radius: 1rem; @@ -63,7 +60,7 @@ } .BrowseWrapper details[open] > summary::before { - rotate: var(--bg-turn); + rotate: var(--hue-turn); } .Browse__card > header { @@ -118,7 +115,12 @@ dl.Browse__twopleset > dd { .Browse__blanknode { padding: var(--gutter-4); - border: inset var(--gutter-4) rgba(0,0,0,0.382); + border-color: rgba(0,0,0,0.382); + border-style: solid; + border-inline-start-width: var(--gutter-3); + border-block-start-width: var(--gutter-4); + border-inline-end-width: 0; + border-block-end-width: 0; } .Browse__literal { @@ -137,11 +139,11 @@ dl.Browse__twopleset > dd { } .Browse__predicate { - background-color: var(--bg-color-2); + background-color: lch(from var(--bg-color-initial) 89% c var(--hue-turn)); padding: var(--gutter-4); } .Browse__object { - background-color: var(--bg-color-3); + background-color: lch(from var(--bg-color-initial) 93% c var(--hue-turn)); padding: var(--gutter-4); } diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 44486a5a9..d406b05b4 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -51,7 +51,7 @@ TROVE.Cardsearch, TROVE.Valuesearch, }, - param_iris={TROVE.deriverIRI}, + param_iris={TROVE.deriverIRI, TROVE.blendCards}, thesaurus=TROVE_API_THESAURUS, ) @@ -61,7 +61,10 @@ literal('trove search', language='en'), ), norms=TROVE_GATHERING_NORMS, - gatherer_params={'deriver_iri': TROVE.deriverIRI}, + gatherer_params={ + 'deriver_iri': TROVE.deriverIRI, + 'blend_cards': TROVE.blendCards, + }, ) From 68b0270fe503c71f2af5ed43c29000964003b94d Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 21 Apr 2025 19:22:50 -0400 Subject: [PATCH 20/43] wip --- trove/render/_base.py | 5 ++ trove/render/html_browse.py | 22 +++--- trove/render/jsonapi.py | 5 ++ trove/trovesearch/trovesearch_gathering.py | 87 ++++++++++++++-------- trove/util/trove_params.py | 4 + trove/views/_base.py | 19 +++-- trove/views/browse.py | 8 -- trove/views/indexcard.py | 51 ++++--------- trove/views/search.py | 2 +- 9 files changed, 110 insertions(+), 93 deletions(-) diff --git a/trove/render/_base.py b/trove/render/_base.py index 76b48cbbf..48cfe1cc8 100644 --- a/trove/render/_base.py +++ b/trove/render/_base.py @@ -34,6 +34,11 @@ class BaseRenderer(abc.ABC): iri_shorthand: rdf.IriShorthand = dataclasses.field(default_factory=namespaces_shorthand) thesaurus_tripledict: rdf.RdfTripleDictionary = dataclasses.field(default_factory=lambda: TROVE_API_THESAURUS) + @classmethod + def get_deriver_iri(cls, card_blending: bool): + # override if needed + return cls.INDEXCARD_DERIVER_IRI + @functools.cached_property def thesaurus(self): return rdf.RdfGraph(self.thesaurus_tripledict) diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index 59d0e97c8..3dcf6b758 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -122,11 +122,11 @@ def __blender_toggle_card(self): if self.is_data_blended: _header_text = _('card-blending ON') _link_text = _('disable card-blending') - _link_blend = '0' # blendCards=0 + _link_blend: str | None = None # remove blendCards param (defaults false) else: _header_text = _('card-blending OFF') _link_text = _('enable card-blending') - _link_blend = None # remove blendCards param (defaults true) + _link_blend = '1' # blendCards=1 self.__leaf('header', text=_header_text) self.__leaf('a', text=_link_text, attrs={ 'href': self._queryparam_href('blendCards', _link_blend), @@ -149,21 +149,26 @@ def __render_subj(self, subj_iri: str, *, start_collapsed=None): with self.__nest_card('article'): with self.__nest('header'): _compact = self.iri_shorthand.compact_iri(subj_iri) - if (_compact != subj_iri): - with self.__nest_h_tag(attrs={'id': quote(subj_iri)}) as _h: + _is_compactable = (_compact != subj_iri) + with self.__nest_h_tag(attrs={'id': quote(subj_iri)}) as _h: + if _is_compactable: _h.text = _compact - self.__iri_display(subj_iri) + else: + self.__leaf('pre', text='\n'.join(self.__iri_lines(subj_iri))) + if _is_compactable: + self.__leaf('pre', text='\n'.join(self.__iri_lines(subj_iri))) + self.__iri_subheaders(subj_iri) if _twopledict: with self.__nest('details') as _details: _detail_depth = sum((_tag == 'details') for _tag in self.__nested_tags) _should_open = ( - _detail_depth < 5 + _detail_depth < 3 if start_collapsed is None else not start_collapsed ) if _should_open: _details.set('open', '') - self.__leaf('summary', text=_('details...')) + self.__leaf('summary', text=_('more details...')) self.__twoples(_twopledict) def __twoples(self, twopledict: rdf.RdfTwopleDictionary): @@ -374,8 +379,7 @@ def _queryparam_href(self, param_name: str, param_value: str | None): _fragment, )) - def __iri_display(self, iri: str) -> None: - self.__leaf('pre', text='\n'.join(self.__iri_lines(iri))) + def __iri_subheaders(self, iri: str) -> None: for _label in self.__iri_thesaurus_labels(iri): self.__literal(_label) for _type_iri in self.__current_data.get(iri, {}).get(RDF.type, ()): diff --git a/trove/render/jsonapi.py b/trove/render/jsonapi.py index a85d9dd26..6337e7edc 100644 --- a/trove/render/jsonapi.py +++ b/trove/render/jsonapi.py @@ -77,6 +77,11 @@ class RdfJsonapiRenderer(BaseRenderer): repr=False, ) + # override BaseRenderer + @classmethod + def get_deriver_iri(cls, card_blending: bool): + return (None if card_blending else super().get_deriver_iri(card_blending)) + def simple_render_document(self) -> str: return json.dumps( self.render_dict(self.response_focus.single_iri()), diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index d406b05b4..c75d8ce55 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -146,7 +146,7 @@ def gather_count(focus: CardsearchFocus, **kwargs): focustype_iris={TROVE.Cardsearch}, cache_bound=1, # only the first page gets cached ) -def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): +def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, blend_cards, **kwargs): # each searchResultPage a sequence of search results _current_handle: CardsearchHandle | None = focus.search_handle while _current_handle is not None: @@ -159,36 +159,65 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): _card_focus = _card_foci.get(_result.card_iri) if _card_focus is None: continue # skip (deleted card still indexed?) - _text_evidence_twoples = ( - (TROVE.matchEvidence, frozenset(( - (RDF.type, TROVE.TextMatchEvidence), - (TROVE.matchingHighlight, _evidence.matching_highlight), - (TROVE.evidenceCardIdentifier, literal(_evidence.card_iri)), - *_single_propertypath_twoples(_evidence.property_path), - ))) - for _evidence in _result.text_match_evidence + _result_obj, _triples = ( + _blended_card(_card_focus) + if blend_cards + else _unblended_card(_result, _card_focus) ) - _result_page.append(frozenset(( - (RDF.type, TROVE.SearchResult), - (TROVE.indexCard, _result.card_iri), - *_text_evidence_twoples, - ))) - # hack around (current) limitations of primitive_metadata.gather - # (what with all these intermediate blank nodes and sequences): - # yield trove:resourceMetadata here (instead of another gatherer) - _card_twoples = _minimal_indexcard_twoples( - focus_identifiers=[ - _identifier.as_iri() - for _identifier in _card_focus.indexcard.focus_identifier_set.all() - ], - resource_metadata=_card_focus.resourceMetadata, - ) - for _pred, _obj in _card_twoples: - yield (_result.card_iri, _pred, _obj) + _result_page.append(_result_obj) + yield from _triples yield (TROVE.searchResultPage, sequence(_result_page)) _current_handle = _current_handle.get_next_streaming_handle() +def _blended_card(card_focus) -> tuple[rdf.RdfObject, Iterable[rdf.RdfTriple]]: + _metadata = card_focus.resourceMetadata + if isinstance(_metadata, rdf.Literal): + return (_metadata, ()) + if isinstance(_metadata, rdf.QuotedGraph): + return (_metadata.focus_iri, rdf.iter_tripleset(_metadata.tripledict)) + return (card_focus.single_iri(), ()) # oh well + + +def _unblended_card(_result, _card_focus) -> tuple[rdf.RdfObject, Iterable[rdf.RdfTriple]]: + return ( + _unblended_cardsearch_result(_result), + _unblended_card_triples(_result, _card_focus), + ) + + +def _unblended_cardsearch_result(_result) -> rdf.RdfBlanknode: + _text_evidence_twoples = ( + (TROVE.matchEvidence, frozenset(( + (RDF.type, TROVE.TextMatchEvidence), + (TROVE.matchingHighlight, _evidence.matching_highlight), + (TROVE.evidenceCardIdentifier, literal(_evidence.card_iri)), + *_single_propertypath_twoples(_evidence.property_path), + ))) + for _evidence in _result.text_match_evidence + ) + return frozenset(( + (RDF.type, TROVE.SearchResult), + (TROVE.indexCard, _result.card_iri), + *_text_evidence_twoples, + )) + + +def _unblended_card_triples(_result, _card_focus) -> Iterator[rdf.RdfTriple]: + # hack around (current) limitations of primitive_metadata.gather + # (what with all these intermediate blank nodes and sequences): + # yield trove:resourceMetadata here (instead of another gatherer) + _card_twoples = _unblended_indexcard_twoples( + focus_identifiers=[ + _identifier.as_iri() + for _identifier in _card_focus.indexcard.focus_identifier_set.all() + ], + resource_metadata=_card_focus.resourceMetadata, + ) + for _pred, _obj in _card_twoples: + yield (_result.card_iri, _pred, _obj) + + @trovesearch_by_indexstrategy.gatherer(TROVE.searchResultPage) def gather_page_links(focus, **kwargs): # links to more pages of results @@ -251,7 +280,7 @@ def gather_valuesearch_page(focus: ValuesearchFocus, *, deriver_iri, **kwargs): # hack around (current) limitations of primitive_metadata.gather # (what with all these intermediate blank nodes and sequences): # yield trove:resourceMetadata here (instead of another gatherer) - _card_twoples = _minimal_indexcard_twoples( + _card_twoples = _unblended_indexcard_twoples( focus_identifiers=[ _identifier.as_iri() for _identifier in _card_focus.indexcard.focus_identifier_set.all() @@ -456,7 +485,7 @@ def _valuesearch_result_as_json(result: ValuesearchResult) -> Literal: ) -def _minimal_indexcard_twoples( +def _unblended_indexcard_twoples( focus_identifiers: Iterable[str], resource_metadata: rdf.Literal, ) -> Iterator[rdf.RdfTwople]: @@ -471,7 +500,7 @@ def _minimal_indexcard_twoples( def _valuesearch_result_as_indexcard_blanknode(result: ValuesearchResult) -> frozenset: - return frozenset(_minimal_indexcard_twoples( + return frozenset(_unblended_indexcard_twoples( focus_identifiers=[literal(result.value_iri or result.value_value)], resource_metadata=_valuesearch_result_as_json(result), )) diff --git a/trove/util/trove_params.py b/trove/util/trove_params.py index 46c8d9119..92e5ed2da 100644 --- a/trove/util/trove_params.py +++ b/trove/util/trove_params.py @@ -27,6 +27,7 @@ class BasicTroveParams: accept_mediatype: str | None included_relations: PropertypathSet = dataclasses.field(repr=False, compare=False) attrpaths_by_type: Mapping[str, PropertypathSet] = dataclasses.field(repr=False, compare=False) + blend_cards: bool ### # class methods @@ -48,6 +49,7 @@ def parse_queryparams(cls, queryparams: _qp.QueryparamDict) -> dict: 'included_relations': cls._gather_included_relations(queryparams, _shorthand), 'attrpaths_by_type': cls._gather_attrpaths(queryparams, _shorthand), 'accept_mediatype': _qp.get_single_value(queryparams, 'acceptMediatype'), + 'blend_cards': _qp.get_bool_value(queryparams, 'blendCards'), } @classmethod @@ -126,5 +128,7 @@ def to_querydict(self) -> QueryDict: _querydict = QueryDict(mutable=True) if self.accept_mediatype: _querydict['acceptMediatype'] = self.accept_mediatype + if self.blend_cards: + _querydict['blendCards'] = '' # TODO: iriShorthand, include, fields[...] return _querydict diff --git a/trove/views/_base.py b/trove/views/_base.py index e94c1a783..5eb6b6ab5 100644 --- a/trove/views/_base.py +++ b/trove/views/_base.py @@ -34,10 +34,10 @@ class BaseTroveView(View, abc.ABC): params_type: ClassVar[type[BasicTroveParams]] = BasicTroveParams @abc.abstractmethod - def _render_response_content(self, request, params, renderer_type: type[BaseRenderer]) -> ProtoRendering: + def _render_response_content(self, request, params, renderer_type: type[BaseRenderer], url_kwargs) -> ProtoRendering: raise NotImplementedError - def get(self, request): + def get(self, request, **kwargs): try: _renderer_type = get_renderer_type(request) except trove_exceptions.CannotRenderMediatype as _error: @@ -48,7 +48,7 @@ def get(self, request): try: _params = self._parse_params(request) return make_http_response( - content_rendering=self._render_response_content(request, _params, _renderer_type), + content_rendering=self._render_response_content(request, _params, _renderer_type, kwargs), http_request=request, ) except trove_exceptions.TroveError as _error: @@ -67,8 +67,8 @@ class GatheredTroveView(BaseTroveView, abc.ABC): # optional ClassVars: focus_type_iris: ClassVar[Container[str]] = () - def _render_response_content(self, request, params, renderer_type: type[BaseRenderer]): - _focus = self._build_focus(request, params) + def _render_response_content(self, request, params, renderer_type: type[BaseRenderer], url_kwargs): + _focus = self._build_focus(request, params, url_kwargs) _renderer = self._gather_to_renderer(_focus, params, renderer_type) return _renderer.render_document() @@ -81,7 +81,7 @@ def _gather_to_renderer(self, focus, params, renderer_type: type[BaseRenderer]) def _get_focus_iri(self, request, params): return request.build_absolute_uri() - def _build_focus(self, request, params): + def _build_focus(self, request, params, url_kwargs): return gather.Focus.new(self._get_focus_iri(request, params), self.focus_type_iris) def _build_gathering(self, params, renderer_type: type[BaseRenderer]) -> gather.Gathering: @@ -93,7 +93,10 @@ def _get_gatherer_kwargs(self, params, renderer_type): _kwargs = {} _deriver_kw = _get_param_keyword(TROVE.deriverIRI, self.gathering_organizer) if _deriver_kw: - _kwargs[_deriver_kw] = renderer_type.INDEXCARD_DERIVER_IRI + _kwargs[_deriver_kw] = renderer_type.get_deriver_iri(params.blend_cards) + _blend_kw = _get_param_keyword(TROVE.blendCards, self.gathering_organizer) + if _blend_kw: + _kwargs[_blend_kw] = params.blend_cards return _kwargs @@ -111,7 +114,7 @@ def get_static_triples(cls, focus_iri: str) -> rdf.RdfTripleDictionary: def get_focus_iri(cls) -> str: raise NotImplementedError(f'implement get_focus_iri on {cls}') - def _render_response_content(self, request, params, renderer_type: type[BaseRenderer]): + def _render_response_content(self, request, params, renderer_type: type[BaseRenderer], url_kwargs): _focus_iri = self.get_focus_iri() _triples = self.get_static_triples(_focus_iri) _focus = gather.Focus.new( diff --git a/trove/views/browse.py b/trove/views/browse.py index 21e91a245..e7d019464 100644 --- a/trove/views/browse.py +++ b/trove/views/browse.py @@ -18,7 +18,6 @@ @dataclasses.dataclass(frozen=True) class BrowseParams(BasicTroveParams): iri: str - blend_cards: bool @classmethod def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: @@ -28,7 +27,6 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: return { **super().parse_queryparams(queryparams), 'iri': cls._parse_iri(_iri_value), - 'blend_cards': get_bool_value(queryparams, 'blendCards', if_absent=True), } @classmethod @@ -57,9 +55,3 @@ class BrowseIriView(GatheredTroveView): def _get_focus_iri(self, request, params: BrowseParams): # override GatheredTroveView return params.iri - - def _get_gatherer_kwargs(self, params, renderer_type): # override GatheredTroveView - return { - **super()._get_gatherer_kwargs(params, renderer_type), - 'blend_cards': params.blend_cards, - } diff --git a/trove/views/indexcard.py b/trove/views/indexcard.py index 208a15f85..158102f80 100644 --- a/trove/views/indexcard.py +++ b/trove/views/indexcard.py @@ -1,50 +1,25 @@ -from django.views import View +from django.http import Http404 -from trove import exceptions as trove_exceptions from trove import models as trove_db -from trove.render import ( - DEFAULT_RENDERER_TYPE, - get_renderer_type, -) from trove.trovesearch.search_params import IndexcardParams from trove.trovesearch.trovesearch_gathering import ( trovesearch_by_indexstrategy, IndexcardFocus, ) from trove.vocab.trove import trove_indexcard_iri -from ._gather_ask import ask_gathering_from_params -from ._responder import ( - make_http_error_response, - make_http_response, -) +from ._base import GatheredTroveView + +class IndexcardView(GatheredTroveView): + params_type = IndexcardParams + gathering_organizer = trovesearch_by_indexstrategy -class IndexcardView(View): - def get(self, request, indexcard_uuid): + def _build_focus(self, request, params, url_kwargs): try: - _renderer_type = get_renderer_type(request) - _gathering = trovesearch_by_indexstrategy.new_gathering({ - 'deriver_iri': _renderer_type.INDEXCARD_DERIVER_IRI, - }) - _indexcard_iri = trove_indexcard_iri(indexcard_uuid) - _params = IndexcardParams.from_querystring(request.META['QUERY_STRING']) - _focus = IndexcardFocus.new( - iris=_indexcard_iri, - indexcard=trove_db.Indexcard.objects.get_for_iri(_indexcard_iri), - ) - ask_gathering_from_params(_gathering, _params, _focus) - _renderer = _renderer_type(_focus, _gathering) - return make_http_response( - content_rendering=_renderer.render_document(), - http_request=request, - ) - except trove_exceptions.CannotRenderMediatype as _error: - return make_http_error_response( - error=_error, - renderer_type=DEFAULT_RENDERER_TYPE, - ) - except trove_exceptions.TroveError as _error: - return make_http_error_response( - error=_error, - renderer_type=_renderer_type, + _indexcard_uuid = url_kwargs['indexcard_uuid'] + return IndexcardFocus.new( + iris=trove_indexcard_iri(_indexcard_uuid), + indexcard=trove_db.Indexcard.objects.get(uuid=_indexcard_uuid), ) + except trove_db.Indexcard.DoesNotExist: + raise Http404 diff --git a/trove/views/search.py b/trove/views/search.py index 994d7db48..f84f50623 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -30,7 +30,7 @@ class _BaseTrovesearchView(GatheredTroveView, abc.ABC): gathering_organizer = trovesearch_by_indexstrategy # for GatheredTroveView - def _build_focus(self, request, params): # override GatheredTroveView + def _build_focus(self, request, params, url_kwargs): # override GatheredTroveView _strategy = index_strategy.get_strategy_for_trovesearch(params) return self.focus_type.new( iris=self._get_focus_iri(request, params), From 1a7cadbada674b4f9944bf6671fc3d086f79c556 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 22 Apr 2025 10:04:20 -0400 Subject: [PATCH 21/43] wip --- trove/trovesearch/trovesearch_gathering.py | 2 +- trove/views/browse.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index c75d8ce55..0d2fcb719 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -253,7 +253,7 @@ def gather_cardsearch_filter(focus, **kwargs): TROVE.searchResultPage, focustype_iris={TROVE.Valuesearch}, ) -def gather_valuesearch_page(focus: ValuesearchFocus, *, deriver_iri, **kwargs): +def gather_valuesearch_page(focus: ValuesearchFocus, *, deriver_iri, blend_cards, **kwargs): _result_page = [] _value_iris = { _result.value_iri diff --git a/trove/views/browse.py b/trove/views/browse.py index e7d019464..6739b53d7 100644 --- a/trove/views/browse.py +++ b/trove/views/browse.py @@ -10,7 +10,6 @@ from trove.util.queryparams import ( QueryparamDict, get_single_value, - get_bool_value, ) from ._base import GatheredTroveView From 14b7e326839ac0a51e30f9b7d0ec5674d5370be5 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 22 Apr 2025 15:33:27 -0400 Subject: [PATCH 22/43] wip --- trove/render/_html.py | 67 +++++++++++ trove/render/html_browse.py | 225 +++++++++++++++++------------------- trove/static/css/browse.css | 6 +- trove/views/_base.py | 16 +-- trove/views/shtrove_root.py | 18 ++- 5 files changed, 199 insertions(+), 133 deletions(-) create mode 100644 trove/render/_html.py diff --git a/trove/render/_html.py b/trove/render/_html.py new file mode 100644 index 000000000..45f775880 --- /dev/null +++ b/trove/render/_html.py @@ -0,0 +1,67 @@ +from __future__ import annotations +import contextlib +import dataclasses +from xml.etree.ElementTree import ( + Element, + SubElement, +) + +from primitive_metadata import primitive_rdf as rdf + + +__all__ = ('HtmlBuilder',) + + +@dataclasses.dataclass +class HtmlBuilder: + given_root: Element + _: dataclasses.KW_ONLY + _nested_elements: list[Element] = dataclasses.field(default_factory=list) + _heading_depth: int = 0 + + def __post_init__(self): + self._nested_elements.append(self.given_root) + + @property + def root_element(self) -> Element: + return self._nested_elements[0] + + @property + def _current_element(self) -> Element: + return self._nested_elements[-1] + + ### + # html-building helper methods + + @contextlib.contextmanager + def nest_h_tag(self, **kwargs): + _outer_heading_depth = self._heading_depth + if not _outer_heading_depth: + self._heading_depth = 1 + elif _outer_heading_depth < 6: # h6 deepest + self._heading_depth += 1 + _h_tag = f'h{self._heading_depth}' + with self.nest(_h_tag, **kwargs) as _nested: + try: + yield _nested + finally: + self._heading_depth = _outer_heading_depth + + @contextlib.contextmanager + def nest(self, tag_name, attrs=None): + _attrs = {**attrs} if attrs else {} + _nested_element = SubElement(self._current_element, tag_name, _attrs) + self._nested_elements.append(_nested_element) + try: + yield self._current_element + finally: + _popped_element = self._nested_elements.pop() + assert _popped_element is _nested_element + + def leaf(self, tag_name, *, text=None, attrs=None): + _leaf_element = SubElement(self._current_element, tag_name, attrs or {}) + if isinstance(text, rdf.Literal): + # TODO: lang + _leaf_element.text = text.unicode_value + elif text is not None: + _leaf_element.text = text diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index 3dcf6b758..9fef803dd 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -5,10 +5,10 @@ import math import random import re +from typing import ClassVar from urllib.parse import quote, urlsplit, urlunsplit from xml.etree.ElementTree import ( Element, - SubElement, tostring as etree_tostring, fromstring as etree_fromstring, ) @@ -28,6 +28,7 @@ from trove.vocab.static_vocab import combined_thesaurus__suffuniq from trove.vocab.trove import trove_browse_link from ._base import BaseRenderer +from ._html import HtmlBuilder STABLE_MEDIATYPES = (mediatypes.JSONAPI,) UNSTABLE_MEDIATYPES = ( @@ -56,69 +57,75 @@ _PHI = (math.sqrt(5) + 1) / 2 - -@dataclasses.dataclass -class RdfHtmlBrowseRenderer(BaseRenderer): - MEDIATYPE = 'text/html; charset=utf-8' - - def simple_render_document(self) -> str: - _html_builder = _HtmlBuilder( - all_data=self.response_tripledict, - focus_iri=self.response_focus.single_iri(), - iri_shorthand=self.iri_shorthand, - is_data_blended=self.response_gathering.gatherer_kwargs.get('blend_cards'), - ) - _html_str = etree_tostring(_html_builder.html_element, encoding='unicode', method='html') - return ''.join(( - '', # TODO: can etree put the doctype in? - _html_str, - )) +_HTML_DOCTYPE = '' @dataclasses.dataclass -class _HtmlBuilder: - all_data: rdf.RdfTripleDictionary - focus_iri: str - iri_shorthand: rdf.IriShorthand - is_data_blended: bool | None = None - html_element: Element = dataclasses.field(init=False) +class RdfHtmlBrowseRenderer(BaseRenderer): + MEDIATYPE: ClassVar[str] = 'text/html; charset=utf-8' __current_data: rdf.RdfTripleDictionary = dataclasses.field(init=False) - __current_element: Element = dataclasses.field(init=False) __visiting_iris: set[str] = dataclasses.field(init=False) - __heading_depth: int = 0 + __hb: HtmlBuilder = dataclasses.field(init=False) __last_hue_turn: float = dataclasses.field(default_factory=random.random) - __nested_tags: list[str] = dataclasses.field(default_factory=list) def __post_init__(self): # TODO: lang (according to request -- also translate) - self.html_element = self.__current_element = Element('html') - self.__current_data = self.all_data + self.__current_data = self.response_tripledict self.__visiting_iris = set() - with self.__nest('head'): - self.__leaf('link', attrs={ - 'rel': 'stylesheet', - 'href': staticfiles_storage.url('css/browse.css'), - }) + + @property + def is_data_blended(self) -> bool | None: + return self.response_gathering.gatherer_kwargs.get('blend_cards') + + # override BaseRenderer + def simple_render_document(self) -> str: + self.__hb = HtmlBuilder(Element('html')) + self.render_html_head() _body_attrs = { 'class': 'BrowseWrapper', 'style': self._hue_turn_css(), } - with self.__nest('body', attrs=_body_attrs): - self.__render_subj(self.focus_iri), + with self.__hb.nest('body', attrs=_body_attrs): + self.render_nav() + self.render_main() + self.render_footer() + return '\n'.join(( + _HTML_DOCTYPE, + etree_tostring(self.__hb.root_element, encoding='unicode', method='html'), + )) + + def render_html_head(self): + with self.__hb.nest('head'): + self.__hb.leaf('link', attrs={ + 'rel': 'stylesheet', + 'href': staticfiles_storage.url('css/browse.css'), + }) + + def render_nav(self): + with self.__hb.nest('nav'): self.__alternate_mediatypes_card() if self.is_data_blended is not None: self.__blender_toggle_card() - # TODO:
with unvisited triples in self.data (unreachable from focus_iri) + + def render_main(self): + with self.__hb.nest('main'): + for _iri in self.response_focus.iris: + self.__render_subj(_iri) + # TODO: show additional unvisited triples? + + def render_footer(self): + with self.__hb.nest('footer'): + ... def __alternate_mediatypes_card(self): - with self.__nest_card('nav'): - self.__leaf('header', text=_('alternate mediatypes')) + with self.__nest_card('details'): + self.__hb.leaf('summary', text=_('alternate mediatypes')) for _mediatype in shuffled((*STABLE_MEDIATYPES, *UNSTABLE_MEDIATYPES)): - with self.__nest('span', attrs={'class': 'Browse__literal'}): + with self.__hb.nest('span', attrs={'class': 'Browse__literal'}): self.__mediatype_link(_mediatype) def __blender_toggle_card(self): - with self.__nest_card('nav'): + with self.__nest_card('details'): if self.is_data_blended: _header_text = _('card-blending ON') _link_text = _('disable card-blending') @@ -127,40 +134,46 @@ def __blender_toggle_card(self): _header_text = _('card-blending OFF') _link_text = _('enable card-blending') _link_blend = '1' # blendCards=1 - self.__leaf('header', text=_header_text) - self.__leaf('a', text=_link_text, attrs={ + self.__hb.leaf('summary', text=_header_text) + self.__hb.leaf('a', text=_link_text, attrs={ 'href': self._queryparam_href('blendCards', _link_blend), }) def __mediatype_link(self, mediatype: str): - self.__leaf('a', text=mediatype, attrs={ + self.__hb.leaf('a', text=mediatype, attrs={ 'href': self._queryparam_href('acceptMediatype', mediatype), }) if mediatype in UNSTABLE_MEDIATYPES: - self.__leaf('aside', text=_('(unstable)')) + self.__hb.leaf('aside', text=_('(unstable)')) if mediatype in STABLE_MEDIATYPES: - with self.__nest('aside'): - with self.__nest('a', attrs={'href': reverse('trove:docs')}) as _link: + with self.__hb.nest('aside'): + with self.__hb.nest('a', attrs={'href': reverse('trove:docs')}) as _link: _link.text = _('(stable for documented use)') def __render_subj(self, subj_iri: str, *, start_collapsed=None): _twopledict = self.__current_data.get(subj_iri, {}) with self.__visiting(subj_iri): with self.__nest_card('article'): - with self.__nest('header'): + with self.__hb.nest('header'): _compact = self.iri_shorthand.compact_iri(subj_iri) _is_compactable = (_compact != subj_iri) - with self.__nest_h_tag(attrs={'id': quote(subj_iri)}) as _h: - if _is_compactable: - _h.text = _compact + _should_link = (subj_iri not in self.response_focus.iris) + with self.__hb.nest_h_tag(attrs={'id': quote(subj_iri)}) as _h: + if _should_link: + with self.__nest_link(subj_iri) as _link: + if _is_compactable: + _link.text = _compact + else: + self.__split_iri_pre(subj_iri) else: - self.__leaf('pre', text='\n'.join(self.__iri_lines(subj_iri))) - if _is_compactable: - self.__leaf('pre', text='\n'.join(self.__iri_lines(subj_iri))) + if _is_compactable: + _h.text = _compact + else: + self.__split_iri_pre(subj_iri) self.__iri_subheaders(subj_iri) if _twopledict: - with self.__nest('details') as _details: - _detail_depth = sum((_tag == 'details') for _tag in self.__nested_tags) + with self.__hb.nest('details') as _details: + _detail_depth = sum((_el.tag == 'details') for _el in self.__hb._nested_elements) _should_open = ( _detail_depth < 3 if start_collapsed is None @@ -168,17 +181,17 @@ def __render_subj(self, subj_iri: str, *, start_collapsed=None): ) if _should_open: _details.set('open', '') - self.__leaf('summary', text=_('more details...')) + self.__hb.leaf('summary', text=_('more details...')) self.__twoples(_twopledict) def __twoples(self, twopledict: rdf.RdfTwopleDictionary): - with self.__nest('dl', {'class': 'Browse__twopleset'}): + with self.__hb.nest('dl', {'class': 'Browse__twopleset'}): for _pred, _obj_set in shuffled(twopledict.items()): - with self.__nest('dt', attrs={'class': 'Browse__predicate'}): + with self.__hb.nest('dt', attrs={'class': 'Browse__predicate'}): self.__compact_link(_pred) for _text in self.__iri_thesaurus_labels(_pred): self.__literal(_text) - with self.__nest('dd'): + with self.__hb.nest('dd'): for _obj in shuffled(_obj_set): self.__obj(_obj) @@ -188,7 +201,7 @@ def __obj(self, obj: rdf.RdfObject): if (obj in self.__current_data) and (obj not in self.__visiting_iris): self.__render_subj(obj) else: - with self.__nest('article', attrs={'class': 'Browse__object'}): + with self.__hb.nest('article', attrs={'class': 'Browse__object'}): self.__iri_link_and_labels(obj) elif isinstance(obj, frozenset): # blanknode if (RDF.type, RDF.Seq) in obj: @@ -218,24 +231,24 @@ def __literal( if is_rdf_object: _element_classes.append('Browse__object') # TODO: checksum_iri, literal_iri - with self.__nest('article', attrs={'class': ' '.join(_element_classes)}): + with self.__hb.nest('article', attrs={'class': ' '.join(_element_classes)}): for _datatype_iri in _lit.datatype_iris.difference(_IMPLICIT_DATATYPES): self.__compact_link(_datatype_iri) if _is_markdown: # TODO: tests for safe_mode _html = markdown2.markdown(_lit.unicode_value, safe_mode='escape') - self.__current_element.append(etree_fromstring(f'{_html}')) + self.__hb._current_element.append(etree_fromstring(f'{_html}')) else: - self.__leaf('q', text=_lit) + self.__hb.leaf('q', text=_lit) def __sequence(self, sequence_twoples: frozenset): _obj_in_order = list(rdf.sequence_objects_in_order(sequence_twoples)) - with self.__nest('details', attrs={'open': '', 'class': 'Browse__blanknode Browse__object'}): + with self.__hb.nest('details', attrs={'open': '', 'class': 'Browse__blanknode Browse__object'}): _text = _('sequence of %(count)s') % {'count': len(_obj_in_order)} - self.__leaf('summary', text=_text) - with self.__nest('ol'): # TODO: style? + self.__hb.leaf('summary', text=_text) + with self.__hb.nest('ol'): # TODO: style? for _seq_obj in _obj_in_order: - with self.__nest('li'): # , visible=True): + with self.__hb.nest('li'): # , visible=True): self.__obj(_seq_obj) def __quoted_graph(self, quoted_graph: rdf.QuotedGraph): @@ -248,16 +261,16 @@ def __blanknode(self, blanknode: rdf.RdfTwopleDictionary | frozenset): if isinstance(blanknode, dict) else rdf.twopledict_from_twopleset(blanknode) ) - with self.__nest('details', attrs={ + with self.__hb.nest('details', attrs={ 'open': '', 'class': 'Browse__blanknode Browse__object', 'style': self._hue_turn_css(), }): - self.__leaf('summary', text='(blank node)') + self.__hb.leaf('summary', text='(blank node)') self.__twoples(_twopledict) - ### - # private html-building helpers + def __split_iri_pre(self, iri: str): + self.__hb.leaf('pre', text='\n'.join(self.__iri_lines(iri))) @contextlib.contextmanager def __visiting(self, iri: str): @@ -268,20 +281,6 @@ def __visiting(self, iri: str): finally: self.__visiting_iris.remove(iri) - @contextlib.contextmanager - def __nest_h_tag(self, **kwargs): - _outer_heading_depth = self.__heading_depth - if not _outer_heading_depth: - self.__heading_depth = 1 - elif _outer_heading_depth < 6: # h6 deepest - self.__heading_depth += 1 - _h_tag = f'h{self.__heading_depth}' - with self.__nest(_h_tag, **kwargs) as _nested: - try: - yield _nested - finally: - self.__heading_depth = _outer_heading_depth - @contextlib.contextmanager def __quoted_data(self, quoted_data: dict): _outer_data = self.__current_data @@ -294,43 +293,26 @@ def __quoted_data(self, quoted_data: dict): self.__current_data = _outer_data self.__visiting_iris = _outer_visiting_iris - @contextlib.contextmanager - def __nest(self, tag_name, attrs=None): - _attrs = {**attrs} if attrs else {} - _parent_element = self.__current_element - self.__current_element = SubElement(_parent_element, tag_name, _attrs) - self.__nested_tags.append(tag_name) - try: - yield self.__current_element - finally: - self.__nested_tags.pop() - self.__current_element = _parent_element - - def __leaf(self, tag_name, *, text=None, attrs=None): - _leaf_element = SubElement(self.__current_element, tag_name, attrs or {}) - if isinstance(text, rdf.Literal): - # TODO: lang - _leaf_element.text = text.unicode_value - elif text is not None: - _leaf_element.text = text - def __iri_link_and_labels(self, iri: str): self.__compact_link(iri) for _text in self.__iri_thesaurus_labels(iri): self.__literal(_text) - def __compact_link(self, iri: str): + def __nest_link(self, iri: str): _href = ( iri - if iri.startswith(settings.SHARE_WEB_URL) + if _is_local_url(iri) else trove_browse_link(iri) ) - with self.__nest('a', attrs={'href': _href}) as _a: + return self.__hb.nest('a', attrs={'href': _href}) + + def __compact_link(self, iri: str): + with self.__nest_link(iri) as _a: _a.text = self.iri_shorthand.compact_iri(iri) return _a def __nest_card(self, tag: str): - return self.__nest( + return self.__hb.nest( tag, attrs={ 'class': 'Browse__card', @@ -350,17 +332,16 @@ def __iri_thesaurus_labels(self, iri: str): if _twoples: for _pred in _LINK_TEXT_PREDICATES: _labels.update(_twoples.get(_pred, ())) - yield from shuffled(_labels) + return shuffled(_labels) def _hue_turn_css(self): - # return f'--hue: {random.random()}turn;' _hue_turn = (self.__last_hue_turn + _PHI) % 1.0 self.__last_hue_turn = _hue_turn return f'--hue-turn: {_hue_turn}turn;' def _queryparam_href(self, param_name: str, param_value: str | None): - _base_url = self.focus_iri - if not _base_url.startswith(settings.SHARE_WEB_URL): + _base_url = self.response_focus.single_iri() + if not _is_local_url(_base_url): _base_url = trove_browse_link(_base_url) (_scheme, _netloc, _path, _query, _fragment) = urlsplit(_base_url) _qparams = QueryDict(_query, mutable=True) @@ -380,10 +361,14 @@ def _queryparam_href(self, param_name: str, param_value: str | None): )) def __iri_subheaders(self, iri: str) -> None: - for _label in self.__iri_thesaurus_labels(iri): - self.__literal(_label) - for _type_iri in self.__current_data.get(iri, {}).get(RDF.type, ()): - self.__compact_link(_type_iri) + _type_iris = self.__current_data.get(iri, {}).get(RDF.type, ()) + if _type_iris: + for _type_iri in _type_iris: + self.__compact_link(_type_iri) + _labels = self.__iri_thesaurus_labels(iri) + if _labels: + for _label in _labels: + self.__literal(_label) def __iri_lines(self, iri: str) -> Iterator[str]: (_scheme, _netloc, _path, _query, _fragment) = urlsplit(iri) @@ -403,3 +388,7 @@ def _append_class(el: Element, element_class: str): 'class', ' '.join(filter(None, (element_class, el.get('class')))), ) + + +def _is_local_url(iri: str) -> bool: + return iri.startswith(settings.SHARE_WEB_URL) diff --git a/trove/static/css/browse.css b/trove/static/css/browse.css index 33d051090..643bcfcf2 100644 --- a/trove/static/css/browse.css +++ b/trove/static/css/browse.css @@ -31,7 +31,6 @@ .BrowseWrapper pre { margin: 0; - background-color: lch(91% var(--bg-chroma) var(--hue-turn)); } .Browse__card { @@ -65,13 +64,14 @@ .Browse__card > header { display: flex; - flex-direction: column; + flex-direction: row; + gap: var(--gutter-2); + align-items: baseline; border-bottom: solid 1px rgba(0,0,0,0.382); margin-bottom: var(--gutter-3); } .Browse__card > header > :first-child { - align-self: stretch; margin: 0; } diff --git a/trove/views/_base.py b/trove/views/_base.py index 5eb6b6ab5..e2cd48f48 100644 --- a/trove/views/_base.py +++ b/trove/views/_base.py @@ -102,21 +102,23 @@ def _get_gatherer_kwargs(self, params, renderer_type): class StaticTroveView(BaseTroveView, abc.ABC): @classmethod - def get_static_twoples(cls) -> rdf.RdfTripleDictionary: - raise NotImplementedError(f'implement either get_static_triples or get_static_twoples on {cls}') + @abc.abstractmethod + def get_focus_iri(cls) -> str: + raise NotImplementedError @classmethod - @functools.cache + @abc.abstractmethod def get_static_triples(cls, focus_iri: str) -> rdf.RdfTripleDictionary: - return {focus_iri: cls.get_static_twoples()} + raise NotImplementedError @classmethod - def get_focus_iri(cls) -> str: - raise NotImplementedError(f'implement get_focus_iri on {cls}') + @functools.cache + def cached_static_triples(cls, focus_iri): + return cls.get_static_triples(focus_iri) def _render_response_content(self, request, params, renderer_type: type[BaseRenderer], url_kwargs): _focus_iri = self.get_focus_iri() - _triples = self.get_static_triples(_focus_iri) + _triples = self.cached_static_triples(_focus_iri) _focus = gather.Focus.new( _focus_iri, type_iris=_triples.get(_focus_iri, {}).get(RDF.type, ()), diff --git a/trove/views/shtrove_root.py b/trove/views/shtrove_root.py index a450c5c11..596d524a1 100644 --- a/trove/views/shtrove_root.py +++ b/trove/views/shtrove_root.py @@ -13,15 +13,23 @@ def get_focus_iri(cls): @classmethod def get_static_triples(cls, focus_iri: str) -> rdf.RdfTripleDictionary: _here = rdf.IriNamespace(focus_iri) + _docs = _here['trove/docs'] + _browse = _here['trove/browse'] + _cardsearch = _here['trove/index-card-search'] return { focus_iri: { ns.DCTERMS.description: { rdf.literal('a trove of metadata meant to be shared', language='en'), }, - ns.RDFS.seeAlso: { - _here['trove/docs'], - _here['trove/browse'], - _here['trove/index-card-search'], - }, + ns.RDFS.seeAlso: {_docs, _browse, _cardsearch}, + }, + _docs: { + ns.DCTERMS.title: {rdf.literal('trove search-api docs', language='en')}, + }, + _browse: { + ns.DCTERMS.title: {rdf.literal('trove browse', language='en')}, + }, + _cardsearch: { + ns.DCTERMS.title: {rdf.literal('trove index-card-search', language='en')}, }, } From b61af9b3f9472f800e70481377defe652ea53f84 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 18 Mar 2025 13:31:41 -0400 Subject: [PATCH 23/43] chore: better trovesearch tests, factories --- tests/share/search/__init__.py | 16 +- tests/share/search/end_to_end/__init__.py | 1 + tests/share/search/end_to_end/_common.py | 254 ++++++++++++++++++ .../test_osfsearch_on_trovesearch_denorm.py | 7 + .../_common_trovesearch_tests.py | 138 +++++----- .../index_strategy/_with_real_services.py | 13 +- .../index_strategy/test_sharev2_elastic5.py | 6 +- .../index_strategy/test_sharev2_elastic8.py | 6 +- tests/trove/factories.py | 63 ++++- 9 files changed, 404 insertions(+), 100 deletions(-) create mode 100644 tests/share/search/end_to_end/__init__.py create mode 100644 tests/share/search/end_to_end/_common.py create mode 100644 tests/share/search/end_to_end/test_osfsearch_on_trovesearch_denorm.py diff --git a/tests/share/search/__init__.py b/tests/share/search/__init__.py index 871256d44..76b608261 100644 --- a/tests/share/search/__init__.py +++ b/tests/share/search/__init__.py @@ -3,11 +3,10 @@ from typing import Iterable from unittest import mock -from share.search import index_strategy - @contextlib.contextmanager -def patch_index_strategies(strategies: Iterable[index_strategy.IndexStrategy]): +def patch_index_strategies(strategies: Iterable): + from share.search import index_strategy with mock.patch.object(index_strategy, '_AvailableStrategies', new=enum.Enum( '_AvailableStrategies', [ (_strategy.strategy_name, _strategy) @@ -15,3 +14,14 @@ def patch_index_strategies(strategies: Iterable[index_strategy.IndexStrategy]): ], )): yield + + +@contextlib.contextmanager +def patch_index_strategy(strategy): + from share.search import index_strategy as _module_to_patch + with ( + mock.patch.object(_module_to_patch, 'all_strategy_names', return_value=frozenset([strategy.strategy_name])), + mock.patch.object(_module_to_patch, 'each_strategy', return_value=[strategy]), + mock.patch.object(_module_to_patch, 'get_strategy', return_value=strategy), + ): + yield diff --git a/tests/share/search/end_to_end/__init__.py b/tests/share/search/end_to_end/__init__.py new file mode 100644 index 000000000..ea9b78354 --- /dev/null +++ b/tests/share/search/end_to_end/__init__.py @@ -0,0 +1 @@ +__all__ = () diff --git a/tests/share/search/end_to_end/_common.py b/tests/share/search/end_to_end/_common.py new file mode 100644 index 000000000..5501a07ab --- /dev/null +++ b/tests/share/search/end_to_end/_common.py @@ -0,0 +1,254 @@ +import datetime +import itertools +from urllib.parse import urlencode +from typing import Iterator + +from primitive_metadata import primitive_rdf as rdf + +from trove.vocab import mediatypes +from trove.vocab.namespaces import RDF, DCTERMS, OWL, FOAF, DCAT, BLARG, OSFMAP, TROVE +from tests.share.search.index_strategy._with_real_services import RealElasticTestCase +from tests.trove.factories import ( + create_indexcard, + index_indexcards, +) + + +# abstract base class -- subclasses need to implement RealElasticTestCase.get_index_strategy +class End2EndSearchTestCase(RealElasticTestCase): + MEDIATYPES = (mediatypes.JSONAPI,) # TODO: more + + def setUp(self): + super().setUp() + _indexcards = self._create_test_cards() + index_indexcards(self.index_strategy, _indexcards) + + ### + # test methods + + def test_like_osfsearch(self): + # cardsearch + for _queryparams, _expected_focus_iris in self._cardsearch_cases(): + self._test_get_for_each_mediatype( + url_path='/trove/index-card-search', + queryparams=_queryparams, + actual_getter=self._get_cardsearch_focus_iris, + expected=_expected_focus_iris, + ) + # valuesearch + for _queryparams, _expected_values in self._valuesearch_cases(): + self._test_get_for_each_mediatype( + url_path='/trove/index-value-search', + queryparams=_queryparams, + actual_getter=self._get_valuesearch_values, + expected=_expected_values, + ) + + ### + # internals + + def _test_get_for_each_mediatype( + self, + url_path, + queryparams, + actual_getter, + expected, + ): + for _mediatype in self.MEDIATYPES: + _response = self._send_get(url_path, queryparams, _mediatype) + _actual = actual_getter(_response) + self.assertEqual(_actual, expected) + + def _create_test_cards(self): + self.all_card_focus_iris = { + BLARG.myproj, + BLARG.mypreprint, + } + self.card__myproj = create_indexcard(BLARG.myproj, { + RDF.type: {OSFMAP.Project}, + DCTERMS.title: {rdf.literal('my project', language='en')}, + DCTERMS.description: {rdf.literal('this project sure is.', language='en')}, + OWL.sameAs: {'https://doi.example/13.618/7', 'http://raid.example/whatever'}, + DCTERMS.creator: {BLARG.a_person, BLARG.nother_person}, + OSFMAP.keyword: {rdf.literal('keyword', language='en')}, + DCAT.accessService: {BLARG.anOsfOrSomething}, + DCTERMS.created: {rdf.literal(datetime.date(2020, 2, 2))}, + }, rdf_tripledict={ + BLARG.a_person: { + RDF.type: {DCTERMS.Agent, FOAF.Person}, + FOAF.name: {rdf.literal('peerrr sssssooo oooonnn nnnnnnnn')}, + }, + BLARG.nother_person: { + RDF.type: {DCTERMS.Agent, FOAF.Person}, + FOAF.name: {rdf.literal('nootthhh eeerrrppp peeeerrrrssssooooonnnnn')}, + OSFMAP.affiliation: {BLARG.an_institution}, + }, + BLARG.an_institution: { + RDF.type: {DCTERMS.Agent, FOAF.Organization}, + FOAF.name: {rdf.literal('innssttt iiitttuuuu ttttiiiioooonnnnn')}, + OSFMAP.affiliation: {BLARG.an_institution}, + }, + }, deriver_iris=(TROVE['derive/osfmap_json'],)) + self.card__mypreprint = create_indexcard(BLARG.mypreprint, { + RDF.type: {OSFMAP.Preprint}, + DCTERMS.title: {rdf.literal('my preprint', language='en')}, + DCTERMS.description: {rdf.literal('this preprint sure is that.', language='en')}, + OWL.sameAs: {'https://doi.example/13.618/11', 'http://raid.example/whateverz'}, + DCTERMS.creator: {BLARG.nother_person, BLARG.third_person}, + OSFMAP.keyword: { + rdf.literal('keyword', language='en'), + rdf.literal('lockword', language='en'), + }, + DCAT.accessService: {BLARG.anOsfOrSomething}, + DCTERMS.created: {rdf.literal(datetime.date(2022, 2, 2))}, + }, rdf_tripledict={ + BLARG.nother_person: { + RDF.type: {DCTERMS.Agent, FOAF.Person}, + FOAF.name: {rdf.literal('nootthhh eeerrrppp peeeerrrrssssooooonnnnn')}, + }, + BLARG.third_person: { + RDF.type: {DCTERMS.Agent, FOAF.Person}, + FOAF.name: {rdf.literal('⚞33️⃣🕒🥉 ☘️🎶 ³⑶➂ ⚞👩‍👩‍👧⚟ ㍛⬱⚟')}, + }, + BLARG.an_institution: { + RDF.type: {DCTERMS.Agent, FOAF.Organization}, + FOAF.name: {rdf.literal('innssttt iiitttuuuu ttttiiiioooonnnnn')}, + }, + }, deriver_iris=(TROVE['derive/osfmap_json'],)) + return [ + self.card__myproj, + self.card__mypreprint, + ] + + def _send_get(self, base_url, queryparams, mediatype): + assert '?' not in base_url + queryparams['acceptMediatype'] = mediatype + _url = f'{base_url}?{urlencode(queryparams)}' + return self.client.get(_url) + + def _get_cardsearch_focus_iris(self, response): + if response.headers['Content-Type'] != mediatypes.JSONAPI: + raise NotImplementedError('TODO: more mediatypes') + _response_json = response.json() + return set(itertools.chain.from_iterable( + _json_resource['attributes']['resourceIdentifier'] + for _json_resource in _response_json['included'] + if _json_resource['type'] == 'index-card' + )) + + def _get_valuesearch_values(self, response): + if response.headers['Content-Type'] != mediatypes.JSONAPI: + raise NotImplementedError('TODO: more mediatypes') + _response_json = response.json() + return set(itertools.chain.from_iterable( + _json_resource['attributes']['resourceIdentifier'] + for _json_resource in _response_json['included'] + if _json_resource['type'] == 'index-card' + )) + + def _cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]: + yield ( # empty baseline + {}, # no query params + self.all_card_focus_iris, + ) + yield ( # osf-search "all types" tab + { + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent,Project,ProjectComponent,Preprint,Agent,File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + self.all_card_focus_iris, + ) + yield ( # osf-search "all types" tab (with cardSearchText) + { + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent,Project,ProjectComponent,Preprint,Agent,File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '⚞👩‍👩‍👧⚟', + 'sort': '-relevance', + }, + {BLARG.mypreprint}, + ) + yield ( # osf-search "projects" tab + { + 'cardSearchFilter[resourceType]': 'Project,ProjectComponent', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {BLARG.myproj}, + ) + yield ( # osf-search "preprints" tab + { + 'cardSearchFilter[resourceType]': 'Preprint', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {BLARG.mypreprint}, + ) + yield ( # osf-search "registrations" tab + { + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + set(), # TODO + ) + yield ( # osf-search "files" tab + { + 'cardSearchFilter[resourceType]': 'File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + set(), # TODO + ) + + def _valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]: + yield ( # simple baseline + {'valueSearchPropertyPath': 'resourceType'}, + {OSFMAP.Project, OSFMAP.Preprint}, + ) + yield ( # osf-search "all types" tab; "creator" facet + { + 'valueSearchPropertyPath': 'creator', + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent,Project,ProjectComponent,Preprint,Agent,File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {BLARG.a_person, BLARG.nother_person, BLARG.third_person}, + ) + yield ( # osf-search "all types" tab; "creator" facet with valueSearchText + { + 'valueSearchPropertyPath': 'creator', + 'valueSearchText': '⚞👩‍👩‍👧⚟', + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent,Project,ProjectComponent,Preprint,Agent,File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {BLARG.third_person}, + ) + yield ( # osf-search "preprints" tab; "creator" facet + { + 'valueSearchPropertyPath': 'creator', + 'cardSearchFilter[resourceType]': 'Preprint', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {BLARG.nother_person, BLARG.third_person}, + ) + yield ( # osf-search "all types" tab; "dateCreated" facet + { + 'valueSearchPropertyPath': 'dateCreated', + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent,Project,ProjectComponent,Preprint,Agent,File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {'2020', '2022'}, # year histogram + ) diff --git a/tests/share/search/end_to_end/test_osfsearch_on_trovesearch_denorm.py b/tests/share/search/end_to_end/test_osfsearch_on_trovesearch_denorm.py new file mode 100644 index 000000000..a29023158 --- /dev/null +++ b/tests/share/search/end_to_end/test_osfsearch_on_trovesearch_denorm.py @@ -0,0 +1,7 @@ +from share.search.index_strategy.trovesearch_denorm import TrovesearchDenormIndexStrategy +from . import _common + + +class TestOsfsearchOnTrovesearchDenorm(_common.End2EndSearchTestCase): + def get_index_strategy(self): # for RealElasticTestCase + return TrovesearchDenormIndexStrategy('test_osfsearch_on_trovesearch_denorm') diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index 8f3fc66fe..7845ff918 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -15,6 +15,7 @@ create_indexcard, update_indexcard_content, create_supplement, + index_indexcards, ) from ._with_real_services import RealElasticTestCase @@ -30,7 +31,7 @@ def setUp(self): def test_for_smoke_without_daemon(self): _indexcard = self._create_indexcard( focus_iri=BLARG.hello, - rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, + rdf_twopledict={RDFS.label: {rdf.literal('hello')}}, ) _messages_chunk = messages.MessagesChunk( messages.MessageType.UPDATE_INDEXCARD, @@ -44,7 +45,7 @@ def test_for_smoke_without_daemon(self): def test_for_smoke_with_daemon(self): _indexcard = self._create_indexcard( focus_iri=BLARG.hello, - rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, + rdf_twopledict={RDFS.label: {rdf.literal('hello')}}, ) _messages_chunk = messages.MessagesChunk( messages.MessageType.UPDATE_INDEXCARD, @@ -78,11 +79,9 @@ def test_cardsearch_after_deletion(self): def test_cardsearch_after_updates(self): _cards = self._fill_test_data_for_querying() self._update_indexcard_content(_cards[BLARG.c], BLARG.c, { - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_c}, # subj_bc removed; subj_c added - DCTERMS.title: {rdf.literal('cccc')}, - }, + RDF.type: {BLARG.Thing}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_c}, # subj_bc removed; subj_c added + DCTERMS.title: {rdf.literal('cccc')}, }) self._index_indexcards([_cards[BLARG.c]]) _cases = [ @@ -112,11 +111,9 @@ def test_cardsearch_pagination(self): _focus_iri = BLARG[f'i{_i}'] _expected_iris.add(_focus_iri) _cards.append(self._create_indexcard(_focus_iri, { - _focus_iri: { - RDF.type: {BLARG.Thing}, - DCTERMS.title: {rdf.literal(f'card #{_i}')}, - DCTERMS.created: {rdf.literal(_start_date + timedelta(weeks=_i, days=_i))}, - }, + RDF.type: {BLARG.Thing}, + DCTERMS.title: {rdf.literal(f'card #{_i}')}, + DCTERMS.created: {rdf.literal(_start_date + timedelta(weeks=_i, days=_i))}, })) self._index_indexcards(_cards) # gather all pages results: @@ -187,12 +184,10 @@ def test_valuesearch_after_deletion(self): def test_valuesearch_after_updates(self): _cards = self._fill_test_data_for_querying() self._update_indexcard_content(_cards[BLARG.c], BLARG.c, { - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.creator: {BLARG.someone_new}, # someone_else removed; someone_new added - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_c, BLARG.subj_new}, # subj_bc removed; subj_new added - DCTERMS.title: {rdf.literal('cccc')}, - }, + RDF.type: {BLARG.Thing}, + DCTERMS.creator: {BLARG.someone_new}, # someone_else removed; someone_new added + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_c, BLARG.subj_new}, # subj_bc removed; subj_new added + DCTERMS.title: {rdf.literal('cccc')}, }) self._index_indexcards([_cards[BLARG.c]]) _cases = [ @@ -239,16 +234,15 @@ def _assert_valuesearch_values(self, queryparams, expected_values): def _fill_test_data_for_querying(self): _card_a = self._create_indexcard(BLARG.a, { - BLARG.a: { - RDF.type: {BLARG.Thing}, - OWL.sameAs: {BLARG.a_same, BLARG.a_same2}, - DCTERMS.created: {rdf.literal(date(1999, 12, 31))}, - DCTERMS.creator: {BLARG.someone}, - DCTERMS.title: {rdf.literal('aaaa')}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_a}, - DCTERMS.references: {BLARG.b, BLARG.c}, - DCTERMS.description: {rdf.literal('This place is not a place of honor... no highly esteemed deed is commemorated here... nothing valued is here.', language='en')}, - }, + RDF.type: {BLARG.Thing}, + OWL.sameAs: {BLARG.a_same, BLARG.a_same2}, + DCTERMS.created: {rdf.literal(date(1999, 12, 31))}, + DCTERMS.creator: {BLARG.someone}, + DCTERMS.title: {rdf.literal('aaaa')}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_a}, + DCTERMS.references: {BLARG.b, BLARG.c}, + DCTERMS.description: {rdf.literal('This place is not a place of honor... no highly esteemed deed is commemorated here... nothing valued is here.', language='en')}, + }, rdf_tripledict={ BLARG.someone: { FOAF.name: {rdf.literal('some one')}, }, @@ -265,16 +259,15 @@ def _fill_test_data_for_querying(self): }, }) _card_b = self._create_indexcard(BLARG.b, { - BLARG.b: { - RDF.type: {BLARG.Thing}, - OWL.sameAs: {BLARG.b_same}, - DCTERMS.created: {rdf.literal(date(2012, 12, 31))}, - DCTERMS.creator: {BLARG.someone}, - DCTERMS.title: {rdf.literal('bbbb')}, - DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, - DCTERMS.references: {BLARG.c}, - DCTERMS.description: {rdf.literal('What is here was dangerous and repulsive to us. This message is a warning about danger. ', language='en')}, - }, + RDF.type: {BLARG.Thing}, + OWL.sameAs: {BLARG.b_same}, + DCTERMS.created: {rdf.literal(date(2012, 12, 31))}, + DCTERMS.creator: {BLARG.someone}, + DCTERMS.title: {rdf.literal('bbbb')}, + DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, + DCTERMS.references: {BLARG.c}, + DCTERMS.description: {rdf.literal('What is here was dangerous and repulsive to us. This message is a warning about danger. ', language='en')}, + }, rdf_tripledict={ BLARG.someone: { FOAF.name: {rdf.literal('some one')}, }, @@ -285,44 +278,37 @@ def _fill_test_data_for_querying(self): }, }) _card_c = self._create_indexcard(BLARG.c, { - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.created: {rdf.literal(date(2024, 12, 31))}, - DCTERMS.creator: {BLARG.someone_else}, - DCTERMS.title: {rdf.literal('cccc')}, - DCTERMS.subject: { - BLARG['subj_ac/'], # this one has an extra trailing slash - BLARG.subj_bc, - BLARG.subj_c, - }, - DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')}, + RDF.type: {BLARG.Thing}, + DCTERMS.created: {rdf.literal(date(2024, 12, 31))}, + DCTERMS.creator: {BLARG.someone_else}, + DCTERMS.title: {rdf.literal('cccc')}, + DCTERMS.subject: { + BLARG['subj_ac/'], # this one has an extra trailing slash + BLARG.subj_bc, + BLARG.subj_c, }, + DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')}, + }, rdf_tripledict={ BLARG.someone_else: { FOAF.name: {rdf.literal('some one else')}, }, }) create_supplement(_card_a, BLARG.a, { - BLARG.a: { - DCTERMS.replaces: {BLARG.a_past}, - DCAT.servesDataset: { - rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(10)}}), - }, + DCTERMS.replaces: {BLARG.a_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(10)}}), }, }) create_supplement(_card_b, BLARG.b, { - BLARG.b: { - DCTERMS.replaces: {BLARG.b_past}, - DCAT.servesDataset: { - rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(7)}}), - }, + DCTERMS.replaces: {BLARG.b_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(7)}}), }, }) create_supplement(_card_c, BLARG.c, { - BLARG.c: { - DCTERMS.replaces: {BLARG.c_past}, - DCAT.servesDataset: { - rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(333)}}), - }, + DCTERMS.replaces: {BLARG.c_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(333)}}), }, }) _cards = { @@ -608,8 +594,13 @@ def valuesearch_sameas_cases(self): {BLARG.subj_ac, BLARG.subj_a, BLARG.subj_c, BLARG.subj_bc}, ) - def _create_indexcard(self, focus_iri: str, rdf_tripledict: rdf.RdfTripleDictionary) -> trove_db.Indexcard: - _indexcard = create_indexcard(focus_iri, rdf_tripledict, (TROVE['derive/osfmap_json'],)) + def _create_indexcard( + self, + focus_iri: str, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, + ) -> trove_db.Indexcard: + _indexcard = create_indexcard(focus_iri, rdf_twopledict, rdf_tripledict, (TROVE['derive/osfmap_json'],)) self._indexcard_focus_by_uuid[str(_indexcard.uuid)] = focus_iri return _indexcard @@ -617,21 +608,14 @@ def _update_indexcard_content( self, indexcard: trove_db.Indexcard, focus_iri: str, - rdf_tripledict: rdf.RdfTripleDictionary, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, ) -> None: - update_indexcard_content(indexcard, focus_iri, rdf_tripledict) + update_indexcard_content(indexcard, focus_iri, rdf_twopledict, rdf_tripledict) self._indexcard_focus_by_uuid[str(indexcard.uuid)] = focus_iri def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): - _messages_chunk = messages.MessagesChunk( - messages.MessageType.UPDATE_INDEXCARD, - [_indexcard.id for _indexcard in indexcards], - ) - self.assertTrue(all( - _response.is_done - for _response in self.index_strategy.pls_handle_messages_chunk(_messages_chunk) - )) - self.index_strategy.pls_refresh() + index_indexcards(self.index_strategy, indexcards) def _delete_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): for _indexcard in indexcards: diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index 8ad685026..a4219b312 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -1,3 +1,4 @@ +import abc import contextlib from unittest import mock @@ -8,17 +9,21 @@ from share.search.daemon import IndexerDaemonControl from share.search.index_messenger import IndexMessenger from share.search import index_strategy -from tests.share.search import patch_index_strategies +from tests.share.search import patch_index_strategy # base class for testing IndexStrategy subclasses with actual elasticsearch. # (using TransactionTestCase so there's NOT a transaction wrapping each test # and IndexerDaemon can use a separate db connection from a separate thread) -class RealElasticTestCase(TransactionTestCase): +class RealElasticTestCase(TransactionTestCase, abc.ABC): serialized_rollback = True # for TransactionTestCase; restore db after - # required for subclasses + @abc.abstractmethod def get_index_strategy(self) -> index_strategy.IndexStrategy: + '''return an IndexStrategy instance that will be tested + + override in subclasses to reuse these tests + ''' raise NotImplementedError(f'{self.__class__} must implement `get_index_strategy`') def setUp(self): @@ -26,7 +31,7 @@ def setUp(self): self.enterContext(mock.patch('share.models.core._setup_user_token_and_groups')) self.index_strategy = self.get_index_strategy() self.index_strategy.pls_teardown() # in case it already exists - self.enterContext(patch_index_strategies([self.index_strategy])) + self.enterContext(patch_index_strategy(self.index_strategy)) self.index_messenger = IndexMessenger( celery_app=celery_app, index_strategys=[self.index_strategy], diff --git a/tests/share/search/index_strategy/test_sharev2_elastic5.py b/tests/share/search/index_strategy/test_sharev2_elastic5.py index 016330c84..8d0d84e73 100644 --- a/tests/share/search/index_strategy/test_sharev2_elastic5.py +++ b/tests/share/search/index_strategy/test_sharev2_elastic5.py @@ -24,10 +24,8 @@ def setUp(self): self.__indexcard = create_indexcard( BLARG.hello, { - BLARG.hello: { - RDF.type: {SHAREv2.CreativeWork}, - DCTERMS.title: {rdf.literal('hello', language='en')}, - }, + RDF.type: {SHAREv2.CreativeWork}, + DCTERMS.title: {rdf.literal('hello', language='en')}, }, deriver_iris=[SHAREv2.sharev2_elastic], ) diff --git a/tests/share/search/index_strategy/test_sharev2_elastic8.py b/tests/share/search/index_strategy/test_sharev2_elastic8.py index fb3a1a5c9..c41667000 100644 --- a/tests/share/search/index_strategy/test_sharev2_elastic8.py +++ b/tests/share/search/index_strategy/test_sharev2_elastic8.py @@ -17,10 +17,8 @@ def setUp(self): self.__indexcard = create_indexcard( BLARG.hello, { - BLARG.hello: { - RDF.type: {SHAREv2.CreativeWork}, - DCTERMS.title: {rdf.literal('hello', language='en')}, - }, + RDF.type: {SHAREv2.CreativeWork}, + DCTERMS.title: {rdf.literal('hello', language='en')}, }, deriver_iris=[SHAREv2.sharev2_elastic], ) diff --git a/tests/trove/factories.py b/tests/trove/factories.py index 475cdc80f..1a7d4b31b 100644 --- a/tests/trove/factories.py +++ b/tests/trove/factories.py @@ -8,14 +8,26 @@ from trove import digestive_tract +__all__ = ( + 'create_indexcard', + 'create_supplement', + 'index_indexcards', + 'update_indexcard_content', +) + + def create_indexcard( focus_iri: str, - rdf_tripledict: rdf.RdfTripleDictionary, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, deriver_iris: Collection[str] = (), ) -> trove_db.Indexcard: _suid = factories.SourceUniqueIdentifierFactory() _indexcard = trove_db.Indexcard.objects.create(source_record_suid=_suid) - update_indexcard_content(_indexcard, focus_iri, rdf_tripledict) + _indexcard.focus_identifier_set.add( + trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri), + ) + update_indexcard_content(_indexcard, focus_iri, rdf_twopledict, rdf_tripledict) if deriver_iris: digestive_tract.derive(_indexcard, deriver_iris) return _indexcard @@ -24,15 +36,21 @@ def create_indexcard( def update_indexcard_content( indexcard: trove_db.Indexcard, focus_iri: str, - rdf_tripledict: rdf.RdfTripleDictionary, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, ) -> None: - _raw = factories.RawDatumFactory(suid=indexcard.source_record_suid) + _card_content = _combined_tripledict(focus_iri, rdf_twopledict, rdf_tripledict) + _card_content_turtle = rdf.turtle_from_tripledict(_card_content) + _raw = factories.RawDatumFactory(suid=indexcard.source_record_suid, datum=_card_content_turtle) + indexcard.focus_identifier_set.add( + trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri), + ) trove_db.LatestIndexcardRdf.objects.update_or_create( indexcard=indexcard, defaults={ 'from_raw_datum': _raw, 'focus_iri': focus_iri, - 'rdf_as_turtle': rdf.turtle_from_tripledict(rdf_tripledict), + 'rdf_as_turtle': _card_content_turtle, 'turtle_checksum_iri': 'foo', # not enforced }, ) @@ -41,15 +59,44 @@ def update_indexcard_content( def create_supplement( indexcard: trove_db.Indexcard, focus_iri: str, - rdf_tripledict: rdf.RdfTripleDictionary, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, ) -> trove_db.SupplementaryIndexcardRdf: _supp_suid = factories.SourceUniqueIdentifierFactory() - _supp_raw = factories.RawDatumFactory(suid=_supp_suid) + _supp_content = _combined_tripledict(focus_iri, rdf_twopledict, rdf_tripledict) + _supp_content_turtle = rdf.turtle_from_tripledict(_supp_content) + _supp_raw = factories.RawDatumFactory(suid=_supp_suid, datum=_supp_content_turtle) return trove_db.SupplementaryIndexcardRdf.objects.create( from_raw_datum=_supp_raw, indexcard=indexcard, supplementary_suid=_supp_suid, focus_iri=focus_iri, - rdf_as_turtle=rdf.turtle_from_tripledict(rdf_tripledict), + rdf_as_turtle=_supp_content_turtle, turtle_checksum_iri='sup', # not enforced ) + + +def index_indexcards(index_strategy, indexcards): + from share.search import messages + _messages_chunk = messages.MessagesChunk( + messages.MessageType.UPDATE_INDEXCARD, + [_indexcard.id for _indexcard in indexcards], + ) + assert all( + _response.is_done + for _response in index_strategy.pls_handle_messages_chunk(_messages_chunk) + ) + index_strategy.pls_refresh() + + +def _combined_tripledict( + focus_iri: str, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, +) -> rdf.RdfTripleDictionary: + _graph = rdf.RdfGraph() + if rdf_twopledict is not None: + _graph.add_twopledict(focus_iri, rdf_twopledict) + if rdf_tripledict is not None: + _graph.add_tripledict(rdf_tripledict) + return _graph.tripledict From 1b44fb83f379ffeb5689528fc608c3fb56bde48a Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 25 Apr 2025 09:21:00 -0400 Subject: [PATCH 24/43] delete `trove_indexcard_flats` index-strategy recommend `trovesearch_denorm` for your trovesearching needs --- share/models/feature_flag.py | 1 - share/search/index_strategy/__init__.py | 11 +- .../index_strategy/trove_indexcard_flats.py | 953 ------------------ .../index_strategy/test_strategy_selection.py | 2 - .../test_trove_indexcard_flats.py | 21 - 5 files changed, 2 insertions(+), 986 deletions(-) delete mode 100644 share/search/index_strategy/trove_indexcard_flats.py delete mode 100644 tests/share/search/index_strategy/test_trove_indexcard_flats.py diff --git a/share/models/feature_flag.py b/share/models/feature_flag.py index a1ea95022..b1abbe090 100644 --- a/share/models/feature_flag.py +++ b/share/models/feature_flag.py @@ -30,7 +30,6 @@ class FeatureFlag(models.Model): ELASTIC_EIGHT_DEFAULT = 'elastic_eight_default' SUGGEST_CREATOR_FACET = 'suggest_creator_facet' FORBID_UNTRUSTED_FEED = 'forbid_untrusted_feed' - TROVESEARCH_DENORMILY = 'trovesearch_denormily' PREPRINT_AFFILIATIONS = 'preprint_affiliations' # name _should_ be one of the constants above, but that is not enforced by `choices` diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index c00d2fbf1..943e67f30 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -9,7 +9,6 @@ from trove.trovesearch import search_params from .sharev2_elastic5 import Sharev2Elastic5IndexStrategy from .sharev2_elastic8 import Sharev2Elastic8IndexStrategy -from .trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy from .trovesearch_denorm import TrovesearchDenormIndexStrategy from ._base import IndexStrategy from ._indexnames import parse_indexname_parts @@ -38,7 +37,6 @@ class _AvailableStrategies(enum.Enum): if settings.ELASTICSEARCH8_URL: sharev2_elastic8 = Sharev2Elastic8IndexStrategy('sharev2_elastic8') - trove_indexcard_flats = TroveIndexcardFlatsIndexStrategy('trove_indexcard_flats') trovesearch_denorm = TrovesearchDenormIndexStrategy('trovesearch_denorm') @@ -96,13 +94,8 @@ def get_strategy_for_sharev2_search(requested_name: str | None = None) -> IndexS def get_strategy_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy: if params.index_strategy_name: # specific strategy requested _strategy = parse_strategy_name(params.index_strategy_name, for_search=True) - else: - _strategy_name = ( - _AvailableStrategies.trovesearch_denorm.name - if FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY) - else _AvailableStrategies.trove_indexcard_flats.name - ) - _strategy = get_strategy(_strategy_name, for_search=True) + else: # hard-coded default (...for now) + _strategy = get_strategy(_AvailableStrategies.trovesearch_denorm.name, for_search=True) return _strategy diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py deleted file mode 100644 index edfc89fe1..000000000 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ /dev/null @@ -1,953 +0,0 @@ -import base64 -from collections import defaultdict -import dataclasses -import datetime -import json -import logging -import re -import uuid -from typing import Iterable, Iterator, Any - -from django.conf import settings -import elasticsearch8 -from primitive_metadata import primitive_rdf - -from share.search import exceptions -from share.search import messages -from share.search.index_strategy._base import IndexStrategy -from share.search.index_strategy.elastic8 import Elastic8IndexStrategy -from share.util.checksum_iri import ChecksumIri -from trove import models as trove_db -from trove.trovesearch.page_cursor import ( - MANY_MORE, - OffsetCursor, - PageCursor, - ReproduciblyRandomSampleCursor, -) -from trove.util.propertypath import GLOB_PATHSTEP -from trove.trovesearch.search_params import ( - CardsearchParams, - ValuesearchParams, - SearchFilter, - Textsegment, - SortParam, -) -from trove.trovesearch.search_handle import ( - CardsearchHandle, - ValuesearchHandle, - TextMatchEvidence, - CardsearchResult, - ValuesearchResult, - PropertypathUsage, -) -from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri, iri_path_as_keyword -from trove.vocab import osfmap -from trove.vocab.namespaces import RDF, OWL -from ._trovesearch_util import ( - latest_rdf_for_indexcard_pks, - GraphWalk, - KEYWORD_LENGTH_MAX, -) - - -logger = logging.getLogger(__name__) - - -class TroveIndexcardFlatsIndexStrategy(Elastic8IndexStrategy): - CURRENT_STRATEGY_CHECKSUM = ChecksumIri( - checksumalgorithm_name='sha-256', - salt='TroveIndexcardFlatsIndexStrategy', - hexdigest='bdec536873e1ed0c58facaa5d1145bef73bba09d671deef48e45c019def5c5a5', - ) - - # abstract method from IndexStrategy - @property - def supported_message_types(self): - return { - messages.MessageType.UPDATE_INDEXCARD, - messages.MessageType.BACKFILL_INDEXCARD, - } - - # abstract method from IndexStrategy - @property - def backfill_message_type(self): - return messages.MessageType.BACKFILL_INDEXCARD - - @classmethod - def define_current_indexes(cls): - return { # empty index subname, for backcompat - '': cls.IndexDefinition( - mappings=cls.index_mappings(), - settings=cls.index_settings(), - ), - } - - @classmethod - def index_settings(cls): - return {} - - @classmethod - def index_mappings(cls): - _capped_keyword = { - 'type': 'keyword', - 'ignore_above': KEYWORD_LENGTH_MAX, - } - _common_nested_keywords = { - 'path_from_focus': _capped_keyword, - 'suffuniq_path_from_focus': _capped_keyword, - 'property_iri': _capped_keyword, - 'distance_from_focus': {'type': 'keyword'}, # numeric value as keyword (used for 'term' filter) - } - return { - 'dynamic': 'false', - 'properties': { - 'indexcard_uuid': _capped_keyword, - 'focus_iri': _capped_keyword, - 'suffuniq_focus_iri': _capped_keyword, - 'source_record_identifier': _capped_keyword, - 'source_config_label': _capped_keyword, - 'flat_iri_values': { - 'type': 'flattened', - 'ignore_above': KEYWORD_LENGTH_MAX, - }, - 'flat_iri_values_suffuniq': { - 'type': 'flattened', - 'ignore_above': KEYWORD_LENGTH_MAX, - }, - 'iri_paths_present': _capped_keyword, - 'iri_paths_present_suffuniq': _capped_keyword, - 'nested_iri': { - 'type': 'nested', - 'properties': { - **_common_nested_keywords, - 'iri_value': _capped_keyword, - 'suffuniq_iri_value': _capped_keyword, - 'value_type_iri': _capped_keyword, - 'value_name_text': { - 'type': 'text', - 'fields': {'raw': _capped_keyword}, - 'copy_to': 'nested_iri.value_namelike_text', - }, - 'value_title_text': { - 'type': 'text', - 'fields': {'raw': _capped_keyword}, - 'copy_to': 'nested_iri.value_namelike_text', - }, - 'value_label_text': { - 'type': 'text', - 'fields': {'raw': _capped_keyword}, - 'copy_to': 'nested_iri.value_namelike_text', - }, - 'value_namelike_text': {'type': 'text'}, - }, - }, - 'nested_date': { - 'type': 'nested', - 'properties': { - **_common_nested_keywords, - 'date_value': { - 'type': 'date', - 'format': 'strict_date_optional_time', - }, - }, - }, - 'nested_text': { - 'type': 'nested', - 'properties': { - **_common_nested_keywords, - 'language_iri': _capped_keyword, - 'text_value': { - 'type': 'text', - 'index_options': 'offsets', # for faster highlighting - 'store': True, # avoid loading _source to render highlights - 'fields': {'raw': _capped_keyword}, - }, - }, - }, - }, - } - - @property - def __index(self) -> IndexStrategy.SpecificIndex: - # this is a single-index strategy -- for back-compat, that index has empty subname - return self.get_index('') - - def _build_sourcedoc(self, indexcard_rdf): - _rdfdoc = indexcard_rdf.as_rdfdoc_with_supplements() - if _should_skip_card(indexcard_rdf, _rdfdoc): - return None # will be deleted from the index - _nested_iris = defaultdict(set) - _nested_dates = defaultdict(set) - _nested_texts = defaultdict(set) - _walk = GraphWalk(_rdfdoc, indexcard_rdf.focus_iri) - for _walk_path, _walk_iris in _walk.iri_values.items(): - for _iri_obj in _walk_iris: - _nested_iris[_NestedIriKey.for_iri_at_path(_walk_path, _iri_obj, _rdfdoc)].add(_iri_obj) - for _walk_path, _walk_dates in _walk.date_values.items(): - for _date_obj in _walk_dates: - _nested_dates[_walk_path].add(datetime.date.isoformat(_date_obj)) - for _walk_path, _walk_texts in _walk.text_values.items(): - for _text_obj in _walk_texts: - _nested_texts[(_walk_path, tuple(_text_obj.datatype_iris))].add(_text_obj.unicode_value) - _focus_iris = {indexcard_rdf.focus_iri} - _suffuniq_focus_iris = {get_sufficiently_unique_iri(indexcard_rdf.focus_iri)} - for _identifier in indexcard_rdf.indexcard.focus_identifier_set.all(): - _focus_iris.update(_identifier.raw_iri_list) - _suffuniq_focus_iris.add(_identifier.sufficiently_unique_iri) - return { - 'indexcard_uuid': str(indexcard_rdf.indexcard.uuid), - 'focus_iri': list(_focus_iris), - 'suffuniq_focus_iri': list(_suffuniq_focus_iris), - 'source_record_identifier': indexcard_rdf.indexcard.source_record_suid.identifier, - 'source_config_label': indexcard_rdf.indexcard.source_record_suid.source_config.label, - 'flat_iri_values': self._flattened_iris(_nested_iris), - 'flat_iri_values_suffuniq': self._flattened_iris_suffuniq(_nested_iris), - 'iri_paths_present': [ - iri_path_as_keyword(_path) - for _path in _walk.paths_walked - ], - 'iri_paths_present_suffuniq': [ - iri_path_as_keyword(_path, suffuniq=True) - for _path in _walk.paths_walked - ], - 'nested_iri': list(filter(bool, ( - self._iri_nested_sourcedoc(_nested_iri_key, _iris, _rdfdoc) - for _nested_iri_key, _iris in _nested_iris.items() - ))), - 'nested_date': [ - { - **_iri_path_as_indexable_fields(_path), - 'date_value': list(_value_set), - } - for _path, _value_set in _nested_dates.items() - ], - 'nested_text': [ - { - **_iri_path_as_indexable_fields(_path), - 'language_iri': _language_iris, - 'text_value': list(_value_set), - } - for (_path, _language_iris), _value_set in _nested_texts.items() - ], - } - - def _iri_nested_sourcedoc(self, iri_key: '_NestedIriKey', iris, rdfdoc): - _iris_with_synonyms = set(filter(is_worthwhile_iri, iris)) - for _iri in iris: - _iris_with_synonyms.update( - filter(is_worthwhile_iri, rdfdoc.q(_iri, OWL.sameAs)), - ) - if not _iris_with_synonyms: - return None - _sourcedoc = { - **iri_key.as_indexable_fields(), - 'iri_value': list(_iris_with_synonyms), - 'suffuniq_iri_value': [ - get_sufficiently_unique_iri(_iri) - for _iri in _iris_with_synonyms - ], - } - return _sourcedoc - - def _flattened_iris_by_path(self, nested_iris: dict['_NestedIriKey', set[str]]): - _by_path = defaultdict(set) - for _iri_key, _iris in nested_iris.items(): - _by_path[_iri_key.path].update(_iris) - return _by_path - - def _flattened_iris(self, nested_iris: dict['_NestedIriKey', set[str]]): - return { - _iri_path_as_flattened_key(_path): list(_iris) - for _path, _iris in self._flattened_iris_by_path(nested_iris).items() - } - - def _flattened_iris_suffuniq(self, nested_iris: dict['_NestedIriKey', set[str]]): - return { - _iri_path_as_flattened_key(_path): [ - get_sufficiently_unique_iri(_iri) - for _iri in _iris - ] - for _path, _iris in self._flattened_iris_by_path(nested_iris).items() - } - - def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): - def _make_actionset(indexcard_id, *actions): - return self.MessageActionSet(indexcard_id, {'': actions}) - _indexcard_rdf_qs = latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) - _remaining_indexcard_ids = set(messages_chunk.target_ids_chunk) - for _indexcard_rdf in _indexcard_rdf_qs: - _suid = _indexcard_rdf.indexcard.source_record_suid - if _suid.has_forecompat_replacement(): - continue # skip this one, let it get deleted - _sourcedoc = self._build_sourcedoc(_indexcard_rdf) - if _sourcedoc: - _index_action = self.build_index_action( - doc_id=_indexcard_rdf.indexcard.get_iri(), - doc_source=_sourcedoc, - ) - _remaining_indexcard_ids.discard(_indexcard_rdf.indexcard_id) - yield _make_actionset(_indexcard_rdf.indexcard_id, _index_action) - # delete any that don't have "latest" rdf and derived osfmap_json - _leftovers = trove_db.Indexcard.objects.filter(id__in=_remaining_indexcard_ids) - for _indexcard in _leftovers: - yield _make_actionset(_indexcard.id, self.build_delete_action(_indexcard.get_iri())) - - def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: - return self.es8_client.search( - index=self.__index.full_index_name, - body={ - **(request_body or {}), - 'track_total_hits': True, - }, - params=(request_queryparams or {}), - ) - - def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: - _cursor = self._cardsearch_cursor(cardsearch_params) - _sort = self._cardsearch_sort(cardsearch_params.sort_list) - _query = self._cardsearch_query( - cardsearch_params.cardsearch_filter_set, - cardsearch_params.cardsearch_textsegment_set, - cardsearch_cursor=_cursor, - ) - _from_offset = ( - _cursor.start_offset - if _cursor.is_first_page() or not isinstance(_cursor, ReproduciblyRandomSampleCursor) - else _cursor.start_offset - len(_cursor.first_page_ids) - ) - _search_kwargs = dict( - query=_query, - aggs=self._cardsearch_aggs(cardsearch_params), - sort=_sort, - from_=_from_offset, - size=_cursor.bounded_page_size, - source=False, # no need to get _source; _id is enough - ) - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.es8_client.search( - index=self.__index.full_index_name, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self._cardsearch_handle(cardsearch_params, _es8_response, _cursor) - - def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: - _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) - _is_date_search = osfmap.is_date_property(valuesearch_params.valuesearch_propertypath[-1]) - _search_kwargs = dict( - query=self._cardsearch_query( - valuesearch_params.cardsearch_filter_set, - valuesearch_params.cardsearch_textsegment_set, - additional_filters=[{'term': {'iri_paths_present': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - )}}], - ), - size=0, # ignore cardsearch hits; just want the aggs - aggs=( - self._valuesearch_date_aggs(valuesearch_params) - if _is_date_search - else self._valuesearch_iri_aggs(valuesearch_params, _cursor) - ), - ) - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.es8_client.search( - index=self.__index.full_index_name, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self._valuesearch_handle(valuesearch_params, _es8_response, _cursor) - - ### - # query implementation - - def _cardsearch_cursor(self, cardsearch_params: CardsearchParams) -> OffsetCursor: - _request_cursor = cardsearch_params.page_cursor - if ( - _request_cursor.is_basic() - and not cardsearch_params.sort_list - and not cardsearch_params.cardsearch_textsegment_set - ): - return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) - return OffsetCursor.from_cursor(_request_cursor) - - def _cardsearch_query( - self, - filter_set, textsegment_set, *, - additional_filters=None, - cardsearch_cursor: PageCursor | None = None, - ) -> dict: - _bool_query = { - 'filter': additional_filters or [], - 'must': [], - 'must_not': [], - 'should': [], - } - for _searchfilter in filter_set: - if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: - _bool_query['must_not'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: - _bool_query['filter'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: - _bool_query['filter'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: - _bool_query['must_not'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator.is_date_operator(): - _bool_query['filter'].append(self._cardsearch_date_filter(_searchfilter)) - else: - raise ValueError(f'unknown filter operator {_searchfilter.operator}') - _textq_builder = self._NestedTextQueryBuilder( - relevance_matters=not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor), - ) - for _textsegment in textsegment_set: - for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): - _bool_query[_boolkey].extend(_textqueries) - if not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor): - # no need for randomness - return {'bool': _bool_query} - if not cardsearch_cursor.first_page_ids: - # independent random sample - return { - 'function_score': { - 'query': {'bool': _bool_query}, - 'boost_mode': 'replace', - 'random_score': {}, # default random_score is fast and unpredictable - }, - } - _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_ids}} - if cardsearch_cursor.is_first_page(): - # returning to a first page previously visited - _bool_query['filter'].append(_firstpage_uuid_query) - return {'bool': _bool_query} - # get a subsequent page using reproducible randomness - _bool_query['must_not'].append(_firstpage_uuid_query) - return { - 'function_score': { - 'query': {'bool': _bool_query}, - 'boost_mode': 'replace', - 'random_score': { - 'seed': ''.join(cardsearch_cursor.first_page_ids), - 'field': 'indexcard_uuid', - }, - }, - } - - def _cardsearch_aggs(self, cardsearch_params): - _aggs = {} - if cardsearch_params.related_property_paths: - _aggs['related_propertypath_usage'] = {'terms': { - 'field': 'iri_paths_present', - 'include': [ - iri_path_as_keyword(_path) - for _path in cardsearch_params.related_property_paths - ], - 'size': len(cardsearch_params.related_property_paths), - }} - return _aggs - - def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: OffsetCursor): - _nested_iri_bool: dict[str, Any] = { - 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - suffuniq=True, - )}}], - 'must': [], - 'must_not': [], - 'should': [], - } - _nested_terms_agg = { - 'field': 'nested_iri.iri_value', - # WARNING: terribly inefficient pagination (part one) - 'size': cursor.start_offset + cursor.bounded_page_size + 1, - } - _iris = list(valuesearch_params.valuesearch_iris()) - if _iris: - _nested_iri_bool['filter'].append({'terms': { - 'nested_iri.iri_value': _iris, - }}) - _nested_terms_agg['size'] = len(_iris) - _nested_terms_agg['include'] = _iris - _type_iris = list(valuesearch_params.valuesearch_type_iris()) - if _type_iris: - _nested_iri_bool['filter'].append({'terms': { - 'nested_iri.value_type_iri': _type_iris, - }}) - _textq_builder = self._SimpleTextQueryBuilder('nested_iri.value_namelike_text') - for _textsegment in valuesearch_params.valuesearch_textsegment_set: - for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): - _nested_iri_bool[_boolkey].extend(_textqueries) - return { - 'in_nested_iri': { - 'nested': {'path': 'nested_iri'}, - 'aggs': { - 'value_at_propertypath': { - 'filter': {'bool': _nested_iri_bool}, - 'aggs': { - 'iri_values': { - 'terms': _nested_terms_agg, - 'aggs': { - 'type_iri': {'terms': { - 'field': 'nested_iri.value_type_iri', - }}, - 'name_text': {'terms': { - 'field': 'nested_iri.value_name_text.raw', - }}, - 'title_text': {'terms': { - 'field': 'nested_iri.value_title_text.raw', - }}, - 'label_text': {'terms': { - 'field': 'nested_iri.value_label_text.raw', - }}, - }, - }, - }, - }, - }, - }, - } - - def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams): - _aggs = { - 'in_nested_date': { - 'nested': {'path': 'nested_date'}, - 'aggs': { - 'value_at_propertypath': { - 'filter': {'term': { - 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - suffuniq=True, - ), - }}, - 'aggs': { - 'count_by_year': { - 'date_histogram': { - 'field': 'nested_date.date_value', - 'calendar_interval': 'year', - 'format': 'yyyy', - 'order': {'_key': 'desc'}, - 'min_doc_count': 1, - }, - }, - }, - }, - }, - }, - } - return _aggs - - def _valuesearch_handle( - self, - valuesearch_params: ValuesearchParams, - es8_response: dict, - cursor: OffsetCursor, - ): - _iri_aggs = es8_response['aggregations'].get('in_nested_iri') - if _iri_aggs: - _buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets'] - _bucket_count = len(_buckets) - # WARNING: terribly inefficient pagination (part two) - _page_end_index = cursor.start_offset + cursor.bounded_page_size - _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages - cursor.total_count = ( - MANY_MORE - if (_bucket_count > _page_end_index) # agg includes one more, if there - else _bucket_count - ) - return ValuesearchHandle( - cursor=cursor, - search_result_page=[ - self._valuesearch_iri_result(_iri_bucket) - for _iri_bucket in _bucket_page - ], - search_params=valuesearch_params, - ) - else: # assume date - _year_buckets = ( - es8_response['aggregations']['in_nested_date'] - ['value_at_propertypath']['count_by_year']['buckets'] - ) - return ValuesearchHandle( - cursor=PageCursor(len(_year_buckets)), - search_result_page=[ - self._valuesearch_date_result(_year_bucket) - for _year_bucket in _year_buckets - ], - search_params=valuesearch_params, - ) - - def _valuesearch_iri_result(self, iri_bucket): - return ValuesearchResult( - value_iri=iri_bucket['key'], - value_type=_bucketlist(iri_bucket['type_iri']), - name_text=_bucketlist(iri_bucket['name_text']), - title_text=_bucketlist(iri_bucket['title_text']), - label_text=_bucketlist(iri_bucket['label_text']), - match_count=iri_bucket['doc_count'], - ) - - def _valuesearch_date_result(self, date_bucket): - return ValuesearchResult( - value_iri=None, - value_value=date_bucket['key_as_string'], - label_text=(date_bucket['key_as_string'],), - match_count=date_bucket['doc_count'], - ) - - def _cardsearch_presence_query(self, search_filter) -> dict: - _filters = [ - self._cardsearch_path_presence_query(_path) - for _path in search_filter.propertypath_set - ] - if len(_filters) == 1: - return _filters[0] - return {'bool': { - 'minimum_should_match': 1, - 'should': _filters, - }} - - def _cardsearch_path_presence_query(self, path: tuple[str, ...]): - if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): - return {'nested': { - 'path': 'nested_iri', - 'query': {'term': {'nested_iri.distance_from_focus': len(path)}}, - }} - return {'term': { - 'iri_paths_present_suffuniq': iri_path_as_keyword(path, suffuniq=True), - }} - - def _cardsearch_iri_filter(self, search_filter) -> dict: - _filters = [ - self._cardsearch_path_iri_query(_path, search_filter.value_set) - for _path in search_filter.propertypath_set - ] - if len(_filters) == 1: - return _filters[0] - return {'bool': { - 'minimum_should_match': 1, - 'should': _filters, - }} - - def _cardsearch_path_iri_query(self, path, value_set): - _suffuniq_values = [ - get_sufficiently_unique_iri(_iri) - for _iri in value_set - ] - if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): - return {'nested': { - 'path': 'nested_iri', - 'query': {'bool': { - 'must': [ # both - {'term': {'nested_iri.distance_from_focus': len(path)}}, - {'terms': {'nested_iri.suffuniq_iri_value': _suffuniq_values}}, - ], - }}, - }} - # without a glob-path, can use the flattened keyword field - return {'terms': {_iri_path_as_flattened_field(path): _suffuniq_values}} - - def _cardsearch_date_filter(self, search_filter): - return {'nested': { - 'path': 'nested_date', - 'query': {'bool': {'filter': list(self._iter_nested_date_filters(search_filter))}}, - }} - - def _iter_nested_date_filters(self, search_filter) -> Iterator[dict]: - # filter by requested paths - yield _pathset_as_nestedvalue_filter(search_filter.propertypath_set, 'nested_date') - # filter by requested value/operator - if search_filter.operator == SearchFilter.FilterOperator.BEFORE: - _value = min(search_filter.value_set) # rely on string-comparable isoformat - yield {'range': {'nested_date.date_value': { - 'lt': _daterange_value_and_format(_value) - }}} - elif search_filter.operator == SearchFilter.FilterOperator.AFTER: - _value = max(search_filter.value_set) # rely on string-comparable isoformat - yield {'range': {'nested_date.date_value': { - 'gt': _daterange_value_and_format(_value) - }}} - elif search_filter.operator == SearchFilter.FilterOperator.AT_DATE: - for _value in search_filter.value_set: - _filtervalue = _daterange_value_and_format(_value) - yield {'range': {'nested_date.date_value': { - 'gte': _filtervalue, - 'lte': _filtervalue, - }}} - else: - raise ValueError(f'invalid date filter operator (got {search_filter.operator})') - - def _cardsearch_sort(self, sort_list: tuple[SortParam, ...]): - if not sort_list: - return None - return [ - {'nested_date.date_value': { - 'order': ('desc' if _sortparam.descending else 'asc'), - 'nested': { - 'path': 'nested_date', - 'filter': {'term': { - 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - _sortparam.propertypath, - suffuniq=True, - ), - }}, - }, - }} - for _sortparam in sort_list - ] - - def _cardsearch_handle( - self, - cardsearch_params: CardsearchParams, - es8_response: dict, - cursor: OffsetCursor, - ) -> CardsearchHandle: - _es8_total = es8_response['hits']['total'] - if _es8_total['relation'] != 'eq': - cursor.total_count = MANY_MORE - elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): - # account for the filtered-out first page - cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) - else: # exact (and small) count - cursor.total_count = _es8_total['value'] - _results = [] - for _es8_hit in es8_response['hits']['hits']: - _card_iri = _es8_hit['_id'] - _results.append(CardsearchResult( - card_iri=_card_iri, - text_match_evidence=list(self._gather_textmatch_evidence(_es8_hit)), - )) - _relatedproperty_list: list[PropertypathUsage] = [] - if cardsearch_params.related_property_paths: - _relatedproperty_list.extend( - PropertypathUsage(property_path=_path, usage_count=0) - for _path in cardsearch_params.related_property_paths - ) - _relatedproperty_by_path = { - _result.property_path: _result - for _result in _relatedproperty_list - } - for _bucket in es8_response['aggregations']['related_propertypath_usage']['buckets']: - _path = tuple(json.loads(_bucket['key'])) - _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] - return CardsearchHandle( - cursor=cursor, - search_result_page=_results, - related_propertypath_results=_relatedproperty_list, - search_params=cardsearch_params, - ) - - def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]: - for _innerhit_group in es8_hit.get('inner_hits', {}).values(): - for _innerhit in _innerhit_group['hits']['hits']: - _property_path = tuple( - json.loads(_innerhit['fields']['nested_text.path_from_focus'][0]), - ) - try: - _language_iris = _innerhit['fields']['nested_text.language_iri'] - except KeyError: - _language_iris = () - for _highlight in _innerhit['highlight']['nested_text.text_value']: - yield TextMatchEvidence( - property_path=_property_path, - matching_highlight=primitive_rdf.literal(_highlight, datatype_iris=_language_iris), - card_iri=_innerhit['_id'], - ) - - class _SimpleTextQueryBuilder: - def __init__( - self, text_field, *, - relevance_matters=False, - ): - self._text_field = text_field - self._relevance_matters = relevance_matters - - def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: - if textsegment.is_negated: - return {'must_not': [self.exact_text_query(textsegment.text)]} - if not textsegment.is_fuzzy: - return {'must': [self.exact_text_query(textsegment.text)]} - if not self._relevance_matters: - return {'must': [self.fuzzy_text_must_query(textsegment.text)]} - return { - 'must': [self.fuzzy_text_must_query(textsegment.text)], - 'should': [self.fuzzy_text_should_query(textsegment.text)], - } - - def exact_text_query(self, text: str) -> dict: - # TODO: textsegment.is_openended (prefix query) - return {'match_phrase': { - self._text_field: {'query': text}, - }} - - def fuzzy_text_must_query(self, text: str) -> dict: - # TODO: textsegment.is_openended (prefix query) - return {'match': { - self._text_field: { - 'query': text, - 'fuzziness': 'AUTO', - # TODO: 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) - }, - }} - - def fuzzy_text_should_query(self, text: str): - return {'match_phrase': { - self._text_field: { - 'query': text, - 'slop': len(text.split()), - }, - }} - - class _NestedTextQueryBuilder(_SimpleTextQueryBuilder): - def __init__(self, **kwargs): - super().__init__('nested_text.text_value', **kwargs) - - def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: - return { - _boolkey: [ - self._make_nested_query(textsegment, _query) - for _query in _queries - ] - for _boolkey, _queries in super().textsegment_boolparts(textsegment).items() - } - - def _make_nested_query(self, textsegment, query): - _nested_q = {'nested': { - 'path': 'nested_text', - 'query': {'bool': { - 'filter': _pathset_as_nestedvalue_filter(textsegment.propertypath_set, 'nested_text'), - 'must': query, - }}, - }} - if self._relevance_matters: - _nested_q['nested']['inner_hits'] = self._inner_hits() - return _nested_q - - def _inner_hits(self, *, highlight_query=None) -> dict: - _highlight = { - 'type': 'unified', - 'fields': {'nested_text.text_value': {}}, - } - if highlight_query is not None: - _highlight['highlight_query'] = highlight_query - return { - 'name': str(uuid.uuid4()), # avoid inner-hit name collisions - 'highlight': _highlight, - '_source': False, # _source is expensive for nested docs - 'docvalue_fields': [ - 'nested_text.path_from_focus', - 'nested_text.language_iri', - ], - } - - -### -# module-local utils - -def _should_skip_card(indexcard_rdf, rdfdoc): - # skip cards without some value for name/title/label - return not any(rdfdoc.q(indexcard_rdf.focus_iri, osfmap.NAMELIKE_PROPERTIES)) - - -def _bucketlist(agg_result: dict) -> list[str]: - return [ - _bucket['key'] - for _bucket in agg_result['buckets'] - ] - - -def _daterange_value_and_format(datevalue: str): - _cleanvalue = datevalue.strip() - if re.fullmatch(r'\d{4,}', _cleanvalue): - return f'{_cleanvalue}||/y' - if re.fullmatch(r'\d{4,}-\d{2}', _cleanvalue): - return f'{_cleanvalue}||/M' - if re.fullmatch(r'\d{4,}-\d{2}-\d{2}', _cleanvalue): - return f'{_cleanvalue}||/d' - raise ValueError(f'bad date value "{datevalue}"') - - -def _iri_path_as_indexable_fields(path: tuple[str, ...]): - assert path, 'path should not be empty' - return { - 'path_from_focus': iri_path_as_keyword(path), - 'suffuniq_path_from_focus': iri_path_as_keyword(path, suffuniq=True), - 'property_iri': path[-1], - 'distance_from_focus': len(path), - } - - -def _iri_path_as_flattened_key(path: tuple[str, ...]) -> str: - return base64.b16encode(json.dumps(path).encode()).decode() - - -def _iri_path_as_flattened_field(path: tuple[str, ...]) -> str: - return f'flat_iri_values_suffuniq.{_iri_path_as_flattened_key(path)}' - - -def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[tuple[str, ...]], nested_path: str): - _suffuniq_iri_paths = [] - _glob_path_lengths = [] - for _path in propertypath_set: - if all(_pathstep == GLOB_PATHSTEP for _pathstep in _path): - _glob_path_lengths.append(len(_path)) - else: - _suffuniq_iri_paths.append(iri_path_as_keyword(_path, suffuniq=True)) - if _suffuniq_iri_paths and _glob_path_lengths: - return {'bool': { - 'minimum_should_match': 1, - 'should': [ - {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}}, - {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}}, - ], - }} - if _glob_path_lengths: - return {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}} - return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}} - - -@dataclasses.dataclass(frozen=True) -class _NestedIriKey: - '''if this is the same for multiple iri values, they can be combined in one `nested_iri` doc - ''' - path: tuple[str, ...] - type_iris: frozenset[str] - label_text: frozenset[str] - title_text: frozenset[str] - name_text: frozenset[str] - - @classmethod - def for_iri_at_path(cls, path: tuple[str, ...], iri: str, rdfdoc): - return cls( - path=path, - type_iris=frozenset(rdfdoc.q(iri, RDF.type)), - # TODO: don't discard language for name/title/label - name_text=frozenset( - _text.unicode_value - for _text in rdfdoc.q(iri, osfmap.NAME_PROPERTIES) - if isinstance(_text, primitive_rdf.Literal) - ), - title_text=frozenset( - _text.unicode_value - for _text in rdfdoc.q(iri, osfmap.TITLE_PROPERTIES) - if isinstance(_text, primitive_rdf.Literal) - ), - label_text=frozenset( - _text.unicode_value - for _text in rdfdoc.q(iri, osfmap.LABEL_PROPERTIES) - if isinstance(_text, primitive_rdf.Literal) - ), - ) - - def as_indexable_fields(self): - # matches fields in the mapping for `nested_iri`, above - return { - **_iri_path_as_indexable_fields(self.path), - 'value_type_iri': list(self.type_iris), - 'value_label_text': list(self.label_text), - 'value_title_text': list(self.title_text), - 'value_name_text': list(self.name_text), - } diff --git a/tests/share/search/index_strategy/test_strategy_selection.py b/tests/share/search/index_strategy/test_strategy_selection.py index b4d8a1045..a017bc2ba 100644 --- a/tests/share/search/index_strategy/test_strategy_selection.py +++ b/tests/share/search/index_strategy/test_strategy_selection.py @@ -8,7 +8,6 @@ get_strategy, sharev2_elastic5, sharev2_elastic8, - trove_indexcard_flats, trovesearch_denorm, parse_strategy_name, ) @@ -21,7 +20,6 @@ def patched_strategies(mock_elastic_clients): _strategies = [ sharev2_elastic5.Sharev2Elastic5IndexStrategy('sharev2_elastic5'), sharev2_elastic8.Sharev2Elastic8IndexStrategy('sharev2_elastic8'), - trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy('trove_indexcard_flats'), trovesearch_denorm.TrovesearchDenormIndexStrategy('trovesearch_denorm'), ] with patch_index_strategies(_strategies): diff --git a/tests/share/search/index_strategy/test_trove_indexcard_flats.py b/tests/share/search/index_strategy/test_trove_indexcard_flats.py deleted file mode 100644 index 0718ad346..000000000 --- a/tests/share/search/index_strategy/test_trove_indexcard_flats.py +++ /dev/null @@ -1,21 +0,0 @@ -from share.search.index_strategy.trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy - -from . import _common_trovesearch_tests - - -class TestTroveIndexcardFlats(_common_trovesearch_tests.CommonTrovesearchTests): - # for RealElasticTestCase - def get_index_strategy(self): - return TroveIndexcardFlatsIndexStrategy('test_trove_indexcard_flats') - - def cardsearch_integer_cases(self): - yield from () # integers not indexed by this strategy - - def cardsearch_trailingslash_cases(self): - yield from () # trailing-slash handling improved in trovesearch_denorm - - def valuesearch_sameas_cases(self): - yield from () # sameas handling improved in trovesearch_denorm - - def valuesearch_trailingslash_cases(self): - yield from () # trailing-slash handling improved in trovesearch_denorm From e70e1a06ca0071945e39c0f01a3118cef2cb3053 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 23 Apr 2025 17:07:44 +0300 Subject: [PATCH 25/43] removed text parsing --- .../index_strategy/trove_indexcard_flats.py | 949 ++++++++++++++++++ .../index_strategy/trovesearch_denorm.py | 10 +- tests/trove/trovesearch/test_search_params.py | 81 +- trove/trovesearch/search_params.py | 115 +-- 4 files changed, 991 insertions(+), 164 deletions(-) create mode 100644 share/search/index_strategy/trove_indexcard_flats.py diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py new file mode 100644 index 000000000..4acafa4f7 --- /dev/null +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -0,0 +1,949 @@ +import base64 +from collections import defaultdict +import dataclasses +import datetime +import json +import logging +import re +import uuid +from typing import Iterable, Iterator, Any + +from django.conf import settings +import elasticsearch8 +from primitive_metadata import primitive_rdf + +from share.search import exceptions +from share.search import messages +from share.search.index_strategy._base import IndexStrategy +from share.search.index_strategy.elastic8 import Elastic8IndexStrategy +from share.util.checksum_iri import ChecksumIri +from trove import models as trove_db +from trove.trovesearch.page_cursor import ( + MANY_MORE, + OffsetCursor, + PageCursor, + ReproduciblyRandomSampleCursor, +) +from trove.util.propertypath import GLOB_PATHSTEP +from trove.trovesearch.search_params import ( + CardsearchParams, + ValuesearchParams, + SearchFilter, + SearchText, + SortParam, +) +from trove.trovesearch.search_handle import ( + CardsearchHandle, + ValuesearchHandle, + TextMatchEvidence, + CardsearchResult, + ValuesearchResult, + PropertypathUsage, +) +from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri, iri_path_as_keyword +from trove.vocab import osfmap +from trove.vocab.namespaces import RDF, OWL +from ._trovesearch_util import ( + latest_rdf_for_indexcard_pks, + GraphWalk, + KEYWORD_LENGTH_MAX, +) + + +logger = logging.getLogger(__name__) + + +class TroveIndexcardFlatsIndexStrategy(Elastic8IndexStrategy): + CURRENT_STRATEGY_CHECKSUM = ChecksumIri( + checksumalgorithm_name='sha-256', + salt='TroveIndexcardFlatsIndexStrategy', + hexdigest='bdec536873e1ed0c58facaa5d1145bef73bba09d671deef48e45c019def5c5a5', + ) + + # abstract method from IndexStrategy + @property + def supported_message_types(self): + return { + messages.MessageType.UPDATE_INDEXCARD, + messages.MessageType.BACKFILL_INDEXCARD, + } + + # abstract method from IndexStrategy + @property + def backfill_message_type(self): + return messages.MessageType.BACKFILL_INDEXCARD + + @classmethod + def define_current_indexes(cls): + return { # empty index subname, for backcompat + '': cls.IndexDefinition( + mappings=cls.index_mappings(), + settings=cls.index_settings(), + ), + } + + @classmethod + def index_settings(cls): + return {} + + @classmethod + def index_mappings(cls): + _capped_keyword = { + 'type': 'keyword', + 'ignore_above': KEYWORD_LENGTH_MAX, + } + _common_nested_keywords = { + 'path_from_focus': _capped_keyword, + 'suffuniq_path_from_focus': _capped_keyword, + 'property_iri': _capped_keyword, + 'distance_from_focus': {'type': 'keyword'}, # numeric value as keyword (used for 'term' filter) + } + return { + 'dynamic': 'false', + 'properties': { + 'indexcard_uuid': _capped_keyword, + 'focus_iri': _capped_keyword, + 'suffuniq_focus_iri': _capped_keyword, + 'source_record_identifier': _capped_keyword, + 'source_config_label': _capped_keyword, + 'flat_iri_values': { + 'type': 'flattened', + 'ignore_above': KEYWORD_LENGTH_MAX, + }, + 'flat_iri_values_suffuniq': { + 'type': 'flattened', + 'ignore_above': KEYWORD_LENGTH_MAX, + }, + 'iri_paths_present': _capped_keyword, + 'iri_paths_present_suffuniq': _capped_keyword, + 'nested_iri': { + 'type': 'nested', + 'properties': { + **_common_nested_keywords, + 'iri_value': _capped_keyword, + 'suffuniq_iri_value': _capped_keyword, + 'value_type_iri': _capped_keyword, + 'value_name_text': { + 'type': 'text', + 'fields': {'raw': _capped_keyword}, + 'copy_to': 'nested_iri.value_namelike_text', + }, + 'value_title_text': { + 'type': 'text', + 'fields': {'raw': _capped_keyword}, + 'copy_to': 'nested_iri.value_namelike_text', + }, + 'value_label_text': { + 'type': 'text', + 'fields': {'raw': _capped_keyword}, + 'copy_to': 'nested_iri.value_namelike_text', + }, + 'value_namelike_text': {'type': 'text'}, + }, + }, + 'nested_date': { + 'type': 'nested', + 'properties': { + **_common_nested_keywords, + 'date_value': { + 'type': 'date', + 'format': 'strict_date_optional_time', + }, + }, + }, + 'nested_text': { + 'type': 'nested', + 'properties': { + **_common_nested_keywords, + 'language_iri': _capped_keyword, + 'text_value': { + 'type': 'text', + 'index_options': 'offsets', # for faster highlighting + 'store': True, # avoid loading _source to render highlights + 'fields': {'raw': _capped_keyword}, + }, + }, + }, + }, + } + + @property + def __index(self) -> IndexStrategy.SpecificIndex: + # this is a single-index strategy -- for back-compat, that index has empty subname + return self.get_index('') + + def _build_sourcedoc(self, indexcard_rdf): + _rdfdoc = indexcard_rdf.as_rdfdoc_with_supplements() + if _should_skip_card(indexcard_rdf, _rdfdoc): + return None # will be deleted from the index + _nested_iris = defaultdict(set) + _nested_dates = defaultdict(set) + _nested_texts = defaultdict(set) + _walk = GraphWalk(_rdfdoc, indexcard_rdf.focus_iri) + for _walk_path, _walk_iris in _walk.iri_values.items(): + for _iri_obj in _walk_iris: + _nested_iris[_NestedIriKey.for_iri_at_path(_walk_path, _iri_obj, _rdfdoc)].add(_iri_obj) + for _walk_path, _walk_dates in _walk.date_values.items(): + for _date_obj in _walk_dates: + _nested_dates[_walk_path].add(datetime.date.isoformat(_date_obj)) + for _walk_path, _walk_texts in _walk.text_values.items(): + for _text_obj in _walk_texts: + _nested_texts[(_walk_path, tuple(_text_obj.datatype_iris))].add(_text_obj.unicode_value) + _focus_iris = {indexcard_rdf.focus_iri} + _suffuniq_focus_iris = {get_sufficiently_unique_iri(indexcard_rdf.focus_iri)} + for _identifier in indexcard_rdf.indexcard.focus_identifier_set.all(): + _focus_iris.update(_identifier.raw_iri_list) + _suffuniq_focus_iris.add(_identifier.sufficiently_unique_iri) + return { + 'indexcard_uuid': str(indexcard_rdf.indexcard.uuid), + 'focus_iri': list(_focus_iris), + 'suffuniq_focus_iri': list(_suffuniq_focus_iris), + 'source_record_identifier': indexcard_rdf.indexcard.source_record_suid.identifier, + 'source_config_label': indexcard_rdf.indexcard.source_record_suid.source_config.label, + 'flat_iri_values': self._flattened_iris(_nested_iris), + 'flat_iri_values_suffuniq': self._flattened_iris_suffuniq(_nested_iris), + 'iri_paths_present': [ + iri_path_as_keyword(_path) + for _path in _walk.paths_walked + ], + 'iri_paths_present_suffuniq': [ + iri_path_as_keyword(_path, suffuniq=True) + for _path in _walk.paths_walked + ], + 'nested_iri': list(filter(bool, ( + self._iri_nested_sourcedoc(_nested_iri_key, _iris, _rdfdoc) + for _nested_iri_key, _iris in _nested_iris.items() + ))), + 'nested_date': [ + { + **_iri_path_as_indexable_fields(_path), + 'date_value': list(_value_set), + } + for _path, _value_set in _nested_dates.items() + ], + 'nested_text': [ + { + **_iri_path_as_indexable_fields(_path), + 'language_iri': _language_iris, + 'text_value': list(_value_set), + } + for (_path, _language_iris), _value_set in _nested_texts.items() + ], + } + + def _iri_nested_sourcedoc(self, iri_key: '_NestedIriKey', iris, rdfdoc): + _iris_with_synonyms = set(filter(is_worthwhile_iri, iris)) + for _iri in iris: + _iris_with_synonyms.update( + filter(is_worthwhile_iri, rdfdoc.q(_iri, OWL.sameAs)), + ) + if not _iris_with_synonyms: + return None + _sourcedoc = { + **iri_key.as_indexable_fields(), + 'iri_value': list(_iris_with_synonyms), + 'suffuniq_iri_value': [ + get_sufficiently_unique_iri(_iri) + for _iri in _iris_with_synonyms + ], + } + return _sourcedoc + + def _flattened_iris_by_path(self, nested_iris: dict['_NestedIriKey', set[str]]): + _by_path = defaultdict(set) + for _iri_key, _iris in nested_iris.items(): + _by_path[_iri_key.path].update(_iris) + return _by_path + + def _flattened_iris(self, nested_iris: dict['_NestedIriKey', set[str]]): + return { + _iri_path_as_flattened_key(_path): list(_iris) + for _path, _iris in self._flattened_iris_by_path(nested_iris).items() + } + + def _flattened_iris_suffuniq(self, nested_iris: dict['_NestedIriKey', set[str]]): + return { + _iri_path_as_flattened_key(_path): [ + get_sufficiently_unique_iri(_iri) + for _iri in _iris + ] + for _path, _iris in self._flattened_iris_by_path(nested_iris).items() + } + + def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): + def _make_actionset(indexcard_id, *actions): + return self.MessageActionSet(indexcard_id, {'': actions}) + _indexcard_rdf_qs = latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) + _remaining_indexcard_ids = set(messages_chunk.target_ids_chunk) + for _indexcard_rdf in _indexcard_rdf_qs: + _suid = _indexcard_rdf.indexcard.source_record_suid + if _suid.has_forecompat_replacement(): + continue # skip this one, let it get deleted + _sourcedoc = self._build_sourcedoc(_indexcard_rdf) + if _sourcedoc: + _index_action = self.build_index_action( + doc_id=_indexcard_rdf.indexcard.get_iri(), + doc_source=_sourcedoc, + ) + _remaining_indexcard_ids.discard(_indexcard_rdf.indexcard_id) + yield _make_actionset(_indexcard_rdf.indexcard_id, _index_action) + # delete any that don't have "latest" rdf and derived osfmap_json + _leftovers = trove_db.Indexcard.objects.filter(id__in=_remaining_indexcard_ids) + for _indexcard in _leftovers: + yield _make_actionset(_indexcard.id, self.build_delete_action(_indexcard.get_iri())) + + def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: + return self.es8_client.search( + index=self.__index.full_index_name, + body={ + **(request_body or {}), + 'track_total_hits': True, + }, + params=(request_queryparams or {}), + ) + + def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: + _cursor = self._cardsearch_cursor(cardsearch_params) + _sort = self._cardsearch_sort(cardsearch_params.sort_list) + _query = self._cardsearch_query( + cardsearch_params.cardsearch_filter_set, + cardsearch_params.cardsearch_textsegment_set, + cardsearch_cursor=_cursor, + ) + _from_offset = ( + _cursor.start_offset + if _cursor.is_first_page() or not isinstance(_cursor, ReproduciblyRandomSampleCursor) + else _cursor.start_offset - len(_cursor.first_page_ids) + ) + _search_kwargs = dict( + query=_query, + aggs=self._cardsearch_aggs(cardsearch_params), + sort=_sort, + from_=_from_offset, + size=_cursor.bounded_page_size, + source=False, # no need to get _source; _id is enough + ) + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.es8_client.search( + index=self.__index.full_index_name, + **_search_kwargs, + ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self._cardsearch_handle(cardsearch_params, _es8_response, _cursor) + + def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: + _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) + _is_date_search = osfmap.is_date_property(valuesearch_params.valuesearch_propertypath[-1]) + _search_kwargs = dict( + query=self._cardsearch_query( + valuesearch_params.cardsearch_filter_set, + valuesearch_params.cardsearch_textsegment_set, + additional_filters=[{'term': {'iri_paths_present': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + )}}], + ), + size=0, # ignore cardsearch hits; just want the aggs + aggs=( + self._valuesearch_date_aggs(valuesearch_params) + if _is_date_search + else self._valuesearch_iri_aggs(valuesearch_params, _cursor) + ), + ) + if settings.DEBUG: + logger.info(json.dumps(_search_kwargs, indent=2)) + try: + _es8_response = self.es8_client.search( + index=self.__index.full_index_name, + **_search_kwargs, + ) + except elasticsearch8.TransportError as error: + raise exceptions.IndexStrategyError() from error # TODO: error messaging + return self._valuesearch_handle(valuesearch_params, _es8_response, _cursor) + + ### + # query implementation + + def _cardsearch_cursor(self, cardsearch_params: CardsearchParams) -> OffsetCursor: + _request_cursor = cardsearch_params.page_cursor + if ( + _request_cursor.is_basic() + and not cardsearch_params.sort_list + and not cardsearch_params.cardsearch_textsegment_set + ): + return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) + return OffsetCursor.from_cursor(_request_cursor) + + def _cardsearch_query( + self, + filter_set, textsegment_set, *, + additional_filters=None, + cardsearch_cursor: PageCursor | None = None, + ) -> dict: + _bool_query = { + 'filter': additional_filters or [], + 'must': [], + 'must_not': [], + 'should': [], + } + for _searchfilter in filter_set: + if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: + _bool_query['must_not'].append(self._cardsearch_iri_filter(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: + _bool_query['filter'].append(self._cardsearch_iri_filter(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: + _bool_query['filter'].append(self._cardsearch_presence_query(_searchfilter)) + elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: + _bool_query['must_not'].append(self._cardsearch_presence_query(_searchfilter)) + elif _searchfilter.operator.is_date_operator(): + _bool_query['filter'].append(self._cardsearch_date_filter(_searchfilter)) + else: + raise ValueError(f'unknown filter operator {_searchfilter.operator}') + _textq_builder = self._NestedTextQueryBuilder( + relevance_matters=not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor), + ) + for _textsegment in textsegment_set: + for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): + _bool_query[_boolkey].extend(_textqueries) + if not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor): + # no need for randomness + return {'bool': _bool_query} + if not cardsearch_cursor.first_page_ids: + # independent random sample + return { + 'function_score': { + 'query': {'bool': _bool_query}, + 'boost_mode': 'replace', + 'random_score': {}, # default random_score is fast and unpredictable + }, + } + _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_ids}} + if cardsearch_cursor.is_first_page(): + # returning to a first page previously visited + _bool_query['filter'].append(_firstpage_uuid_query) + return {'bool': _bool_query} + # get a subsequent page using reproducible randomness + _bool_query['must_not'].append(_firstpage_uuid_query) + return { + 'function_score': { + 'query': {'bool': _bool_query}, + 'boost_mode': 'replace', + 'random_score': { + 'seed': ''.join(cardsearch_cursor.first_page_ids), + 'field': 'indexcard_uuid', + }, + }, + } + + def _cardsearch_aggs(self, cardsearch_params): + _aggs = {} + if cardsearch_params.related_property_paths: + _aggs['related_propertypath_usage'] = {'terms': { + 'field': 'iri_paths_present', + 'include': [ + iri_path_as_keyword(_path) + for _path in cardsearch_params.related_property_paths + ], + 'size': len(cardsearch_params.related_property_paths), + }} + return _aggs + + def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: OffsetCursor): + _nested_iri_bool: dict[str, Any] = { + 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + suffuniq=True, + )}}], + 'must': [], + 'must_not': [], + 'should': [], + } + _nested_terms_agg = { + 'field': 'nested_iri.iri_value', + # WARNING: terribly inefficient pagination (part one) + 'size': cursor.start_offset + cursor.bounded_page_size + 1, + } + _iris = list(valuesearch_params.valuesearch_iris()) + if _iris: + _nested_iri_bool['filter'].append({'terms': { + 'nested_iri.iri_value': _iris, + }}) + _nested_terms_agg['size'] = len(_iris) + _nested_terms_agg['include'] = _iris + _type_iris = list(valuesearch_params.valuesearch_type_iris()) + if _type_iris: + _nested_iri_bool['filter'].append({'terms': { + 'nested_iri.value_type_iri': _type_iris, + }}) + _textq_builder = self._SimpleTextQueryBuilder('nested_iri.value_namelike_text') + for _textsegment in valuesearch_params.valuesearch_textsegment_set: + for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): + _nested_iri_bool[_boolkey].extend(_textqueries) + return { + 'in_nested_iri': { + 'nested': {'path': 'nested_iri'}, + 'aggs': { + 'value_at_propertypath': { + 'filter': {'bool': _nested_iri_bool}, + 'aggs': { + 'iri_values': { + 'terms': _nested_terms_agg, + 'aggs': { + 'type_iri': {'terms': { + 'field': 'nested_iri.value_type_iri', + }}, + 'name_text': {'terms': { + 'field': 'nested_iri.value_name_text.raw', + }}, + 'title_text': {'terms': { + 'field': 'nested_iri.value_title_text.raw', + }}, + 'label_text': {'terms': { + 'field': 'nested_iri.value_label_text.raw', + }}, + }, + }, + }, + }, + }, + }, + } + + def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams): + _aggs = { + 'in_nested_date': { + 'nested': {'path': 'nested_date'}, + 'aggs': { + 'value_at_propertypath': { + 'filter': {'term': { + 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( + valuesearch_params.valuesearch_propertypath, + suffuniq=True, + ), + }}, + 'aggs': { + 'count_by_year': { + 'date_histogram': { + 'field': 'nested_date.date_value', + 'calendar_interval': 'year', + 'format': 'yyyy', + 'order': {'_key': 'desc'}, + 'min_doc_count': 1, + }, + }, + }, + }, + }, + }, + } + return _aggs + + def _valuesearch_handle( + self, + valuesearch_params: ValuesearchParams, + es8_response: dict, + cursor: OffsetCursor, + ): + _iri_aggs = es8_response['aggregations'].get('in_nested_iri') + if _iri_aggs: + _buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets'] + _bucket_count = len(_buckets) + # WARNING: terribly inefficient pagination (part two) + _page_end_index = cursor.start_offset + cursor.bounded_page_size + _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages + cursor.total_count = ( + MANY_MORE + if (_bucket_count > _page_end_index) # agg includes one more, if there + else _bucket_count + ) + return ValuesearchHandle( + cursor=cursor, + search_result_page=[ + self._valuesearch_iri_result(_iri_bucket) + for _iri_bucket in _bucket_page + ], + search_params=valuesearch_params, + ) + else: # assume date + _year_buckets = ( + es8_response['aggregations']['in_nested_date'] + ['value_at_propertypath']['count_by_year']['buckets'] + ) + return ValuesearchHandle( + cursor=PageCursor(len(_year_buckets)), + search_result_page=[ + self._valuesearch_date_result(_year_bucket) + for _year_bucket in _year_buckets + ], + search_params=valuesearch_params, + ) + + def _valuesearch_iri_result(self, iri_bucket): + return ValuesearchResult( + value_iri=iri_bucket['key'], + value_type=_bucketlist(iri_bucket['type_iri']), + name_text=_bucketlist(iri_bucket['name_text']), + title_text=_bucketlist(iri_bucket['title_text']), + label_text=_bucketlist(iri_bucket['label_text']), + match_count=iri_bucket['doc_count'], + ) + + def _valuesearch_date_result(self, date_bucket): + return ValuesearchResult( + value_iri=None, + value_value=date_bucket['key_as_string'], + label_text=(date_bucket['key_as_string'],), + match_count=date_bucket['doc_count'], + ) + + def _cardsearch_presence_query(self, search_filter) -> dict: + _filters = [ + self._cardsearch_path_presence_query(_path) + for _path in search_filter.propertypath_set + ] + if len(_filters) == 1: + return _filters[0] + return {'bool': { + 'minimum_should_match': 1, + 'should': _filters, + }} + + def _cardsearch_path_presence_query(self, path: tuple[str, ...]): + if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): + return {'nested': { + 'path': 'nested_iri', + 'query': {'term': {'nested_iri.distance_from_focus': len(path)}}, + }} + return {'term': { + 'iri_paths_present_suffuniq': iri_path_as_keyword(path, suffuniq=True), + }} + + def _cardsearch_iri_filter(self, search_filter) -> dict: + _filters = [ + self._cardsearch_path_iri_query(_path, search_filter.value_set) + for _path in search_filter.propertypath_set + ] + if len(_filters) == 1: + return _filters[0] + return {'bool': { + 'minimum_should_match': 1, + 'should': _filters, + }} + + def _cardsearch_path_iri_query(self, path, value_set): + _suffuniq_values = [ + get_sufficiently_unique_iri(_iri) + for _iri in value_set + ] + if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): + return {'nested': { + 'path': 'nested_iri', + 'query': {'bool': { + 'must': [ # both + {'term': {'nested_iri.distance_from_focus': len(path)}}, + {'terms': {'nested_iri.suffuniq_iri_value': _suffuniq_values}}, + ], + }}, + }} + # without a glob-path, can use the flattened keyword field + return {'terms': {_iri_path_as_flattened_field(path): _suffuniq_values}} + + def _cardsearch_date_filter(self, search_filter): + return {'nested': { + 'path': 'nested_date', + 'query': {'bool': {'filter': list(self._iter_nested_date_filters(search_filter))}}, + }} + + def _iter_nested_date_filters(self, search_filter) -> Iterator[dict]: + # filter by requested paths + yield _pathset_as_nestedvalue_filter(search_filter.propertypath_set, 'nested_date') + # filter by requested value/operator + if search_filter.operator == SearchFilter.FilterOperator.BEFORE: + _value = min(search_filter.value_set) # rely on string-comparable isoformat + yield {'range': {'nested_date.date_value': { + 'lt': _daterange_value_and_format(_value) + }}} + elif search_filter.operator == SearchFilter.FilterOperator.AFTER: + _value = max(search_filter.value_set) # rely on string-comparable isoformat + yield {'range': {'nested_date.date_value': { + 'gt': _daterange_value_and_format(_value) + }}} + elif search_filter.operator == SearchFilter.FilterOperator.AT_DATE: + for _value in search_filter.value_set: + _filtervalue = _daterange_value_and_format(_value) + yield {'range': {'nested_date.date_value': { + 'gte': _filtervalue, + 'lte': _filtervalue, + }}} + else: + raise ValueError(f'invalid date filter operator (got {search_filter.operator})') + + def _cardsearch_sort(self, sort_list: tuple[SortParam, ...]): + if not sort_list: + return None + return [ + {'nested_date.date_value': { + 'order': ('desc' if _sortparam.descending else 'asc'), + 'nested': { + 'path': 'nested_date', + 'filter': {'term': { + 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( + _sortparam.propertypath, + suffuniq=True, + ), + }}, + }, + }} + for _sortparam in sort_list + ] + + def _cardsearch_handle( + self, + cardsearch_params: CardsearchParams, + es8_response: dict, + cursor: OffsetCursor, + ) -> CardsearchHandle: + _es8_total = es8_response['hits']['total'] + if _es8_total['relation'] != 'eq': + cursor.total_count = MANY_MORE + elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): + # account for the filtered-out first page + cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) + else: # exact (and small) count + cursor.total_count = _es8_total['value'] + _results = [] + for _es8_hit in es8_response['hits']['hits']: + _card_iri = _es8_hit['_id'] + _results.append(CardsearchResult( + card_iri=_card_iri, + text_match_evidence=list(self._gather_textmatch_evidence(_es8_hit)), + )) + _relatedproperty_list: list[PropertypathUsage] = [] + if cardsearch_params.related_property_paths: + _relatedproperty_list.extend( + PropertypathUsage(property_path=_path, usage_count=0) + for _path in cardsearch_params.related_property_paths + ) + _relatedproperty_by_path = { + _result.property_path: _result + for _result in _relatedproperty_list + } + for _bucket in es8_response['aggregations']['related_propertypath_usage']['buckets']: + _path = tuple(json.loads(_bucket['key'])) + _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] + return CardsearchHandle( + cursor=cursor, + search_result_page=_results, + related_propertypath_results=_relatedproperty_list, + search_params=cardsearch_params, + ) + + def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]: + for _innerhit_group in es8_hit.get('inner_hits', {}).values(): + for _innerhit in _innerhit_group['hits']['hits']: + _property_path = tuple( + json.loads(_innerhit['fields']['nested_text.path_from_focus'][0]), + ) + try: + _language_iris = _innerhit['fields']['nested_text.language_iri'] + except KeyError: + _language_iris = () + for _highlight in _innerhit['highlight']['nested_text.text_value']: + yield TextMatchEvidence( + property_path=_property_path, + matching_highlight=primitive_rdf.literal(_highlight, datatype_iris=_language_iris), + card_iri=_innerhit['_id'], + ) + + class _SimpleTextQueryBuilder: + def __init__( + self, text_field, *, + relevance_matters=False, + ): + self._text_field = text_field + self._relevance_matters = relevance_matters + + def textsegment_boolparts(self, textsegment: SearchText) -> dict[str, list]: + if not self._relevance_matters: + return {'must': [self.fuzzy_text_must_query(textsegment.text)]} + return { + 'must': [self.fuzzy_text_must_query(textsegment.text)], + 'should': [self.fuzzy_text_should_query(textsegment.text)], + } + + def exact_text_query(self, text: str) -> dict: + # TODO: textsegment.is_openended (prefix query) + return {'match_phrase': { + self._text_field: {'query': text}, + }} + + def fuzzy_text_must_query(self, text: str) -> dict: + # TODO: textsegment.is_openended (prefix query) + return {'match': { + self._text_field: { + 'query': text, + 'fuzziness': 'AUTO', + # TODO: 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) + }, + }} + + def fuzzy_text_should_query(self, text: str): + return {'match_phrase': { + self._text_field: { + 'query': text, + 'slop': len(text.split()), + }, + }} + + class _NestedTextQueryBuilder(_SimpleTextQueryBuilder): + def __init__(self, **kwargs): + super().__init__('nested_text.text_value', **kwargs) + + def textsegment_boolparts(self, textsegment: SearchText) -> dict[str, list]: + return { + _boolkey: [ + self._make_nested_query(textsegment, _query) + for _query in _queries + ] + for _boolkey, _queries in super().textsegment_boolparts(textsegment).items() + } + + def _make_nested_query(self, textsegment, query): + _nested_q = {'nested': { + 'path': 'nested_text', + 'query': {'bool': { + 'filter': _pathset_as_nestedvalue_filter(textsegment.propertypath_set, 'nested_text'), + 'must': query, + }}, + }} + if self._relevance_matters: + _nested_q['nested']['inner_hits'] = self._inner_hits() + return _nested_q + + def _inner_hits(self, *, highlight_query=None) -> dict: + _highlight = { + 'type': 'unified', + 'fields': {'nested_text.text_value': {}}, + } + if highlight_query is not None: + _highlight['highlight_query'] = highlight_query + return { + 'name': str(uuid.uuid4()), # avoid inner-hit name collisions + 'highlight': _highlight, + '_source': False, # _source is expensive for nested docs + 'docvalue_fields': [ + 'nested_text.path_from_focus', + 'nested_text.language_iri', + ], + } + + +### +# module-local utils + +def _should_skip_card(indexcard_rdf, rdfdoc): + # skip cards without some value for name/title/label + return not any(rdfdoc.q(indexcard_rdf.focus_iri, osfmap.NAMELIKE_PROPERTIES)) + + +def _bucketlist(agg_result: dict) -> list[str]: + return [ + _bucket['key'] + for _bucket in agg_result['buckets'] + ] + + +def _daterange_value_and_format(datevalue: str): + _cleanvalue = datevalue.strip() + if re.fullmatch(r'\d{4,}', _cleanvalue): + return f'{_cleanvalue}||/y' + if re.fullmatch(r'\d{4,}-\d{2}', _cleanvalue): + return f'{_cleanvalue}||/M' + if re.fullmatch(r'\d{4,}-\d{2}-\d{2}', _cleanvalue): + return f'{_cleanvalue}||/d' + raise ValueError(f'bad date value "{datevalue}"') + + +def _iri_path_as_indexable_fields(path: tuple[str, ...]): + assert path, 'path should not be empty' + return { + 'path_from_focus': iri_path_as_keyword(path), + 'suffuniq_path_from_focus': iri_path_as_keyword(path, suffuniq=True), + 'property_iri': path[-1], + 'distance_from_focus': len(path), + } + + +def _iri_path_as_flattened_key(path: tuple[str, ...]) -> str: + return base64.b16encode(json.dumps(path).encode()).decode() + + +def _iri_path_as_flattened_field(path: tuple[str, ...]) -> str: + return f'flat_iri_values_suffuniq.{_iri_path_as_flattened_key(path)}' + + +def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[tuple[str, ...]], nested_path: str): + _suffuniq_iri_paths = [] + _glob_path_lengths = [] + for _path in propertypath_set: + if all(_pathstep == GLOB_PATHSTEP for _pathstep in _path): + _glob_path_lengths.append(len(_path)) + else: + _suffuniq_iri_paths.append(iri_path_as_keyword(_path, suffuniq=True)) + if _suffuniq_iri_paths and _glob_path_lengths: + return {'bool': { + 'minimum_should_match': 1, + 'should': [ + {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}}, + {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}}, + ], + }} + if _glob_path_lengths: + return {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}} + return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}} + + +@dataclasses.dataclass(frozen=True) +class _NestedIriKey: + '''if this is the same for multiple iri values, they can be combined in one `nested_iri` doc + ''' + path: tuple[str, ...] + type_iris: frozenset[str] + label_text: frozenset[str] + title_text: frozenset[str] + name_text: frozenset[str] + + @classmethod + def for_iri_at_path(cls, path: tuple[str, ...], iri: str, rdfdoc): + return cls( + path=path, + type_iris=frozenset(rdfdoc.q(iri, RDF.type)), + # TODO: don't discard language for name/title/label + name_text=frozenset( + _text.unicode_value + for _text in rdfdoc.q(iri, osfmap.NAME_PROPERTIES) + if isinstance(_text, primitive_rdf.Literal) + ), + title_text=frozenset( + _text.unicode_value + for _text in rdfdoc.q(iri, osfmap.TITLE_PROPERTIES) + if isinstance(_text, primitive_rdf.Literal) + ), + label_text=frozenset( + _text.unicode_value + for _text in rdfdoc.q(iri, osfmap.LABEL_PROPERTIES) + if isinstance(_text, primitive_rdf.Literal) + ), + ) + + def as_indexable_fields(self): + # matches fields in the mapping for `nested_iri`, above + return { + **_iri_path_as_indexable_fields(self.path), + 'value_type_iri': list(self.type_iris), + 'value_label_text': list(self.label_text), + 'value_title_text': list(self.title_text), + 'value_name_text': list(self.name_text), + } diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 2a40d1211..cd66e6220 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -33,7 +33,7 @@ CardsearchParams, Propertypath, SearchFilter, - Textsegment, + SearchText, ValueType, ValuesearchParams, is_globpath, @@ -627,7 +627,7 @@ def add_boolparts(self, boolparts: Iterator[tuple[str, dict]]): @dataclasses.dataclass class _QueryHelper: base_field: Literal['card', 'iri_value'] - textsegment_set: frozenset[Textsegment] + textsegment_set: frozenset[SearchText] filter_set: frozenset[SearchFilter] relevance_matters: bool @@ -718,14 +718,14 @@ def _text_field_name(self, propertypath: Propertypath): else f'{self.base_field}.text_by_propertypath.{_path_field_name(propertypath)}' ) - def _exact_text_query(self, textsegment: Textsegment) -> dict: + def _exact_text_query(self, textsegment: SearchText) -> dict: # TODO: textsegment.is_openended (prefix query) return _any_query([ {'match_phrase': {self._text_field_name(_path): {'query': textsegment.text}}} for _path in textsegment.propertypath_set ]) - def _fuzzy_text_must_query(self, textsegment: Textsegment) -> dict: + def _fuzzy_text_must_query(self, textsegment: SearchText) -> dict: # TODO: textsegment.is_openended (prefix query) return _any_query([ {'match': { @@ -738,7 +738,7 @@ def _fuzzy_text_must_query(self, textsegment: Textsegment) -> dict: for _path in textsegment.propertypath_set ]) - def _fuzzy_text_should_query(self, textsegment: Textsegment): + def _fuzzy_text_should_query(self, textsegment: SearchText): _slop = len(textsegment.text.split()) return _any_query([ {'match_phrase': { diff --git a/tests/trove/trovesearch/test_search_params.py b/tests/trove/trovesearch/test_search_params.py index 3b9f0e6f4..5f43b582f 100644 --- a/tests/trove/trovesearch/test_search_params.py +++ b/tests/trove/trovesearch/test_search_params.py @@ -1,70 +1,43 @@ from django.test import SimpleTestCase from trove.trovesearch.search_params import ( - Textsegment, - SearchFilter, + SearchText, + SearchFilter, DEFAULT_PROPERTYPATH_SET, ) from trove.util.queryparams import QueryparamName from trove.vocab.namespaces import OSFMAP, RDF, DCTERMS -class TestTextsegment(SimpleTestCase): - def test_empty(self): - for _empty_input in ('', '""', '*', '-', '-""'): - _empty = set(Textsegment.iter_from_text(_empty_input)) - self.assertFalse(_empty) +from django.test import SimpleTestCase +from trove.trovesearch.search_params import SearchText - def test_fuzz(self): - _fuzzword = set(Textsegment.iter_from_text('woord')) - self.assertEqual(_fuzzword, frozenset(( - Textsegment('woord', is_fuzzy=True, is_negated=False, is_openended=True), - ))) - _fuzzphrase = set(Textsegment.iter_from_text('wibbleplop worble polp elbbiw')) - self.assertEqual(_fuzzphrase, frozenset(( - Textsegment('wibbleplop worble polp elbbiw', is_fuzzy=True, is_negated=False, is_openended=True), - ))) +class TestSearchText(SimpleTestCase): + def test_empty_text_list(self): + inputs = [] + results = [SearchText(text) for text in inputs] + self.assertEqual(results, []) - def test_exact(self): - _exactword = set(Textsegment.iter_from_text('"woord"')) - self.assertEqual(_exactword, frozenset(( - Textsegment('woord', is_fuzzy=False, is_negated=False, is_openended=False), - ))) - _exactphrase = set(Textsegment.iter_from_text('"wibbleplop worble polp elbbiw"')) - self.assertEqual(_exactphrase, frozenset(( - Textsegment('wibbleplop worble polp elbbiw', is_fuzzy=False, is_negated=False, is_openended=False), - ))) - _openphrase = set(Textsegment.iter_from_text('"wibbleplop worble polp elbbiw')) - self.assertEqual(_openphrase, frozenset(( - Textsegment('wibbleplop worble polp elbbiw', is_fuzzy=False, is_negated=False, is_openended=True), - ))) + def test_single_word(self): + st = SearchText("word") + self.assertEqual(st.text, "word") + self.assertEqual(st.propertypath_set, DEFAULT_PROPERTYPATH_SET) - def test_minus(self): - _minusword = set(Textsegment.iter_from_text('-woord')) - self.assertEqual(_minusword, frozenset(( - Textsegment('woord', is_fuzzy=False, is_negated=True, is_openended=False), - ))) - _minusexactword = set(Textsegment.iter_from_text('-"woord droow"')) - self.assertEqual(_minusexactword, frozenset(( - Textsegment('woord droow', is_fuzzy=False, is_negated=True, is_openended=False), - ))) - _minustwo = set(Textsegment.iter_from_text('abc -def -g hi there')) - self.assertEqual(_minustwo, frozenset(( - Textsegment('def', is_fuzzy=False, is_negated=True, is_openended=False), - Textsegment('g', is_fuzzy=False, is_negated=True, is_openended=False), - Textsegment('hi there', is_fuzzy=True, is_negated=False, is_openended=True), - Textsegment('abc', is_fuzzy=True, is_negated=False, is_openended=False), - ))) + def test_multiple_words(self): + words = ["apple", "banana", "cherry"] + results = [SearchText(word) for word in words] + self.assertEqual(len(results), 3) + self.assertIn(SearchText("banana"), results) - def test_combo(self): - _combo = set(Textsegment.iter_from_text('wibbleplop -"worble polp" elbbiw -but "exactly')) - self.assertEqual(_combo, frozenset(( - Textsegment('worble polp', is_fuzzy=False, is_negated=True, is_openended=False), - Textsegment('elbbiw', is_fuzzy=True, is_negated=False, is_openended=False), - Textsegment('wibbleplop', is_fuzzy=True, is_negated=False, is_openended=False), - Textsegment('but', is_fuzzy=False, is_negated=True, is_openended=False), - Textsegment('exactly', is_fuzzy=False, is_negated=False, is_openended=True), - ))) + def test_text_with_spaces(self): + phrase = "multi word phrase" + st = SearchText(phrase) + self.assertEqual(st.text, phrase) + self.assertEqual(st.propertypath_set, DEFAULT_PROPERTYPATH_SET) + def test_custom_propertypath_set(self): + custom_set = frozenset(["some:path"]) + st = SearchText("hello", propertypath_set=custom_set) + self.assertEqual(st.propertypath_set, custom_set) class TestSearchFilterPath(SimpleTestCase): def test_from_param(self): diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index 760c078e5..fc4024950 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -129,17 +129,10 @@ def _default_attrpaths(cls) -> collections.abc.Mapping[str, tuple[Propertypath, @dataclasses.dataclass(frozen=True) -class Textsegment: +class SearchText: text: str - is_fuzzy: bool = True - is_negated: bool = False - is_openended: bool = False propertypath_set: PropertypathSet = DEFAULT_PROPERTYPATH_SET - def __post_init__(self): - if self.is_negated and self.is_fuzzy: - raise trove_exceptions.InvalidSearchText(self.text, "search cannot be both negated and fuzzy") - def words(self): return self.text.split() @@ -166,92 +159,13 @@ def iter_from_searchtext_param(cls, param_name: QueryparamName, param_value: str 'may not use glob-paths longer than "*" with search-text parameters', ) - for _textsegment in cls.iter_from_text(param_value): + if param_value: + _textsegment = cls(text=param_value) if _propertypath_set: yield dataclasses.replace(_textsegment, propertypath_set=_propertypath_set) else: yield _textsegment - @classmethod - def iter_from_text(cls, text: str) -> typing.Iterable['Textsegment']: - '''parse search text into words and quoted phrases - ''' - _in_quotes = False - _last_quote_prefix = None - _text_remaining = text - while _text_remaining: - ( # split on the next " - _text_chunk, - _quote_mark, - _text_remaining, - ) = _text_remaining.partition(DOUBLE_QUOTATION_MARK) - _text_chunk = _text_chunk.strip() - if _text_chunk: - _is_openended = not (_quote_mark or _text_remaining) - if _in_quotes: - yield cls( - text=_text_chunk, - is_fuzzy=False, - is_negated=(_last_quote_prefix == NEGATE_WORD_OR_PHRASE), - is_openended=_is_openended, - ) - else: - yield from cls._from_fuzzy_text( - _text_chunk, - is_openended=_is_openended, - ) - if _quote_mark: - if _in_quotes: # end quote - _in_quotes = False - _last_quote_prefix = None - else: # begin quote - _in_quotes = True - _last_quote_prefix = _text_chunk[-1:] - - @classmethod - def _from_fuzzy_text(cls, text_chunk: str, is_openended: bool): - if text_chunk == '*': - return # special case for COS employees used to the old search page - _all_wordgroups = ( - (_each_word_negated, list(_words)) - for (_each_word_negated, _words) in itertools.groupby( - text_chunk.split(), - key=lambda word: word.startswith(NEGATE_WORD_OR_PHRASE), - ) - ) - (*_wordgroups, (_lastgroup_negated, _lastgroup_words)) = _all_wordgroups - for _each_word_negated, _words in _wordgroups: - yield from cls._from_fuzzy_wordgroup( - _each_word_negated, - _words, - is_openended=False, - ) - yield from cls._from_fuzzy_wordgroup( - _lastgroup_negated, - _lastgroup_words, - is_openended=is_openended, - ) - - @classmethod - def _from_fuzzy_wordgroup(cls, each_word_negated: bool, words: typing.Iterable[str], *, is_openended=False): - if each_word_negated: - for _word in words: - _word_without_prefix = _word[len(NEGATE_WORD_OR_PHRASE):] - if _word_without_prefix: - yield cls( - text=_word_without_prefix, - is_fuzzy=False, - is_negated=True, - is_openended=False, - ) - else: # nothing negated; keep the phrase in one fuzzy segment - yield cls( - text=' '.join(words), - is_fuzzy=True, - is_negated=False, - is_openended=is_openended, - ) - @classmethod def queryparams_from_textsegments(self, queryparam_family: str, textsegments): _by_propertypath_set = collections.defaultdict(set) @@ -263,20 +177,11 @@ def queryparams_from_textsegments(self, queryparam_family: str, textsegments): (osfmap.osfmap_propertypath_set_key(_propertypath_set),), ) _qp_value = ' '.join( - _textsegment.as_searchtext() + _textsegment.text for _textsegment in _combinable_segments ) yield str(_qp_name), _qp_value - def as_searchtext(self) -> str: - _text = self.text - if not self.is_fuzzy: - _text = f'"{_text}"' - if self.is_negated: - _text = f'-{_text}' - return _text - - @dataclasses.dataclass(frozen=True) class SearchFilter: class FilterOperator(enum.Enum): @@ -470,7 +375,7 @@ class IndexcardParams(TrovesearchParams): @dataclasses.dataclass(frozen=True) class CardsearchParams(TrovesearchParams): - cardsearch_textsegment_set: frozenset[Textsegment] + cardsearch_textsegment_set: frozenset[SearchText] cardsearch_filter_set: frozenset[SearchFilter] index_strategy_name: str | None sort_list: tuple[SortParam, ...] @@ -483,7 +388,7 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: _filter_set = SearchFilter.from_queryparam_family(queryparams, 'cardSearchFilter') return { **super().parse_queryparams(queryparams), - 'cardsearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'cardSearchText'), + 'cardsearch_textsegment_set': SearchText.from_queryparam_family(queryparams, 'cardSearchText'), 'cardsearch_filter_set': _filter_set, 'index_strategy_name': get_single_value(queryparams, 'indexStrategy'), 'sort_list': SortParam.from_sort_queryparams(queryparams), @@ -520,7 +425,7 @@ def cardsearch_text_glob_depths(self) -> frozenset[int]: def to_querydict(self) -> QueryDict: _querydict = super().to_querydict() - for _qp_name, _qp_value in Textsegment.queryparams_from_textsegments('cardSearchText', self.cardsearch_textsegment_set): + for _qp_name, _qp_value in SearchText.queryparams_from_textsegments('cardSearchText', self.cardsearch_textsegment_set): _querydict[_qp_name] = _qp_value for _sort in self.sort_list: _qp_name, _qp_value = _sort.as_queryparam() @@ -542,7 +447,7 @@ class ValuesearchParams(CardsearchParams): # includes fields from CardsearchParams, because a # valuesearch is always in context of a cardsearch valuesearch_propertypath: Propertypath - valuesearch_textsegment_set: frozenset[Textsegment] + valuesearch_textsegment_set: frozenset[SearchText] valuesearch_filter_set: frozenset[SearchFilter] static_focus_type = TROVE.Valuesearch @@ -556,7 +461,7 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: return { **super().parse_queryparams(queryparams), 'valuesearch_propertypath': osfmap.parse_osfmap_propertypath(_raw_propertypath), - 'valuesearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'valueSearchText'), + 'valuesearch_textsegment_set': SearchText.from_queryparam_family(queryparams, 'valueSearchText'), 'valuesearch_filter_set': SearchFilter.from_queryparam_family(queryparams, 'valueSearchFilter'), } @@ -575,7 +480,7 @@ def __post_init__(self): def to_querydict(self): _querydict = super().to_querydict() _querydict['valueSearchPropertyPath'] = osfmap.osfmap_propertypath_key(self.valuesearch_propertypath) - for _qp_name, _qp_value in Textsegment.queryparams_from_textsegments('valueSearchText', self.valuesearch_textsegment_set): + for _qp_name, _qp_value in SearchText.queryparams_from_textsegments('valueSearchText', self.valuesearch_textsegment_set): _querydict[_qp_name] = _qp_value for _filter in self.valuesearch_filter_set: _qp_name, _qp_value = _filter.as_queryparam('valueSearchFilter') From 72225f07a4957164b50d7717c53c8d830d6aacf0 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 23 Apr 2025 17:26:49 +0300 Subject: [PATCH 26/43] fix flake8 --- tests/trove/trovesearch/test_search_params.py | 4 ---- trove/trovesearch/search_params.py | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/trove/trovesearch/test_search_params.py b/tests/trove/trovesearch/test_search_params.py index 5f43b582f..6c61d41c4 100644 --- a/tests/trove/trovesearch/test_search_params.py +++ b/tests/trove/trovesearch/test_search_params.py @@ -7,10 +7,6 @@ from trove.util.queryparams import QueryparamName from trove.vocab.namespaces import OSFMAP, RDF, DCTERMS - -from django.test import SimpleTestCase -from trove.trovesearch.search_params import SearchText - class TestSearchText(SimpleTestCase): def test_empty_text_list(self): inputs = [] diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index fc4024950..e7099a670 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -182,6 +182,7 @@ def queryparams_from_textsegments(self, queryparam_family: str, textsegments): ) yield str(_qp_name), _qp_value + @dataclasses.dataclass(frozen=True) class SearchFilter: class FilterOperator(enum.Enum): From 52b19977d4a5867e9ded25cf3a9cd0f6fa626337 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 23 Apr 2025 17:33:49 +0300 Subject: [PATCH 27/43] fix flake8 --- tests/trove/trovesearch/test_search_params.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/trove/trovesearch/test_search_params.py b/tests/trove/trovesearch/test_search_params.py index 6c61d41c4..b85077313 100644 --- a/tests/trove/trovesearch/test_search_params.py +++ b/tests/trove/trovesearch/test_search_params.py @@ -7,6 +7,7 @@ from trove.util.queryparams import QueryparamName from trove.vocab.namespaces import OSFMAP, RDF, DCTERMS + class TestSearchText(SimpleTestCase): def test_empty_text_list(self): inputs = [] @@ -35,6 +36,7 @@ def test_custom_propertypath_set(self): st = SearchText("hello", propertypath_set=custom_set) self.assertEqual(st.propertypath_set, custom_set) + class TestSearchFilterPath(SimpleTestCase): def test_from_param(self): _cases = { From c4483f52ecc5496a61949d40f3418ecbb1f24f04 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 23 Apr 2025 18:18:58 +0300 Subject: [PATCH 28/43] fix trovesearch_denorm --- .../index_strategy/trovesearch_denorm.py | 32 +------------------ 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index cd66e6220..c157b85ac 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -654,14 +654,7 @@ def iri_boolparts(self) -> Iterator[tuple[str, dict]]: def text_boolparts(self) -> Iterator[tuple[str, dict]]: # text-based queries for _textsegment in self.textsegment_set: - if _textsegment.is_negated: - yield 'must_not', self._exact_text_query(_textsegment) - elif not _textsegment.is_fuzzy: - yield 'must', self._exact_text_query(_textsegment) - else: - yield 'must', self._fuzzy_text_must_query(_textsegment) - if self.relevance_matters: - yield 'should', self._fuzzy_text_should_query(_textsegment) + yield 'must', self._exact_text_query(_textsegment) def _presence_query(self, search_filter) -> dict: return _any_query([ @@ -725,29 +718,6 @@ def _exact_text_query(self, textsegment: SearchText) -> dict: for _path in textsegment.propertypath_set ]) - def _fuzzy_text_must_query(self, textsegment: SearchText) -> dict: - # TODO: textsegment.is_openended (prefix query) - return _any_query([ - {'match': { - self._text_field_name(_path): { - 'query': textsegment.text, - 'fuzziness': 'AUTO', - # TODO: consider 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) - }, - }} - for _path in textsegment.propertypath_set - ]) - - def _fuzzy_text_should_query(self, textsegment: SearchText): - _slop = len(textsegment.text.split()) - return _any_query([ - {'match_phrase': { - self._text_field_name(_path): {'query': textsegment.text, 'slop': _slop}, - }} - for _path in textsegment.propertypath_set - ]) - - @dataclasses.dataclass class _CardsearchQueryBuilder: params: CardsearchParams From 2c3319f81bde1c218fce498e5e47bff4c8dde2a9 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 23 Apr 2025 18:26:40 +0300 Subject: [PATCH 29/43] flake 8 --- share/search/index_strategy/trovesearch_denorm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index c157b85ac..180af7228 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -718,6 +718,7 @@ def _exact_text_query(self, textsegment: SearchText) -> dict: for _path in textsegment.propertypath_set ]) + @dataclasses.dataclass class _CardsearchQueryBuilder: params: CardsearchParams From 18e58e35f1af13de287e81b8dc9e0b197139cc4c Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Thu, 24 Apr 2025 16:20:28 +0300 Subject: [PATCH 30/43] fix tests, fixed naming, further improvements --- .../index_strategy/trove_indexcard_flats.py | 12 ++--- .../index_strategy/trovesearch_denorm.py | 16 +++---- .../_common_trovesearch_tests.py | 38 ++++++++-------- tests/trove/trovesearch/test_search_params.py | 44 ++++++++++++------- trove/trovesearch/search_params.py | 36 +++++++-------- 5 files changed, 75 insertions(+), 71 deletions(-) diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py index 4acafa4f7..1de5e236a 100644 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ b/share/search/index_strategy/trove_indexcard_flats.py @@ -307,7 +307,7 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear _sort = self._cardsearch_sort(cardsearch_params.sort_list) _query = self._cardsearch_query( cardsearch_params.cardsearch_filter_set, - cardsearch_params.cardsearch_textsegment_set, + cardsearch_params.cardsearch_searchtext, cardsearch_cursor=_cursor, ) _from_offset = ( @@ -340,7 +340,7 @@ def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> Value _search_kwargs = dict( query=self._cardsearch_query( valuesearch_params.cardsearch_filter_set, - valuesearch_params.cardsearch_textsegment_set, + valuesearch_params.cardsearch_searchtext, additional_filters=[{'term': {'iri_paths_present': iri_path_as_keyword( valuesearch_params.valuesearch_propertypath, )}}], @@ -371,14 +371,14 @@ def _cardsearch_cursor(self, cardsearch_params: CardsearchParams) -> OffsetCurso if ( _request_cursor.is_basic() and not cardsearch_params.sort_list - and not cardsearch_params.cardsearch_textsegment_set + and not cardsearch_params.cardsearch_searchtext ): return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) return OffsetCursor.from_cursor(_request_cursor) def _cardsearch_query( self, - filter_set, textsegment_set, *, + filter_set, searchtext, *, additional_filters=None, cardsearch_cursor: PageCursor | None = None, ) -> dict: @@ -404,7 +404,7 @@ def _cardsearch_query( _textq_builder = self._NestedTextQueryBuilder( relevance_matters=not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor), ) - for _textsegment in textsegment_set: + for _textsegment in searchtext: for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): _bool_query[_boolkey].extend(_textqueries) if not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor): @@ -478,7 +478,7 @@ def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: O 'nested_iri.value_type_iri': _type_iris, }}) _textq_builder = self._SimpleTextQueryBuilder('nested_iri.value_namelike_text') - for _textsegment in valuesearch_params.valuesearch_textsegment_set: + for _textsegment in valuesearch_params.valuesearch_searchtext: for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): _nested_iri_bool[_boolkey].extend(_textqueries) return { diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 180af7228..899435751 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -627,7 +627,7 @@ def add_boolparts(self, boolparts: Iterator[tuple[str, dict]]): @dataclasses.dataclass class _QueryHelper: base_field: Literal['card', 'iri_value'] - textsegment_set: frozenset[SearchText] + searchtext: frozenset[SearchText] filter_set: frozenset[SearchFilter] relevance_matters: bool @@ -653,7 +653,7 @@ def iri_boolparts(self) -> Iterator[tuple[str, dict]]: def text_boolparts(self) -> Iterator[tuple[str, dict]]: # text-based queries - for _textsegment in self.textsegment_set: + for _textsegment in self.searchtext: yield 'must', self._exact_text_query(_textsegment) def _presence_query(self, search_filter) -> dict: @@ -738,7 +738,7 @@ def response_cursor(self) -> OffsetCursor: if ( _request_cursor.is_basic() and not self.params.sort_list - and not self.params.cardsearch_textsegment_set + and not self.params.cardsearch_searchtext ): return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) return OffsetCursor.from_cursor(_request_cursor) @@ -756,7 +756,7 @@ def _cardsearch_query(self) -> dict: _bool.add_boolparts( _QueryHelper( base_field='card', - textsegment_set=self.params.cardsearch_textsegment_set, + searchtext=self.params.cardsearch_searchtext, filter_set=self.params.cardsearch_filter_set, relevance_matters=(not self.params.sort_list), ).boolparts(), @@ -840,7 +840,7 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d _bool.add_boolparts( _QueryHelper( base_field='card', - textsegment_set=params.cardsearch_textsegment_set, + searchtext=params.cardsearch_searchtext, filter_set=params.cardsearch_filter_set, relevance_matters=False, ).boolparts(), @@ -848,7 +848,7 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d _bool.add_boolparts( _QueryHelper( base_field='iri_value', - textsegment_set=params.valuesearch_textsegment_set, + searchtext=params.valuesearch_searchtext, filter_set=params.valuesearch_filter_set, relevance_matters=False, ).boolparts() @@ -877,13 +877,13 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d def _build_date_valuesearch(params: ValuesearchParams) -> dict: - assert not params.valuesearch_textsegment_set + assert not params.valuesearch_searchtext assert not params.valuesearch_filter_set _bool = _BoolBuilder() _bool.add_boolparts( _QueryHelper( base_field='card', - textsegment_set=params.cardsearch_textsegment_set, + searchtext=params.cardsearch_searchtext, filter_set=params.cardsearch_filter_set, relevance_matters=False, ).boolparts(), diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index 7845ff918..97be9dd96 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -56,25 +56,25 @@ def test_for_smoke_with_daemon(self): expected_doc_count=1, ) - def test_cardsearch(self): - self._fill_test_data_for_querying() - for _queryparams, _expected_focus_iris in self.cardsearch_cases(): - self._assert_cardsearch_iris(_queryparams, _expected_focus_iris) - - def test_cardsearch_after_deletion(self): - _cards = self._fill_test_data_for_querying() - _deleted_focus_iris = {BLARG.b} - self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris]) - for _queryparams, _expected_focus_iris in self.cardsearch_cases(): - if isinstance(_expected_focus_iris, set): - _expected_focus_iris -= _deleted_focus_iris - else: - _expected_focus_iris = [ - _iri - for _iri in _expected_focus_iris - if _iri not in _deleted_focus_iris - ] - self._assert_cardsearch_iris(_queryparams, _expected_focus_iris) + # def test_cardsearch(self): + # self._fill_test_data_for_querying() + # for _queryparams, _expected_focus_iris in self.cardsearch_cases(): + # self._assert_cardsearch_iris(_queryparams, _expected_focus_iris) + # + # def test_cardsearch_after_deletion(self): + # _cards = self._fill_test_data_for_querying() + # _deleted_focus_iris = {BLARG.b} + # self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris]) + # for _queryparams, _expected_focus_iris in self.cardsearch_cases(): + # if isinstance(_expected_focus_iris, set): + # _expected_focus_iris -= _deleted_focus_iris + # else: + # _expected_focus_iris = [ + # _iri + # for _iri in _expected_focus_iris + # if _iri not in _deleted_focus_iris + # ] + # self._assert_cardsearch_iris(_queryparams, _expected_focus_iris) def test_cardsearch_after_updates(self): _cards = self._fill_test_data_for_querying() diff --git a/tests/trove/trovesearch/test_search_params.py b/tests/trove/trovesearch/test_search_params.py index b85077313..655d25c68 100644 --- a/tests/trove/trovesearch/test_search_params.py +++ b/tests/trove/trovesearch/test_search_params.py @@ -1,40 +1,50 @@ +import urllib + from django.test import SimpleTestCase from trove.trovesearch.search_params import ( SearchText, SearchFilter, DEFAULT_PROPERTYPATH_SET, ) -from trove.util.queryparams import QueryparamName +from trove.util.queryparams import QueryparamName, queryparams_from_querystring from trove.vocab.namespaces import OSFMAP, RDF, DCTERMS class TestSearchText(SimpleTestCase): - def test_empty_text_list(self): - inputs = [] - results = [SearchText(text) for text in inputs] - self.assertEqual(results, []) + def test_from_queryparam_family_with_empty_value(self): + _qp = queryparams_from_querystring('myBlargText[foo]=') + result = SearchText.from_queryparam_family(_qp, 'myBlargText') + self.assertEqual(result, frozenset()) def test_single_word(self): - st = SearchText("word") + qp = queryparams_from_querystring('myBlargText=word') + (st,) = SearchText.from_queryparam_family(qp, 'myBlargText') self.assertEqual(st.text, "word") self.assertEqual(st.propertypath_set, DEFAULT_PROPERTYPATH_SET) def test_multiple_words(self): - words = ["apple", "banana", "cherry"] - results = [SearchText(word) for word in words] - self.assertEqual(len(results), 3) - self.assertIn(SearchText("banana"), results) + qp = queryparams_from_querystring('myBlargText=apple&myBlargText=banana&myBlargText=cherry&anotherText=no') + result = SearchText.from_queryparam_family(qp, 'myBlargText') + self.assertEqual(result, {SearchText('apple'), SearchText('banana'), SearchText('cherry')}) def test_text_with_spaces(self): - phrase = "multi word phrase" - st = SearchText(phrase) - self.assertEqual(st.text, phrase) - self.assertEqual(st.propertypath_set, DEFAULT_PROPERTYPATH_SET) + phrases = [ + "multi word phrase", + 'phrase with "double quotes"', + '~phrase~ with +special.characters AND \'mismatched quotes"' + ] + for phrase in phrases: + qp = queryparams_from_querystring(urllib.parse.urlencode({'myBlargText': phrase})) + (st,) = SearchText.from_queryparam_family(qp, 'myBlargText') + self.assertEqual(st.text, phrase) + self.assertEqual(st.propertypath_set, DEFAULT_PROPERTYPATH_SET) def test_custom_propertypath_set(self): - custom_set = frozenset(["some:path"]) - st = SearchText("hello", propertypath_set=custom_set) - self.assertEqual(st.propertypath_set, custom_set) + qp = queryparams_from_querystring('myBlargText[title]=foo') + result = SearchText.from_queryparam_family(qp, 'myBlargText') + self.assertEqual(result, { + SearchText('foo', frozenset({(DCTERMS.title,)})) + }) class TestSearchFilterPath(SimpleTestCase): diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index e7099a670..cee61a5ad 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -3,7 +3,6 @@ import dataclasses import enum import functools -import itertools import logging import types import typing @@ -133,9 +132,6 @@ class SearchText: text: str propertypath_set: PropertypathSet = DEFAULT_PROPERTYPATH_SET - def words(self): - return self.text.split() - @classmethod def from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: str): return frozenset(cls.iter_from_queryparam_family(queryparams, queryparam_family)) @@ -143,10 +139,11 @@ def from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: @classmethod def iter_from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: str): for (_param_name, _param_value) in queryparams.get(queryparam_family, ()): - yield from cls.iter_from_searchtext_param(_param_name, _param_value) + if _param_value: + yield cls.from_searchtext_param_or_none(_param_name, _param_value) @classmethod - def iter_from_searchtext_param(cls, param_name: QueryparamName, param_value: str): + def from_searchtext_param_or_none(cls, param_name: QueryparamName, param_value: str) -> SearchText | None: _propertypath_set = ( frozenset(osfmap.parse_osfmap_propertypath_set(param_name.bracketed_names[0], allow_globs=True)) if param_name.bracketed_names @@ -158,13 +155,10 @@ def iter_from_searchtext_param(cls, param_name: QueryparamName, param_value: str str(param_name), 'may not use glob-paths longer than "*" with search-text parameters', ) - - if param_value: - _textsegment = cls(text=param_value) - if _propertypath_set: - yield dataclasses.replace(_textsegment, propertypath_set=_propertypath_set) - else: - yield _textsegment + _textsegment = cls(text=param_value) + if _propertypath_set: + _textsegment = dataclasses.replace(_textsegment, propertypath_set=_propertypath_set) + return _textsegment @classmethod def queryparams_from_textsegments(self, queryparam_family: str, textsegments): @@ -376,7 +370,7 @@ class IndexcardParams(TrovesearchParams): @dataclasses.dataclass(frozen=True) class CardsearchParams(TrovesearchParams): - cardsearch_textsegment_set: frozenset[SearchText] + cardsearch_searchtext: frozenset[SearchText] cardsearch_filter_set: frozenset[SearchFilter] index_strategy_name: str | None sort_list: tuple[SortParam, ...] @@ -389,7 +383,7 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: _filter_set = SearchFilter.from_queryparam_family(queryparams, 'cardSearchFilter') return { **super().parse_queryparams(queryparams), - 'cardsearch_textsegment_set': SearchText.from_queryparam_family(queryparams, 'cardSearchText'), + 'cardsearch_searchtext': SearchText.from_queryparam_family(queryparams, 'cardSearchText'), 'cardsearch_filter_set': _filter_set, 'index_strategy_name': get_single_value(queryparams, 'indexStrategy'), 'sort_list': SortParam.from_sort_queryparams(queryparams), @@ -413,7 +407,7 @@ def cardsearch_type_iris(self): def cardsearch_text_paths(self) -> PropertypathSet: return frozenset().union(*( _textsegment.propertypath_set - for _textsegment in self.cardsearch_textsegment_set + for _textsegment in self.cardsearch_searchtext )) @functools.cached_property @@ -426,7 +420,7 @@ def cardsearch_text_glob_depths(self) -> frozenset[int]: def to_querydict(self) -> QueryDict: _querydict = super().to_querydict() - for _qp_name, _qp_value in SearchText.queryparams_from_textsegments('cardSearchText', self.cardsearch_textsegment_set): + for _qp_name, _qp_value in SearchText.queryparams_from_textsegments('cardSearchText', self.cardsearch_searchtext): _querydict[_qp_name] = _qp_value for _sort in self.sort_list: _qp_name, _qp_value = _sort.as_queryparam() @@ -448,7 +442,7 @@ class ValuesearchParams(CardsearchParams): # includes fields from CardsearchParams, because a # valuesearch is always in context of a cardsearch valuesearch_propertypath: Propertypath - valuesearch_textsegment_set: frozenset[SearchText] + valuesearch_searchtext: frozenset[SearchText] valuesearch_filter_set: frozenset[SearchFilter] static_focus_type = TROVE.Valuesearch @@ -462,14 +456,14 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: return { **super().parse_queryparams(queryparams), 'valuesearch_propertypath': osfmap.parse_osfmap_propertypath(_raw_propertypath), - 'valuesearch_textsegment_set': SearchText.from_queryparam_family(queryparams, 'valueSearchText'), + 'valuesearch_searchtext': SearchText.from_queryparam_family(queryparams, 'valueSearchText'), 'valuesearch_filter_set': SearchFilter.from_queryparam_family(queryparams, 'valueSearchFilter'), } def __post_init__(self): if osfmap.is_date_property(self.valuesearch_propertypath[-1]): # date-value limitations - if self.valuesearch_textsegment_set: + if self.valuesearch_searchtext: raise trove_exceptions.InvalidQueryParams( 'valueSearchText may not be used with valueSearchPropertyPath leading to a "date" property', ) @@ -481,7 +475,7 @@ def __post_init__(self): def to_querydict(self): _querydict = super().to_querydict() _querydict['valueSearchPropertyPath'] = osfmap.osfmap_propertypath_key(self.valuesearch_propertypath) - for _qp_name, _qp_value in SearchText.queryparams_from_textsegments('valueSearchText', self.valuesearch_textsegment_set): + for _qp_name, _qp_value in SearchText.queryparams_from_textsegments('valueSearchText', self.valuesearch_searchtext): _querydict[_qp_name] = _qp_value for _filter in self.valuesearch_filter_set: _qp_name, _qp_value = _filter.as_queryparam('valueSearchFilter') From 9b05a6ac73ce620d21f847d70a23cef1eb69cbe8 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 24 Apr 2025 14:15:19 -0400 Subject: [PATCH 31/43] less specific "common" searchtext tests remove test searches that require fuzziness or syntax operators (should verify those in strategy-specific tests, which may have various syntactic expectations) --- .../_common_trovesearch_tests.py | 50 +++++++------------ 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index 97be9dd96..2dc5cb7f5 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -56,25 +56,25 @@ def test_for_smoke_with_daemon(self): expected_doc_count=1, ) - # def test_cardsearch(self): - # self._fill_test_data_for_querying() - # for _queryparams, _expected_focus_iris in self.cardsearch_cases(): - # self._assert_cardsearch_iris(_queryparams, _expected_focus_iris) - # - # def test_cardsearch_after_deletion(self): - # _cards = self._fill_test_data_for_querying() - # _deleted_focus_iris = {BLARG.b} - # self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris]) - # for _queryparams, _expected_focus_iris in self.cardsearch_cases(): - # if isinstance(_expected_focus_iris, set): - # _expected_focus_iris -= _deleted_focus_iris - # else: - # _expected_focus_iris = [ - # _iri - # for _iri in _expected_focus_iris - # if _iri not in _deleted_focus_iris - # ] - # self._assert_cardsearch_iris(_queryparams, _expected_focus_iris) + def test_cardsearch(self): + self._fill_test_data_for_querying() + for _queryparams, _expected_focus_iris in self.cardsearch_cases(): + self._assert_cardsearch_iris(_queryparams, _expected_focus_iris) + + def test_cardsearch_after_deletion(self): + _cards = self._fill_test_data_for_querying() + _deleted_focus_iris = {BLARG.b} + self._delete_indexcards([_cards[_focus_iri] for _focus_iri in _deleted_focus_iris]) + for _queryparams, _expected_focus_iris in self.cardsearch_cases(): + if isinstance(_expected_focus_iris, set): + _expected_focus_iris -= _deleted_focus_iris + else: + _expected_focus_iris = [ + _iri + for _iri in _expected_focus_iris + if _iri not in _deleted_focus_iris + ] + self._assert_cardsearch_iris(_queryparams, _expected_focus_iris) def test_cardsearch_after_updates(self): _cards = self._fill_test_data_for_querying() @@ -454,26 +454,14 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str {'cardSearchText': 'bbbb'}, {BLARG.b}, ) - yield ( - {'cardSearchText': '-bbbb'}, - {BLARG.a, BLARG.c}, - ) yield ( {'cardSearchText': 'danger'}, {BLARG.b, BLARG.c}, ) - yield ( - {'cardSearchText': 'dangre'}, - {BLARG.b, BLARG.c}, - ) yield ( {'cardSearchText': '"dangre"'}, set(), ) - yield ( - {'cardSearchText': 'danger -repulsive'}, - {BLARG.c}, - ) yield ( {'cardSearchText': '"nothing valued is here"'}, {BLARG.a}, From 21cf85a9e1a7a11deba56c14eb93ee0c69b8ec59 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 24 Apr 2025 14:37:02 -0400 Subject: [PATCH 32/43] oops: missed a dangre --- .../share/search/index_strategy/_common_trovesearch_tests.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index 2dc5cb7f5..3d5f51e58 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -458,10 +458,6 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str {'cardSearchText': 'danger'}, {BLARG.b, BLARG.c}, ) - yield ( - {'cardSearchText': '"dangre"'}, - set(), - ) yield ( {'cardSearchText': '"nothing valued is here"'}, {BLARG.a}, From 03791b34148bb44d45829a67646cbe95e1c54399 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 24 Apr 2025 17:16:16 -0400 Subject: [PATCH 33/43] skip tests for soon-to-be-removed trove_indexcard_flats strategy --- .../test_trove_indexcard_flats.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 tests/share/search/index_strategy/test_trove_indexcard_flats.py diff --git a/tests/share/search/index_strategy/test_trove_indexcard_flats.py b/tests/share/search/index_strategy/test_trove_indexcard_flats.py new file mode 100644 index 000000000..56d3bd433 --- /dev/null +++ b/tests/share/search/index_strategy/test_trove_indexcard_flats.py @@ -0,0 +1,24 @@ +import pytest + +from share.search.index_strategy.trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy + +from . import _common_trovesearch_tests + + +@pytest.mark.skip +class TestTroveIndexcardFlats(_common_trovesearch_tests.CommonTrovesearchTests): + # for RealElasticTestCase + def get_index_strategy(self): + return TroveIndexcardFlatsIndexStrategy('test_trove_indexcard_flats') + + def cardsearch_integer_cases(self): + yield from () # integers not indexed by this strategy + + def cardsearch_trailingslash_cases(self): + yield from () # trailing-slash handling improved in trovesearch_denorm + + def valuesearch_sameas_cases(self): + yield from () # sameas handling improved in trovesearch_denorm + + def valuesearch_trailingslash_cases(self): + yield from () # trailing-slash handling improved in trovesearch_denorm From a927c97a3de6972f1e340f965a83b47db0b701df Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Fri, 25 Apr 2025 17:27:00 +0300 Subject: [PATCH 34/43] cleanup, renaming --- .../index_strategy/trovesearch_denorm.py | 4 +-- trove/trovesearch/search_params.py | 26 ++++++++----------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 899435751..1d13bc33e 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -653,8 +653,8 @@ def iri_boolparts(self) -> Iterator[tuple[str, dict]]: def text_boolparts(self) -> Iterator[tuple[str, dict]]: # text-based queries - for _textsegment in self.searchtext: - yield 'must', self._exact_text_query(_textsegment) + for _text in self.searchtext: + yield 'must', self._exact_text_query(_text) def _presence_query(self, search_filter) -> dict: return _any_query([ diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index cee61a5ad..b8bbf34a9 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -155,26 +155,22 @@ def from_searchtext_param_or_none(cls, param_name: QueryparamName, param_value: str(param_name), 'may not use glob-paths longer than "*" with search-text parameters', ) - _textsegment = cls(text=param_value) + _searchtext = cls(text=param_value) if _propertypath_set: - _textsegment = dataclasses.replace(_textsegment, propertypath_set=_propertypath_set) - return _textsegment + _searchtext = dataclasses.replace(_searchtext, propertypath_set=_propertypath_set) + return _searchtext @classmethod - def queryparams_from_textsegments(self, queryparam_family: str, textsegments): + def queryparams_from_searchtext(self, queryparam_family: str, cardsearch_searchtext): _by_propertypath_set = collections.defaultdict(set) - for _textsegment in textsegments: - _by_propertypath_set[_textsegment.propertypath_set].add(_textsegment) + for searchtext in cardsearch_searchtext: + _by_propertypath_set[searchtext.propertypath_set].add(searchtext) for _propertypath_set, _combinable_segments in _by_propertypath_set.items(): _qp_name = QueryparamName( queryparam_family, (osfmap.osfmap_propertypath_set_key(_propertypath_set),), ) - _qp_value = ' '.join( - _textsegment.text - for _textsegment in _combinable_segments - ) - yield str(_qp_name), _qp_value + yield str(_qp_name), _combinable_segments @dataclasses.dataclass(frozen=True) @@ -406,8 +402,8 @@ def cardsearch_type_iris(self): @functools.cached_property def cardsearch_text_paths(self) -> PropertypathSet: return frozenset().union(*( - _textsegment.propertypath_set - for _textsegment in self.cardsearch_searchtext + searchtext.propertypath_set + for searchtext in self.cardsearch_searchtext )) @functools.cached_property @@ -420,7 +416,7 @@ def cardsearch_text_glob_depths(self) -> frozenset[int]: def to_querydict(self) -> QueryDict: _querydict = super().to_querydict() - for _qp_name, _qp_value in SearchText.queryparams_from_textsegments('cardSearchText', self.cardsearch_searchtext): + for _qp_name, _qp_value in SearchText.queryparams_from_searchtext('cardSearchText', self.cardsearch_searchtext): _querydict[_qp_name] = _qp_value for _sort in self.sort_list: _qp_name, _qp_value = _sort.as_queryparam() @@ -475,7 +471,7 @@ def __post_init__(self): def to_querydict(self): _querydict = super().to_querydict() _querydict['valueSearchPropertyPath'] = osfmap.osfmap_propertypath_key(self.valuesearch_propertypath) - for _qp_name, _qp_value in SearchText.queryparams_from_textsegments('valueSearchText', self.valuesearch_searchtext): + for _qp_name, _qp_value in SearchText.queryparams_from_searchtext('valueSearchText', self.valuesearch_searchtext): _querydict[_qp_name] = _qp_value for _filter in self.valuesearch_filter_set: _qp_name, _qp_value = _filter.as_queryparam('valueSearchFilter') From 32c764607ae8d9319a65873f45ecd432885ac397 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Fri, 25 Apr 2025 17:35:26 +0300 Subject: [PATCH 35/43] reverted changes --- .../index_strategy/trove_indexcard_flats.py | 949 ------------------ .../test_trove_indexcard_flats.py | 24 - 2 files changed, 973 deletions(-) delete mode 100644 share/search/index_strategy/trove_indexcard_flats.py delete mode 100644 tests/share/search/index_strategy/test_trove_indexcard_flats.py diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py deleted file mode 100644 index 1de5e236a..000000000 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ /dev/null @@ -1,949 +0,0 @@ -import base64 -from collections import defaultdict -import dataclasses -import datetime -import json -import logging -import re -import uuid -from typing import Iterable, Iterator, Any - -from django.conf import settings -import elasticsearch8 -from primitive_metadata import primitive_rdf - -from share.search import exceptions -from share.search import messages -from share.search.index_strategy._base import IndexStrategy -from share.search.index_strategy.elastic8 import Elastic8IndexStrategy -from share.util.checksum_iri import ChecksumIri -from trove import models as trove_db -from trove.trovesearch.page_cursor import ( - MANY_MORE, - OffsetCursor, - PageCursor, - ReproduciblyRandomSampleCursor, -) -from trove.util.propertypath import GLOB_PATHSTEP -from trove.trovesearch.search_params import ( - CardsearchParams, - ValuesearchParams, - SearchFilter, - SearchText, - SortParam, -) -from trove.trovesearch.search_handle import ( - CardsearchHandle, - ValuesearchHandle, - TextMatchEvidence, - CardsearchResult, - ValuesearchResult, - PropertypathUsage, -) -from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri, iri_path_as_keyword -from trove.vocab import osfmap -from trove.vocab.namespaces import RDF, OWL -from ._trovesearch_util import ( - latest_rdf_for_indexcard_pks, - GraphWalk, - KEYWORD_LENGTH_MAX, -) - - -logger = logging.getLogger(__name__) - - -class TroveIndexcardFlatsIndexStrategy(Elastic8IndexStrategy): - CURRENT_STRATEGY_CHECKSUM = ChecksumIri( - checksumalgorithm_name='sha-256', - salt='TroveIndexcardFlatsIndexStrategy', - hexdigest='bdec536873e1ed0c58facaa5d1145bef73bba09d671deef48e45c019def5c5a5', - ) - - # abstract method from IndexStrategy - @property - def supported_message_types(self): - return { - messages.MessageType.UPDATE_INDEXCARD, - messages.MessageType.BACKFILL_INDEXCARD, - } - - # abstract method from IndexStrategy - @property - def backfill_message_type(self): - return messages.MessageType.BACKFILL_INDEXCARD - - @classmethod - def define_current_indexes(cls): - return { # empty index subname, for backcompat - '': cls.IndexDefinition( - mappings=cls.index_mappings(), - settings=cls.index_settings(), - ), - } - - @classmethod - def index_settings(cls): - return {} - - @classmethod - def index_mappings(cls): - _capped_keyword = { - 'type': 'keyword', - 'ignore_above': KEYWORD_LENGTH_MAX, - } - _common_nested_keywords = { - 'path_from_focus': _capped_keyword, - 'suffuniq_path_from_focus': _capped_keyword, - 'property_iri': _capped_keyword, - 'distance_from_focus': {'type': 'keyword'}, # numeric value as keyword (used for 'term' filter) - } - return { - 'dynamic': 'false', - 'properties': { - 'indexcard_uuid': _capped_keyword, - 'focus_iri': _capped_keyword, - 'suffuniq_focus_iri': _capped_keyword, - 'source_record_identifier': _capped_keyword, - 'source_config_label': _capped_keyword, - 'flat_iri_values': { - 'type': 'flattened', - 'ignore_above': KEYWORD_LENGTH_MAX, - }, - 'flat_iri_values_suffuniq': { - 'type': 'flattened', - 'ignore_above': KEYWORD_LENGTH_MAX, - }, - 'iri_paths_present': _capped_keyword, - 'iri_paths_present_suffuniq': _capped_keyword, - 'nested_iri': { - 'type': 'nested', - 'properties': { - **_common_nested_keywords, - 'iri_value': _capped_keyword, - 'suffuniq_iri_value': _capped_keyword, - 'value_type_iri': _capped_keyword, - 'value_name_text': { - 'type': 'text', - 'fields': {'raw': _capped_keyword}, - 'copy_to': 'nested_iri.value_namelike_text', - }, - 'value_title_text': { - 'type': 'text', - 'fields': {'raw': _capped_keyword}, - 'copy_to': 'nested_iri.value_namelike_text', - }, - 'value_label_text': { - 'type': 'text', - 'fields': {'raw': _capped_keyword}, - 'copy_to': 'nested_iri.value_namelike_text', - }, - 'value_namelike_text': {'type': 'text'}, - }, - }, - 'nested_date': { - 'type': 'nested', - 'properties': { - **_common_nested_keywords, - 'date_value': { - 'type': 'date', - 'format': 'strict_date_optional_time', - }, - }, - }, - 'nested_text': { - 'type': 'nested', - 'properties': { - **_common_nested_keywords, - 'language_iri': _capped_keyword, - 'text_value': { - 'type': 'text', - 'index_options': 'offsets', # for faster highlighting - 'store': True, # avoid loading _source to render highlights - 'fields': {'raw': _capped_keyword}, - }, - }, - }, - }, - } - - @property - def __index(self) -> IndexStrategy.SpecificIndex: - # this is a single-index strategy -- for back-compat, that index has empty subname - return self.get_index('') - - def _build_sourcedoc(self, indexcard_rdf): - _rdfdoc = indexcard_rdf.as_rdfdoc_with_supplements() - if _should_skip_card(indexcard_rdf, _rdfdoc): - return None # will be deleted from the index - _nested_iris = defaultdict(set) - _nested_dates = defaultdict(set) - _nested_texts = defaultdict(set) - _walk = GraphWalk(_rdfdoc, indexcard_rdf.focus_iri) - for _walk_path, _walk_iris in _walk.iri_values.items(): - for _iri_obj in _walk_iris: - _nested_iris[_NestedIriKey.for_iri_at_path(_walk_path, _iri_obj, _rdfdoc)].add(_iri_obj) - for _walk_path, _walk_dates in _walk.date_values.items(): - for _date_obj in _walk_dates: - _nested_dates[_walk_path].add(datetime.date.isoformat(_date_obj)) - for _walk_path, _walk_texts in _walk.text_values.items(): - for _text_obj in _walk_texts: - _nested_texts[(_walk_path, tuple(_text_obj.datatype_iris))].add(_text_obj.unicode_value) - _focus_iris = {indexcard_rdf.focus_iri} - _suffuniq_focus_iris = {get_sufficiently_unique_iri(indexcard_rdf.focus_iri)} - for _identifier in indexcard_rdf.indexcard.focus_identifier_set.all(): - _focus_iris.update(_identifier.raw_iri_list) - _suffuniq_focus_iris.add(_identifier.sufficiently_unique_iri) - return { - 'indexcard_uuid': str(indexcard_rdf.indexcard.uuid), - 'focus_iri': list(_focus_iris), - 'suffuniq_focus_iri': list(_suffuniq_focus_iris), - 'source_record_identifier': indexcard_rdf.indexcard.source_record_suid.identifier, - 'source_config_label': indexcard_rdf.indexcard.source_record_suid.source_config.label, - 'flat_iri_values': self._flattened_iris(_nested_iris), - 'flat_iri_values_suffuniq': self._flattened_iris_suffuniq(_nested_iris), - 'iri_paths_present': [ - iri_path_as_keyword(_path) - for _path in _walk.paths_walked - ], - 'iri_paths_present_suffuniq': [ - iri_path_as_keyword(_path, suffuniq=True) - for _path in _walk.paths_walked - ], - 'nested_iri': list(filter(bool, ( - self._iri_nested_sourcedoc(_nested_iri_key, _iris, _rdfdoc) - for _nested_iri_key, _iris in _nested_iris.items() - ))), - 'nested_date': [ - { - **_iri_path_as_indexable_fields(_path), - 'date_value': list(_value_set), - } - for _path, _value_set in _nested_dates.items() - ], - 'nested_text': [ - { - **_iri_path_as_indexable_fields(_path), - 'language_iri': _language_iris, - 'text_value': list(_value_set), - } - for (_path, _language_iris), _value_set in _nested_texts.items() - ], - } - - def _iri_nested_sourcedoc(self, iri_key: '_NestedIriKey', iris, rdfdoc): - _iris_with_synonyms = set(filter(is_worthwhile_iri, iris)) - for _iri in iris: - _iris_with_synonyms.update( - filter(is_worthwhile_iri, rdfdoc.q(_iri, OWL.sameAs)), - ) - if not _iris_with_synonyms: - return None - _sourcedoc = { - **iri_key.as_indexable_fields(), - 'iri_value': list(_iris_with_synonyms), - 'suffuniq_iri_value': [ - get_sufficiently_unique_iri(_iri) - for _iri in _iris_with_synonyms - ], - } - return _sourcedoc - - def _flattened_iris_by_path(self, nested_iris: dict['_NestedIriKey', set[str]]): - _by_path = defaultdict(set) - for _iri_key, _iris in nested_iris.items(): - _by_path[_iri_key.path].update(_iris) - return _by_path - - def _flattened_iris(self, nested_iris: dict['_NestedIriKey', set[str]]): - return { - _iri_path_as_flattened_key(_path): list(_iris) - for _path, _iris in self._flattened_iris_by_path(nested_iris).items() - } - - def _flattened_iris_suffuniq(self, nested_iris: dict['_NestedIriKey', set[str]]): - return { - _iri_path_as_flattened_key(_path): [ - get_sufficiently_unique_iri(_iri) - for _iri in _iris - ] - for _path, _iris in self._flattened_iris_by_path(nested_iris).items() - } - - def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): - def _make_actionset(indexcard_id, *actions): - return self.MessageActionSet(indexcard_id, {'': actions}) - _indexcard_rdf_qs = latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) - _remaining_indexcard_ids = set(messages_chunk.target_ids_chunk) - for _indexcard_rdf in _indexcard_rdf_qs: - _suid = _indexcard_rdf.indexcard.source_record_suid - if _suid.has_forecompat_replacement(): - continue # skip this one, let it get deleted - _sourcedoc = self._build_sourcedoc(_indexcard_rdf) - if _sourcedoc: - _index_action = self.build_index_action( - doc_id=_indexcard_rdf.indexcard.get_iri(), - doc_source=_sourcedoc, - ) - _remaining_indexcard_ids.discard(_indexcard_rdf.indexcard_id) - yield _make_actionset(_indexcard_rdf.indexcard_id, _index_action) - # delete any that don't have "latest" rdf and derived osfmap_json - _leftovers = trove_db.Indexcard.objects.filter(id__in=_remaining_indexcard_ids) - for _indexcard in _leftovers: - yield _make_actionset(_indexcard.id, self.build_delete_action(_indexcard.get_iri())) - - def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: - return self.es8_client.search( - index=self.__index.full_index_name, - body={ - **(request_body or {}), - 'track_total_hits': True, - }, - params=(request_queryparams or {}), - ) - - def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: - _cursor = self._cardsearch_cursor(cardsearch_params) - _sort = self._cardsearch_sort(cardsearch_params.sort_list) - _query = self._cardsearch_query( - cardsearch_params.cardsearch_filter_set, - cardsearch_params.cardsearch_searchtext, - cardsearch_cursor=_cursor, - ) - _from_offset = ( - _cursor.start_offset - if _cursor.is_first_page() or not isinstance(_cursor, ReproduciblyRandomSampleCursor) - else _cursor.start_offset - len(_cursor.first_page_ids) - ) - _search_kwargs = dict( - query=_query, - aggs=self._cardsearch_aggs(cardsearch_params), - sort=_sort, - from_=_from_offset, - size=_cursor.bounded_page_size, - source=False, # no need to get _source; _id is enough - ) - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.es8_client.search( - index=self.__index.full_index_name, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self._cardsearch_handle(cardsearch_params, _es8_response, _cursor) - - def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: - _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) - _is_date_search = osfmap.is_date_property(valuesearch_params.valuesearch_propertypath[-1]) - _search_kwargs = dict( - query=self._cardsearch_query( - valuesearch_params.cardsearch_filter_set, - valuesearch_params.cardsearch_searchtext, - additional_filters=[{'term': {'iri_paths_present': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - )}}], - ), - size=0, # ignore cardsearch hits; just want the aggs - aggs=( - self._valuesearch_date_aggs(valuesearch_params) - if _is_date_search - else self._valuesearch_iri_aggs(valuesearch_params, _cursor) - ), - ) - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.es8_client.search( - index=self.__index.full_index_name, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self._valuesearch_handle(valuesearch_params, _es8_response, _cursor) - - ### - # query implementation - - def _cardsearch_cursor(self, cardsearch_params: CardsearchParams) -> OffsetCursor: - _request_cursor = cardsearch_params.page_cursor - if ( - _request_cursor.is_basic() - and not cardsearch_params.sort_list - and not cardsearch_params.cardsearch_searchtext - ): - return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) - return OffsetCursor.from_cursor(_request_cursor) - - def _cardsearch_query( - self, - filter_set, searchtext, *, - additional_filters=None, - cardsearch_cursor: PageCursor | None = None, - ) -> dict: - _bool_query = { - 'filter': additional_filters or [], - 'must': [], - 'must_not': [], - 'should': [], - } - for _searchfilter in filter_set: - if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: - _bool_query['must_not'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: - _bool_query['filter'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: - _bool_query['filter'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: - _bool_query['must_not'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator.is_date_operator(): - _bool_query['filter'].append(self._cardsearch_date_filter(_searchfilter)) - else: - raise ValueError(f'unknown filter operator {_searchfilter.operator}') - _textq_builder = self._NestedTextQueryBuilder( - relevance_matters=not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor), - ) - for _textsegment in searchtext: - for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): - _bool_query[_boolkey].extend(_textqueries) - if not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor): - # no need for randomness - return {'bool': _bool_query} - if not cardsearch_cursor.first_page_ids: - # independent random sample - return { - 'function_score': { - 'query': {'bool': _bool_query}, - 'boost_mode': 'replace', - 'random_score': {}, # default random_score is fast and unpredictable - }, - } - _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_ids}} - if cardsearch_cursor.is_first_page(): - # returning to a first page previously visited - _bool_query['filter'].append(_firstpage_uuid_query) - return {'bool': _bool_query} - # get a subsequent page using reproducible randomness - _bool_query['must_not'].append(_firstpage_uuid_query) - return { - 'function_score': { - 'query': {'bool': _bool_query}, - 'boost_mode': 'replace', - 'random_score': { - 'seed': ''.join(cardsearch_cursor.first_page_ids), - 'field': 'indexcard_uuid', - }, - }, - } - - def _cardsearch_aggs(self, cardsearch_params): - _aggs = {} - if cardsearch_params.related_property_paths: - _aggs['related_propertypath_usage'] = {'terms': { - 'field': 'iri_paths_present', - 'include': [ - iri_path_as_keyword(_path) - for _path in cardsearch_params.related_property_paths - ], - 'size': len(cardsearch_params.related_property_paths), - }} - return _aggs - - def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: OffsetCursor): - _nested_iri_bool: dict[str, Any] = { - 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - suffuniq=True, - )}}], - 'must': [], - 'must_not': [], - 'should': [], - } - _nested_terms_agg = { - 'field': 'nested_iri.iri_value', - # WARNING: terribly inefficient pagination (part one) - 'size': cursor.start_offset + cursor.bounded_page_size + 1, - } - _iris = list(valuesearch_params.valuesearch_iris()) - if _iris: - _nested_iri_bool['filter'].append({'terms': { - 'nested_iri.iri_value': _iris, - }}) - _nested_terms_agg['size'] = len(_iris) - _nested_terms_agg['include'] = _iris - _type_iris = list(valuesearch_params.valuesearch_type_iris()) - if _type_iris: - _nested_iri_bool['filter'].append({'terms': { - 'nested_iri.value_type_iri': _type_iris, - }}) - _textq_builder = self._SimpleTextQueryBuilder('nested_iri.value_namelike_text') - for _textsegment in valuesearch_params.valuesearch_searchtext: - for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): - _nested_iri_bool[_boolkey].extend(_textqueries) - return { - 'in_nested_iri': { - 'nested': {'path': 'nested_iri'}, - 'aggs': { - 'value_at_propertypath': { - 'filter': {'bool': _nested_iri_bool}, - 'aggs': { - 'iri_values': { - 'terms': _nested_terms_agg, - 'aggs': { - 'type_iri': {'terms': { - 'field': 'nested_iri.value_type_iri', - }}, - 'name_text': {'terms': { - 'field': 'nested_iri.value_name_text.raw', - }}, - 'title_text': {'terms': { - 'field': 'nested_iri.value_title_text.raw', - }}, - 'label_text': {'terms': { - 'field': 'nested_iri.value_label_text.raw', - }}, - }, - }, - }, - }, - }, - }, - } - - def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams): - _aggs = { - 'in_nested_date': { - 'nested': {'path': 'nested_date'}, - 'aggs': { - 'value_at_propertypath': { - 'filter': {'term': { - 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - suffuniq=True, - ), - }}, - 'aggs': { - 'count_by_year': { - 'date_histogram': { - 'field': 'nested_date.date_value', - 'calendar_interval': 'year', - 'format': 'yyyy', - 'order': {'_key': 'desc'}, - 'min_doc_count': 1, - }, - }, - }, - }, - }, - }, - } - return _aggs - - def _valuesearch_handle( - self, - valuesearch_params: ValuesearchParams, - es8_response: dict, - cursor: OffsetCursor, - ): - _iri_aggs = es8_response['aggregations'].get('in_nested_iri') - if _iri_aggs: - _buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets'] - _bucket_count = len(_buckets) - # WARNING: terribly inefficient pagination (part two) - _page_end_index = cursor.start_offset + cursor.bounded_page_size - _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages - cursor.total_count = ( - MANY_MORE - if (_bucket_count > _page_end_index) # agg includes one more, if there - else _bucket_count - ) - return ValuesearchHandle( - cursor=cursor, - search_result_page=[ - self._valuesearch_iri_result(_iri_bucket) - for _iri_bucket in _bucket_page - ], - search_params=valuesearch_params, - ) - else: # assume date - _year_buckets = ( - es8_response['aggregations']['in_nested_date'] - ['value_at_propertypath']['count_by_year']['buckets'] - ) - return ValuesearchHandle( - cursor=PageCursor(len(_year_buckets)), - search_result_page=[ - self._valuesearch_date_result(_year_bucket) - for _year_bucket in _year_buckets - ], - search_params=valuesearch_params, - ) - - def _valuesearch_iri_result(self, iri_bucket): - return ValuesearchResult( - value_iri=iri_bucket['key'], - value_type=_bucketlist(iri_bucket['type_iri']), - name_text=_bucketlist(iri_bucket['name_text']), - title_text=_bucketlist(iri_bucket['title_text']), - label_text=_bucketlist(iri_bucket['label_text']), - match_count=iri_bucket['doc_count'], - ) - - def _valuesearch_date_result(self, date_bucket): - return ValuesearchResult( - value_iri=None, - value_value=date_bucket['key_as_string'], - label_text=(date_bucket['key_as_string'],), - match_count=date_bucket['doc_count'], - ) - - def _cardsearch_presence_query(self, search_filter) -> dict: - _filters = [ - self._cardsearch_path_presence_query(_path) - for _path in search_filter.propertypath_set - ] - if len(_filters) == 1: - return _filters[0] - return {'bool': { - 'minimum_should_match': 1, - 'should': _filters, - }} - - def _cardsearch_path_presence_query(self, path: tuple[str, ...]): - if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): - return {'nested': { - 'path': 'nested_iri', - 'query': {'term': {'nested_iri.distance_from_focus': len(path)}}, - }} - return {'term': { - 'iri_paths_present_suffuniq': iri_path_as_keyword(path, suffuniq=True), - }} - - def _cardsearch_iri_filter(self, search_filter) -> dict: - _filters = [ - self._cardsearch_path_iri_query(_path, search_filter.value_set) - for _path in search_filter.propertypath_set - ] - if len(_filters) == 1: - return _filters[0] - return {'bool': { - 'minimum_should_match': 1, - 'should': _filters, - }} - - def _cardsearch_path_iri_query(self, path, value_set): - _suffuniq_values = [ - get_sufficiently_unique_iri(_iri) - for _iri in value_set - ] - if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): - return {'nested': { - 'path': 'nested_iri', - 'query': {'bool': { - 'must': [ # both - {'term': {'nested_iri.distance_from_focus': len(path)}}, - {'terms': {'nested_iri.suffuniq_iri_value': _suffuniq_values}}, - ], - }}, - }} - # without a glob-path, can use the flattened keyword field - return {'terms': {_iri_path_as_flattened_field(path): _suffuniq_values}} - - def _cardsearch_date_filter(self, search_filter): - return {'nested': { - 'path': 'nested_date', - 'query': {'bool': {'filter': list(self._iter_nested_date_filters(search_filter))}}, - }} - - def _iter_nested_date_filters(self, search_filter) -> Iterator[dict]: - # filter by requested paths - yield _pathset_as_nestedvalue_filter(search_filter.propertypath_set, 'nested_date') - # filter by requested value/operator - if search_filter.operator == SearchFilter.FilterOperator.BEFORE: - _value = min(search_filter.value_set) # rely on string-comparable isoformat - yield {'range': {'nested_date.date_value': { - 'lt': _daterange_value_and_format(_value) - }}} - elif search_filter.operator == SearchFilter.FilterOperator.AFTER: - _value = max(search_filter.value_set) # rely on string-comparable isoformat - yield {'range': {'nested_date.date_value': { - 'gt': _daterange_value_and_format(_value) - }}} - elif search_filter.operator == SearchFilter.FilterOperator.AT_DATE: - for _value in search_filter.value_set: - _filtervalue = _daterange_value_and_format(_value) - yield {'range': {'nested_date.date_value': { - 'gte': _filtervalue, - 'lte': _filtervalue, - }}} - else: - raise ValueError(f'invalid date filter operator (got {search_filter.operator})') - - def _cardsearch_sort(self, sort_list: tuple[SortParam, ...]): - if not sort_list: - return None - return [ - {'nested_date.date_value': { - 'order': ('desc' if _sortparam.descending else 'asc'), - 'nested': { - 'path': 'nested_date', - 'filter': {'term': { - 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - _sortparam.propertypath, - suffuniq=True, - ), - }}, - }, - }} - for _sortparam in sort_list - ] - - def _cardsearch_handle( - self, - cardsearch_params: CardsearchParams, - es8_response: dict, - cursor: OffsetCursor, - ) -> CardsearchHandle: - _es8_total = es8_response['hits']['total'] - if _es8_total['relation'] != 'eq': - cursor.total_count = MANY_MORE - elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): - # account for the filtered-out first page - cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) - else: # exact (and small) count - cursor.total_count = _es8_total['value'] - _results = [] - for _es8_hit in es8_response['hits']['hits']: - _card_iri = _es8_hit['_id'] - _results.append(CardsearchResult( - card_iri=_card_iri, - text_match_evidence=list(self._gather_textmatch_evidence(_es8_hit)), - )) - _relatedproperty_list: list[PropertypathUsage] = [] - if cardsearch_params.related_property_paths: - _relatedproperty_list.extend( - PropertypathUsage(property_path=_path, usage_count=0) - for _path in cardsearch_params.related_property_paths - ) - _relatedproperty_by_path = { - _result.property_path: _result - for _result in _relatedproperty_list - } - for _bucket in es8_response['aggregations']['related_propertypath_usage']['buckets']: - _path = tuple(json.loads(_bucket['key'])) - _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] - return CardsearchHandle( - cursor=cursor, - search_result_page=_results, - related_propertypath_results=_relatedproperty_list, - search_params=cardsearch_params, - ) - - def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]: - for _innerhit_group in es8_hit.get('inner_hits', {}).values(): - for _innerhit in _innerhit_group['hits']['hits']: - _property_path = tuple( - json.loads(_innerhit['fields']['nested_text.path_from_focus'][0]), - ) - try: - _language_iris = _innerhit['fields']['nested_text.language_iri'] - except KeyError: - _language_iris = () - for _highlight in _innerhit['highlight']['nested_text.text_value']: - yield TextMatchEvidence( - property_path=_property_path, - matching_highlight=primitive_rdf.literal(_highlight, datatype_iris=_language_iris), - card_iri=_innerhit['_id'], - ) - - class _SimpleTextQueryBuilder: - def __init__( - self, text_field, *, - relevance_matters=False, - ): - self._text_field = text_field - self._relevance_matters = relevance_matters - - def textsegment_boolparts(self, textsegment: SearchText) -> dict[str, list]: - if not self._relevance_matters: - return {'must': [self.fuzzy_text_must_query(textsegment.text)]} - return { - 'must': [self.fuzzy_text_must_query(textsegment.text)], - 'should': [self.fuzzy_text_should_query(textsegment.text)], - } - - def exact_text_query(self, text: str) -> dict: - # TODO: textsegment.is_openended (prefix query) - return {'match_phrase': { - self._text_field: {'query': text}, - }} - - def fuzzy_text_must_query(self, text: str) -> dict: - # TODO: textsegment.is_openended (prefix query) - return {'match': { - self._text_field: { - 'query': text, - 'fuzziness': 'AUTO', - # TODO: 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) - }, - }} - - def fuzzy_text_should_query(self, text: str): - return {'match_phrase': { - self._text_field: { - 'query': text, - 'slop': len(text.split()), - }, - }} - - class _NestedTextQueryBuilder(_SimpleTextQueryBuilder): - def __init__(self, **kwargs): - super().__init__('nested_text.text_value', **kwargs) - - def textsegment_boolparts(self, textsegment: SearchText) -> dict[str, list]: - return { - _boolkey: [ - self._make_nested_query(textsegment, _query) - for _query in _queries - ] - for _boolkey, _queries in super().textsegment_boolparts(textsegment).items() - } - - def _make_nested_query(self, textsegment, query): - _nested_q = {'nested': { - 'path': 'nested_text', - 'query': {'bool': { - 'filter': _pathset_as_nestedvalue_filter(textsegment.propertypath_set, 'nested_text'), - 'must': query, - }}, - }} - if self._relevance_matters: - _nested_q['nested']['inner_hits'] = self._inner_hits() - return _nested_q - - def _inner_hits(self, *, highlight_query=None) -> dict: - _highlight = { - 'type': 'unified', - 'fields': {'nested_text.text_value': {}}, - } - if highlight_query is not None: - _highlight['highlight_query'] = highlight_query - return { - 'name': str(uuid.uuid4()), # avoid inner-hit name collisions - 'highlight': _highlight, - '_source': False, # _source is expensive for nested docs - 'docvalue_fields': [ - 'nested_text.path_from_focus', - 'nested_text.language_iri', - ], - } - - -### -# module-local utils - -def _should_skip_card(indexcard_rdf, rdfdoc): - # skip cards without some value for name/title/label - return not any(rdfdoc.q(indexcard_rdf.focus_iri, osfmap.NAMELIKE_PROPERTIES)) - - -def _bucketlist(agg_result: dict) -> list[str]: - return [ - _bucket['key'] - for _bucket in agg_result['buckets'] - ] - - -def _daterange_value_and_format(datevalue: str): - _cleanvalue = datevalue.strip() - if re.fullmatch(r'\d{4,}', _cleanvalue): - return f'{_cleanvalue}||/y' - if re.fullmatch(r'\d{4,}-\d{2}', _cleanvalue): - return f'{_cleanvalue}||/M' - if re.fullmatch(r'\d{4,}-\d{2}-\d{2}', _cleanvalue): - return f'{_cleanvalue}||/d' - raise ValueError(f'bad date value "{datevalue}"') - - -def _iri_path_as_indexable_fields(path: tuple[str, ...]): - assert path, 'path should not be empty' - return { - 'path_from_focus': iri_path_as_keyword(path), - 'suffuniq_path_from_focus': iri_path_as_keyword(path, suffuniq=True), - 'property_iri': path[-1], - 'distance_from_focus': len(path), - } - - -def _iri_path_as_flattened_key(path: tuple[str, ...]) -> str: - return base64.b16encode(json.dumps(path).encode()).decode() - - -def _iri_path_as_flattened_field(path: tuple[str, ...]) -> str: - return f'flat_iri_values_suffuniq.{_iri_path_as_flattened_key(path)}' - - -def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[tuple[str, ...]], nested_path: str): - _suffuniq_iri_paths = [] - _glob_path_lengths = [] - for _path in propertypath_set: - if all(_pathstep == GLOB_PATHSTEP for _pathstep in _path): - _glob_path_lengths.append(len(_path)) - else: - _suffuniq_iri_paths.append(iri_path_as_keyword(_path, suffuniq=True)) - if _suffuniq_iri_paths and _glob_path_lengths: - return {'bool': { - 'minimum_should_match': 1, - 'should': [ - {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}}, - {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}}, - ], - }} - if _glob_path_lengths: - return {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}} - return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}} - - -@dataclasses.dataclass(frozen=True) -class _NestedIriKey: - '''if this is the same for multiple iri values, they can be combined in one `nested_iri` doc - ''' - path: tuple[str, ...] - type_iris: frozenset[str] - label_text: frozenset[str] - title_text: frozenset[str] - name_text: frozenset[str] - - @classmethod - def for_iri_at_path(cls, path: tuple[str, ...], iri: str, rdfdoc): - return cls( - path=path, - type_iris=frozenset(rdfdoc.q(iri, RDF.type)), - # TODO: don't discard language for name/title/label - name_text=frozenset( - _text.unicode_value - for _text in rdfdoc.q(iri, osfmap.NAME_PROPERTIES) - if isinstance(_text, primitive_rdf.Literal) - ), - title_text=frozenset( - _text.unicode_value - for _text in rdfdoc.q(iri, osfmap.TITLE_PROPERTIES) - if isinstance(_text, primitive_rdf.Literal) - ), - label_text=frozenset( - _text.unicode_value - for _text in rdfdoc.q(iri, osfmap.LABEL_PROPERTIES) - if isinstance(_text, primitive_rdf.Literal) - ), - ) - - def as_indexable_fields(self): - # matches fields in the mapping for `nested_iri`, above - return { - **_iri_path_as_indexable_fields(self.path), - 'value_type_iri': list(self.type_iris), - 'value_label_text': list(self.label_text), - 'value_title_text': list(self.title_text), - 'value_name_text': list(self.name_text), - } diff --git a/tests/share/search/index_strategy/test_trove_indexcard_flats.py b/tests/share/search/index_strategy/test_trove_indexcard_flats.py deleted file mode 100644 index 56d3bd433..000000000 --- a/tests/share/search/index_strategy/test_trove_indexcard_flats.py +++ /dev/null @@ -1,24 +0,0 @@ -import pytest - -from share.search.index_strategy.trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy - -from . import _common_trovesearch_tests - - -@pytest.mark.skip -class TestTroveIndexcardFlats(_common_trovesearch_tests.CommonTrovesearchTests): - # for RealElasticTestCase - def get_index_strategy(self): - return TroveIndexcardFlatsIndexStrategy('test_trove_indexcard_flats') - - def cardsearch_integer_cases(self): - yield from () # integers not indexed by this strategy - - def cardsearch_trailingslash_cases(self): - yield from () # trailing-slash handling improved in trovesearch_denorm - - def valuesearch_sameas_cases(self): - yield from () # sameas handling improved in trovesearch_denorm - - def valuesearch_trailingslash_cases(self): - yield from () # trailing-slash handling improved in trovesearch_denorm From d41a15ab2aa9dad313cd4ca83b7f66612e966077 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Tue, 29 Apr 2025 15:24:32 +0300 Subject: [PATCH 36/43] add query syntax with `simple_query_string --- share/search/index_strategy/trovesearch_denorm.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 1d13bc33e..59ea81656 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -712,11 +712,13 @@ def _text_field_name(self, propertypath: Propertypath): ) def _exact_text_query(self, textsegment: SearchText) -> dict: - # TODO: textsegment.is_openended (prefix query) - return _any_query([ - {'match_phrase': {self._text_field_name(_path): {'query': textsegment.text}}} - for _path in textsegment.propertypath_set - ]) + return { + "simple_query_string": { + "query": textsegment.text, + "fields": [self._text_field_name(_path) for _path in textsegment.propertypath_set], + "default_operator": "AND" + } + } @dataclasses.dataclass From 2d3d565a316459ea1c9763a3f7273e3a76327a08 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Tue, 29 Apr 2025 16:41:04 +0300 Subject: [PATCH 37/43] simplified --- .../index_strategy/trovesearch_denorm.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 59ea81656..1e38e85c7 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -654,7 +654,16 @@ def iri_boolparts(self) -> Iterator[tuple[str, dict]]: def text_boolparts(self) -> Iterator[tuple[str, dict]]: # text-based queries for _text in self.searchtext: - yield 'must', self._exact_text_query(_text) + yield ( + 'must', + { + "simple_query_string": { + "query": _text.text, + "fields": [self._text_field_name(_path) for _path in _text.propertypath_set], + "default_operator": "AND" + } + } + ) def _presence_query(self, search_filter) -> dict: return _any_query([ @@ -711,16 +720,6 @@ def _text_field_name(self, propertypath: Propertypath): else f'{self.base_field}.text_by_propertypath.{_path_field_name(propertypath)}' ) - def _exact_text_query(self, textsegment: SearchText) -> dict: - return { - "simple_query_string": { - "query": textsegment.text, - "fields": [self._text_field_name(_path) for _path in textsegment.propertypath_set], - "default_operator": "AND" - } - } - - @dataclasses.dataclass class _CardsearchQueryBuilder: params: CardsearchParams From 78930ded288aaeb824aa440345e35a89114bc24d Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Tue, 29 Apr 2025 16:45:51 +0300 Subject: [PATCH 38/43] fix flake8 --- share/search/index_strategy/trovesearch_denorm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 1e38e85c7..a65eb776f 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -720,6 +720,7 @@ def _text_field_name(self, propertypath: Propertypath): else f'{self.base_field}.text_by_propertypath.{_path_field_name(propertypath)}' ) + @dataclasses.dataclass class _CardsearchQueryBuilder: params: CardsearchParams From 020331ec113ad5cfe45904ed7de589fd217c7d78 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 2 May 2025 10:42:12 -0400 Subject: [PATCH 39/43] update cardSearchText/valueSearchText descriptions --- trove/vocab/trove.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index bb9fe879e..4e40a3bb7 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -519,17 +519,23 @@ def trove_browse_link(iri: str): RDFS.comment: {literal('free-text search query', language='en')}, TROVE.jsonSchema: {literal_json({'type': 'string'})}, DCTERMS.description: {_literal_markdown('''**cardSearchText** is -a query parameter for free-text search, e.g. `cardSearchText=foo` +a query parameter for free-text search within an index-card. + +accepts comma-separated property-paths in an optional bracketed parameter, +e.g. `cardSearchText[title,description]=foo` +(without brackets equivalent to `cardSearchText[*]`, matching any property-path of length one from the index-card focus). + +different index-strategies may parse and process search text differently +-- the current default index-strategy supports these special characters: +* `+` signifies AND operation (default) +* `|` signifies OR operation +* `-` negates a single token +* `"` wraps a number of tokens to signify a phrase for searching +* `*` at the end of a term signifies a prefix query +* `(` and `)` signify precedence +* `~N` (where N is an integer) after a word signifies edit distance (fuzziness) +* `~N` (where N is an integer) after a phrase signifies slop amount -special characters in search text: - -* `"` (double quotes): use on both sides of a word or phrase to require exact text match - -- without quotes, text search is fuzzier and more approximate -* `-` (hyphen): use before a word or quoted phrase (before the leading `"`) to require - that the exact word or phrase be absent - -accepts comma-separated property-paths in an optional bracketed parameter (default -`*]`, matches any one property), e.g. `cardSearchText[title,description]=foo` ''', language='en')}, }, TROVE.cardSearchFilter: { @@ -589,10 +595,10 @@ def trove_browse_link(iri: str): RDFS.comment: {literal('free-text search (within a title, name, or label associated with an IRI)', language='en')}, TROVE.jsonSchema: {literal_json({'type': 'string'})}, DCTERMS.description: {_literal_markdown('''**valueSearchText** is -a query parameter that matches text closely associated with each value -(specifically `dcterms:title`, `foaf:name`, and `rdfs:label`) +a query parameter to narrow an index-value-search by free-text search. -note: does not accept any bracketed parameters +behaves like `cardSearchText` except that paths are interpreted relative to +(non-focus) IRI values within each index-card. ''', language='en')}, }, TROVE.indexCardId: { From 4a284cab2ef54ac179b3975125436b6778ab91fd Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 2 May 2025 10:58:02 -0400 Subject: [PATCH 40/43] consistent IRI/iri casing at /trove/docs --- trove/vocab/trove.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index 4e40a3bb7..ddd7997e8 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -81,12 +81,12 @@ def trove_browse_link(iri: str): DCTERMS.description: {_literal_markdown(f'''an **index-card** is a metadata record about a specific thing. -that thing is called the "focus" of the index-card and is identified by a "focus iri" +that thing is called the "focus" of the index-card and is identified by a "focus IRI" -- any thing may be identified by multiple iris, but choose one within an index-card (and perhaps include the others with `owl:sameAs`) the metadata about the thing is a quoted [rdf graph](https://www.w3.org/TR/rdf11-concepts/#data-model) -in which every triple is reachable from the card's focus iri +in which every triple is reachable from the card's focus IRI following predicates as directed edges from subject to object. there is not (yet) any size limit for an index-card's metadata, @@ -100,12 +100,12 @@ def trove_browse_link(iri: str): when represented as `application/vnd.api+json` (jsonapi), the `resourceMetadata` attribute contains a json object that has: -* `@id` with the focus iri +* `@id` with the focus IRI * `@type` with the focus resource's `rdf:type` -* property keys from [OSFMAP]({osfmap.OSFMAP_LINK}) shorthand (each corresponding to an iri) +* property keys from [OSFMAP]({osfmap.OSFMAP_LINK}) shorthand (each corresponding to an IRI) * property values as lists of objects: * literal text as `{{"@value": "..."}}` - * iri references as `{{"@id": "..."}}` + * IRI references as `{{"@id": "..."}}` ''', language='en')}, }, @@ -194,7 +194,7 @@ def trove_browse_link(iri: str): RDFS.label: {literal('card-search-with-star-path', language='en')}, RDFS.comment: {literal('card-search with star path', language='en')}, DCTERMS.description: {_literal_markdown(''' -searches index-cards with a specific iri value at any property +searches index-cards with a specific IRI value at any property uses query parameter: @@ -233,16 +233,16 @@ def trove_browse_link(iri: str): # TROVE.include, }, RDFS.label: {literal('index-value-search', language='en')}, - RDFS.comment: {literal('search for iri values based on how they are used', language='en')}, + RDFS.comment: {literal('search for IRI values based on how they are used', language='en')}, DCTERMS.description: {_literal_markdown('''**index-value-search** is -a way to find iri values that could be used in a cardSearchFilter +a way to find IRI values that could be used in a cardSearchFilter ''', language='en')}, TROVE.example: { blanknode({ RDFS.label: {literal('value-search without card-search', language='en')}, RDFS.comment: {literal('value-search without card-search', language='en')}, DCTERMS.description: {_literal_markdown(''' -search for iri values for the property `creator` (aka `dcterms:creator`, +search for IRI values for the property `creator` (aka `dcterms:creator`, ``) uses query parameters: @@ -255,7 +255,7 @@ def trove_browse_link(iri: str): RDFS.label: {literal('value-search with card-search', language='en')}, RDFS.comment: {literal('value-search with card-search', language='en')}, DCTERMS.description: {_literal_markdown(''' -search for iri values for the property `creator` within the context of a card-search +search for IRI values for the property `creator` within the context of a card-search uses query parameter: @@ -266,10 +266,10 @@ def trove_browse_link(iri: str): RDF.value: {literal('/trove/index-value-search?valueSearchPropertyPath=creator&cardSearchText=sciency&cardSearchFilter[subject][is-present]&acceptMediatype=application/vnd.api%2Bjson')}, }), blanknode({ - RDFS.label: {literal('value-search specific iri', language='en')}, - RDFS.comment: {literal('value-search specific iri', language='en')}, + RDFS.label: {literal('value-search specific IRI', language='en')}, + RDFS.comment: {literal('value-search specific IRI', language='en')}, DCTERMS.description: {_literal_markdown(''' -search for a specific iri value in the property `creator` +search for a specific IRI value in the property `creator` uses query parameters: @@ -282,7 +282,7 @@ def trove_browse_link(iri: str): RDFS.label: {literal('value-search by value type', language='en')}, RDFS.comment: {literal('value-search by value type', language='en')}, DCTERMS.description: {_literal_markdown(''' -search for iri values that are used as `creator` and have `rdf:type` `Person` (aka `foaf:Person`) +search for IRI values that are used as `creator` and have `rdf:type` `Person` (aka `foaf:Person`) uses query parameters: @@ -295,7 +295,7 @@ def trove_browse_link(iri: str): RDFS.label: {literal('value-search with text', language='en')}, RDFS.comment: {literal('value-search with text', language='en')}, DCTERMS.description: {_literal_markdown(''' -search for iri values used as `license` that have "cc" in their label +search for IRI values used as `license` that have "cc" in their label (`rdfs:label`, `dcterms:title`, or `foaf:name`) uses query parameters: @@ -553,11 +553,11 @@ def trove_browse_link(iri: str): * `propertypath_set`: comma-separated **property-path** set * `filter_operator`: any one of the operators defined below -* `value_iris`: comma-separated iri set +* `value_iris`: comma-separated IRI set ### filter operators -operators on iri values: +operators on IRI values: * `any-of` (default): at least one of the value iris * `none-of`: none of the value iris @@ -622,7 +622,7 @@ def trove_browse_link(iri: str): it may be used only two ways: -* `valueSearchFilter[sameAs]=` to request a specific value by IRI +* `valueSearchFilter[sameAs]=` to request a specific value by IRI * `valueSearchFilter[resourceType]=` to request values used with `rdf:type ` ''', language='en')}, }, From c327604af57da28489849046b2495dd765b28a4e Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 2 May 2025 10:59:32 -0400 Subject: [PATCH 41/43] less misleading property-path docs/examples --- trove/vocab/trove.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index ddd7997e8..a4485ab7e 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -723,15 +723,15 @@ def trove_browse_link(iri: str): most places that allow one property-path also accept a comma-separated set of paths, like `title,description` (which is parsed as two paths: `title` and `description`) -or `creator.name,affiliation.name,funder.name` (which is parsed as three paths: `creator.name`, -`affiliation.name`, and `funder.name`) +or `affiliation,creator.affiliation,funder` (which is parsed as three paths: `affiliation`, +`creator.affiliation`, and `funder`) the special path segment `*` matches any property -* `*`: match text values one step away from the focus -* `*.*`: match text values exactly two steps away -* `*,*.*`: match text values one OR two steps away -* `*,creator.name`: match text values one step away OR at the specific path `creator.name` +* `*`: match values one step away from the focus +* `*.*`: match values exactly two steps away +* `*,*.*`: match values one OR two steps away +* `*,creator`: match values one step away OR at the specific path `creator` (currently, if a path contains `*`, then every step must be `*` -- mixed paths like `*.affiliation` are not supported) From f5220ff33557c89c8784421f66e74dc005fc09c9 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 2 May 2025 10:41:41 -0400 Subject: [PATCH 42/43] add isContainedBy.creator.name to indexed osfmap text fields --- trove/vocab/osfmap.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/trove/vocab/osfmap.py b/trove/vocab/osfmap.py index 7af7b283a..5d4eadce6 100644 --- a/trove/vocab/osfmap.py +++ b/trove/vocab/osfmap.py @@ -930,7 +930,9 @@ NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES) SKIPPABLE_PROPERTIES = (OSFMAP.contains, OWL.sameAs) EXTRA_INDEXED_LITERAL_PATHS = frozenset(( + # indirect text-search paths used by osf-search (DCTERMS.creator, FOAF.name), + (OSFMAP.isContainedBy, DCTERMS.creator, FOAF.name), )) DEFAULT_TABULAR_SEARCH_COLUMN_PATHS: tuple[tuple[str, ...], ...] = ( From 067d39ff66c4d8c8efecada36296818d8498aee5 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 5 May 2025 16:45:23 -0400 Subject: [PATCH 43/43] cardSearchText docs: note backslash --- trove/vocab/trove.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index a4485ab7e..166a5a24e 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -526,7 +526,7 @@ def trove_browse_link(iri: str): (without brackets equivalent to `cardSearchText[*]`, matching any property-path of length one from the index-card focus). different index-strategies may parse and process search text differently --- the current default index-strategy supports these special characters: +-- the current default index-strategy supports these special characters (to use them literally, precede with backslash (`\\`)) * `+` signifies AND operation (default) * `|` signifies OR operation * `-` negates a single token @@ -536,6 +536,7 @@ def trove_browse_link(iri: str): * `~N` (where N is an integer) after a word signifies edit distance (fuzziness) * `~N` (where N is an integer) after a phrase signifies slop amount + ''', language='en')}, }, TROVE.cardSearchFilter: {