diff --git a/project/settings.py b/project/settings.py index adb6ec1a6..e6db19bd5 100644 --- a/project/settings.py +++ b/project/settings.py @@ -16,15 +16,7 @@ import jwe from share import __version__ - - -def strtobool(s: str) -> bool: - s = s.lower() - if s in ('t', 'true', '1'): - return True - if s in ('f', 'false', '0'): - return False - raise ValueError(f'unboolable string: "{s}"') +from trove.util.queryparams import parse_booly_str def split(string, delim): @@ -463,7 +455,7 @@ def route_urgent_task(name, args, kwargs, options, task=None, **kw): SUBJECTS_CENTRAL_TAXONOMY = os.environ.get('SUBJECTS_CENTRAL_TAXONOMY', 'bepress') -HIDE_DEPRECATED_VIEWS = strtobool(os.environ.get('HIDE_DEPRECATED_VIEWS', 'False')) +HIDE_DEPRECATED_VIEWS = parse_booly_str(os.environ.get('HIDE_DEPRECATED_VIEWS', 'False')) # Regulator pipeline, names of setuptools entry points SHARE_REGULATOR_CONFIG = { diff --git a/project/urls.py b/project/urls.py index 3a4a48aa0..e69fc52b1 100644 --- a/project/urls.py +++ b/project/urls.py @@ -12,13 +12,14 @@ from share.admin import admin_site from share.oaipmh.views import OAIPMHView from trove.views.vocab import TroveVocabView +from trove.views.shtrove_root import ShtroveRootView urlpatterns = [ url(r'^admin/', admin_site.urls), # url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')), - path('api/v3/', include('trove.urls', namespace='trove')), # same as 'trove/' but more subtle - path('trove/', include('trove.urls', namespace='trovetrove')), + path('api/v3/', include('trove.urls', namespace='apiv3')), # same as 'trove/' but more subtle + path('trove/', include('trove.urls', namespace='trove')), path('vocab/2023/trove/', view=TroveVocabView.as_view(), name='trove-vocab'), url(r'^api/v2/', include('api.urls', namespace='api')), url(r'^api/(?P(?!v\d+).*)', APIVersionRedirectView.as_view()), @@ -32,6 +33,7 @@ permanent=False ), name='favicon'), url(r'^icons/(?P[^/]+).ico$', source_icon_view, name='source_icon'), + path('', ShtroveRootView.as_view()), ] if settings.DEBUG: diff --git a/share/admin/__init__.py b/share/admin/__init__.py index 1f21210e4..7d1328756 100644 --- a/share/admin/__init__.py +++ b/share/admin/__init__.py @@ -213,6 +213,7 @@ class FeatureFlagAdmin(admin.ModelAdmin): readonly_fields = ('name',) search_fields = ('name',) list_display = ('name', 'is_up', 'is_defined') + list_editable = ('is_up',) admin_site.register(AccessToken, AccessTokenAdmin) diff --git a/share/models/feature_flag.py b/share/models/feature_flag.py index a1ea95022..b1abbe090 100644 --- a/share/models/feature_flag.py +++ b/share/models/feature_flag.py @@ -30,7 +30,6 @@ class FeatureFlag(models.Model): ELASTIC_EIGHT_DEFAULT = 'elastic_eight_default' SUGGEST_CREATOR_FACET = 'suggest_creator_facet' FORBID_UNTRUSTED_FEED = 'forbid_untrusted_feed' - TROVESEARCH_DENORMILY = 'trovesearch_denormily' PREPRINT_AFFILIATIONS = 'preprint_affiliations' # name _should_ be one of the constants above, but that is not enforced by `choices` diff --git a/share/search/index_strategy/__init__.py b/share/search/index_strategy/__init__.py index c00d2fbf1..943e67f30 100644 --- a/share/search/index_strategy/__init__.py +++ b/share/search/index_strategy/__init__.py @@ -9,7 +9,6 @@ from trove.trovesearch import search_params from .sharev2_elastic5 import Sharev2Elastic5IndexStrategy from .sharev2_elastic8 import Sharev2Elastic8IndexStrategy -from .trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy from .trovesearch_denorm import TrovesearchDenormIndexStrategy from ._base import IndexStrategy from ._indexnames import parse_indexname_parts @@ -38,7 +37,6 @@ class _AvailableStrategies(enum.Enum): if settings.ELASTICSEARCH8_URL: sharev2_elastic8 = Sharev2Elastic8IndexStrategy('sharev2_elastic8') - trove_indexcard_flats = TroveIndexcardFlatsIndexStrategy('trove_indexcard_flats') trovesearch_denorm = TrovesearchDenormIndexStrategy('trovesearch_denorm') @@ -96,13 +94,8 @@ def get_strategy_for_sharev2_search(requested_name: str | None = None) -> IndexS def get_strategy_for_trovesearch(params: search_params.CardsearchParams) -> IndexStrategy: if params.index_strategy_name: # specific strategy requested _strategy = parse_strategy_name(params.index_strategy_name, for_search=True) - else: - _strategy_name = ( - _AvailableStrategies.trovesearch_denorm.name - if FeatureFlag.objects.flag_is_up(FeatureFlag.TROVESEARCH_DENORMILY) - else _AvailableStrategies.trove_indexcard_flats.name - ) - _strategy = get_strategy(_strategy_name, for_search=True) + else: # hard-coded default (...for now) + _strategy = get_strategy(_AvailableStrategies.trovesearch_denorm.name, for_search=True) return _strategy diff --git a/share/search/index_strategy/trove_indexcard_flats.py b/share/search/index_strategy/trove_indexcard_flats.py deleted file mode 100644 index 49874d189..000000000 --- a/share/search/index_strategy/trove_indexcard_flats.py +++ /dev/null @@ -1,953 +0,0 @@ -import base64 -from collections import defaultdict -import dataclasses -import datetime -import json -import logging -import re -import uuid -from typing import Iterable, Iterator, Any - -from django.conf import settings -import elasticsearch8 -from primitive_metadata import primitive_rdf - -from share.search import exceptions -from share.search import messages -from share.search.index_strategy._base import IndexStrategy -from share.search.index_strategy.elastic8 import Elastic8IndexStrategy -from share.util.checksum_iri import ChecksumIri -from trove import models as trove_db -from trove.trovesearch.page_cursor import ( - MANY_MORE, - OffsetCursor, - PageCursor, - ReproduciblyRandomSampleCursor, -) -from trove.trovesearch.search_params import ( - CardsearchParams, - ValuesearchParams, - SearchFilter, - Textsegment, - SortParam, - GLOB_PATHSTEP, -) -from trove.trovesearch.search_handle import ( - CardsearchHandle, - ValuesearchHandle, - TextMatchEvidence, - CardsearchResult, - ValuesearchResult, - PropertypathUsage, -) -from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri, iri_path_as_keyword -from trove.vocab import osfmap -from trove.vocab.namespaces import RDF, OWL -from ._trovesearch_util import ( - latest_rdf_for_indexcard_pks, - GraphWalk, - KEYWORD_LENGTH_MAX, -) - - -logger = logging.getLogger(__name__) - - -class TroveIndexcardFlatsIndexStrategy(Elastic8IndexStrategy): - CURRENT_STRATEGY_CHECKSUM = ChecksumIri( - checksumalgorithm_name='sha-256', - salt='TroveIndexcardFlatsIndexStrategy', - hexdigest='bdec536873e1ed0c58facaa5d1145bef73bba09d671deef48e45c019def5c5a5', - ) - - # abstract method from IndexStrategy - @property - def supported_message_types(self): - return { - messages.MessageType.UPDATE_INDEXCARD, - messages.MessageType.BACKFILL_INDEXCARD, - } - - # abstract method from IndexStrategy - @property - def backfill_message_type(self): - return messages.MessageType.BACKFILL_INDEXCARD - - @classmethod - def define_current_indexes(cls): - return { # empty index subname, for backcompat - '': cls.IndexDefinition( - mappings=cls.index_mappings(), - settings=cls.index_settings(), - ), - } - - @classmethod - def index_settings(cls): - return {} - - @classmethod - def index_mappings(cls): - _capped_keyword = { - 'type': 'keyword', - 'ignore_above': KEYWORD_LENGTH_MAX, - } - _common_nested_keywords = { - 'path_from_focus': _capped_keyword, - 'suffuniq_path_from_focus': _capped_keyword, - 'property_iri': _capped_keyword, - 'distance_from_focus': {'type': 'keyword'}, # numeric value as keyword (used for 'term' filter) - } - return { - 'dynamic': 'false', - 'properties': { - 'indexcard_uuid': _capped_keyword, - 'focus_iri': _capped_keyword, - 'suffuniq_focus_iri': _capped_keyword, - 'source_record_identifier': _capped_keyword, - 'source_config_label': _capped_keyword, - 'flat_iri_values': { - 'type': 'flattened', - 'ignore_above': KEYWORD_LENGTH_MAX, - }, - 'flat_iri_values_suffuniq': { - 'type': 'flattened', - 'ignore_above': KEYWORD_LENGTH_MAX, - }, - 'iri_paths_present': _capped_keyword, - 'iri_paths_present_suffuniq': _capped_keyword, - 'nested_iri': { - 'type': 'nested', - 'properties': { - **_common_nested_keywords, - 'iri_value': _capped_keyword, - 'suffuniq_iri_value': _capped_keyword, - 'value_type_iri': _capped_keyword, - 'value_name_text': { - 'type': 'text', - 'fields': {'raw': _capped_keyword}, - 'copy_to': 'nested_iri.value_namelike_text', - }, - 'value_title_text': { - 'type': 'text', - 'fields': {'raw': _capped_keyword}, - 'copy_to': 'nested_iri.value_namelike_text', - }, - 'value_label_text': { - 'type': 'text', - 'fields': {'raw': _capped_keyword}, - 'copy_to': 'nested_iri.value_namelike_text', - }, - 'value_namelike_text': {'type': 'text'}, - }, - }, - 'nested_date': { - 'type': 'nested', - 'properties': { - **_common_nested_keywords, - 'date_value': { - 'type': 'date', - 'format': 'strict_date_optional_time', - }, - }, - }, - 'nested_text': { - 'type': 'nested', - 'properties': { - **_common_nested_keywords, - 'language_iri': _capped_keyword, - 'text_value': { - 'type': 'text', - 'index_options': 'offsets', # for faster highlighting - 'store': True, # avoid loading _source to render highlights - 'fields': {'raw': _capped_keyword}, - }, - }, - }, - }, - } - - @property - def __index(self) -> IndexStrategy.SpecificIndex: - # this is a single-index strategy -- for back-compat, that index has empty subname - return self.get_index('') - - def _build_sourcedoc(self, indexcard_rdf): - _rdfdoc = indexcard_rdf.as_rdfdoc_with_supplements() - if _should_skip_card(indexcard_rdf, _rdfdoc): - return None # will be deleted from the index - _nested_iris = defaultdict(set) - _nested_dates = defaultdict(set) - _nested_texts = defaultdict(set) - _walk = GraphWalk(_rdfdoc, indexcard_rdf.focus_iri) - for _walk_path, _walk_iris in _walk.iri_values.items(): - for _iri_obj in _walk_iris: - _nested_iris[_NestedIriKey.for_iri_at_path(_walk_path, _iri_obj, _rdfdoc)].add(_iri_obj) - for _walk_path, _walk_dates in _walk.date_values.items(): - for _date_obj in _walk_dates: - _nested_dates[_walk_path].add(datetime.date.isoformat(_date_obj)) - for _walk_path, _walk_texts in _walk.text_values.items(): - for _text_obj in _walk_texts: - _nested_texts[(_walk_path, tuple(_text_obj.datatype_iris))].add(_text_obj.unicode_value) - _focus_iris = {indexcard_rdf.focus_iri} - _suffuniq_focus_iris = {get_sufficiently_unique_iri(indexcard_rdf.focus_iri)} - for _identifier in indexcard_rdf.indexcard.focus_identifier_set.all(): - _focus_iris.update(_identifier.raw_iri_list) - _suffuniq_focus_iris.add(_identifier.sufficiently_unique_iri) - return { - 'indexcard_uuid': str(indexcard_rdf.indexcard.uuid), - 'focus_iri': list(_focus_iris), - 'suffuniq_focus_iri': list(_suffuniq_focus_iris), - 'source_record_identifier': indexcard_rdf.indexcard.source_record_suid.identifier, - 'source_config_label': indexcard_rdf.indexcard.source_record_suid.source_config.label, - 'flat_iri_values': self._flattened_iris(_nested_iris), - 'flat_iri_values_suffuniq': self._flattened_iris_suffuniq(_nested_iris), - 'iri_paths_present': [ - iri_path_as_keyword(_path) - for _path in _walk.paths_walked - ], - 'iri_paths_present_suffuniq': [ - iri_path_as_keyword(_path, suffuniq=True) - for _path in _walk.paths_walked - ], - 'nested_iri': list(filter(bool, ( - self._iri_nested_sourcedoc(_nested_iri_key, _iris, _rdfdoc) - for _nested_iri_key, _iris in _nested_iris.items() - ))), - 'nested_date': [ - { - **_iri_path_as_indexable_fields(_path), - 'date_value': list(_value_set), - } - for _path, _value_set in _nested_dates.items() - ], - 'nested_text': [ - { - **_iri_path_as_indexable_fields(_path), - 'language_iri': _language_iris, - 'text_value': list(_value_set), - } - for (_path, _language_iris), _value_set in _nested_texts.items() - ], - } - - def _iri_nested_sourcedoc(self, iri_key: '_NestedIriKey', iris, rdfdoc): - _iris_with_synonyms = set(filter(is_worthwhile_iri, iris)) - for _iri in iris: - _iris_with_synonyms.update( - filter(is_worthwhile_iri, rdfdoc.q(_iri, OWL.sameAs)), - ) - if not _iris_with_synonyms: - return None - _sourcedoc = { - **iri_key.as_indexable_fields(), - 'iri_value': list(_iris_with_synonyms), - 'suffuniq_iri_value': [ - get_sufficiently_unique_iri(_iri) - for _iri in _iris_with_synonyms - ], - } - return _sourcedoc - - def _flattened_iris_by_path(self, nested_iris: dict['_NestedIriKey', set[str]]): - _by_path = defaultdict(set) - for _iri_key, _iris in nested_iris.items(): - _by_path[_iri_key.path].update(_iris) - return _by_path - - def _flattened_iris(self, nested_iris: dict['_NestedIriKey', set[str]]): - return { - _iri_path_as_flattened_key(_path): list(_iris) - for _path, _iris in self._flattened_iris_by_path(nested_iris).items() - } - - def _flattened_iris_suffuniq(self, nested_iris: dict['_NestedIriKey', set[str]]): - return { - _iri_path_as_flattened_key(_path): [ - get_sufficiently_unique_iri(_iri) - for _iri in _iris - ] - for _path, _iris in self._flattened_iris_by_path(nested_iris).items() - } - - def build_elastic_actions(self, messages_chunk: messages.MessagesChunk): - def _make_actionset(indexcard_id, *actions): - return self.MessageActionSet(indexcard_id, {'': actions}) - _indexcard_rdf_qs = latest_rdf_for_indexcard_pks(messages_chunk.target_ids_chunk) - _remaining_indexcard_ids = set(messages_chunk.target_ids_chunk) - for _indexcard_rdf in _indexcard_rdf_qs: - _suid = _indexcard_rdf.indexcard.source_record_suid - if _suid.has_forecompat_replacement(): - continue # skip this one, let it get deleted - _sourcedoc = self._build_sourcedoc(_indexcard_rdf) - if _sourcedoc: - _index_action = self.build_index_action( - doc_id=_indexcard_rdf.indexcard.get_iri(), - doc_source=_sourcedoc, - ) - _remaining_indexcard_ids.discard(_indexcard_rdf.indexcard_id) - yield _make_actionset(_indexcard_rdf.indexcard_id, _index_action) - # delete any that don't have "latest" rdf and derived osfmap_json - _leftovers = trove_db.Indexcard.objects.filter(id__in=_remaining_indexcard_ids) - for _indexcard in _leftovers: - yield _make_actionset(_indexcard.id, self.build_delete_action(_indexcard.get_iri())) - - def pls_handle_search__passthru(self, request_body=None, request_queryparams=None) -> dict: - return self.es8_client.search( - index=self.__index.full_index_name, - body={ - **(request_body or {}), - 'track_total_hits': True, - }, - params=(request_queryparams or {}), - ) - - def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> CardsearchHandle: - _cursor = self._cardsearch_cursor(cardsearch_params) - _sort = self._cardsearch_sort(cardsearch_params.sort_list) - _query = self._cardsearch_query( - cardsearch_params.cardsearch_filter_set, - cardsearch_params.cardsearch_textsegment_set, - cardsearch_cursor=_cursor, - ) - _from_offset = ( - _cursor.start_offset - if _cursor.is_first_page() or not isinstance(_cursor, ReproduciblyRandomSampleCursor) - else _cursor.start_offset - len(_cursor.first_page_ids) - ) - _search_kwargs = dict( - query=_query, - aggs=self._cardsearch_aggs(cardsearch_params), - sort=_sort, - from_=_from_offset, - size=_cursor.bounded_page_size, - source=False, # no need to get _source; _id is enough - ) - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.es8_client.search( - index=self.__index.full_index_name, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self._cardsearch_handle(cardsearch_params, _es8_response, _cursor) - - def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchHandle: - _cursor = OffsetCursor.from_cursor(valuesearch_params.page_cursor) - _is_date_search = osfmap.is_date_property(valuesearch_params.valuesearch_propertypath[-1]) - _search_kwargs = dict( - query=self._cardsearch_query( - valuesearch_params.cardsearch_filter_set, - valuesearch_params.cardsearch_textsegment_set, - additional_filters=[{'term': {'iri_paths_present': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - )}}], - ), - size=0, # ignore cardsearch hits; just want the aggs - aggs=( - self._valuesearch_date_aggs(valuesearch_params) - if _is_date_search - else self._valuesearch_iri_aggs(valuesearch_params, _cursor) - ), - ) - if settings.DEBUG: - logger.info(json.dumps(_search_kwargs, indent=2)) - try: - _es8_response = self.es8_client.search( - index=self.__index.full_index_name, - **_search_kwargs, - ) - except elasticsearch8.TransportError as error: - raise exceptions.IndexStrategyError() from error # TODO: error messaging - return self._valuesearch_handle(valuesearch_params, _es8_response, _cursor) - - ### - # query implementation - - def _cardsearch_cursor(self, cardsearch_params: CardsearchParams) -> OffsetCursor: - _request_cursor = cardsearch_params.page_cursor - if ( - _request_cursor.is_basic() - and not cardsearch_params.sort_list - and not cardsearch_params.cardsearch_textsegment_set - ): - return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) - return OffsetCursor.from_cursor(_request_cursor) - - def _cardsearch_query( - self, - filter_set, textsegment_set, *, - additional_filters=None, - cardsearch_cursor: PageCursor | None = None, - ) -> dict: - _bool_query = { - 'filter': additional_filters or [], - 'must': [], - 'must_not': [], - 'should': [], - } - for _searchfilter in filter_set: - if _searchfilter.operator == SearchFilter.FilterOperator.NONE_OF: - _bool_query['must_not'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.ANY_OF: - _bool_query['filter'].append(self._cardsearch_iri_filter(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_PRESENT: - _bool_query['filter'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator == SearchFilter.FilterOperator.IS_ABSENT: - _bool_query['must_not'].append(self._cardsearch_presence_query(_searchfilter)) - elif _searchfilter.operator.is_date_operator(): - _bool_query['filter'].append(self._cardsearch_date_filter(_searchfilter)) - else: - raise ValueError(f'unknown filter operator {_searchfilter.operator}') - _textq_builder = self._NestedTextQueryBuilder( - relevance_matters=not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor), - ) - for _textsegment in textsegment_set: - for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): - _bool_query[_boolkey].extend(_textqueries) - if not isinstance(cardsearch_cursor, ReproduciblyRandomSampleCursor): - # no need for randomness - return {'bool': _bool_query} - if not cardsearch_cursor.first_page_ids: - # independent random sample - return { - 'function_score': { - 'query': {'bool': _bool_query}, - 'boost_mode': 'replace', - 'random_score': {}, # default random_score is fast and unpredictable - }, - } - _firstpage_uuid_query = {'terms': {'indexcard_uuid': cardsearch_cursor.first_page_ids}} - if cardsearch_cursor.is_first_page(): - # returning to a first page previously visited - _bool_query['filter'].append(_firstpage_uuid_query) - return {'bool': _bool_query} - # get a subsequent page using reproducible randomness - _bool_query['must_not'].append(_firstpage_uuid_query) - return { - 'function_score': { - 'query': {'bool': _bool_query}, - 'boost_mode': 'replace', - 'random_score': { - 'seed': ''.join(cardsearch_cursor.first_page_ids), - 'field': 'indexcard_uuid', - }, - }, - } - - def _cardsearch_aggs(self, cardsearch_params): - _aggs = {} - if cardsearch_params.related_property_paths: - _aggs['related_propertypath_usage'] = {'terms': { - 'field': 'iri_paths_present', - 'include': [ - iri_path_as_keyword(_path) - for _path in cardsearch_params.related_property_paths - ], - 'size': len(cardsearch_params.related_property_paths), - }} - return _aggs - - def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: OffsetCursor): - _nested_iri_bool: dict[str, Any] = { - 'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - suffuniq=True, - )}}], - 'must': [], - 'must_not': [], - 'should': [], - } - _nested_terms_agg = { - 'field': 'nested_iri.iri_value', - # WARNING: terribly inefficient pagination (part one) - 'size': cursor.start_offset + cursor.bounded_page_size + 1, - } - _iris = list(valuesearch_params.valuesearch_iris()) - if _iris: - _nested_iri_bool['filter'].append({'terms': { - 'nested_iri.iri_value': _iris, - }}) - _nested_terms_agg['size'] = len(_iris) - _nested_terms_agg['include'] = _iris - _type_iris = list(valuesearch_params.valuesearch_type_iris()) - if _type_iris: - _nested_iri_bool['filter'].append({'terms': { - 'nested_iri.value_type_iri': _type_iris, - }}) - _textq_builder = self._SimpleTextQueryBuilder('nested_iri.value_namelike_text') - for _textsegment in valuesearch_params.valuesearch_textsegment_set: - for _boolkey, _textqueries in _textq_builder.textsegment_boolparts(_textsegment).items(): - _nested_iri_bool[_boolkey].extend(_textqueries) - return { - 'in_nested_iri': { - 'nested': {'path': 'nested_iri'}, - 'aggs': { - 'value_at_propertypath': { - 'filter': {'bool': _nested_iri_bool}, - 'aggs': { - 'iri_values': { - 'terms': _nested_terms_agg, - 'aggs': { - 'type_iri': {'terms': { - 'field': 'nested_iri.value_type_iri', - }}, - 'name_text': {'terms': { - 'field': 'nested_iri.value_name_text.raw', - }}, - 'title_text': {'terms': { - 'field': 'nested_iri.value_title_text.raw', - }}, - 'label_text': {'terms': { - 'field': 'nested_iri.value_label_text.raw', - }}, - }, - }, - }, - }, - }, - }, - } - - def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams): - _aggs = { - 'in_nested_date': { - 'nested': {'path': 'nested_date'}, - 'aggs': { - 'value_at_propertypath': { - 'filter': {'term': { - 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - valuesearch_params.valuesearch_propertypath, - suffuniq=True, - ), - }}, - 'aggs': { - 'count_by_year': { - 'date_histogram': { - 'field': 'nested_date.date_value', - 'calendar_interval': 'year', - 'format': 'yyyy', - 'order': {'_key': 'desc'}, - 'min_doc_count': 1, - }, - }, - }, - }, - }, - }, - } - return _aggs - - def _valuesearch_handle( - self, - valuesearch_params: ValuesearchParams, - es8_response: dict, - cursor: OffsetCursor, - ): - _iri_aggs = es8_response['aggregations'].get('in_nested_iri') - if _iri_aggs: - _buckets = _iri_aggs['value_at_propertypath']['iri_values']['buckets'] - _bucket_count = len(_buckets) - # WARNING: terribly inefficient pagination (part two) - _page_end_index = cursor.start_offset + cursor.bounded_page_size - _bucket_page = _buckets[cursor.start_offset:_page_end_index] # discard prior pages - cursor.total_count = ( - MANY_MORE - if (_bucket_count > _page_end_index) # agg includes one more, if there - else _bucket_count - ) - return ValuesearchHandle( - cursor=cursor, - search_result_page=[ - self._valuesearch_iri_result(_iri_bucket) - for _iri_bucket in _bucket_page - ], - search_params=valuesearch_params, - ) - else: # assume date - _year_buckets = ( - es8_response['aggregations']['in_nested_date'] - ['value_at_propertypath']['count_by_year']['buckets'] - ) - return ValuesearchHandle( - cursor=PageCursor(len(_year_buckets)), - search_result_page=[ - self._valuesearch_date_result(_year_bucket) - for _year_bucket in _year_buckets - ], - search_params=valuesearch_params, - ) - - def _valuesearch_iri_result(self, iri_bucket): - return ValuesearchResult( - value_iri=iri_bucket['key'], - value_type=_bucketlist(iri_bucket['type_iri']), - name_text=_bucketlist(iri_bucket['name_text']), - title_text=_bucketlist(iri_bucket['title_text']), - label_text=_bucketlist(iri_bucket['label_text']), - match_count=iri_bucket['doc_count'], - ) - - def _valuesearch_date_result(self, date_bucket): - return ValuesearchResult( - value_iri=None, - value_value=date_bucket['key_as_string'], - label_text=(date_bucket['key_as_string'],), - match_count=date_bucket['doc_count'], - ) - - def _cardsearch_presence_query(self, search_filter) -> dict: - _filters = [ - self._cardsearch_path_presence_query(_path) - for _path in search_filter.propertypath_set - ] - if len(_filters) == 1: - return _filters[0] - return {'bool': { - 'minimum_should_match': 1, - 'should': _filters, - }} - - def _cardsearch_path_presence_query(self, path: tuple[str, ...]): - if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): - return {'nested': { - 'path': 'nested_iri', - 'query': {'term': {'nested_iri.distance_from_focus': len(path)}}, - }} - return {'term': { - 'iri_paths_present_suffuniq': iri_path_as_keyword(path, suffuniq=True), - }} - - def _cardsearch_iri_filter(self, search_filter) -> dict: - _filters = [ - self._cardsearch_path_iri_query(_path, search_filter.value_set) - for _path in search_filter.propertypath_set - ] - if len(_filters) == 1: - return _filters[0] - return {'bool': { - 'minimum_should_match': 1, - 'should': _filters, - }} - - def _cardsearch_path_iri_query(self, path, value_set): - _suffuniq_values = [ - get_sufficiently_unique_iri(_iri) - for _iri in value_set - ] - if all(_pathstep == GLOB_PATHSTEP for _pathstep in path): - return {'nested': { - 'path': 'nested_iri', - 'query': {'bool': { - 'must': [ # both - {'term': {'nested_iri.distance_from_focus': len(path)}}, - {'terms': {'nested_iri.suffuniq_iri_value': _suffuniq_values}}, - ], - }}, - }} - # without a glob-path, can use the flattened keyword field - return {'terms': {_iri_path_as_flattened_field(path): _suffuniq_values}} - - def _cardsearch_date_filter(self, search_filter): - return {'nested': { - 'path': 'nested_date', - 'query': {'bool': {'filter': list(self._iter_nested_date_filters(search_filter))}}, - }} - - def _iter_nested_date_filters(self, search_filter) -> Iterator[dict]: - # filter by requested paths - yield _pathset_as_nestedvalue_filter(search_filter.propertypath_set, 'nested_date') - # filter by requested value/operator - if search_filter.operator == SearchFilter.FilterOperator.BEFORE: - _value = min(search_filter.value_set) # rely on string-comparable isoformat - yield {'range': {'nested_date.date_value': { - 'lt': _daterange_value_and_format(_value) - }}} - elif search_filter.operator == SearchFilter.FilterOperator.AFTER: - _value = max(search_filter.value_set) # rely on string-comparable isoformat - yield {'range': {'nested_date.date_value': { - 'gt': _daterange_value_and_format(_value) - }}} - elif search_filter.operator == SearchFilter.FilterOperator.AT_DATE: - for _value in search_filter.value_set: - _filtervalue = _daterange_value_and_format(_value) - yield {'range': {'nested_date.date_value': { - 'gte': _filtervalue, - 'lte': _filtervalue, - }}} - else: - raise ValueError(f'invalid date filter operator (got {search_filter.operator})') - - def _cardsearch_sort(self, sort_list: tuple[SortParam, ...]): - if not sort_list: - return None - return [ - {'nested_date.date_value': { - 'order': ('desc' if _sortparam.descending else 'asc'), - 'nested': { - 'path': 'nested_date', - 'filter': {'term': { - 'nested_date.suffuniq_path_from_focus': iri_path_as_keyword( - _sortparam.propertypath, - suffuniq=True, - ), - }}, - }, - }} - for _sortparam in sort_list - ] - - def _cardsearch_handle( - self, - cardsearch_params: CardsearchParams, - es8_response: dict, - cursor: OffsetCursor, - ) -> CardsearchHandle: - _es8_total = es8_response['hits']['total'] - if _es8_total['relation'] != 'eq': - cursor.total_count = MANY_MORE - elif isinstance(cursor, ReproduciblyRandomSampleCursor) and not cursor.is_first_page(): - # account for the filtered-out first page - cursor.total_count = _es8_total['value'] + len(cursor.first_page_ids) - else: # exact (and small) count - cursor.total_count = _es8_total['value'] - _results = [] - for _es8_hit in es8_response['hits']['hits']: - _card_iri = _es8_hit['_id'] - _results.append(CardsearchResult( - card_iri=_card_iri, - text_match_evidence=list(self._gather_textmatch_evidence(_es8_hit)), - )) - _relatedproperty_list: list[PropertypathUsage] = [] - if cardsearch_params.related_property_paths: - _relatedproperty_list.extend( - PropertypathUsage(property_path=_path, usage_count=0) - for _path in cardsearch_params.related_property_paths - ) - _relatedproperty_by_path = { - _result.property_path: _result - for _result in _relatedproperty_list - } - for _bucket in es8_response['aggregations']['related_propertypath_usage']['buckets']: - _path = tuple(json.loads(_bucket['key'])) - _relatedproperty_by_path[_path].usage_count += _bucket['doc_count'] - return CardsearchHandle( - cursor=cursor, - search_result_page=_results, - related_propertypath_results=_relatedproperty_list, - search_params=cardsearch_params, - ) - - def _gather_textmatch_evidence(self, es8_hit) -> Iterable[TextMatchEvidence]: - for _innerhit_group in es8_hit.get('inner_hits', {}).values(): - for _innerhit in _innerhit_group['hits']['hits']: - _property_path = tuple( - json.loads(_innerhit['fields']['nested_text.path_from_focus'][0]), - ) - try: - _language_iris = _innerhit['fields']['nested_text.language_iri'] - except KeyError: - _language_iris = () - for _highlight in _innerhit['highlight']['nested_text.text_value']: - yield TextMatchEvidence( - property_path=_property_path, - matching_highlight=primitive_rdf.literal(_highlight, datatype_iris=_language_iris), - card_iri=_innerhit['_id'], - ) - - class _SimpleTextQueryBuilder: - def __init__( - self, text_field, *, - relevance_matters=False, - ): - self._text_field = text_field - self._relevance_matters = relevance_matters - - def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: - if textsegment.is_negated: - return {'must_not': [self.exact_text_query(textsegment.text)]} - if not textsegment.is_fuzzy: - return {'must': [self.exact_text_query(textsegment.text)]} - if not self._relevance_matters: - return {'must': [self.fuzzy_text_must_query(textsegment.text)]} - return { - 'must': [self.fuzzy_text_must_query(textsegment.text)], - 'should': [self.fuzzy_text_should_query(textsegment.text)], - } - - def exact_text_query(self, text: str) -> dict: - # TODO: textsegment.is_openended (prefix query) - return {'match_phrase': { - self._text_field: {'query': text}, - }} - - def fuzzy_text_must_query(self, text: str) -> dict: - # TODO: textsegment.is_openended (prefix query) - return {'match': { - self._text_field: { - 'query': text, - 'fuzziness': 'AUTO', - # TODO: 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) - }, - }} - - def fuzzy_text_should_query(self, text: str): - return {'match_phrase': { - self._text_field: { - 'query': text, - 'slop': len(text.split()), - }, - }} - - class _NestedTextQueryBuilder(_SimpleTextQueryBuilder): - def __init__(self, **kwargs): - super().__init__('nested_text.text_value', **kwargs) - - def textsegment_boolparts(self, textsegment: Textsegment) -> dict[str, list]: - return { - _boolkey: [ - self._make_nested_query(textsegment, _query) - for _query in _queries - ] - for _boolkey, _queries in super().textsegment_boolparts(textsegment).items() - } - - def _make_nested_query(self, textsegment, query): - _nested_q = {'nested': { - 'path': 'nested_text', - 'query': {'bool': { - 'filter': _pathset_as_nestedvalue_filter(textsegment.propertypath_set, 'nested_text'), - 'must': query, - }}, - }} - if self._relevance_matters: - _nested_q['nested']['inner_hits'] = self._inner_hits() - return _nested_q - - def _inner_hits(self, *, highlight_query=None) -> dict: - _highlight = { - 'type': 'unified', - 'fields': {'nested_text.text_value': {}}, - } - if highlight_query is not None: - _highlight['highlight_query'] = highlight_query - return { - 'name': str(uuid.uuid4()), # avoid inner-hit name collisions - 'highlight': _highlight, - '_source': False, # _source is expensive for nested docs - 'docvalue_fields': [ - 'nested_text.path_from_focus', - 'nested_text.language_iri', - ], - } - - -### -# module-local utils - -def _should_skip_card(indexcard_rdf, rdfdoc): - # skip cards without some value for name/title/label - return not any(rdfdoc.q(indexcard_rdf.focus_iri, osfmap.NAMELIKE_PROPERTIES)) - - -def _bucketlist(agg_result: dict) -> list[str]: - return [ - _bucket['key'] - for _bucket in agg_result['buckets'] - ] - - -def _daterange_value_and_format(datevalue: str): - _cleanvalue = datevalue.strip() - if re.fullmatch(r'\d{4,}', _cleanvalue): - return f'{_cleanvalue}||/y' - if re.fullmatch(r'\d{4,}-\d{2}', _cleanvalue): - return f'{_cleanvalue}||/M' - if re.fullmatch(r'\d{4,}-\d{2}-\d{2}', _cleanvalue): - return f'{_cleanvalue}||/d' - raise ValueError(f'bad date value "{datevalue}"') - - -def _iri_path_as_indexable_fields(path: tuple[str, ...]): - assert path, 'path should not be empty' - return { - 'path_from_focus': iri_path_as_keyword(path), - 'suffuniq_path_from_focus': iri_path_as_keyword(path, suffuniq=True), - 'property_iri': path[-1], - 'distance_from_focus': len(path), - } - - -def _iri_path_as_flattened_key(path: tuple[str, ...]) -> str: - return base64.b16encode(json.dumps(path).encode()).decode() - - -def _iri_path_as_flattened_field(path: tuple[str, ...]) -> str: - return f'flat_iri_values_suffuniq.{_iri_path_as_flattened_key(path)}' - - -def _pathset_as_nestedvalue_filter(propertypath_set: frozenset[tuple[str, ...]], nested_path: str): - _suffuniq_iri_paths = [] - _glob_path_lengths = [] - for _path in propertypath_set: - if all(_pathstep == GLOB_PATHSTEP for _pathstep in _path): - _glob_path_lengths.append(len(_path)) - else: - _suffuniq_iri_paths.append(iri_path_as_keyword(_path, suffuniq=True)) - if _suffuniq_iri_paths and _glob_path_lengths: - return {'bool': { - 'minimum_should_match': 1, - 'should': [ - {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}}, - {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}}, - ], - }} - if _glob_path_lengths: - return {'terms': {f'{nested_path}.distance_from_focus': _glob_path_lengths}} - return {'terms': {f'{nested_path}.suffuniq_path_from_focus': _suffuniq_iri_paths}} - - -@dataclasses.dataclass(frozen=True) -class _NestedIriKey: - '''if this is the same for multiple iri values, they can be combined in one `nested_iri` doc - ''' - path: tuple[str, ...] - type_iris: frozenset[str] - label_text: frozenset[str] - title_text: frozenset[str] - name_text: frozenset[str] - - @classmethod - def for_iri_at_path(cls, path: tuple[str, ...], iri: str, rdfdoc): - return cls( - path=path, - type_iris=frozenset(rdfdoc.q(iri, RDF.type)), - # TODO: don't discard language for name/title/label - name_text=frozenset( - _text.unicode_value - for _text in rdfdoc.q(iri, osfmap.NAME_PROPERTIES) - if isinstance(_text, primitive_rdf.Literal) - ), - title_text=frozenset( - _text.unicode_value - for _text in rdfdoc.q(iri, osfmap.TITLE_PROPERTIES) - if isinstance(_text, primitive_rdf.Literal) - ), - label_text=frozenset( - _text.unicode_value - for _text in rdfdoc.q(iri, osfmap.LABEL_PROPERTIES) - if isinstance(_text, primitive_rdf.Literal) - ), - ) - - def as_indexable_fields(self): - # matches fields in the mapping for `nested_iri`, above - return { - **_iri_path_as_indexable_fields(self.path), - 'value_type_iri': list(self.type_iris), - 'value_label_text': list(self.label_text), - 'value_title_text': list(self.title_text), - 'value_name_text': list(self.name_text), - } diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 2a40d1211..a65eb776f 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -33,7 +33,7 @@ CardsearchParams, Propertypath, SearchFilter, - Textsegment, + SearchText, ValueType, ValuesearchParams, is_globpath, @@ -627,7 +627,7 @@ def add_boolparts(self, boolparts: Iterator[tuple[str, dict]]): @dataclasses.dataclass class _QueryHelper: base_field: Literal['card', 'iri_value'] - textsegment_set: frozenset[Textsegment] + searchtext: frozenset[SearchText] filter_set: frozenset[SearchFilter] relevance_matters: bool @@ -653,15 +653,17 @@ def iri_boolparts(self) -> Iterator[tuple[str, dict]]: def text_boolparts(self) -> Iterator[tuple[str, dict]]: # text-based queries - for _textsegment in self.textsegment_set: - if _textsegment.is_negated: - yield 'must_not', self._exact_text_query(_textsegment) - elif not _textsegment.is_fuzzy: - yield 'must', self._exact_text_query(_textsegment) - else: - yield 'must', self._fuzzy_text_must_query(_textsegment) - if self.relevance_matters: - yield 'should', self._fuzzy_text_should_query(_textsegment) + for _text in self.searchtext: + yield ( + 'must', + { + "simple_query_string": { + "query": _text.text, + "fields": [self._text_field_name(_path) for _path in _text.propertypath_set], + "default_operator": "AND" + } + } + ) def _presence_query(self, search_filter) -> dict: return _any_query([ @@ -718,35 +720,6 @@ def _text_field_name(self, propertypath: Propertypath): else f'{self.base_field}.text_by_propertypath.{_path_field_name(propertypath)}' ) - def _exact_text_query(self, textsegment: Textsegment) -> dict: - # TODO: textsegment.is_openended (prefix query) - return _any_query([ - {'match_phrase': {self._text_field_name(_path): {'query': textsegment.text}}} - for _path in textsegment.propertypath_set - ]) - - def _fuzzy_text_must_query(self, textsegment: Textsegment) -> dict: - # TODO: textsegment.is_openended (prefix query) - return _any_query([ - {'match': { - self._text_field_name(_path): { - 'query': textsegment.text, - 'fuzziness': 'AUTO', - # TODO: consider 'operator': 'and' (by query param FilterOperator, `cardSearchText[*][every-word]=...`) - }, - }} - for _path in textsegment.propertypath_set - ]) - - def _fuzzy_text_should_query(self, textsegment: Textsegment): - _slop = len(textsegment.text.split()) - return _any_query([ - {'match_phrase': { - self._text_field_name(_path): {'query': textsegment.text, 'slop': _slop}, - }} - for _path in textsegment.propertypath_set - ]) - @dataclasses.dataclass class _CardsearchQueryBuilder: @@ -767,7 +740,7 @@ def response_cursor(self) -> OffsetCursor: if ( _request_cursor.is_basic() and not self.params.sort_list - and not self.params.cardsearch_textsegment_set + and not self.params.cardsearch_searchtext ): return ReproduciblyRandomSampleCursor.from_cursor(_request_cursor) return OffsetCursor.from_cursor(_request_cursor) @@ -785,7 +758,7 @@ def _cardsearch_query(self) -> dict: _bool.add_boolparts( _QueryHelper( base_field='card', - textsegment_set=self.params.cardsearch_textsegment_set, + searchtext=self.params.cardsearch_searchtext, filter_set=self.params.cardsearch_filter_set, relevance_matters=(not self.params.sort_list), ).boolparts(), @@ -869,7 +842,7 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d _bool.add_boolparts( _QueryHelper( base_field='card', - textsegment_set=params.cardsearch_textsegment_set, + searchtext=params.cardsearch_searchtext, filter_set=params.cardsearch_filter_set, relevance_matters=False, ).boolparts(), @@ -877,7 +850,7 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d _bool.add_boolparts( _QueryHelper( base_field='iri_value', - textsegment_set=params.valuesearch_textsegment_set, + searchtext=params.valuesearch_searchtext, filter_set=params.valuesearch_filter_set, relevance_matters=False, ).boolparts() @@ -906,13 +879,13 @@ def _build_iri_valuesearch(params: ValuesearchParams, cursor: OffsetCursor) -> d def _build_date_valuesearch(params: ValuesearchParams) -> dict: - assert not params.valuesearch_textsegment_set + assert not params.valuesearch_searchtext assert not params.valuesearch_filter_set _bool = _BoolBuilder() _bool.add_boolparts( _QueryHelper( base_field='card', - textsegment_set=params.cardsearch_textsegment_set, + searchtext=params.cardsearch_searchtext, filter_set=params.cardsearch_filter_set, relevance_matters=False, ).boolparts(), diff --git a/share/util/checksum_iri.py b/share/util/checksum_iri.py index e204b1126..012fdbab2 100644 --- a/share/util/checksum_iri.py +++ b/share/util/checksum_iri.py @@ -72,7 +72,3 @@ def from_iri(cls, checksum_iri: str): salt=salt, hexdigest=hexdigest, ) - - @classmethod - def from_dataclass_instance(cls, dataclass_instance): - return cls.digest_json(dataclasses.asdict(dataclass_instance)) diff --git a/tests/share/search/__init__.py b/tests/share/search/__init__.py index 871256d44..76b608261 100644 --- a/tests/share/search/__init__.py +++ b/tests/share/search/__init__.py @@ -3,11 +3,10 @@ from typing import Iterable from unittest import mock -from share.search import index_strategy - @contextlib.contextmanager -def patch_index_strategies(strategies: Iterable[index_strategy.IndexStrategy]): +def patch_index_strategies(strategies: Iterable): + from share.search import index_strategy with mock.patch.object(index_strategy, '_AvailableStrategies', new=enum.Enum( '_AvailableStrategies', [ (_strategy.strategy_name, _strategy) @@ -15,3 +14,14 @@ def patch_index_strategies(strategies: Iterable[index_strategy.IndexStrategy]): ], )): yield + + +@contextlib.contextmanager +def patch_index_strategy(strategy): + from share.search import index_strategy as _module_to_patch + with ( + mock.patch.object(_module_to_patch, 'all_strategy_names', return_value=frozenset([strategy.strategy_name])), + mock.patch.object(_module_to_patch, 'each_strategy', return_value=[strategy]), + mock.patch.object(_module_to_patch, 'get_strategy', return_value=strategy), + ): + yield diff --git a/tests/share/search/end_to_end/__init__.py b/tests/share/search/end_to_end/__init__.py new file mode 100644 index 000000000..ea9b78354 --- /dev/null +++ b/tests/share/search/end_to_end/__init__.py @@ -0,0 +1 @@ +__all__ = () diff --git a/tests/share/search/end_to_end/_common.py b/tests/share/search/end_to_end/_common.py new file mode 100644 index 000000000..5501a07ab --- /dev/null +++ b/tests/share/search/end_to_end/_common.py @@ -0,0 +1,254 @@ +import datetime +import itertools +from urllib.parse import urlencode +from typing import Iterator + +from primitive_metadata import primitive_rdf as rdf + +from trove.vocab import mediatypes +from trove.vocab.namespaces import RDF, DCTERMS, OWL, FOAF, DCAT, BLARG, OSFMAP, TROVE +from tests.share.search.index_strategy._with_real_services import RealElasticTestCase +from tests.trove.factories import ( + create_indexcard, + index_indexcards, +) + + +# abstract base class -- subclasses need to implement RealElasticTestCase.get_index_strategy +class End2EndSearchTestCase(RealElasticTestCase): + MEDIATYPES = (mediatypes.JSONAPI,) # TODO: more + + def setUp(self): + super().setUp() + _indexcards = self._create_test_cards() + index_indexcards(self.index_strategy, _indexcards) + + ### + # test methods + + def test_like_osfsearch(self): + # cardsearch + for _queryparams, _expected_focus_iris in self._cardsearch_cases(): + self._test_get_for_each_mediatype( + url_path='/trove/index-card-search', + queryparams=_queryparams, + actual_getter=self._get_cardsearch_focus_iris, + expected=_expected_focus_iris, + ) + # valuesearch + for _queryparams, _expected_values in self._valuesearch_cases(): + self._test_get_for_each_mediatype( + url_path='/trove/index-value-search', + queryparams=_queryparams, + actual_getter=self._get_valuesearch_values, + expected=_expected_values, + ) + + ### + # internals + + def _test_get_for_each_mediatype( + self, + url_path, + queryparams, + actual_getter, + expected, + ): + for _mediatype in self.MEDIATYPES: + _response = self._send_get(url_path, queryparams, _mediatype) + _actual = actual_getter(_response) + self.assertEqual(_actual, expected) + + def _create_test_cards(self): + self.all_card_focus_iris = { + BLARG.myproj, + BLARG.mypreprint, + } + self.card__myproj = create_indexcard(BLARG.myproj, { + RDF.type: {OSFMAP.Project}, + DCTERMS.title: {rdf.literal('my project', language='en')}, + DCTERMS.description: {rdf.literal('this project sure is.', language='en')}, + OWL.sameAs: {'https://doi.example/13.618/7', 'http://raid.example/whatever'}, + DCTERMS.creator: {BLARG.a_person, BLARG.nother_person}, + OSFMAP.keyword: {rdf.literal('keyword', language='en')}, + DCAT.accessService: {BLARG.anOsfOrSomething}, + DCTERMS.created: {rdf.literal(datetime.date(2020, 2, 2))}, + }, rdf_tripledict={ + BLARG.a_person: { + RDF.type: {DCTERMS.Agent, FOAF.Person}, + FOAF.name: {rdf.literal('peerrr sssssooo oooonnn nnnnnnnn')}, + }, + BLARG.nother_person: { + RDF.type: {DCTERMS.Agent, FOAF.Person}, + FOAF.name: {rdf.literal('nootthhh eeerrrppp peeeerrrrssssooooonnnnn')}, + OSFMAP.affiliation: {BLARG.an_institution}, + }, + BLARG.an_institution: { + RDF.type: {DCTERMS.Agent, FOAF.Organization}, + FOAF.name: {rdf.literal('innssttt iiitttuuuu ttttiiiioooonnnnn')}, + OSFMAP.affiliation: {BLARG.an_institution}, + }, + }, deriver_iris=(TROVE['derive/osfmap_json'],)) + self.card__mypreprint = create_indexcard(BLARG.mypreprint, { + RDF.type: {OSFMAP.Preprint}, + DCTERMS.title: {rdf.literal('my preprint', language='en')}, + DCTERMS.description: {rdf.literal('this preprint sure is that.', language='en')}, + OWL.sameAs: {'https://doi.example/13.618/11', 'http://raid.example/whateverz'}, + DCTERMS.creator: {BLARG.nother_person, BLARG.third_person}, + OSFMAP.keyword: { + rdf.literal('keyword', language='en'), + rdf.literal('lockword', language='en'), + }, + DCAT.accessService: {BLARG.anOsfOrSomething}, + DCTERMS.created: {rdf.literal(datetime.date(2022, 2, 2))}, + }, rdf_tripledict={ + BLARG.nother_person: { + RDF.type: {DCTERMS.Agent, FOAF.Person}, + FOAF.name: {rdf.literal('nootthhh eeerrrppp peeeerrrrssssooooonnnnn')}, + }, + BLARG.third_person: { + RDF.type: {DCTERMS.Agent, FOAF.Person}, + FOAF.name: {rdf.literal('⚞33οΈβƒ£πŸ•’πŸ₯‰ ☘️🎢 Β³β‘Άβž‚ βšžπŸ‘©β€πŸ‘©β€πŸ‘§βšŸ γ›β¬±βšŸ')}, + }, + BLARG.an_institution: { + RDF.type: {DCTERMS.Agent, FOAF.Organization}, + FOAF.name: {rdf.literal('innssttt iiitttuuuu ttttiiiioooonnnnn')}, + }, + }, deriver_iris=(TROVE['derive/osfmap_json'],)) + return [ + self.card__myproj, + self.card__mypreprint, + ] + + def _send_get(self, base_url, queryparams, mediatype): + assert '?' not in base_url + queryparams['acceptMediatype'] = mediatype + _url = f'{base_url}?{urlencode(queryparams)}' + return self.client.get(_url) + + def _get_cardsearch_focus_iris(self, response): + if response.headers['Content-Type'] != mediatypes.JSONAPI: + raise NotImplementedError('TODO: more mediatypes') + _response_json = response.json() + return set(itertools.chain.from_iterable( + _json_resource['attributes']['resourceIdentifier'] + for _json_resource in _response_json['included'] + if _json_resource['type'] == 'index-card' + )) + + def _get_valuesearch_values(self, response): + if response.headers['Content-Type'] != mediatypes.JSONAPI: + raise NotImplementedError('TODO: more mediatypes') + _response_json = response.json() + return set(itertools.chain.from_iterable( + _json_resource['attributes']['resourceIdentifier'] + for _json_resource in _response_json['included'] + if _json_resource['type'] == 'index-card' + )) + + def _cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]: + yield ( # empty baseline + {}, # no query params + self.all_card_focus_iris, + ) + yield ( # osf-search "all types" tab + { + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent,Project,ProjectComponent,Preprint,Agent,File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + self.all_card_focus_iris, + ) + yield ( # osf-search "all types" tab (with cardSearchText) + { + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent,Project,ProjectComponent,Preprint,Agent,File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': 'βšžπŸ‘©β€πŸ‘©β€πŸ‘§βšŸ', + 'sort': '-relevance', + }, + {BLARG.mypreprint}, + ) + yield ( # osf-search "projects" tab + { + 'cardSearchFilter[resourceType]': 'Project,ProjectComponent', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {BLARG.myproj}, + ) + yield ( # osf-search "preprints" tab + { + 'cardSearchFilter[resourceType]': 'Preprint', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {BLARG.mypreprint}, + ) + yield ( # osf-search "registrations" tab + { + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + set(), # TODO + ) + yield ( # osf-search "files" tab + { + 'cardSearchFilter[resourceType]': 'File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + set(), # TODO + ) + + def _valuesearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str]]]: + yield ( # simple baseline + {'valueSearchPropertyPath': 'resourceType'}, + {OSFMAP.Project, OSFMAP.Preprint}, + ) + yield ( # osf-search "all types" tab; "creator" facet + { + 'valueSearchPropertyPath': 'creator', + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent,Project,ProjectComponent,Preprint,Agent,File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {BLARG.a_person, BLARG.nother_person, BLARG.third_person}, + ) + yield ( # osf-search "all types" tab; "creator" facet with valueSearchText + { + 'valueSearchPropertyPath': 'creator', + 'valueSearchText': 'βšžπŸ‘©β€πŸ‘©β€πŸ‘§βšŸ', + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent,Project,ProjectComponent,Preprint,Agent,File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {BLARG.third_person}, + ) + yield ( # osf-search "preprints" tab; "creator" facet + { + 'valueSearchPropertyPath': 'creator', + 'cardSearchFilter[resourceType]': 'Preprint', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {BLARG.nother_person, BLARG.third_person}, + ) + yield ( # osf-search "all types" tab; "dateCreated" facet + { + 'valueSearchPropertyPath': 'dateCreated', + 'cardSearchFilter[resourceType]': 'Registration,RegistrationComponent,Project,ProjectComponent,Preprint,Agent,File', + 'cardSearchFilter[accessService]': BLARG.anOsfOrSomething, + 'cardSearchText[*,creator.name,isContainedBy.creator.name]': '', + 'sort': '-relevance', + }, + {'2020', '2022'}, # year histogram + ) diff --git a/tests/share/search/end_to_end/test_osfsearch_on_trovesearch_denorm.py b/tests/share/search/end_to_end/test_osfsearch_on_trovesearch_denorm.py new file mode 100644 index 000000000..a29023158 --- /dev/null +++ b/tests/share/search/end_to_end/test_osfsearch_on_trovesearch_denorm.py @@ -0,0 +1,7 @@ +from share.search.index_strategy.trovesearch_denorm import TrovesearchDenormIndexStrategy +from . import _common + + +class TestOsfsearchOnTrovesearchDenorm(_common.End2EndSearchTestCase): + def get_index_strategy(self): # for RealElasticTestCase + return TrovesearchDenormIndexStrategy('test_osfsearch_on_trovesearch_denorm') diff --git a/tests/share/search/index_strategy/_common_trovesearch_tests.py b/tests/share/search/index_strategy/_common_trovesearch_tests.py index 6d6eab52b..3d5f51e58 100644 --- a/tests/share/search/index_strategy/_common_trovesearch_tests.py +++ b/tests/share/search/index_strategy/_common_trovesearch_tests.py @@ -15,6 +15,7 @@ create_indexcard, update_indexcard_content, create_supplement, + index_indexcards, ) from ._with_real_services import RealElasticTestCase @@ -30,7 +31,7 @@ def setUp(self): def test_for_smoke_without_daemon(self): _indexcard = self._create_indexcard( focus_iri=BLARG.hello, - rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, + rdf_twopledict={RDFS.label: {rdf.literal('hello')}}, ) _messages_chunk = messages.MessagesChunk( messages.MessageType.UPDATE_INDEXCARD, @@ -44,7 +45,7 @@ def test_for_smoke_without_daemon(self): def test_for_smoke_with_daemon(self): _indexcard = self._create_indexcard( focus_iri=BLARG.hello, - rdf_tripledict={BLARG.hello: {RDFS.label: {rdf.literal('hello')}}}, + rdf_twopledict={RDFS.label: {rdf.literal('hello')}}, ) _messages_chunk = messages.MessagesChunk( messages.MessageType.UPDATE_INDEXCARD, @@ -78,11 +79,9 @@ def test_cardsearch_after_deletion(self): def test_cardsearch_after_updates(self): _cards = self._fill_test_data_for_querying() self._update_indexcard_content(_cards[BLARG.c], BLARG.c, { - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_c}, # subj_bc removed; subj_c added - DCTERMS.title: {rdf.literal('cccc')}, - }, + RDF.type: {BLARG.Thing}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_c}, # subj_bc removed; subj_c added + DCTERMS.title: {rdf.literal('cccc')}, }) self._index_indexcards([_cards[BLARG.c]]) _cases = [ @@ -112,11 +111,9 @@ def test_cardsearch_pagination(self): _focus_iri = BLARG[f'i{_i}'] _expected_iris.add(_focus_iri) _cards.append(self._create_indexcard(_focus_iri, { - _focus_iri: { - RDF.type: {BLARG.Thing}, - DCTERMS.title: {rdf.literal(f'card #{_i}')}, - DCTERMS.created: {rdf.literal(_start_date + timedelta(weeks=_i, days=_i))}, - }, + RDF.type: {BLARG.Thing}, + DCTERMS.title: {rdf.literal(f'card #{_i}')}, + DCTERMS.created: {rdf.literal(_start_date + timedelta(weeks=_i, days=_i))}, })) self._index_indexcards(_cards) # gather all pages results: @@ -145,7 +142,7 @@ def test_cardsearch_pagination(self): def test_cardsearch_related_properties(self): self._fill_test_data_for_querying() with mock.patch( - 'trove.trovesearch.search_params.suggested_property_paths', + 'trove.vocab.osfmap.suggested_property_paths', return_value=( (DCTERMS.creator,), (DCTERMS.references,), @@ -187,12 +184,10 @@ def test_valuesearch_after_deletion(self): def test_valuesearch_after_updates(self): _cards = self._fill_test_data_for_querying() self._update_indexcard_content(_cards[BLARG.c], BLARG.c, { - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.creator: {BLARG.someone_new}, # someone_else removed; someone_new added - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_c, BLARG.subj_new}, # subj_bc removed; subj_new added - DCTERMS.title: {rdf.literal('cccc')}, - }, + RDF.type: {BLARG.Thing}, + DCTERMS.creator: {BLARG.someone_new}, # someone_else removed; someone_new added + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_c, BLARG.subj_new}, # subj_bc removed; subj_new added + DCTERMS.title: {rdf.literal('cccc')}, }) self._index_indexcards([_cards[BLARG.c]]) _cases = [ @@ -239,16 +234,15 @@ def _assert_valuesearch_values(self, queryparams, expected_values): def _fill_test_data_for_querying(self): _card_a = self._create_indexcard(BLARG.a, { - BLARG.a: { - RDF.type: {BLARG.Thing}, - OWL.sameAs: {BLARG.a_same, BLARG.a_same2}, - DCTERMS.created: {rdf.literal(date(1999, 12, 31))}, - DCTERMS.creator: {BLARG.someone}, - DCTERMS.title: {rdf.literal('aaaa')}, - DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_a}, - DCTERMS.references: {BLARG.b, BLARG.c}, - DCTERMS.description: {rdf.literal('This place is not a place of honor... no highly esteemed deed is commemorated here... nothing valued is here.', language='en')}, - }, + RDF.type: {BLARG.Thing}, + OWL.sameAs: {BLARG.a_same, BLARG.a_same2}, + DCTERMS.created: {rdf.literal(date(1999, 12, 31))}, + DCTERMS.creator: {BLARG.someone}, + DCTERMS.title: {rdf.literal('aaaa')}, + DCTERMS.subject: {BLARG.subj_ac, BLARG.subj_a}, + DCTERMS.references: {BLARG.b, BLARG.c}, + DCTERMS.description: {rdf.literal('This place is not a place of honor... no highly esteemed deed is commemorated here... nothing valued is here.', language='en')}, + }, rdf_tripledict={ BLARG.someone: { FOAF.name: {rdf.literal('some one')}, }, @@ -265,16 +259,15 @@ def _fill_test_data_for_querying(self): }, }) _card_b = self._create_indexcard(BLARG.b, { - BLARG.b: { - RDF.type: {BLARG.Thing}, - OWL.sameAs: {BLARG.b_same}, - DCTERMS.created: {rdf.literal(date(2012, 12, 31))}, - DCTERMS.creator: {BLARG.someone}, - DCTERMS.title: {rdf.literal('bbbb')}, - DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, - DCTERMS.references: {BLARG.c}, - DCTERMS.description: {rdf.literal('What is here was dangerous and repulsive to us. This message is a warning about danger. ', language='en')}, - }, + RDF.type: {BLARG.Thing}, + OWL.sameAs: {BLARG.b_same}, + DCTERMS.created: {rdf.literal(date(2012, 12, 31))}, + DCTERMS.creator: {BLARG.someone}, + DCTERMS.title: {rdf.literal('bbbb')}, + DCTERMS.subject: {BLARG.subj_b, BLARG.subj_bc}, + DCTERMS.references: {BLARG.c}, + DCTERMS.description: {rdf.literal('What is here was dangerous and repulsive to us. This message is a warning about danger. ', language='en')}, + }, rdf_tripledict={ BLARG.someone: { FOAF.name: {rdf.literal('some one')}, }, @@ -285,44 +278,37 @@ def _fill_test_data_for_querying(self): }, }) _card_c = self._create_indexcard(BLARG.c, { - BLARG.c: { - RDF.type: {BLARG.Thing}, - DCTERMS.created: {rdf.literal(date(2024, 12, 31))}, - DCTERMS.creator: {BLARG.someone_else}, - DCTERMS.title: {rdf.literal('cccc')}, - DCTERMS.subject: { - BLARG['subj_ac/'], # this one has an extra trailing slash - BLARG.subj_bc, - BLARG.subj_c, - }, - DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')}, + RDF.type: {BLARG.Thing}, + DCTERMS.created: {rdf.literal(date(2024, 12, 31))}, + DCTERMS.creator: {BLARG.someone_else}, + DCTERMS.title: {rdf.literal('cccc')}, + DCTERMS.subject: { + BLARG['subj_ac/'], # this one has an extra trailing slash + BLARG.subj_bc, + BLARG.subj_c, }, + DCTERMS.description: {rdf.literal('The danger is unleashed only if you substantially disturb this place physically. This place is best shunned and left uninhabited.', language='en')}, + }, rdf_tripledict={ BLARG.someone_else: { FOAF.name: {rdf.literal('some one else')}, }, }) create_supplement(_card_a, BLARG.a, { - BLARG.a: { - DCTERMS.replaces: {BLARG.a_past}, - DCAT.servesDataset: { - rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(10)}}), - }, + DCTERMS.replaces: {BLARG.a_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(10)}}), }, }) create_supplement(_card_b, BLARG.b, { - BLARG.b: { - DCTERMS.replaces: {BLARG.b_past}, - DCAT.servesDataset: { - rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(7)}}), - }, + DCTERMS.replaces: {BLARG.b_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(7)}}), }, }) create_supplement(_card_c, BLARG.c, { - BLARG.c: { - DCTERMS.replaces: {BLARG.c_past}, - DCAT.servesDataset: { - rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(333)}}), - }, + DCTERMS.replaces: {BLARG.c_past}, + DCAT.servesDataset: { + rdf.blanknode({DCAT.spatialResolutionInMeters: {rdf.literal(333)}}), }, }) _cards = { @@ -468,26 +454,10 @@ def cardsearch_cases(self) -> Iterator[tuple[dict[str, str], set[str] | list[str {'cardSearchText': 'bbbb'}, {BLARG.b}, ) - yield ( - {'cardSearchText': '-bbbb'}, - {BLARG.a, BLARG.c}, - ) yield ( {'cardSearchText': 'danger'}, {BLARG.b, BLARG.c}, ) - yield ( - {'cardSearchText': 'dangre'}, - {BLARG.b, BLARG.c}, - ) - yield ( - {'cardSearchText': '"dangre"'}, - set(), - ) - yield ( - {'cardSearchText': 'danger -repulsive'}, - {BLARG.c}, - ) yield ( {'cardSearchText': '"nothing valued is here"'}, {BLARG.a}, @@ -608,8 +578,13 @@ def valuesearch_sameas_cases(self): {BLARG.subj_ac, BLARG.subj_a, BLARG.subj_c, BLARG.subj_bc}, ) - def _create_indexcard(self, focus_iri: str, rdf_tripledict: rdf.RdfTripleDictionary) -> trove_db.Indexcard: - _indexcard = create_indexcard(focus_iri, rdf_tripledict, (TROVE['derive/osfmap_json'],)) + def _create_indexcard( + self, + focus_iri: str, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, + ) -> trove_db.Indexcard: + _indexcard = create_indexcard(focus_iri, rdf_twopledict, rdf_tripledict, (TROVE['derive/osfmap_json'],)) self._indexcard_focus_by_uuid[str(_indexcard.uuid)] = focus_iri return _indexcard @@ -617,21 +592,14 @@ def _update_indexcard_content( self, indexcard: trove_db.Indexcard, focus_iri: str, - rdf_tripledict: rdf.RdfTripleDictionary, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, ) -> None: - update_indexcard_content(indexcard, focus_iri, rdf_tripledict) + update_indexcard_content(indexcard, focus_iri, rdf_twopledict, rdf_tripledict) self._indexcard_focus_by_uuid[str(indexcard.uuid)] = focus_iri def _index_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): - _messages_chunk = messages.MessagesChunk( - messages.MessageType.UPDATE_INDEXCARD, - [_indexcard.id for _indexcard in indexcards], - ) - self.assertTrue(all( - _response.is_done - for _response in self.index_strategy.pls_handle_messages_chunk(_messages_chunk) - )) - self.index_strategy.pls_refresh() + index_indexcards(self.index_strategy, indexcards) def _delete_indexcards(self, indexcards: Iterable[trove_db.Indexcard]): for _indexcard in indexcards: diff --git a/tests/share/search/index_strategy/_with_real_services.py b/tests/share/search/index_strategy/_with_real_services.py index 8ad685026..a4219b312 100644 --- a/tests/share/search/index_strategy/_with_real_services.py +++ b/tests/share/search/index_strategy/_with_real_services.py @@ -1,3 +1,4 @@ +import abc import contextlib from unittest import mock @@ -8,17 +9,21 @@ from share.search.daemon import IndexerDaemonControl from share.search.index_messenger import IndexMessenger from share.search import index_strategy -from tests.share.search import patch_index_strategies +from tests.share.search import patch_index_strategy # base class for testing IndexStrategy subclasses with actual elasticsearch. # (using TransactionTestCase so there's NOT a transaction wrapping each test # and IndexerDaemon can use a separate db connection from a separate thread) -class RealElasticTestCase(TransactionTestCase): +class RealElasticTestCase(TransactionTestCase, abc.ABC): serialized_rollback = True # for TransactionTestCase; restore db after - # required for subclasses + @abc.abstractmethod def get_index_strategy(self) -> index_strategy.IndexStrategy: + '''return an IndexStrategy instance that will be tested + + override in subclasses to reuse these tests + ''' raise NotImplementedError(f'{self.__class__} must implement `get_index_strategy`') def setUp(self): @@ -26,7 +31,7 @@ def setUp(self): self.enterContext(mock.patch('share.models.core._setup_user_token_and_groups')) self.index_strategy = self.get_index_strategy() self.index_strategy.pls_teardown() # in case it already exists - self.enterContext(patch_index_strategies([self.index_strategy])) + self.enterContext(patch_index_strategy(self.index_strategy)) self.index_messenger = IndexMessenger( celery_app=celery_app, index_strategys=[self.index_strategy], diff --git a/tests/share/search/index_strategy/test_sharev2_elastic5.py b/tests/share/search/index_strategy/test_sharev2_elastic5.py index 016330c84..8d0d84e73 100644 --- a/tests/share/search/index_strategy/test_sharev2_elastic5.py +++ b/tests/share/search/index_strategy/test_sharev2_elastic5.py @@ -24,10 +24,8 @@ def setUp(self): self.__indexcard = create_indexcard( BLARG.hello, { - BLARG.hello: { - RDF.type: {SHAREv2.CreativeWork}, - DCTERMS.title: {rdf.literal('hello', language='en')}, - }, + RDF.type: {SHAREv2.CreativeWork}, + DCTERMS.title: {rdf.literal('hello', language='en')}, }, deriver_iris=[SHAREv2.sharev2_elastic], ) diff --git a/tests/share/search/index_strategy/test_sharev2_elastic8.py b/tests/share/search/index_strategy/test_sharev2_elastic8.py index fb3a1a5c9..c41667000 100644 --- a/tests/share/search/index_strategy/test_sharev2_elastic8.py +++ b/tests/share/search/index_strategy/test_sharev2_elastic8.py @@ -17,10 +17,8 @@ def setUp(self): self.__indexcard = create_indexcard( BLARG.hello, { - BLARG.hello: { - RDF.type: {SHAREv2.CreativeWork}, - DCTERMS.title: {rdf.literal('hello', language='en')}, - }, + RDF.type: {SHAREv2.CreativeWork}, + DCTERMS.title: {rdf.literal('hello', language='en')}, }, deriver_iris=[SHAREv2.sharev2_elastic], ) diff --git a/tests/share/search/index_strategy/test_strategy_selection.py b/tests/share/search/index_strategy/test_strategy_selection.py index b4d8a1045..a017bc2ba 100644 --- a/tests/share/search/index_strategy/test_strategy_selection.py +++ b/tests/share/search/index_strategy/test_strategy_selection.py @@ -8,7 +8,6 @@ get_strategy, sharev2_elastic5, sharev2_elastic8, - trove_indexcard_flats, trovesearch_denorm, parse_strategy_name, ) @@ -21,7 +20,6 @@ def patched_strategies(mock_elastic_clients): _strategies = [ sharev2_elastic5.Sharev2Elastic5IndexStrategy('sharev2_elastic5'), sharev2_elastic8.Sharev2Elastic8IndexStrategy('sharev2_elastic8'), - trove_indexcard_flats.TroveIndexcardFlatsIndexStrategy('trove_indexcard_flats'), trovesearch_denorm.TrovesearchDenormIndexStrategy('trovesearch_denorm'), ] with patch_index_strategies(_strategies): diff --git a/tests/share/search/index_strategy/test_trove_indexcard_flats.py b/tests/share/search/index_strategy/test_trove_indexcard_flats.py deleted file mode 100644 index 0718ad346..000000000 --- a/tests/share/search/index_strategy/test_trove_indexcard_flats.py +++ /dev/null @@ -1,21 +0,0 @@ -from share.search.index_strategy.trove_indexcard_flats import TroveIndexcardFlatsIndexStrategy - -from . import _common_trovesearch_tests - - -class TestTroveIndexcardFlats(_common_trovesearch_tests.CommonTrovesearchTests): - # for RealElasticTestCase - def get_index_strategy(self): - return TroveIndexcardFlatsIndexStrategy('test_trove_indexcard_flats') - - def cardsearch_integer_cases(self): - yield from () # integers not indexed by this strategy - - def cardsearch_trailingslash_cases(self): - yield from () # trailing-slash handling improved in trovesearch_denorm - - def valuesearch_sameas_cases(self): - yield from () # sameas handling improved in trovesearch_denorm - - def valuesearch_trailingslash_cases(self): - yield from () # trailing-slash handling improved in trovesearch_denorm diff --git a/tests/share/search/test_daemon.py b/tests/share/search/test_daemon.py index ba0842a3d..b172a6ddb 100644 --- a/tests/share/search/test_daemon.py +++ b/tests/share/search/test_daemon.py @@ -113,6 +113,7 @@ def test_unsupported_message_type(self): daemon.on_message(unsupported_message.payload, unsupported_message) assert not unsupported_message.acked + @pytest.mark.filterwarnings('ignore::pytest.PytestUnhandledThreadExceptionWarning') def test_unexpected_error(self): class UnexpectedError(Exception): pass @@ -128,10 +129,7 @@ def pls_handle_messages_chunk(self, messages_chunk): with mock.patch('share.search.daemon.sentry_sdk') as mock_sentry: with mock.patch('share.search.daemon.logger') as mock_logger: - with _daemon_running( - FakeIndexStrategyWithUnexpectedError(), - daemonthread_context=lambda: pytest.raises(UnexpectedError) - ) as daemon: + with _daemon_running(FakeIndexStrategyWithUnexpectedError()) as daemon: message = FakeCeleryMessage(messages.MessageType.INDEX_SUID, 1) daemon.on_message(message.payload, message) assert daemon.stop_event.wait(timeout=10), ( @@ -140,6 +138,7 @@ def pls_handle_messages_chunk(self, messages_chunk): mock_sentry.capture_exception.assert_called_once() mock_logger.exception.assert_called_once() + @pytest.mark.filterwarnings('ignore::pytest.PytestUnhandledThreadExceptionWarning') def test_noncurrent_backfill(self): class FakeIndexStrategyWithNoncurrentBackfill: CURRENT_STRATEGY_CHECKSUM = 'not-what-you-expected' @@ -153,10 +152,7 @@ class FakeIndexBackfill: strategy_checksum = 'what-you-expected' return FakeIndexBackfill() - with _daemon_running( - FakeIndexStrategyWithNoncurrentBackfill(), - daemonthread_context=lambda: pytest.raises(exceptions.DaemonSetupError) - ) as daemon: + with _daemon_running(FakeIndexStrategyWithNoncurrentBackfill()) as daemon: message = FakeCeleryMessage(messages.MessageType.BACKFILL_SUID, 1) daemon.on_message(message.payload, message) assert daemon.stop_event.wait(timeout=10), ( diff --git a/tests/trove/factories.py b/tests/trove/factories.py index 475cdc80f..1a7d4b31b 100644 --- a/tests/trove/factories.py +++ b/tests/trove/factories.py @@ -8,14 +8,26 @@ from trove import digestive_tract +__all__ = ( + 'create_indexcard', + 'create_supplement', + 'index_indexcards', + 'update_indexcard_content', +) + + def create_indexcard( focus_iri: str, - rdf_tripledict: rdf.RdfTripleDictionary, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, deriver_iris: Collection[str] = (), ) -> trove_db.Indexcard: _suid = factories.SourceUniqueIdentifierFactory() _indexcard = trove_db.Indexcard.objects.create(source_record_suid=_suid) - update_indexcard_content(_indexcard, focus_iri, rdf_tripledict) + _indexcard.focus_identifier_set.add( + trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri), + ) + update_indexcard_content(_indexcard, focus_iri, rdf_twopledict, rdf_tripledict) if deriver_iris: digestive_tract.derive(_indexcard, deriver_iris) return _indexcard @@ -24,15 +36,21 @@ def create_indexcard( def update_indexcard_content( indexcard: trove_db.Indexcard, focus_iri: str, - rdf_tripledict: rdf.RdfTripleDictionary, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, ) -> None: - _raw = factories.RawDatumFactory(suid=indexcard.source_record_suid) + _card_content = _combined_tripledict(focus_iri, rdf_twopledict, rdf_tripledict) + _card_content_turtle = rdf.turtle_from_tripledict(_card_content) + _raw = factories.RawDatumFactory(suid=indexcard.source_record_suid, datum=_card_content_turtle) + indexcard.focus_identifier_set.add( + trove_db.ResourceIdentifier.objects.get_or_create_for_iri(focus_iri), + ) trove_db.LatestIndexcardRdf.objects.update_or_create( indexcard=indexcard, defaults={ 'from_raw_datum': _raw, 'focus_iri': focus_iri, - 'rdf_as_turtle': rdf.turtle_from_tripledict(rdf_tripledict), + 'rdf_as_turtle': _card_content_turtle, 'turtle_checksum_iri': 'foo', # not enforced }, ) @@ -41,15 +59,44 @@ def update_indexcard_content( def create_supplement( indexcard: trove_db.Indexcard, focus_iri: str, - rdf_tripledict: rdf.RdfTripleDictionary, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, ) -> trove_db.SupplementaryIndexcardRdf: _supp_suid = factories.SourceUniqueIdentifierFactory() - _supp_raw = factories.RawDatumFactory(suid=_supp_suid) + _supp_content = _combined_tripledict(focus_iri, rdf_twopledict, rdf_tripledict) + _supp_content_turtle = rdf.turtle_from_tripledict(_supp_content) + _supp_raw = factories.RawDatumFactory(suid=_supp_suid, datum=_supp_content_turtle) return trove_db.SupplementaryIndexcardRdf.objects.create( from_raw_datum=_supp_raw, indexcard=indexcard, supplementary_suid=_supp_suid, focus_iri=focus_iri, - rdf_as_turtle=rdf.turtle_from_tripledict(rdf_tripledict), + rdf_as_turtle=_supp_content_turtle, turtle_checksum_iri='sup', # not enforced ) + + +def index_indexcards(index_strategy, indexcards): + from share.search import messages + _messages_chunk = messages.MessagesChunk( + messages.MessageType.UPDATE_INDEXCARD, + [_indexcard.id for _indexcard in indexcards], + ) + assert all( + _response.is_done + for _response in index_strategy.pls_handle_messages_chunk(_messages_chunk) + ) + index_strategy.pls_refresh() + + +def _combined_tripledict( + focus_iri: str, + rdf_twopledict: rdf.RdfTwopleDictionary | None = None, + rdf_tripledict: rdf.RdfTripleDictionary | None = None, +) -> rdf.RdfTripleDictionary: + _graph = rdf.RdfGraph() + if rdf_twopledict is not None: + _graph.add_twopledict(focus_iri, rdf_twopledict) + if rdf_tripledict is not None: + _graph.add_tripledict(rdf_tripledict) + return _graph.tripledict diff --git a/tests/trove/render/test_jsonapi_renderer.py b/tests/trove/render/test_jsonapi_renderer.py index a5e8bdc6d..9357c5ff6 100644 --- a/tests/trove/render/test_jsonapi_renderer.py +++ b/tests/trove/render/test_jsonapi_renderer.py @@ -35,7 +35,7 @@ class TestJsonapiRenderer(_BaseJsonapiRendererTest): mediatype='application/vnd.api+json', rendered_content=json.dumps({ "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmQ=", + "id": "blarg:aCard", "type": "index-card", "attributes": { "resourceIdentifier": [ @@ -51,7 +51,7 @@ class TestJsonapiRenderer(_BaseJsonapiRendererTest): }, "meta": { "foaf:primaryTopic": [ - "blarg:anItem" + {"@id": "blarg:anItem"}, ], "dcterms:issued": [ "2024-01-01" @@ -67,10 +67,10 @@ class TestJsonapiRenderer(_BaseJsonapiRendererTest): mediatype='application/vnd.api+json', rendered_content=json.dumps({ "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYVN1YmplY3Q=", + "id": "blarg:aSubject", "type": "blarg:aType", "meta": { - "blarg:hasIri": ["blarg:anIri"], + "blarg:hasIri": [{"@id": "blarg:anIri"}], "blarg:hasRdfStringLiteral": ["an rdf:string literal"], "blarg:hasRdfLangStringLiteral": ['a rdf:langString literal'], "blarg:hasIntegerLiteral": [17], @@ -90,7 +90,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR mediatype='application/vnd.api+json', rendered_content=json.dumps({ "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYVNlYXJjaA==", + "id": "blarg:aSearch", "type": "index-card-search", "attributes": { "totalResultCount": 0, @@ -105,7 +105,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR mediatype='application/vnd.api+json', rendered_content=json.dumps({ "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYVNlYXJjaEZldw==", + "id": "blarg:aSearchFew", "type": "index-card-search", "attributes": { "totalResultCount": 3 @@ -139,7 +139,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR "relationships": { "indexCard": { "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmQ=", + "id": "blarg:aCard", "type": "index-card" } } @@ -151,7 +151,7 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR "relationships": { "indexCard": { "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmRk", + "id": "blarg:aCardd", "type": "index-card" } } @@ -163,18 +163,18 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR "relationships": { "indexCard": { "data": { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmRkZA==", + "id": "blarg:aCarddd", "type": "index-card" } } } }, { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmQ=", + "id": "blarg:aCard", "type": "index-card", "meta": { "foaf:primaryTopic": [ - "blarg:anItem" + {"@id": "blarg:anItem"}, ], "dcterms:issued": [ "2024-01-01" @@ -197,11 +197,11 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR } }, { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmRkZA==", + "id": "blarg:aCarddd", "type": "index-card", "meta": { "foaf:primaryTopic": [ - "blarg:anItemmm" + {"@id": "blarg:anItemmm"}, ], "dcterms:issued": [ "2024-03-03" @@ -224,11 +224,11 @@ class TestJsonapiSearchRenderer(_BaseJsonapiRendererTest, _base.TrovesearchJsonR } }, { - "id": "aHR0cDovL2JsYXJnLmV4YW1wbGUvdm9jYWIvYUNhcmRk", + "id": "blarg:aCardd", "type": "index-card", "meta": { "foaf:primaryTopic": [ - "blarg:anItemm" + {"@id": "blarg:anItemm"}, ], "dcterms:issued": [ "2024-02-02" diff --git a/tests/trove/render/test_simple_json_renderer.py b/tests/trove/render/test_simple_json_renderer.py index d9481e183..7f59c8a59 100644 --- a/tests/trove/render/test_simple_json_renderer.py +++ b/tests/trove/render/test_simple_json_renderer.py @@ -28,7 +28,7 @@ class TestSimpleJsonRenderer(_base.TrovesearchJsonRendererTests): { "@id": BLARG.anItem, "title": "an item, yes", - "foaf:primaryTopicOf": [ + "foaf:isPrimaryTopicOf": [ { "@id": BLARG.aCard } @@ -37,7 +37,7 @@ class TestSimpleJsonRenderer(_base.TrovesearchJsonRendererTests): { "@id": BLARG.anItemm, "title": "an itemm, yes", - "foaf:primaryTopicOf": [ + "foaf:isPrimaryTopicOf": [ { "@id": BLARG.aCardd } @@ -46,7 +46,7 @@ class TestSimpleJsonRenderer(_base.TrovesearchJsonRendererTests): { "@id": BLARG.anItemmm, "title": "an itemmm, yes", - "foaf:primaryTopicOf": [ + "foaf:isPrimaryTopicOf": [ { "@id": BLARG.aCarddd } diff --git a/tests/trove/test_doctest.py b/tests/trove/test_doctest.py new file mode 100644 index 000000000..18c77a18b --- /dev/null +++ b/tests/trove/test_doctest.py @@ -0,0 +1,37 @@ +import doctest + +import trove.util.chainmap +import trove.util.frozen +import trove.util.iris +import trove.util.propertypath + +_DOCTEST_OPTIONFLAGS = ( + doctest.ELLIPSIS + | doctest.NORMALIZE_WHITESPACE +) + +_MODULES_WITH_DOCTESTS = ( + trove.util.chainmap, + trove.util.frozen, + trove.util.iris, + trove.util.propertypath, +) + + +def _make_test_fn(testcase): + def _test(): + _result = testcase.run() + for _error_testcase, _traceback in _result.errors: + print(f'ERROR({_error_testcase}):\n{_traceback}') + for _error_testcase, _traceback in _result.failures: + print(f'FAILURE({_error_testcase}):\n{_traceback}') + assert not _result.failures and not _result.errors + return _test + + +for _module in _MODULES_WITH_DOCTESTS: + # HACK: allow running with pytest + globals().update({ + f'test_doctest_{_module.__name__}_{_i}': _make_test_fn(_test_case) + for _i, _test_case in enumerate(doctest.DocTestSuite(_module, optionflags=_DOCTEST_OPTIONFLAGS)) + }) diff --git a/tests/trove/trovesearch/test_search_params.py b/tests/trove/trovesearch/test_search_params.py index 3b9f0e6f4..655d25c68 100644 --- a/tests/trove/trovesearch/test_search_params.py +++ b/tests/trove/trovesearch/test_search_params.py @@ -1,69 +1,50 @@ +import urllib + from django.test import SimpleTestCase from trove.trovesearch.search_params import ( - Textsegment, - SearchFilter, + SearchText, + SearchFilter, DEFAULT_PROPERTYPATH_SET, ) -from trove.util.queryparams import QueryparamName +from trove.util.queryparams import QueryparamName, queryparams_from_querystring from trove.vocab.namespaces import OSFMAP, RDF, DCTERMS -class TestTextsegment(SimpleTestCase): - def test_empty(self): - for _empty_input in ('', '""', '*', '-', '-""'): - _empty = set(Textsegment.iter_from_text(_empty_input)) - self.assertFalse(_empty) +class TestSearchText(SimpleTestCase): + def test_from_queryparam_family_with_empty_value(self): + _qp = queryparams_from_querystring('myBlargText[foo]=') + result = SearchText.from_queryparam_family(_qp, 'myBlargText') + self.assertEqual(result, frozenset()) - def test_fuzz(self): - _fuzzword = set(Textsegment.iter_from_text('woord')) - self.assertEqual(_fuzzword, frozenset(( - Textsegment('woord', is_fuzzy=True, is_negated=False, is_openended=True), - ))) - _fuzzphrase = set(Textsegment.iter_from_text('wibbleplop worble polp elbbiw')) - self.assertEqual(_fuzzphrase, frozenset(( - Textsegment('wibbleplop worble polp elbbiw', is_fuzzy=True, is_negated=False, is_openended=True), - ))) + def test_single_word(self): + qp = queryparams_from_querystring('myBlargText=word') + (st,) = SearchText.from_queryparam_family(qp, 'myBlargText') + self.assertEqual(st.text, "word") + self.assertEqual(st.propertypath_set, DEFAULT_PROPERTYPATH_SET) - def test_exact(self): - _exactword = set(Textsegment.iter_from_text('"woord"')) - self.assertEqual(_exactword, frozenset(( - Textsegment('woord', is_fuzzy=False, is_negated=False, is_openended=False), - ))) - _exactphrase = set(Textsegment.iter_from_text('"wibbleplop worble polp elbbiw"')) - self.assertEqual(_exactphrase, frozenset(( - Textsegment('wibbleplop worble polp elbbiw', is_fuzzy=False, is_negated=False, is_openended=False), - ))) - _openphrase = set(Textsegment.iter_from_text('"wibbleplop worble polp elbbiw')) - self.assertEqual(_openphrase, frozenset(( - Textsegment('wibbleplop worble polp elbbiw', is_fuzzy=False, is_negated=False, is_openended=True), - ))) + def test_multiple_words(self): + qp = queryparams_from_querystring('myBlargText=apple&myBlargText=banana&myBlargText=cherry&anotherText=no') + result = SearchText.from_queryparam_family(qp, 'myBlargText') + self.assertEqual(result, {SearchText('apple'), SearchText('banana'), SearchText('cherry')}) - def test_minus(self): - _minusword = set(Textsegment.iter_from_text('-woord')) - self.assertEqual(_minusword, frozenset(( - Textsegment('woord', is_fuzzy=False, is_negated=True, is_openended=False), - ))) - _minusexactword = set(Textsegment.iter_from_text('-"woord droow"')) - self.assertEqual(_minusexactword, frozenset(( - Textsegment('woord droow', is_fuzzy=False, is_negated=True, is_openended=False), - ))) - _minustwo = set(Textsegment.iter_from_text('abc -def -g hi there')) - self.assertEqual(_minustwo, frozenset(( - Textsegment('def', is_fuzzy=False, is_negated=True, is_openended=False), - Textsegment('g', is_fuzzy=False, is_negated=True, is_openended=False), - Textsegment('hi there', is_fuzzy=True, is_negated=False, is_openended=True), - Textsegment('abc', is_fuzzy=True, is_negated=False, is_openended=False), - ))) + def test_text_with_spaces(self): + phrases = [ + "multi word phrase", + 'phrase with "double quotes"', + '~phrase~ with +special.characters AND \'mismatched quotes"' + ] + for phrase in phrases: + qp = queryparams_from_querystring(urllib.parse.urlencode({'myBlargText': phrase})) + (st,) = SearchText.from_queryparam_family(qp, 'myBlargText') + self.assertEqual(st.text, phrase) + self.assertEqual(st.propertypath_set, DEFAULT_PROPERTYPATH_SET) - def test_combo(self): - _combo = set(Textsegment.iter_from_text('wibbleplop -"worble polp" elbbiw -but "exactly')) - self.assertEqual(_combo, frozenset(( - Textsegment('worble polp', is_fuzzy=False, is_negated=True, is_openended=False), - Textsegment('elbbiw', is_fuzzy=True, is_negated=False, is_openended=False), - Textsegment('wibbleplop', is_fuzzy=True, is_negated=False, is_openended=False), - Textsegment('but', is_fuzzy=False, is_negated=True, is_openended=False), - Textsegment('exactly', is_fuzzy=False, is_negated=False, is_openended=True), - ))) + def test_custom_propertypath_set(self): + qp = queryparams_from_querystring('myBlargText[title]=foo') + result = SearchText.from_queryparam_family(qp, 'myBlargText') + self.assertEqual(result, { + SearchText('foo', frozenset({(DCTERMS.title,)})) + }) class TestSearchFilterPath(SimpleTestCase): diff --git a/trove/derive/osfmap_json.py b/trove/derive/osfmap_json.py index 1666025f5..5298715cf 100644 --- a/trove/derive/osfmap_json.py +++ b/trove/derive/osfmap_json.py @@ -7,7 +7,7 @@ from trove.vocab.namespaces import TROVE, RDF, OWL from trove.vocab.osfmap import ( OSFMAP_THESAURUS, - osfmap_shorthand, + osfmap_json_shorthand, ) from ._base import IndexcardDeriver @@ -64,7 +64,7 @@ def rdfobject_as_jsonld(self, rdfobject: rdf.RdfObject) -> dict: # datatype iri (or non-standard language iri) _datatype_iris = sorted( ( - osfmap_shorthand().compact_iri(_datatype_iri) + osfmap_json_shorthand().compact_iri(_datatype_iri) for _datatype_iri in rdfobject.datatype_iris ), key=len, @@ -74,7 +74,7 @@ def rdfobject_as_jsonld(self, rdfobject: rdf.RdfObject) -> dict: '@type': (_datatype_iris if (len(_datatype_iris) > 1) else _datatype_iris[0]), } elif isinstance(rdfobject, str): - return {'@id': osfmap_shorthand().compact_iri(rdfobject)} + return {'@id': osfmap_json_shorthand().compact_iri(rdfobject)} elif isinstance(rdfobject, (float, int)): return {'@value': rdfobject} elif isinstance(rdfobject, datetime.date): @@ -91,7 +91,7 @@ def twopledict_as_jsonld(self, twopledict: rdf.RdfTwopleDictionary) -> dict: _jsonld = {} for _pred, _objectset in twopledict.items(): if _objectset: - _key = osfmap_shorthand().compact_iri(_pred) + _key = osfmap_json_shorthand().compact_iri(_pred) _jsonld[_key] = self._list_or_single_value(_pred, [ self.rdfobject_as_jsonld(_obj) for _obj in _objectset @@ -114,10 +114,10 @@ def __nested_rdfobject_as_jsonld( _nested_obj = ( {} if rdfobject.startswith('_:') # HACK: non-blank blank nodes (stop that) - else {'@id': osfmap_shorthand().compact_iri(rdfobject)} + else {'@id': osfmap_json_shorthand().compact_iri(rdfobject)} ) for _pred, _objectset in tripledict[rdfobject].items(): - _label = osfmap_shorthand().compact_iri(_pred) + _label = osfmap_json_shorthand().compact_iri(_pred) if _objectset: _nested_obj[_label] = self._list_or_single_value( _pred, diff --git a/trove/management/commands/ingest_from_another_shtrove.py b/trove/management/commands/ingest_from_another_shtrove.py new file mode 100644 index 000000000..09ab22aa6 --- /dev/null +++ b/trove/management/commands/ingest_from_another_shtrove.py @@ -0,0 +1,68 @@ +import functools +from itertools import islice +import re +from urllib.parse import urlunsplit + +from django.conf import settings +from django.core.management.base import BaseCommand +import requests + +from share import models as share_db +from trove import digestive_tract +from trove.vocab import mediatypes + + +class Command(BaseCommand): + help = "ingest metadata from another SHARE/trove instance" + + def add_arguments(self, parser): + parser.add_argument("host", help="host name of the shtrove instance (e.g. 'staging-share.osf.io')") + parser.add_argument("--count", type=int, default=333) + + def handle(self, *args, host, count, **options): + if not settings.DEBUG: + raise Exception('this command not meant for non-debug use') + _ingested_count = 0 + _skipped_count = 0 + for _datum in islice(self._iter_datums(host), count): + if self._ingest(_datum): + _ingested_count += 1 + else: + _skipped_count += 1 + self.stdout.write( + self.style.SUCCESS(f'ingested {_ingested_count} (skipped {_skipped_count}) from {host}') + ) + + def _iter_datums(self, host: str): + _url = urlunsplit(('https', host, '/api/v2/rawdata/', '', '')) + while _url: + self.stdout.write('fetching a page...') + _json = requests.get(_url, headers={'Accept': mediatypes.JSONAPI}).json() + for _item in _json['data']: + yield _item['attributes']['datum'] + _url = _json['links'].get('next') + + def _ingest(self, datum: str) -> bool: + # HACK: get only turtle files by checking it starts with a prefix (unreliable, generally, but good enough for this) + _smells_like_turtle = datum.startswith('@prefix ') or datum.startswith('PREFIX ') + if _smells_like_turtle: + _first_subject_match = re.search( + r'^<([^>\s]+)>', # HACK: depends on specific serialization + datum, + re.MULTILINE, + ) + if _first_subject_match: + _subject_iri = _first_subject_match.group(1) + digestive_tract.swallow( + from_user=self._application_user, + record=datum, + record_identifier=_subject_iri, + record_mediatype=mediatypes.TURTLE, + focus_iri=_subject_iri, + ) + return True + return False + + @functools.cached_property + def _application_user(self): + return share_db.ShareUser.objects.get(username=settings.APPLICATION_USERNAME) diff --git a/trove/render/__init__.py b/trove/render/__init__.py index 351ac791f..2e1350ac4 100644 --- a/trove/render/__init__.py +++ b/trove/render/__init__.py @@ -11,7 +11,7 @@ from .simple_tsv import TrovesearchSimpleTsvRenderer -__all__ = ('get_renderer_type',) +__all__ = ('get_renderer_type', 'BaseRenderer') RENDERERS: tuple[type[BaseRenderer], ...] = ( RdfHtmlBrowseRenderer, diff --git a/trove/render/_base.py b/trove/render/_base.py index 996ff6744..48cfe1cc8 100644 --- a/trove/render/_base.py +++ b/trove/render/_base.py @@ -11,8 +11,8 @@ from trove import exceptions as trove_exceptions from trove.vocab import mediatypes -from trove.vocab.namespaces import NAMESPACES_SHORTHAND from trove.vocab.trove import TROVE_API_THESAURUS +from trove.vocab.namespaces import namespaces_shorthand from ._rendering import ProtoRendering, SimpleRendering @@ -31,9 +31,14 @@ class BaseRenderer(abc.ABC): # instance fields response_focus: gather.Focus response_gathering: gather.Gathering - iri_shorthand: rdf.IriShorthand = NAMESPACES_SHORTHAND + iri_shorthand: rdf.IriShorthand = dataclasses.field(default_factory=namespaces_shorthand) thesaurus_tripledict: rdf.RdfTripleDictionary = dataclasses.field(default_factory=lambda: TROVE_API_THESAURUS) + @classmethod + def get_deriver_iri(cls, card_blending: bool): + # override if needed + return cls.INDEXCARD_DERIVER_IRI + @functools.cached_property def thesaurus(self): return rdf.RdfGraph(self.thesaurus_tripledict) diff --git a/trove/render/_html.py b/trove/render/_html.py new file mode 100644 index 000000000..45f775880 --- /dev/null +++ b/trove/render/_html.py @@ -0,0 +1,67 @@ +from __future__ import annotations +import contextlib +import dataclasses +from xml.etree.ElementTree import ( + Element, + SubElement, +) + +from primitive_metadata import primitive_rdf as rdf + + +__all__ = ('HtmlBuilder',) + + +@dataclasses.dataclass +class HtmlBuilder: + given_root: Element + _: dataclasses.KW_ONLY + _nested_elements: list[Element] = dataclasses.field(default_factory=list) + _heading_depth: int = 0 + + def __post_init__(self): + self._nested_elements.append(self.given_root) + + @property + def root_element(self) -> Element: + return self._nested_elements[0] + + @property + def _current_element(self) -> Element: + return self._nested_elements[-1] + + ### + # html-building helper methods + + @contextlib.contextmanager + def nest_h_tag(self, **kwargs): + _outer_heading_depth = self._heading_depth + if not _outer_heading_depth: + self._heading_depth = 1 + elif _outer_heading_depth < 6: # h6 deepest + self._heading_depth += 1 + _h_tag = f'h{self._heading_depth}' + with self.nest(_h_tag, **kwargs) as _nested: + try: + yield _nested + finally: + self._heading_depth = _outer_heading_depth + + @contextlib.contextmanager + def nest(self, tag_name, attrs=None): + _attrs = {**attrs} if attrs else {} + _nested_element = SubElement(self._current_element, tag_name, _attrs) + self._nested_elements.append(_nested_element) + try: + yield self._current_element + finally: + _popped_element = self._nested_elements.pop() + assert _popped_element is _nested_element + + def leaf(self, tag_name, *, text=None, attrs=None): + _leaf_element = SubElement(self._current_element, tag_name, attrs or {}) + if isinstance(text, rdf.Literal): + # TODO: lang + _leaf_element.text = text.unicode_value + elif text is not None: + _leaf_element.text = text diff --git a/trove/render/html_browse.py b/trove/render/html_browse.py index c7dceaf0e..9fef803dd 100644 --- a/trove/render/html_browse.py +++ b/trove/render/html_browse.py @@ -1,27 +1,34 @@ +from collections.abc import Iterator import contextlib import dataclasses import datetime -import markdown2 +import math import random +import re +from typing import ClassVar from urllib.parse import quote, urlsplit, urlunsplit from xml.etree.ElementTree import ( Element, - SubElement, tostring as etree_tostring, fromstring as etree_fromstring, ) +from django.conf import settings from django.contrib.staticfiles.storage import staticfiles_storage from django.http import QueryDict from django.urls import reverse -from primitive_metadata import primitive_rdf +from django.utils.translation import gettext as _ +import markdown2 +from primitive_metadata import primitive_rdf as rdf from trove.util.iris import get_sufficiently_unique_iri from trove.util.randomness import shuffled from trove.vocab import mediatypes -from trove.vocab.namespaces import RDF +from trove.vocab.namespaces import RDF, RDFS, SKOS, DCTERMS, FOAF, DC +from trove.vocab.static_vocab import combined_thesaurus__suffuniq from trove.vocab.trove import trove_browse_link from ._base import BaseRenderer +from ._html import HtmlBuilder STABLE_MEDIATYPES = (mediatypes.JSONAPI,) UNSTABLE_MEDIATYPES = ( @@ -33,168 +40,237 @@ mediatypes.CSV, ) +_LINK_TEXT_PREDICATES = ( + SKOS.prefLabel, + RDFS.label, + SKOS.altLabel, + DCTERMS.title, + DC.title, + FOAF.name, +) +_IMPLICIT_DATATYPES = frozenset(( + RDF.string, + RDF.langString, +)) -@dataclasses.dataclass -class RdfHtmlBrowseRenderer(BaseRenderer): - MEDIATYPE = 'text/html; charset=utf-8' +_QUERYPARAM_SPLIT_RE = re.compile(r'(?=[?&])') - def simple_render_document(self) -> str: - _html_builder = _HtmlBuilder(self.response_tripledict, self.response_focus.single_iri(), self.iri_shorthand) - _html_str = etree_tostring(_html_builder.html_element, encoding='unicode', method='html') - return ''.join(( - '', # TODO: can etree put the doctype in? - _html_str, - )) +_PHI = (math.sqrt(5) + 1) / 2 + +_HTML_DOCTYPE = '' @dataclasses.dataclass -class _HtmlBuilder: - all_data: primitive_rdf.RdfTripleDictionary - focus_iri: str - iri_shorthand: primitive_rdf.IriShorthand - html_element: Element = dataclasses.field(init=False) - __current_data: primitive_rdf.RdfTripleDictionary = dataclasses.field(init=False) - __current_element: Element = dataclasses.field(init=False) +class RdfHtmlBrowseRenderer(BaseRenderer): + MEDIATYPE: ClassVar[str] = 'text/html; charset=utf-8' + __current_data: rdf.RdfTripleDictionary = dataclasses.field(init=False) __visiting_iris: set[str] = dataclasses.field(init=False) - __heading_depth: int = 0 + __hb: HtmlBuilder = dataclasses.field(init=False) + __last_hue_turn: float = dataclasses.field(default_factory=random.random) def __post_init__(self): # TODO: lang (according to request -- also translate) - self.html_element = self.__current_element = Element('html') - self.__current_data = self.all_data + self.__current_data = self.response_tripledict self.__visiting_iris = set() - with self.__nest('head'): - self.__leaf('link', attrs={ - 'rel': 'stylesheet', - 'href': staticfiles_storage.url('css/browse.css'), - }) + + @property + def is_data_blended(self) -> bool | None: + return self.response_gathering.gatherer_kwargs.get('blend_cards') + + # override BaseRenderer + def simple_render_document(self) -> str: + self.__hb = HtmlBuilder(Element('html')) + self.render_html_head() _body_attrs = { 'class': 'BrowseWrapper', - 'style': f'--random-turn: {random.random()}turn;', + 'style': self._hue_turn_css(), } - with self.__nest('body', attrs=_body_attrs): - self.__render_subj(self.focus_iri), - self.__render_mediatype_links() - # TODO:
with unvisited triples in self.data (unreachable from focus_iri) - - def __render_mediatype_links(self): - with self.__nest('nav', attrs={'class': 'VisibleNest Browse__card'}): - self.__leaf('header', text='alternate mediatypes') - with self.__nest('ul', attrs={'class': 'Browse__twopleset'}): - for _mediatype in shuffled((*STABLE_MEDIATYPES, *UNSTABLE_MEDIATYPES)): - with self.__nest('li', attrs={'class': 'VisibleNest Browse__twople'}): - self.__mediatype_link(_mediatype) + with self.__hb.nest('body', attrs=_body_attrs): + self.render_nav() + self.render_main() + self.render_footer() + return '\n'.join(( + _HTML_DOCTYPE, + etree_tostring(self.__hb.root_element, encoding='unicode', method='html'), + )) + + def render_html_head(self): + with self.__hb.nest('head'): + self.__hb.leaf('link', attrs={ + 'rel': 'stylesheet', + 'href': staticfiles_storage.url('css/browse.css'), + }) + + def render_nav(self): + with self.__hb.nest('nav'): + self.__alternate_mediatypes_card() + if self.is_data_blended is not None: + self.__blender_toggle_card() + + def render_main(self): + with self.__hb.nest('main'): + for _iri in self.response_focus.iris: + self.__render_subj(_iri) + # TODO: show additional unvisited triples? + + def render_footer(self): + with self.__hb.nest('footer'): + ... + + def __alternate_mediatypes_card(self): + with self.__nest_card('details'): + self.__hb.leaf('summary', text=_('alternate mediatypes')) + for _mediatype in shuffled((*STABLE_MEDIATYPES, *UNSTABLE_MEDIATYPES)): + with self.__hb.nest('span', attrs={'class': 'Browse__literal'}): + self.__mediatype_link(_mediatype) + + def __blender_toggle_card(self): + with self.__nest_card('details'): + if self.is_data_blended: + _header_text = _('card-blending ON') + _link_text = _('disable card-blending') + _link_blend: str | None = None # remove blendCards param (defaults false) + else: + _header_text = _('card-blending OFF') + _link_text = _('enable card-blending') + _link_blend = '1' # blendCards=1 + self.__hb.leaf('summary', text=_header_text) + self.__hb.leaf('a', text=_link_text, attrs={ + 'href': self._queryparam_href('blendCards', _link_blend), + }) def __mediatype_link(self, mediatype: str): - (_scheme, _netloc, _path, _query, _fragment) = urlsplit(self.focus_iri) - _qparams = QueryDict(_query, mutable=True) - _qparams['acceptMediatype'] = mediatype - _href = urlunsplit(( - _scheme, - _netloc, - _path, - _qparams.urlencode(), - _fragment, - )) - self.__leaf('a', text=mediatype, attrs={'href': _href}) + self.__hb.leaf('a', text=mediatype, attrs={ + 'href': self._queryparam_href('acceptMediatype', mediatype), + }) if mediatype in UNSTABLE_MEDIATYPES: - self.__leaf('aside', text='(unstable)') + self.__hb.leaf('aside', text=_('(unstable)')) if mediatype in STABLE_MEDIATYPES: - with self.__nest('aside') as _aside: - _aside.text = '(stable for ' - with self.__nest('a', attrs={'href': reverse('trovetrove:docs')}) as _link: - _link.text = 'documented use' - _link.tail = ')' + with self.__hb.nest('aside'): + with self.__hb.nest('a', attrs={'href': reverse('trove:docs')}) as _link: + _link.text = _('(stable for documented use)') - def __render_subj(self, subj_iri: str, start_collapsed=False): + def __render_subj(self, subj_iri: str, *, start_collapsed=None): _twopledict = self.__current_data.get(subj_iri, {}) with self.__visiting(subj_iri): - with self.__h_tag() as _h_tag: - with self.__nest( - 'details', - attrs={ - 'class': 'Browse__card', - **({} if start_collapsed else {'open': ''}), - }, - visible=True, - ): - with self.__nest('summary'): - _label = self.__label_for_iri(subj_iri) - with self.__nest(_h_tag, attrs={'class': 'Browse__heading'}): - with self.__nest_link(subj_iri): - self.__leaf('dfn', text=_label, attrs={'id': quote(subj_iri)}) - _compact_focus = self.iri_shorthand.compact_iri(subj_iri) - if _compact_focus != _label: - self.__leaf('code', text=_compact_focus) - if _compact_focus != subj_iri: - self.__leaf('code', text=subj_iri) - self.__twoples(_twopledict) - - def __twoples(self, twopledict: primitive_rdf.RdfTwopleDictionary): - with self.__nest('ul', {'class': 'Browse__twopleset'}): + with self.__nest_card('article'): + with self.__hb.nest('header'): + _compact = self.iri_shorthand.compact_iri(subj_iri) + _is_compactable = (_compact != subj_iri) + _should_link = (subj_iri not in self.response_focus.iris) + with self.__hb.nest_h_tag(attrs={'id': quote(subj_iri)}) as _h: + if _should_link: + with self.__nest_link(subj_iri) as _link: + if _is_compactable: + _link.text = _compact + else: + self.__split_iri_pre(subj_iri) + else: + if _is_compactable: + _h.text = _compact + else: + self.__split_iri_pre(subj_iri) + self.__iri_subheaders(subj_iri) + if _twopledict: + with self.__hb.nest('details') as _details: + _detail_depth = sum((_el.tag == 'details') for _el in self.__hb._nested_elements) + _should_open = ( + _detail_depth < 3 + if start_collapsed is None + else not start_collapsed + ) + if _should_open: + _details.set('open', '') + self.__hb.leaf('summary', text=_('more details...')) + self.__twoples(_twopledict) + + def __twoples(self, twopledict: rdf.RdfTwopleDictionary): + with self.__hb.nest('dl', {'class': 'Browse__twopleset'}): for _pred, _obj_set in shuffled(twopledict.items()): - with self.__nest('li', {'class': 'Browse__twople'}, visible=True): - self.__leaf_link(_pred) - with self.__nest('ul', {'class': 'Browse__objectset'}): - for _obj in shuffled(_obj_set): - with self.__nest('li', {'class': 'Browse__object'}, visible=True): - self.__obj(_obj) - - def __obj(self, obj: primitive_rdf.RdfObject): + with self.__hb.nest('dt', attrs={'class': 'Browse__predicate'}): + self.__compact_link(_pred) + for _text in self.__iri_thesaurus_labels(_pred): + self.__literal(_text) + with self.__hb.nest('dd'): + for _obj in shuffled(_obj_set): + self.__obj(_obj) + + def __obj(self, obj: rdf.RdfObject): if isinstance(obj, str): # iri # TODO: detect whether indexcard? - if obj in self.__current_data: - if obj in self.__visiting_iris: - self.__leaf_link(obj) # TODO: consider - else: - self.__render_subj(obj) + if (obj in self.__current_data) and (obj not in self.__visiting_iris): + self.__render_subj(obj) else: - self.__leaf_link(obj) + with self.__hb.nest('article', attrs={'class': 'Browse__object'}): + self.__iri_link_and_labels(obj) elif isinstance(obj, frozenset): # blanknode if (RDF.type, RDF.Seq) in obj: self.__sequence(obj) else: - self.__twoples(primitive_rdf.twopledict_from_twopleset(obj)) - elif isinstance(obj, primitive_rdf.Literal): - self.__literal(obj) + self.__blanknode(obj) + elif isinstance(obj, rdf.Literal): + self.__literal(obj, is_rdf_object=True) elif isinstance(obj, (float, int, datetime.date)): - self.__literal(primitive_rdf.literal(obj)) - elif isinstance(obj, primitive_rdf.QuotedGraph): + self.__literal(rdf.literal(obj), is_rdf_object=True) + elif isinstance(obj, rdf.QuotedGraph): self.__quoted_graph(obj) - def __literal(self, literal: primitive_rdf.Literal): - # TODO language tag, datatypes - _markdown_iri = primitive_rdf.iri_from_mediatype('text/markdown') + def __literal( + self, + literal: rdf.Literal | str, + *, + is_rdf_object: bool = False, + ): + _lit = (literal if isinstance(literal, rdf.Literal) else rdf.literal(literal)) + _markdown_iri = rdf.iri_from_mediatype('text/markdown') _is_markdown = any( _datatype.startswith(_markdown_iri) - for _datatype in literal.datatype_iris + for _datatype in _lit.datatype_iris ) + _element_classes = ['Browse__literal'] + if is_rdf_object: + _element_classes.append('Browse__object') # TODO: checksum_iri, literal_iri - with self.__nest('article', attrs={'class': 'Browse__literal'}): + with self.__hb.nest('article', attrs={'class': ' '.join(_element_classes)}): + for _datatype_iri in _lit.datatype_iris.difference(_IMPLICIT_DATATYPES): + self.__compact_link(_datatype_iri) if _is_markdown: # TODO: tests for safe_mode - _html = markdown2.markdown(literal.unicode_value, safe_mode='escape') - self.__current_element.append(etree_fromstring(f'{_html}')) + _html = markdown2.markdown(_lit.unicode_value, safe_mode='escape') + self.__hb._current_element.append(etree_fromstring(f'{_html}')) else: - self.__leaf('q', text=literal.unicode_value) - for _datatype_iri in literal.datatype_iris: - self.__leaf_link(_datatype_iri) + self.__hb.leaf('q', text=_lit) def __sequence(self, sequence_twoples: frozenset): - _obj_in_order = list(primitive_rdf.sequence_objects_in_order(sequence_twoples)) - with self.__nest('details', attrs={'open': ''}): - self.__leaf('summary', text=str(len(_obj_in_order))) - with self.__nest('ol'): # TODO: style? + _obj_in_order = list(rdf.sequence_objects_in_order(sequence_twoples)) + with self.__hb.nest('details', attrs={'open': '', 'class': 'Browse__blanknode Browse__object'}): + _text = _('sequence of %(count)s') % {'count': len(_obj_in_order)} + self.__hb.leaf('summary', text=_text) + with self.__hb.nest('ol'): # TODO: style? for _seq_obj in _obj_in_order: - with self.__nest('li', visible=True): + with self.__hb.nest('li'): # , visible=True): self.__obj(_seq_obj) - def __quoted_graph(self, quoted_graph: primitive_rdf.QuotedGraph): + def __quoted_graph(self, quoted_graph: rdf.QuotedGraph): with self.__quoted_data(quoted_graph.tripledict): - self.__render_subj(quoted_graph.focus_iri, start_collapsed=True) + self.__render_subj(quoted_graph.focus_iri) # , start_collapsed=True) - ### - # private html-building helpers + def __blanknode(self, blanknode: rdf.RdfTwopleDictionary | frozenset): + _twopledict = ( + blanknode + if isinstance(blanknode, dict) + else rdf.twopledict_from_twopleset(blanknode) + ) + with self.__hb.nest('details', attrs={ + 'open': '', + 'class': 'Browse__blanknode Browse__object', + 'style': self._hue_turn_css(), + }): + self.__hb.leaf('summary', text='(blank node)') + self.__twoples(_twopledict) + + def __split_iri_pre(self, iri: str): + self.__hb.leaf('pre', text='\n'.join(self.__iri_lines(iri))) @contextlib.contextmanager def __visiting(self, iri: str): @@ -205,18 +281,6 @@ def __visiting(self, iri: str): finally: self.__visiting_iris.remove(iri) - @contextlib.contextmanager - def __h_tag(self): - _outer_heading_depth = self.__heading_depth - if not _outer_heading_depth: - self.__heading_depth = 1 - elif _outer_heading_depth < 6: # h6 deepest - self.__heading_depth += 1 - try: - yield f'h{self.__heading_depth}' - finally: - self.__heading_depth = _outer_heading_depth - @contextlib.contextmanager def __quoted_data(self, quoted_data: dict): _outer_data = self.__current_data @@ -229,42 +293,102 @@ def __quoted_data(self, quoted_data: dict): self.__current_data = _outer_data self.__visiting_iris = _outer_visiting_iris - @contextlib.contextmanager - def __nest(self, tag_name, attrs=None, visible=False): - _attrs = {**attrs} if attrs else {} - if visible: - _attrs['class'] = ( - ' '.join((_attrs['class'], 'VisibleNest')) - if 'class' in _attrs - else 'VisibleNest' - ) - _parent_element = self.__current_element - self.__current_element = SubElement(_parent_element, tag_name, _attrs) - try: - yield self.__current_element - finally: - self.__current_element = _parent_element + def __iri_link_and_labels(self, iri: str): + self.__compact_link(iri) + for _text in self.__iri_thesaurus_labels(iri): + self.__literal(_text) + + def __nest_link(self, iri: str): + _href = ( + iri + if _is_local_url(iri) + else trove_browse_link(iri) + ) + return self.__hb.nest('a', attrs={'href': _href}) - def __leaf(self, tag_name, *, text=None, attrs=None): - _leaf_element = SubElement(self.__current_element, tag_name, attrs or {}) - if text is not None: - _leaf_element.text = text + def __compact_link(self, iri: str): + with self.__nest_link(iri) as _a: + _a.text = self.iri_shorthand.compact_iri(iri) + return _a - def __nest_link(self, iri: str, *, attrs=None): - return self.__nest('a', attrs={ - **(attrs or {}), - 'href': trove_browse_link(iri), - }) + def __nest_card(self, tag: str): + return self.__hb.nest( + tag, + attrs={ + 'class': 'Browse__card', + 'style': self._hue_turn_css(), + }, + ) + + def __iri_thesaurus_labels(self, iri: str): + # TODO: consider requested language + _labels: set[rdf.RdfObject] = set() + _suffuniq = get_sufficiently_unique_iri(iri) + _thesaurus_entry = combined_thesaurus__suffuniq().get(_suffuniq) + if _thesaurus_entry: + for _pred in _LINK_TEXT_PREDICATES: + _labels.update(_thesaurus_entry.get(_pred, ())) + _twoples = self.__current_data.get(iri) + if _twoples: + for _pred in _LINK_TEXT_PREDICATES: + _labels.update(_twoples.get(_pred, ())) + return shuffled(_labels) + + def _hue_turn_css(self): + _hue_turn = (self.__last_hue_turn + _PHI) % 1.0 + self.__last_hue_turn = _hue_turn + return f'--hue-turn: {_hue_turn}turn;' + + def _queryparam_href(self, param_name: str, param_value: str | None): + _base_url = self.response_focus.single_iri() + if not _is_local_url(_base_url): + _base_url = trove_browse_link(_base_url) + (_scheme, _netloc, _path, _query, _fragment) = urlsplit(_base_url) + _qparams = QueryDict(_query, mutable=True) + if param_value is None: + try: + del _qparams[param_name] + except KeyError: + pass + else: + _qparams[param_name] = param_value + return urlunsplit(( + _scheme, + _netloc, + _path, + _qparams.urlencode(), + _fragment, + )) + + def __iri_subheaders(self, iri: str) -> None: + _type_iris = self.__current_data.get(iri, {}).get(RDF.type, ()) + if _type_iris: + for _type_iri in _type_iris: + self.__compact_link(_type_iri) + _labels = self.__iri_thesaurus_labels(iri) + if _labels: + for _label in _labels: + self.__literal(_label) - def __leaf_link(self, iri: str, *, attrs=None): - with self.__nest_link(iri, attrs=attrs) as _link: - _link.text = self.iri_shorthand.compact_iri(iri) - - def __label_for_iri(self, iri: str): - # TODO: get actual label in requested language - _shorthand = self.iri_shorthand.compact_iri(iri) - return ( - get_sufficiently_unique_iri(iri) - if _shorthand == iri - else _shorthand + def __iri_lines(self, iri: str) -> Iterator[str]: + (_scheme, _netloc, _path, _query, _fragment) = urlsplit(iri) + yield ( + f'://{_netloc}{_path}' + if _netloc + else f'{_scheme}:{_path}' ) + if _query: + yield from filter(bool, _QUERYPARAM_SPLIT_RE.split(f'?{_query}')) + if _fragment: + yield f'#{_fragment}' + + +def _append_class(el: Element, element_class: str): + el.set( + 'class', + ' '.join(filter(None, (element_class, el.get('class')))), + ) + + +def _is_local_url(iri: str) -> bool: + return iri.startswith(settings.SHARE_WEB_URL) diff --git a/trove/render/jsonapi.py b/trove/render/jsonapi.py index 8e9fc2bcb..6337e7edc 100644 --- a/trove/render/jsonapi.py +++ b/trove/render/jsonapi.py @@ -24,11 +24,8 @@ RDF, TROVE, XSD, - NAMESPACES_SHORTHAND, -) -from trove.vocab.trove import ( - trove_indexcard_namespace, ) +from trove.vocab.trove import trove_indexcard_namespace from ._base import BaseRenderer @@ -80,6 +77,11 @@ class RdfJsonapiRenderer(BaseRenderer): repr=False, ) + # override BaseRenderer + @classmethod + def get_deriver_iri(cls, card_blending: bool): + return (None if card_blending else super().get_deriver_iri(card_blending)) + def simple_render_document(self) -> str: return json.dumps( self.render_dict(self.response_focus.single_iri()), @@ -128,11 +130,15 @@ def render_identifier_object(self, iri_or_blanknode: _IriOrBlanknode): return self._identifier_object_cache[iri_or_blanknode] except KeyError: if isinstance(iri_or_blanknode, str): - _type_iris = list(self.response_data.q(iri_or_blanknode, RDF.type)) _id_obj = { - 'id': self._resource_id_for_iri(iri_or_blanknode), - 'type': self._single_typename(_type_iris), + '@id': self.iri_shorthand.compact_iri(iri_or_blanknode), } + _type_iris = list(self.response_data.q(iri_or_blanknode, RDF.type)) + if _type_iris: + _id_obj = { + 'id': self._resource_id_for_iri(iri_or_blanknode), + 'type': self._single_typename(_type_iris), + } elif isinstance(iri_or_blanknode, frozenset): _type_iris = [ _obj @@ -150,7 +156,7 @@ def render_identifier_object(self, iri_or_blanknode: _IriOrBlanknode): def _single_typename(self, type_iris: list[str]): if not type_iris: - raise trove_exceptions.MissingRdfType + return '' if len(type_iris) == 1: return self._membername_for_iri(type_iris[0]) # choose one predictably, preferring osfmap and trove @@ -178,6 +184,10 @@ def _resource_id_for_iri(self, iri: str): for _iri_namespace in self._id_namespace_set: if iri in _iri_namespace: return primitive_rdf.iri_minus_namespace(iri, namespace=_iri_namespace) + # check for a shorthand + _compact = self.iri_shorthand.compact_iri(iri) + if _compact != iri: + return _compact # as fallback, encode the iri into a valid jsonapi member name return base64.urlsafe_b64encode(iri.encode()).decode() @@ -299,10 +309,7 @@ def _render_attribute_datum(self, rdfobject: primitive_rdf.RdfObject) -> dict | return int(rdfobject.unicode_value) return rdfobject.unicode_value # TODO: decide how to represent language elif isinstance(rdfobject, str): - try: # maybe it's a jsonapi resource - return self.render_identifier_object(rdfobject) - except Exception: - return NAMESPACES_SHORTHAND.compact_iri(rdfobject) + return self.render_identifier_object(rdfobject) elif isinstance(rdfobject, (float, int)): return rdfobject elif isinstance(rdfobject, datetime.date): diff --git a/trove/render/simple_csv.py b/trove/render/simple_csv.py index 0642ed179..c3dd3c243 100644 --- a/trove/render/simple_csv.py +++ b/trove/render/simple_csv.py @@ -1,4 +1,8 @@ from __future__ import annotations +from collections.abc import ( + Iterable, + Iterator, +) import csv import functools import itertools @@ -6,19 +10,20 @@ import typing from trove.trovesearch.search_params import ( - Propertypath, - BaseTroveParams, CardsearchParams, ValuesearchParams, ) +from trove.util.propertypath import Propertypath from trove.vocab import mediatypes from trove.vocab import osfmap from trove.vocab.namespaces import TROVE from ._simple_trovesearch import SimpleTrovesearchRenderer from ._rendering import StreamableRendering +if typing.TYPE_CHECKING: + from trove.util.trove_params import BasicTroveParams -Jsonpath = typing.Iterable[str] # path of json keys +Jsonpath = Iterable[str] # path of json keys _MULTIVALUE_DELIMITER = ' ; ' # possible improvement: smarter in-value delimiting? _VALUE_KEY_PREFERENCE = ('@value', '@id', 'name', 'prefLabel', 'label') @@ -33,7 +38,7 @@ class TrovesearchSimpleCsvRenderer(SimpleTrovesearchRenderer): def unicard_rendering(self, card_iri: str, osfmap_json: dict): self.multicard_rendering(card_pages=iter([{card_iri: osfmap_json}])) - def multicard_rendering(self, card_pages: typing.Iterator[dict[str, dict]]): + def multicard_rendering(self, card_pages: Iterator[dict[str, dict]]): _doc = TabularDoc( card_pages, trove_params=getattr(self.response_focus, 'search_params', None), @@ -44,7 +49,7 @@ def multicard_rendering(self, card_pages: typing.Iterator[dict[str, dict]]): ) -def csv_stream(csv_dialect, header: list, rows: typing.Iterator[list]) -> typing.Iterator[str]: +def csv_stream(csv_dialect, header: list, rows: Iterator[list]) -> Iterator[str]: _writer = csv.writer(_Echo(), dialect=csv_dialect) yield _writer.writerow(header) for _row in rows: @@ -53,8 +58,8 @@ def csv_stream(csv_dialect, header: list, rows: typing.Iterator[list]) -> typing @dataclasses.dataclass class TabularDoc: - card_pages: typing.Iterator[dict[str, dict]] - trove_params: BaseTroveParams | None = None + card_pages: Iterator[dict[str, dict]] + trove_params: BasicTroveParams | None = None _started: bool = False @functools.cached_property @@ -69,8 +74,8 @@ def column_jsonpaths(self) -> tuple[Jsonpath, ...]: def first_page(self) -> dict[str, dict]: return next(self.card_pages, {}) - def _column_paths(self) -> typing.Iterator[Propertypath]: - _pathlists: list[typing.Iterable[Propertypath]] = [] + def _column_paths(self) -> Iterator[Propertypath]: + _pathlists: list[Iterable[Propertypath]] = [] if self.trove_params is not None: # hacks if isinstance(self.trove_params, ValuesearchParams): _expected_card_types = set(self.trove_params.valuesearch_type_iris()) @@ -99,7 +104,7 @@ def _iter_card_pages(self): def header(self) -> list[str]: return ['.'.join(_path) for _path in self.column_jsonpaths] - def rows(self) -> typing.Iterator[list[str]]: + def rows(self) -> Iterator[list[str]]: for _page in self._iter_card_pages(): for _card_iri, _osfmap_json in _page.items(): yield self._row_values(_osfmap_json) @@ -121,8 +126,8 @@ def _row_field_value(self, osfmap_json: dict, field_path: Jsonpath) -> str: return _MULTIVALUE_DELIMITER.join(map(str, _rendered_values)) -def _osfmap_jsonpath(iri_path: typing.Iterable[str]) -> Jsonpath: - _shorthand = osfmap.osfmap_shorthand() +def _osfmap_jsonpath(iri_path: Iterable[str]) -> Jsonpath: + _shorthand = osfmap.osfmap_json_shorthand() return tuple( _shorthand.compact_iri(_pathstep) for _pathstep in iri_path @@ -138,7 +143,7 @@ def _has_value(osfmap_json: dict, path: Jsonpath) -> bool: return True -def _iter_values(osfmap_json: dict, path: Jsonpath) -> typing.Iterator: +def _iter_values(osfmap_json: dict, path: Jsonpath) -> Iterator: assert path (_step, *_rest) = path _val = osfmap_json.get(_step) diff --git a/trove/render/simple_json.py b/trove/render/simple_json.py index 10f896fff..480ef1c7f 100644 --- a/trove/render/simple_json.py +++ b/trove/render/simple_json.py @@ -55,7 +55,7 @@ def _stream_json(self, card_pages: typing.Iterator[dict[str, dict]]): ) def _render_card_content(self, card_iri: str, osfmap_json: dict): - self._add_twople(osfmap_json, 'foaf:primaryTopicOf', card_iri) + self._add_twople(osfmap_json, 'foaf:isPrimaryTopicOf', card_iri) return osfmap_json def _render_meta(self): diff --git a/trove/render/turtle.py b/trove/render/turtle.py index 2b682178c..e8239b34f 100644 --- a/trove/render/turtle.py +++ b/trove/render/turtle.py @@ -1,7 +1,6 @@ from primitive_metadata import primitive_rdf as rdf from trove.vocab.namespaces import TROVE -from trove.vocab.trove import trove_shorthand from ._base import BaseRenderer @@ -14,5 +13,5 @@ def simple_render_document(self) -> str: return rdf.turtle_from_tripledict( self.response_data.tripledict, focus=self.response_focus.single_iri(), - shorthand=trove_shorthand, + shorthand=self.iri_shorthand, ) diff --git a/trove/static/css/browse.css b/trove/static/css/browse.css index 163364611..643bcfcf2 100644 --- a/trove/static/css/browse.css +++ b/trove/static/css/browse.css @@ -1,7 +1,16 @@ -.VisibleNest { - padding: 0.191rem 0.382rem; - filter: hue-rotate(0.192turn); - backdrop-filter: hue-rotate(0.192turn); +:root { + --phi: 1.618; + /* rotating colorspace (using `lch` with luminance and chroma held locally constant) */ + --hue-turn: 0; /* initial */ + --bg-luminance: 83%; + --bg-chroma: 19%; + --bg-color-initial: lch(83% 19% 1.618turn); + /* gutter spaces (gaps, paddings, margins...) */ + --gutter-1: 1.618rem; + --gutter-2: 0.618rem; + --gutter-3: 0.309rem; + --gutter-4: 0.155rem; + --gutter-5: 0.077rem; } .BrowseWrapper { @@ -9,111 +18,132 @@ flex-direction: row; align-items: flex-start; flex-wrap: wrap; - gap: 0.618rem; + gap: var(--gutter-1); margin: 0; - padding: 0; + padding: 1rem; min-height: 100vh; - background-color: #fedbae; - backdrop-filter: hue-rotate(var(--random-turn)); + background-color: lch(var(--bg-luminance) var(--bg-chroma) var(--hue-turn)); } .BrowseWrapper dfn { font-weight: bold; } +.BrowseWrapper pre { + margin: 0; +} + .Browse__card { display: flex; flex-direction: column; - - /*max-width: 31rem;*/ - border: solid 0.382rem rgba(0,0,0,0.191); -} - -details.Browse__card > summary::before { + padding: var(--gutter-2) var(--gutter-3); + background-color: lch(var(--bg-luminance) var(--bg-chroma) var(--hue-turn)); + border-color: lch(59% var(--bg-chroma) var(--hue-turn)); + border-style: solid; + border-inline-start-width: var(--gutter-3); + border-block-start-width: var(--gutter-4); + border-inline-end-width: 0; + border-block-end-width: 0; + /* + border-start-end-radius: 1rem; + border-end-start-radius: 1rem; + */ +} + +.BrowseWrapper details > summary::before { content: 'β€½'; display: inline-block; transition-property: rotate; - transition-duration: 1s; -} - -details.Browse__card[open] > summary::before { - rotate: var(--random-turn); + transition-duration: 0.618s; + margin-right: var(--gutter-2); } -.BrowseWrapper > .Browse__card { - margin: 1em; +.BrowseWrapper details[open] > summary::before { + rotate: var(--hue-turn); } .Browse__card > header { display: flex; - align-items: center; - flex-wrap: wrap; - gap: 0.618rem; - padding: 0.618rem; + flex-direction: row; + gap: var(--gutter-2); + align-items: baseline; + border-bottom: solid 1px rgba(0,0,0,0.382); + margin-bottom: var(--gutter-3); } -.Browse__heading { +.Browse__card > header > :first-child { margin: 0; } .Browse__card > footer { - padding: 0.618rem; + padding: var(--gutter-2); } -.Browse__twopleset { - display: flex; - flex-direction: column; - +dl.Browse__twopleset { + display: grid; + grid-template-columns: + [twople-pred] auto + [twople-obj] 1fr + ; + grid-auto-flow: row; + row-gap: var(--gutter-2); margin: 0; padding: 0; } -.Browse__twople { +dl.Browse__twopleset > dt { + grid-column: twople-pred; display: flex; - flex-direction: row; - align-items: flex-start; - gap: 0.382rem; - margin: 0; - border: solid 1px rgba(0,0,0,0.382); + flex-direction: column; } -.Browse__twople:not(:first-child) { - border-top: 0; +dl.Browse__twopleset > dd { + grid-column: twople-obj; + margin: 0; + display: flex; + flex-direction: column; + gap: var(--gutter-5); } -.Browse__objectset { +.Browse__twople { display: flex; flex-direction: row; - flex-wrap: wrap; align-items: flex-start; - + gap: var(--gutter-3); margin: 0; - padding: 0; - gap: 0.382rem; } -.Browse__object { - display: flex; - flex-direction: row; - margin: 0; - /*border: 1px dotted #000;*/ - border: dotted 1px; - gap: 0.382rem; +.Browse__blanknode { + padding: var(--gutter-4); + border-color: rgba(0,0,0,0.382); + border-style: solid; + border-inline-start-width: var(--gutter-3); + border-block-start-width: var(--gutter-4); + border-inline-end-width: 0; + border-block-end-width: 0; } .Browse__literal { display: flex; flex-direction: row; - flex-wrap: wrap; - gap: 0.382rem; + gap: var(--gutter-3); + padding: var(--gutter-4); } -/* -.Browse :focus-within { - backdrop-filter: hue-rotate(var(--hue-rotate-step)); +.Browse__literal > q { + flex-basis: 100%; + font-style: italic; +} +.Browse__literal > q > p { + margin: 0; +} + +.Browse__predicate { + background-color: lch(from var(--bg-color-initial) 89% c var(--hue-turn)); + padding: var(--gutter-4); } -.Browse :focus { - border: 5px dotted #e28; +.Browse__object { + background-color: lch(from var(--bg-color-initial) 93% c var(--hue-turn)); + padding: var(--gutter-4); } -*/ diff --git a/trove/templates/trove/openapi-redoc.html b/trove/templates/trove/openapi-redoc.html index c0a0da18b..8841d68f9 100644 --- a/trove/templates/trove/openapi-redoc.html +++ b/trove/templates/trove/openapi-redoc.html @@ -6,7 +6,7 @@ - + diff --git a/trove/trovebrowse_gathering.py b/trove/trovebrowse_gathering.py new file mode 100644 index 000000000..76903d158 --- /dev/null +++ b/trove/trovebrowse_gathering.py @@ -0,0 +1,63 @@ +from primitive_metadata import gather +from primitive_metadata import primitive_rdf as rdf + +from trove import models as trove_db +from trove.util.iris import get_sufficiently_unique_iri +from trove.vocab import namespaces as ns +from trove.vocab import static_vocab +from trove.vocab.trove import ( + TROVE_API_THESAURUS, +) + + +TROVEBROWSE_NORMS = gather.GatheringNorms.new( + namestory=( + rdf.literal('trovebrowse', language='en'), + rdf.literal('browse a trove of IRI-linked metadata', language='en'), + ), + focustype_iris={}, + param_iris={ns.TROVE.blendCards}, + thesaurus=TROVE_API_THESAURUS, + +) + + +trovebrowse = gather.GatheringOrganizer( + namestory=( + rdf.literal('trovebrowse organizer', language='en'), + ), + norms=TROVEBROWSE_NORMS, + gatherer_params={'blend_cards': ns.TROVE.blendCards}, +) + + +@trovebrowse.gatherer(ns.FOAF.isPrimaryTopicOf) +def gather_cards_focused_on(focus, *, blend_cards: bool): + _identifier_qs = trove_db.ResourceIdentifier.objects.queryset_for_iris(focus.iris) + _indexcard_qs = trove_db.Indexcard.objects.filter(focus_identifier_set__in=_identifier_qs) + if blend_cards: + for _latest_rdf in trove_db.LatestIndexcardRdf.objects.filter(indexcard__in=_indexcard_qs): + yield from rdf.iter_tripleset(_latest_rdf.as_rdf_tripledict()) + else: + for _indexcard in _indexcard_qs: + _card_iri = _indexcard.get_iri() + yield (ns.FOAF.isPrimaryTopicOf, _card_iri) + yield (_card_iri, ns.RDF.type, ns.TROVE.Indexcard) + + +@trovebrowse.gatherer(ns.TROVE.thesaurusEntry) +def gather_thesaurus_entry(focus, *, blend_cards: bool): + _thesaurus = static_vocab.combined_thesaurus__suffuniq() + for _iri in focus.iris: + _suffuniq_iri = get_sufficiently_unique_iri(_iri) + _thesaurus_entry = _thesaurus.get(_suffuniq_iri, None) + if _thesaurus_entry: + if blend_cards: + yield from rdf.iter_twoples(_thesaurus_entry) + else: + yield (ns.TROVE.thesaurusEntry, rdf.QuotedGraph({_iri: _thesaurus_entry}, focus_iri=_iri)) + + +@trovebrowse.gatherer(ns.TROVE.usedAtPath) +def gather_paths_used_at(focus, **kwargs): + yield from () # TODO via elasticsearch aggregation diff --git a/trove/trovesearch/search_handle.py b/trove/trovesearch/search_handle.py index 90f44265d..01dbffd84 100644 --- a/trove/trovesearch/search_handle.py +++ b/trove/trovesearch/search_handle.py @@ -9,9 +9,9 @@ ReproduciblyRandomSampleCursor, ) from trove.trovesearch.search_params import ( - BaseTroveParams, CardsearchParams, ) +from trove.util.trove_params import BasicTroveParams from trove.vocab.namespaces import TROVE from trove.vocab.trove import trove_indexcard_namespace @@ -19,8 +19,8 @@ @dataclasses.dataclass class BasicSearchHandle: cursor: PageCursor - search_params: BaseTroveParams - handler: typing.Callable[[BaseTroveParams], typing.Self] | None = None + search_params: BasicTroveParams + handler: typing.Callable[[BasicTroveParams], typing.Self] | None = None @property def total_result_count(self) -> primitive_rdf.Literal: @@ -134,7 +134,7 @@ def __post_init__(self): ### # types -TrovesearchHandler = typing.Callable[[BaseTroveParams], BasicSearchHandle] +TrovesearchHandler = typing.Callable[[BasicTroveParams], BasicSearchHandle] ### diff --git a/trove/trovesearch/search_params.py b/trove/trovesearch/search_params.py index d64eaa9d5..b8bbf34a9 100644 --- a/trove/trovesearch/search_params.py +++ b/trove/trovesearch/search_params.py @@ -3,14 +3,11 @@ import dataclasses import enum import functools -import itertools import logging import types import typing -import urllib from django.http import QueryDict -from primitive_metadata import primitive_rdf from trove import exceptions as trove_exceptions from trove.trovesearch.page_cursor import ( @@ -18,30 +15,27 @@ PageCursor, ) from trove.util.frozen import freeze +from trove.util.propertypath import ( + ONE_GLOB_PROPERTYPATH, + PropertypathSet, + Propertypath, + is_globpath, +) +from trove.util.trove_params import BasicTroveParams from trove.util.queryparams import ( QueryparamDict, QueryparamName, split_queryparam_value, join_queryparam_value, - queryparams_from_querystring, -) -from trove.vocab.osfmap import ( - osfmap_shorthand, - is_date_property, - suggested_property_paths, - OSFMAP_THESAURUS, + get_single_value, ) -from trove.vocab.trove import trove_shorthand -from trove.vocab.namespaces import RDF, TROVE, OWL, NAMESPACES_SHORTHAND, FOAF, DCTERMS +from trove.vocab import osfmap +from trove.vocab.trove import trove_json_shorthand +from trove.vocab.namespaces import RDF, TROVE, OWL, FOAF, DCTERMS logger = logging.getLogger(__name__) -### -# type aliases -Propertypath = tuple[str, ...] -PropertypathSet = frozenset[Propertypath] - ### # constants for use in query param parsing @@ -53,12 +47,6 @@ # optional prefix for "sort" values DESCENDING_SORT_PREFIX = '-' -# between each step in a property path "foo.bar.baz" -PROPERTYPATH_DELIMITER = '.' - -# special path-step that matches any property -GLOB_PATHSTEP = '*' -ONE_GLOB_PROPERTYPATH: Propertypath = (GLOB_PATHSTEP,) DEFAULT_PROPERTYPATH_SET: PropertypathSet = frozenset([ONE_GLOB_PROPERTYPATH]) DEFAULT_INCLUDES_BY_TYPE: collections.abc.Mapping[str, frozenset[Propertypath]] = freeze({ @@ -106,7 +94,7 @@ class ValueType(enum.Enum): @classmethod def from_shortname(cls, shortname): - _iri = trove_shorthand().expand_iri(shortname) + _iri = trove_json_shorthand().expand_iri(shortname) return cls(_iri) @classmethod @@ -115,7 +103,7 @@ def shortnames(cls): yield _value_type.to_shortname() def to_shortname(self) -> str: - return trove_shorthand().compact_iri(self.value) + return trove_json_shorthand().compact_iri(self.value) ### @@ -123,107 +111,27 @@ def to_shortname(self) -> str: @dataclasses.dataclass(frozen=True) -class BaseTroveParams: +class TrovesearchParams(BasicTroveParams): static_focus_type: typing.ClassVar[str] # expected on subclasses - iri_shorthand: primitive_rdf.IriShorthand = dataclasses.field(repr=False) - accept_mediatype: str | None - included_relations: PropertypathSet = dataclasses.field(repr=False, compare=False) - attrpaths_by_type: collections.abc.Mapping[str, PropertypathSet] = dataclasses.field(repr=False, compare=False) - - @classmethod - def from_querystring(cls, querystring: str) -> typing.Self: - return cls.from_queryparams(queryparams_from_querystring(querystring)) - - @classmethod - def from_queryparams(cls, queryparams: QueryparamDict) -> typing.Self: - return cls(**cls.parse_queryparams(queryparams)) - - @classmethod - def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: - # subclasses should override and add their fields to super().parse_queryparams(queryparams) - return { - 'iri_shorthand': cls._gather_shorthand(queryparams), - 'included_relations': cls._gather_include(queryparams), - 'attrpaths_by_type': cls._gather_attrpaths(queryparams), - 'accept_mediatype': _get_single_value(queryparams, QueryparamName('acceptMediatype')), - } - - def to_querystring(self) -> str: - return self.to_querydict().urlencode() - - def to_querydict(self) -> QueryDict: - # subclasses should override and add their fields to super().to_querydict() - _querydict = QueryDict(mutable=True) - if self.accept_mediatype: - _querydict['acceptMediatype'] = self.accept_mediatype - # TODO: iriShorthand, include, fields[...] - return _querydict - @classmethod - def _gather_shorthand(cls, queryparams: QueryparamDict): - _prefixmap = {} - for _qp_name, _iri in queryparams.get('iriShorthand', []): - try: - (_shortname,) = _qp_name.bracketed_names - except ValueError: - raise trove_exceptions.InvalidQueryParamName(_qp_name) - else: - _prefixmap[_shortname] = _iri - return NAMESPACES_SHORTHAND.with_update(_prefixmap) + def _default_shorthand(cls): # NOTE: osfmap special + return osfmap.osfmap_json_shorthand() @classmethod - def _gather_include(cls, queryparams: QueryparamDict) -> PropertypathSet: - _include_params = queryparams.get('include', []) - if _include_params: - return frozenset(itertools.chain.from_iterable( - _parse_propertypath_set(_include_value) - for _, _include_value in _include_params - )) + def _default_include(cls): return DEFAULT_INCLUDES_BY_TYPE.get(cls.static_focus_type, frozenset()) @classmethod - def _gather_attrpaths(cls, queryparams: QueryparamDict) -> collections.abc.Mapping[ - str, - tuple[Propertypath, ...], - ]: - _attrpaths: collections.ChainMap[str, tuple[Propertypath, ...]] = collections.ChainMap( - DEFAULT_FIELDS_BY_TYPE, # type: ignore[arg-type] - ) - _fields_params = queryparams.get('fields', []) - if _fields_params: - _requested: dict[str, list[Propertypath]] = collections.defaultdict(list) - for _param_name, _param_value in _fields_params: - try: - (_typenames,) = filter(bool, _param_name.bracketed_names) - except (IndexError, ValueError): - raise trove_exceptions.InvalidQueryParamName( - f'expected "fields[TYPE]" (with exactly one non-empty bracketed segment)' - f' (got "{_param_name}")' - ) - else: - for _type in split_queryparam_value(_typenames): - _type_iri = osfmap_shorthand().expand_iri(_type) - _requested[_type_iri].extend(_parse_propertypaths(_param_value)) - _attrpaths = _attrpaths.new_child(freeze(_requested)) - return _attrpaths + def _default_attrpaths(cls) -> collections.abc.Mapping[str, tuple[Propertypath, ...]]: + return DEFAULT_FIELDS_BY_TYPE @dataclasses.dataclass(frozen=True) -class Textsegment: +class SearchText: text: str - is_fuzzy: bool = True - is_negated: bool = False - is_openended: bool = False propertypath_set: PropertypathSet = DEFAULT_PROPERTYPATH_SET - def __post_init__(self): - if self.is_negated and self.is_fuzzy: - raise trove_exceptions.InvalidSearchText(self.text, "search cannot be both negated and fuzzy") - - def words(self): - return self.text.split() - @classmethod def from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: str): return frozenset(cls.iter_from_queryparam_family(queryparams, queryparam_family)) @@ -231,12 +139,13 @@ def from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: @classmethod def iter_from_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: str): for (_param_name, _param_value) in queryparams.get(queryparam_family, ()): - yield from cls.iter_from_searchtext_param(_param_name, _param_value) + if _param_value: + yield cls.from_searchtext_param_or_none(_param_name, _param_value) @classmethod - def iter_from_searchtext_param(cls, param_name: QueryparamName, param_value: str): + def from_searchtext_param_or_none(cls, param_name: QueryparamName, param_value: str) -> SearchText | None: _propertypath_set = ( - _parse_propertypath_set(param_name.bracketed_names[0]) + frozenset(osfmap.parse_osfmap_propertypath_set(param_name.bracketed_names[0], allow_globs=True)) if param_name.bracketed_names else None ) @@ -246,116 +155,22 @@ def iter_from_searchtext_param(cls, param_name: QueryparamName, param_value: str str(param_name), 'may not use glob-paths longer than "*" with search-text parameters', ) - - for _textsegment in cls.iter_from_text(param_value): - if _propertypath_set: - yield dataclasses.replace(_textsegment, propertypath_set=_propertypath_set) - else: - yield _textsegment - - @classmethod - def iter_from_text(cls, text: str) -> typing.Iterable['Textsegment']: - '''parse search text into words and quoted phrases - ''' - _in_quotes = False - _last_quote_prefix = None - _text_remaining = text - while _text_remaining: - ( # split on the next " - _text_chunk, - _quote_mark, - _text_remaining, - ) = _text_remaining.partition(DOUBLE_QUOTATION_MARK) - _text_chunk = _text_chunk.strip() - if _text_chunk: - _is_openended = not (_quote_mark or _text_remaining) - if _in_quotes: - yield cls( - text=_text_chunk, - is_fuzzy=False, - is_negated=(_last_quote_prefix == NEGATE_WORD_OR_PHRASE), - is_openended=_is_openended, - ) - else: - yield from cls._from_fuzzy_text( - _text_chunk, - is_openended=_is_openended, - ) - if _quote_mark: - if _in_quotes: # end quote - _in_quotes = False - _last_quote_prefix = None - else: # begin quote - _in_quotes = True - _last_quote_prefix = _text_chunk[-1:] - - @classmethod - def _from_fuzzy_text(cls, text_chunk: str, is_openended: bool): - if text_chunk == '*': - return # special case for COS employees used to the old search page - _all_wordgroups = ( - (_each_word_negated, list(_words)) - for (_each_word_negated, _words) in itertools.groupby( - text_chunk.split(), - key=lambda word: word.startswith(NEGATE_WORD_OR_PHRASE), - ) - ) - (*_wordgroups, (_lastgroup_negated, _lastgroup_words)) = _all_wordgroups - for _each_word_negated, _words in _wordgroups: - yield from cls._from_fuzzy_wordgroup( - _each_word_negated, - _words, - is_openended=False, - ) - yield from cls._from_fuzzy_wordgroup( - _lastgroup_negated, - _lastgroup_words, - is_openended=is_openended, - ) - - @classmethod - def _from_fuzzy_wordgroup(cls, each_word_negated: bool, words: typing.Iterable[str], *, is_openended=False): - if each_word_negated: - for _word in words: - _word_without_prefix = _word[len(NEGATE_WORD_OR_PHRASE):] - if _word_without_prefix: - yield cls( - text=_word_without_prefix, - is_fuzzy=False, - is_negated=True, - is_openended=False, - ) - else: # nothing negated; keep the phrase in one fuzzy segment - yield cls( - text=' '.join(words), - is_fuzzy=True, - is_negated=False, - is_openended=is_openended, - ) + _searchtext = cls(text=param_value) + if _propertypath_set: + _searchtext = dataclasses.replace(_searchtext, propertypath_set=_propertypath_set) + return _searchtext @classmethod - def queryparams_from_textsegments(self, queryparam_family: str, textsegments): + def queryparams_from_searchtext(self, queryparam_family: str, cardsearch_searchtext): _by_propertypath_set = collections.defaultdict(set) - for _textsegment in textsegments: - _by_propertypath_set[_textsegment.propertypath_set].add(_textsegment) + for searchtext in cardsearch_searchtext: + _by_propertypath_set[searchtext.propertypath_set].add(searchtext) for _propertypath_set, _combinable_segments in _by_propertypath_set.items(): _qp_name = QueryparamName( queryparam_family, - (propertypath_set_key(_propertypath_set),), - ) - _qp_value = ' '.join( - _textsegment.as_searchtext() - for _textsegment in _combinable_segments + (osfmap.osfmap_propertypath_set_key(_propertypath_set),), ) - yield str(_qp_name), _qp_value - - def as_searchtext(self) -> str: - _text = self.text - if not self.is_fuzzy: - _text = f'"{_text}"' - if self.is_negated: - _text = f'-{_text}' - return _text + yield str(_qp_name), _combinable_segments @dataclasses.dataclass(frozen=True) @@ -372,11 +187,11 @@ class FilterOperator(enum.Enum): @classmethod def from_shortname(cls, shortname): - _iri = trove_shorthand().expand_iri(shortname) + _iri = trove_json_shorthand().expand_iri(shortname) return cls(_iri) def to_shortname(self) -> str: - return trove_shorthand().compact_iri(self.value) + return trove_json_shorthand().compact_iri(self.value) def is_date_operator(self): return self in (self.BEFORE, self.AFTER, self.AT_DATE) @@ -421,9 +236,9 @@ def from_filter_param(cls, param_name: QueryparamName, param_value: str): str(param_name), f'unknown filter operator "{_operator_value}"', ) - _propertypath_set = _parse_propertypath_set(_serialized_path_set) + _propertypath_set = frozenset(osfmap.parse_osfmap_propertypath_set(_serialized_path_set)) _is_date_filter = all( - is_date_property(_path[-1]) + osfmap.is_date_property(_path[-1]) for _path in _propertypath_set ) if _operator is None: # default operator @@ -443,7 +258,7 @@ def from_filter_param(cls, param_name: QueryparamName, param_value: str): if _is_date_filter: _value_list.append(_value) # TODO: vali-date else: - _value_list.append(osfmap_shorthand().expand_iri(_value)) + _value_list.append(osfmap.osfmap_json_shorthand().expand_iri(_value)) return cls( value_set=frozenset(_value_list), operator=_operator, @@ -468,11 +283,11 @@ def is_type_filter(self) -> bool: def as_queryparam(self, queryparam_family: str): _qp_name = QueryparamName(queryparam_family, ( - propertypath_set_key(self.propertypath_set), + osfmap.osfmap_propertypath_set_key(self.propertypath_set), self.operator.to_shortname(), )) _qp_value = join_queryparam_value( - osfmap_shorthand().compact_iri(_value) + osfmap.osfmap_json_shorthand().compact_iri(_value) for _value in self.value_set ) return str(_qp_name), _qp_value @@ -516,7 +331,7 @@ def _from_sort_queryparam( )) _descending = param_value.startswith(DESCENDING_SORT_PREFIX) _rawpath = param_value.lstrip(DESCENDING_SORT_PREFIX) - _path = _parse_propertypath(_rawpath, allow_globs=False) + _path = osfmap.parse_osfmap_propertypath(_rawpath) return cls( value_type=_value_type, propertypath=_path, @@ -539,19 +354,19 @@ def as_queryparam(self) -> tuple[str, str]: if (self.value_type == ValueType.DATE) else f'sort[{self.value_type.to_shortname()}]' ) - _pathkey = propertypath_key(self.propertypath) + _pathkey = osfmap.osfmap_propertypath_key(self.propertypath) _value = (f'-{_pathkey}' if self.descending else _pathkey) return (_name, _value) @dataclasses.dataclass(frozen=True) -class IndexcardParams(BaseTroveParams): +class IndexcardParams(TrovesearchParams): static_focus_type = TROVE.Indexcard @dataclasses.dataclass(frozen=True) -class CardsearchParams(BaseTroveParams): - cardsearch_textsegment_set: frozenset[Textsegment] +class CardsearchParams(TrovesearchParams): + cardsearch_searchtext: frozenset[SearchText] cardsearch_filter_set: frozenset[SearchFilter] index_strategy_name: str | None sort_list: tuple[SortParam, ...] @@ -564,9 +379,9 @@ def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: _filter_set = SearchFilter.from_queryparam_family(queryparams, 'cardSearchFilter') return { **super().parse_queryparams(queryparams), - 'cardsearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'cardSearchText'), + 'cardsearch_searchtext': SearchText.from_queryparam_family(queryparams, 'cardSearchText'), 'cardsearch_filter_set': _filter_set, - 'index_strategy_name': _get_single_value(queryparams, QueryparamName('indexStrategy')), + 'index_strategy_name': get_single_value(queryparams, 'indexStrategy'), 'sort_list': SortParam.from_sort_queryparams(queryparams), 'page_cursor': _get_page_cursor(queryparams), } @@ -587,8 +402,8 @@ def cardsearch_type_iris(self): @functools.cached_property def cardsearch_text_paths(self) -> PropertypathSet: return frozenset().union(*( - _textsegment.propertypath_set - for _textsegment in self.cardsearch_textsegment_set + searchtext.propertypath_set + for searchtext in self.cardsearch_searchtext )) @functools.cached_property @@ -601,7 +416,7 @@ def cardsearch_text_glob_depths(self) -> frozenset[int]: def to_querydict(self) -> QueryDict: _querydict = super().to_querydict() - for _qp_name, _qp_value in Textsegment.queryparams_from_textsegments('cardSearchText', self.cardsearch_textsegment_set): + for _qp_name, _qp_value in SearchText.queryparams_from_searchtext('cardSearchText', self.cardsearch_searchtext): _querydict[_qp_name] = _qp_value for _sort in self.sort_list: _qp_name, _qp_value = _sort.as_queryparam() @@ -623,7 +438,7 @@ class ValuesearchParams(CardsearchParams): # includes fields from CardsearchParams, because a # valuesearch is always in context of a cardsearch valuesearch_propertypath: Propertypath - valuesearch_textsegment_set: frozenset[Textsegment] + valuesearch_searchtext: frozenset[SearchText] valuesearch_filter_set: frozenset[SearchFilter] static_focus_type = TROVE.Valuesearch @@ -631,20 +446,20 @@ class ValuesearchParams(CardsearchParams): # override CardsearchParams @classmethod def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: - _raw_propertypath = _get_single_value(queryparams, QueryparamName('valueSearchPropertyPath')) + _raw_propertypath = get_single_value(queryparams, 'valueSearchPropertyPath') if not _raw_propertypath: raise trove_exceptions.MissingRequiredQueryParam('valueSearchPropertyPath') return { **super().parse_queryparams(queryparams), - 'valuesearch_propertypath': _parse_propertypath(_raw_propertypath, allow_globs=False), - 'valuesearch_textsegment_set': Textsegment.from_queryparam_family(queryparams, 'valueSearchText'), + 'valuesearch_propertypath': osfmap.parse_osfmap_propertypath(_raw_propertypath), + 'valuesearch_searchtext': SearchText.from_queryparam_family(queryparams, 'valueSearchText'), 'valuesearch_filter_set': SearchFilter.from_queryparam_family(queryparams, 'valueSearchFilter'), } def __post_init__(self): - if is_date_property(self.valuesearch_propertypath[-1]): + if osfmap.is_date_property(self.valuesearch_propertypath[-1]): # date-value limitations - if self.valuesearch_textsegment_set: + if self.valuesearch_searchtext: raise trove_exceptions.InvalidQueryParams( 'valueSearchText may not be used with valueSearchPropertyPath leading to a "date" property', ) @@ -655,8 +470,8 @@ def __post_init__(self): def to_querydict(self): _querydict = super().to_querydict() - _querydict['valueSearchPropertyPath'] = propertypath_key(self.valuesearch_propertypath) - for _qp_name, _qp_value in Textsegment.queryparams_from_textsegments('valueSearchText', self.valuesearch_textsegment_set): + _querydict['valueSearchPropertyPath'] = osfmap.osfmap_propertypath_key(self.valuesearch_propertypath) + for _qp_name, _qp_value in SearchText.queryparams_from_searchtext('valueSearchText', self.valuesearch_searchtext): _querydict[_qp_name] = _qp_value for _filter in self.valuesearch_filter_set: _qp_name, _qp_value = _filter.as_queryparam('valueSearchFilter') @@ -677,37 +492,8 @@ def valuesearch_type_iris(self): ### # helper functions -def is_globpath(path: Propertypath) -> bool: - return all(_pathstep == GLOB_PATHSTEP for _pathstep in path) - - -def make_globpath(length: int) -> Propertypath: - return ONE_GLOB_PROPERTYPATH * length - - def is_date_path(path: Propertypath) -> bool: - return bool(path) and is_date_property(path[-1]) - - -def propertypathstep_key(pathstep: str) -> str: - if pathstep == GLOB_PATHSTEP: - return pathstep - # assume iri - return urllib.parse.quote(osfmap_shorthand().compact_iri(pathstep)) - - -def propertypath_key(property_path: Propertypath) -> str: - return PROPERTYPATH_DELIMITER.join( - propertypathstep_key(_pathstep) - for _pathstep in property_path - ) - - -def propertypath_set_key(propertypath_set: PropertypathSet) -> str: - return join_queryparam_value( - propertypath_key(_propertypath) - for _propertypath in propertypath_set - ) + return bool(path) and osfmap.is_date_property(path[-1]) def _get_text_queryparam(queryparams: QueryparamDict, queryparam_family: str) -> str: @@ -720,52 +506,6 @@ def _get_text_queryparam(queryparams: QueryparamDict, queryparam_family: str) -> ) -def _get_single_value( - queryparams: QueryparamDict, - queryparam_name: QueryparamName, -): - _family_params = queryparams.get(queryparam_name.family, ()) - _paramvalues = [ - _paramvalue - for _paramname, _paramvalue in _family_params - if _paramname.bracketed_names == queryparam_name.bracketed_names - ] - if not _paramvalues: - return None - try: - (_singlevalue,) = _paramvalues - except ValueError: - raise trove_exceptions.InvalidRepeatedQueryParam(str(queryparam_name)) - else: - return _singlevalue - - -def _parse_propertypath_set(serialized_path_set: str, *, allow_globs=True) -> PropertypathSet: - # comma-delimited set of dot-delimited paths - return frozenset(_parse_propertypaths(serialized_path_set, allow_globs=allow_globs)) - - -def _parse_propertypaths(serialized_path_set: str, *, allow_globs=True) -> typing.Iterator[Propertypath]: - for _path in split_queryparam_value(serialized_path_set): - yield _parse_propertypath(_path, allow_globs=allow_globs) - - -def _parse_propertypath(serialized_path: str, *, allow_globs=True) -> Propertypath: - _path = tuple( - osfmap_shorthand().expand_iri(_pathstep) - for _pathstep in serialized_path.split(PROPERTYPATH_DELIMITER) - ) - if GLOB_PATHSTEP in _path: - if not allow_globs: - raise trove_exceptions.InvalidPropertyPath(serialized_path, 'no * allowed') - if any(_pathstep != GLOB_PATHSTEP for _pathstep in _path): - raise trove_exceptions.InvalidPropertyPath( - serialized_path, - f'path must be all * or no * (got {serialized_path})', - ) - return _path - - def _get_related_property_paths(filter_set) -> tuple[Propertypath, ...]: # hard-coded for osf.io search pages, static list per type # TODO: replace with some dynamism, maybe a 'significant_terms' aggregation @@ -773,22 +513,22 @@ def _get_related_property_paths(filter_set) -> tuple[Propertypath, ...]: for _filter in filter_set: if _filter.is_type_filter(): _type_iris.update(_filter.value_set) - return suggested_property_paths(_type_iris) + return osfmap.suggested_property_paths(_type_iris) def _get_unnamed_iri_values(filter_set) -> typing.Iterable[str]: for _filter in filter_set: if _filter.operator.is_iri_operator(): for _iri in _filter.value_set: - if _iri not in OSFMAP_THESAURUS: + if _iri not in osfmap.OSFMAP_THESAURUS: yield _iri def _get_page_cursor(queryparams: QueryparamDict) -> PageCursor: - _cursor_value = _get_single_value(queryparams, QueryparamName('page', ('cursor',))) + _cursor_value = get_single_value(queryparams, QueryparamName('page', ('cursor',))) if _cursor_value: return PageCursor.from_queryparam_value(_cursor_value) - _size_value = _get_single_value(queryparams, QueryparamName('page', ('size',))) + _size_value = get_single_value(queryparams, QueryparamName('page', ('size',))) if _size_value is None: return PageCursor() try: diff --git a/trove/trovesearch/trovesearch_gathering.py b/trove/trovesearch/trovesearch_gathering.py index 7e027623c..0d2fcb719 100644 --- a/trove/trovesearch/trovesearch_gathering.py +++ b/trove/trovesearch/trovesearch_gathering.py @@ -19,8 +19,6 @@ from trove.trovesearch.search_params import ( CardsearchParams, ValuesearchParams, - propertypath_key, - propertypath_set_key, ) from trove.trovesearch.search_handle import ( CardsearchHandle, @@ -33,15 +31,10 @@ JSONAPI_LINK_OBJECT, JSONAPI_MEMBERNAME, ) -from trove.vocab.osfmap import ( - osfmap_shorthand, - OSFMAP_THESAURUS, - suggested_filter_operator, -) +from trove.vocab import osfmap from trove.vocab.trove import ( TROVE_API_THESAURUS, trove_indexcard_namespace, - trove_shorthand, ) @@ -58,7 +51,7 @@ TROVE.Cardsearch, TROVE.Valuesearch, }, - param_iris={TROVE.deriverIRI}, + param_iris={TROVE.deriverIRI, TROVE.blendCards}, thesaurus=TROVE_API_THESAURUS, ) @@ -68,7 +61,10 @@ literal('trove search', language='en'), ), norms=TROVE_GATHERING_NORMS, - gatherer_params={'deriver_iri': TROVE.deriverIRI}, + gatherer_params={ + 'deriver_iri': TROVE.deriverIRI, + 'blend_cards': TROVE.blendCards, + }, ) @@ -150,7 +146,7 @@ def gather_count(focus: CardsearchFocus, **kwargs): focustype_iris={TROVE.Cardsearch}, cache_bound=1, # only the first page gets cached ) -def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): +def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, blend_cards, **kwargs): # each searchResultPage a sequence of search results _current_handle: CardsearchHandle | None = focus.search_handle while _current_handle is not None: @@ -163,36 +159,65 @@ def gather_cardsearch_page(focus: CardsearchFocus, *, deriver_iri, **kwargs): _card_focus = _card_foci.get(_result.card_iri) if _card_focus is None: continue # skip (deleted card still indexed?) - _text_evidence_twoples = ( - (TROVE.matchEvidence, frozenset(( - (RDF.type, TROVE.TextMatchEvidence), - (TROVE.matchingHighlight, _evidence.matching_highlight), - (TROVE.evidenceCardIdentifier, literal(_evidence.card_iri)), - *_single_propertypath_twoples(_evidence.property_path), - ))) - for _evidence in _result.text_match_evidence - ) - _result_page.append(frozenset(( - (RDF.type, TROVE.SearchResult), - (TROVE.indexCard, _result.card_iri), - *_text_evidence_twoples, - ))) - # hack around (current) limitations of primitive_metadata.gather - # (what with all these intermediate blank nodes and sequences): - # yield trove:resourceMetadata here (instead of another gatherer) - _card_twoples = _minimal_indexcard_twoples( - focus_identifiers=[ - _identifier.as_iri() - for _identifier in _card_focus.indexcard.focus_identifier_set.all() - ], - resource_metadata=_card_focus.resourceMetadata, + _result_obj, _triples = ( + _blended_card(_card_focus) + if blend_cards + else _unblended_card(_result, _card_focus) ) - for _pred, _obj in _card_twoples: - yield (_result.card_iri, _pred, _obj) + _result_page.append(_result_obj) + yield from _triples yield (TROVE.searchResultPage, sequence(_result_page)) _current_handle = _current_handle.get_next_streaming_handle() +def _blended_card(card_focus) -> tuple[rdf.RdfObject, Iterable[rdf.RdfTriple]]: + _metadata = card_focus.resourceMetadata + if isinstance(_metadata, rdf.Literal): + return (_metadata, ()) + if isinstance(_metadata, rdf.QuotedGraph): + return (_metadata.focus_iri, rdf.iter_tripleset(_metadata.tripledict)) + return (card_focus.single_iri(), ()) # oh well + + +def _unblended_card(_result, _card_focus) -> tuple[rdf.RdfObject, Iterable[rdf.RdfTriple]]: + return ( + _unblended_cardsearch_result(_result), + _unblended_card_triples(_result, _card_focus), + ) + + +def _unblended_cardsearch_result(_result) -> rdf.RdfBlanknode: + _text_evidence_twoples = ( + (TROVE.matchEvidence, frozenset(( + (RDF.type, TROVE.TextMatchEvidence), + (TROVE.matchingHighlight, _evidence.matching_highlight), + (TROVE.evidenceCardIdentifier, literal(_evidence.card_iri)), + *_single_propertypath_twoples(_evidence.property_path), + ))) + for _evidence in _result.text_match_evidence + ) + return frozenset(( + (RDF.type, TROVE.SearchResult), + (TROVE.indexCard, _result.card_iri), + *_text_evidence_twoples, + )) + + +def _unblended_card_triples(_result, _card_focus) -> Iterator[rdf.RdfTriple]: + # hack around (current) limitations of primitive_metadata.gather + # (what with all these intermediate blank nodes and sequences): + # yield trove:resourceMetadata here (instead of another gatherer) + _card_twoples = _unblended_indexcard_twoples( + focus_identifiers=[ + _identifier.as_iri() + for _identifier in _card_focus.indexcard.focus_identifier_set.all() + ], + resource_metadata=_card_focus.resourceMetadata, + ) + for _pred, _obj in _card_twoples: + yield (_result.card_iri, _pred, _obj) + + @trovesearch_by_indexstrategy.gatherer(TROVE.searchResultPage) def gather_page_links(focus, **kwargs): # links to more pages of results @@ -228,7 +253,7 @@ def gather_cardsearch_filter(focus, **kwargs): TROVE.searchResultPage, focustype_iris={TROVE.Valuesearch}, ) -def gather_valuesearch_page(focus: ValuesearchFocus, *, deriver_iri, **kwargs): +def gather_valuesearch_page(focus: ValuesearchFocus, *, deriver_iri, blend_cards, **kwargs): _result_page = [] _value_iris = { _result.value_iri @@ -255,7 +280,7 @@ def gather_valuesearch_page(focus: ValuesearchFocus, *, deriver_iri, **kwargs): # hack around (current) limitations of primitive_metadata.gather # (what with all these intermediate blank nodes and sequences): # yield trove:resourceMetadata here (instead of another gatherer) - _card_twoples = _minimal_indexcard_twoples( + _card_twoples = _unblended_indexcard_twoples( focus_identifiers=[ _identifier.as_iri() for _identifier in _card_focus.indexcard.focus_identifier_set.all() @@ -369,7 +394,7 @@ def _load_cards_and_extracted_rdf_contents(card_iris=None, value_iris=None) -> d _card_iri = _card.get_iri() _quoted_graph = _indexcard_rdf.as_quoted_graph() _quoted_graph.add( - (_quoted_graph.focus_iri, FOAF.primaryTopicOf, _card_iri), + (_quoted_graph.focus_iri, FOAF.isPrimaryTopicOf, _card_iri), ) _card_foci[_card_iri] = IndexcardFocus.new( iris=_card_iri, @@ -439,7 +464,7 @@ def _filter_as_blanknode(search_filter) -> frozenset: def _osfmap_or_unknown_iri_as_json(iri: str): try: - _twopledict = OSFMAP_THESAURUS[iri] + _twopledict = osfmap.OSFMAP_THESAURUS[iri] except KeyError: return rdf.literal_json({'@id': iri}) else: @@ -460,7 +485,7 @@ def _valuesearch_result_as_json(result: ValuesearchResult) -> Literal: ) -def _minimal_indexcard_twoples( +def _unblended_indexcard_twoples( focus_identifiers: Iterable[str], resource_metadata: rdf.Literal, ) -> Iterator[rdf.RdfTwople]: @@ -475,7 +500,7 @@ def _minimal_indexcard_twoples( def _valuesearch_result_as_indexcard_blanknode(result: ValuesearchResult) -> frozenset: - return frozenset(_minimal_indexcard_twoples( + return frozenset(_unblended_indexcard_twoples( focus_identifiers=[literal(result.value_iri or result.value_value)], resource_metadata=_valuesearch_result_as_json(result), )) @@ -495,19 +520,19 @@ def _osfmap_twople_json(twopledict): def _osfmap_path(property_path): return rdf.literal_json([ - osfmap_shorthand().compact_iri(_iri) + osfmap.osfmap_json_shorthand().compact_iri(_iri) for _iri in property_path ]) def _single_propertypath_twoples(property_path: tuple[str, ...]): - yield (TROVE.propertyPathKey, literal(propertypath_key(property_path))) + yield (TROVE.propertyPathKey, literal(osfmap.osfmap_propertypath_key(property_path))) yield (TROVE.propertyPath, _propertypath_sequence(property_path)) yield (TROVE.osfmapPropertyPath, _osfmap_path(property_path)) def _multi_propertypath_twoples(propertypath_set): - yield (TROVE.propertyPathKey, literal(propertypath_set_key(propertypath_set))) + yield (TROVE.propertyPathKey, literal(osfmap.osfmap_propertypath_set_key(propertypath_set))) for _path in propertypath_set: yield (TROVE.propertyPathSet, _propertypath_sequence(_path)) @@ -516,7 +541,7 @@ def _propertypath_sequence(property_path: tuple[str, ...]): _propertypath_metadata = [] for _property_iri in property_path: try: - _property_twopledict = OSFMAP_THESAURUS[_property_iri] + _property_twopledict = osfmap.OSFMAP_THESAURUS[_property_iri] except KeyError: _property_twopledict = {RDF.type: {RDF.Property}} # giving benefit of the doubt _propertypath_metadata.append(_osfmap_json( @@ -530,8 +555,8 @@ def _related_property_result(property_path: tuple[str, ...], count: int): return frozenset(( (RDF.type, TROVE.RelatedPropertypath), (TROVE.cardsearchResultCount, count), - (TROVE.suggestedFilterOperator, literal(trove_shorthand().compact_iri( - suggested_filter_operator(property_path[-1]), + (TROVE.suggestedFilterOperator, literal(osfmap.osfmap_json_shorthand().compact_iri( + osfmap.suggested_filter_operator(property_path[-1]), ))), *_single_propertypath_twoples(property_path), )) diff --git a/trove/urls.py b/trove/urls.py index b58c7127f..64f4b4e3c 100644 --- a/trove/urls.py +++ b/trove/urls.py @@ -1,5 +1,4 @@ from django.urls import path, re_path -from django.views.generic.base import RedirectView from .views.browse import BrowseIriView from .views.ingest import RdfIngestView @@ -20,10 +19,9 @@ path('index-card/', view=IndexcardView.as_view(), name='index-card'), path('index-card-search', view=CardsearchView.as_view(), name='index-card-search'), path('index-value-search', view=ValuesearchView.as_view(), name='index-value-search'), - path('browse///', view=BrowseIriView.as_view(), name='browse-iri'), path('browse', view=BrowseIriView.as_view(), name='browse-iri'), path('ingest', view=RdfIngestView.as_view(), name='ingest-rdf'), path('docs/openapi.json', view=OpenapiJsonView.as_view(), name='docs.openapi-json'), path('docs/openapi.html', view=OpenapiHtmlView.as_view(), name='docs.openapi-html'), - re_path(r'docs/?', view=RedirectView.as_view(pattern_name='trovetrove:docs.openapi-html'), name='docs'), + re_path(r'docs/?', view=OpenapiHtmlView.as_view(), name='docs'), ] diff --git a/trove/util/chainmap.py b/trove/util/chainmap.py new file mode 100644 index 000000000..48a1be487 --- /dev/null +++ b/trove/util/chainmap.py @@ -0,0 +1,66 @@ +from collections.abc import Sequence, Mapping, Iterator +import dataclasses + + +@dataclasses.dataclass +class SimpleChainMap(Mapping): + """Combine multiple mappings for sequential lookup. + + (inspired by rejecting the suggested "greatly simplified read-only version of Chainmap" + linked from python docs: https://code.activestate.com/recipes/305268/ ) + + >>> _map = SimpleChainMap([{'a':1, 'b':2}, {'a':3, 'd':4}]) + >>> _map['a'] + 1 + >>> _map['d'] + 4 + >>> _map['f'] + Traceback (most recent call last): + ... + KeyError: 'f' + >>> 'b' in _map + True + >>> 'c' in _map + False + >>> 'd' in _map + True + >>> _map.get('a', 10) + 1 + >>> _map.get('b', 20) + 2 + >>> _map.get('d', 30) + 4 + >>> _map.get('f', 40) + 40 + >>> sorted(_map) + ['a', 'b', 'd'] + >>> _map + SimpleChainMap(maps=[{'a': 1, 'b': 2}, {'a': 3, 'd': 4}]) + >>> _map.with_new({'a': 11, 'z': 13}) + SimpleChainMap(maps=[{'a': 11, 'z': 13}, {'a': 1, 'b': 2}, {'a': 3, 'd': 4}]) + >>> _map.with_new({'a': 17}).get('a') + 17 + """ + maps: Sequence[Mapping] + + def __getitem__(self, key): + for _mapping in self.maps: + try: + return _mapping[key] + except KeyError: + pass + raise KeyError(key) + + def __iter__(self) -> Iterator: + _seen: set = set() + for _mapping in self.maps: + for _key in _mapping.keys(): + if _key not in _seen: + yield _key + _seen.add(_key) + + def __len__(self): # for Mapping + return sum(1 for _ in self) # use __iter__ + + def with_new(self, new_map): + return dataclasses.replace(self, maps=[new_map, *self.maps]) diff --git a/trove/util/frozen.py b/trove/util/frozen.py index 0e57eb531..65709f3fb 100644 --- a/trove/util/frozen.py +++ b/trove/util/frozen.py @@ -5,7 +5,6 @@ _FROZEN_TYPES = ( tuple, frozenset, - types.MappingProxyType, str, int, float, @@ -13,12 +12,26 @@ def freeze(obj): + ''' + >>> freeze([1, 1, 2]) + (1, 1, 2) + >>> freeze({3}) + frozenset({3}) + >>> freeze('five') + 'five' + >>> freeze({8: [13, 21, {34}]}) + mappingproxy({8: (13, 21, frozenset({34}))}) + >>> freeze(object()) + Traceback (most recent call last): + ... + ValueError: how freeze ? + ''' + if isinstance(obj, set): + return frozenset(obj) # use hashability to approximate immutability + if isinstance(obj, (list, tuple)): + return tuple(map(freeze, obj)) if isinstance(obj, dict): return freeze_mapping(obj) - if isinstance(obj, set): - return frozenset(obj) - if isinstance(obj, list): - return tuple(obj) if isinstance(obj, _FROZEN_TYPES): return obj raise ValueError(f'how freeze {obj!r}?') diff --git a/trove/util/iris.py b/trove/util/iris.py index 2b266e6c8..35d9123f4 100644 --- a/trove/util/iris.py +++ b/trove/util/iris.py @@ -1,6 +1,6 @@ import json import re -from urllib.parse import urlsplit, urlunsplit, quote, unquote +import urllib.parse as _urp from trove import exceptions as trove_exceptions @@ -15,8 +15,8 @@ COLON = ':' COLON_SLASH_SLASH = '://' QUOTED_IRI_REGEX = re.compile( - f'{IRI_SCHEME_REGEX.pattern}{re.escape(quote(COLON))}' - f'|{re.escape(quote(COLON_SLASH_SLASH))}' + f'{IRI_SCHEME_REGEX.pattern}{re.escape(_urp.quote(COLON))}' + f'|{re.escape(_urp.quote(COLON_SLASH_SLASH))}' ) UNQUOTED_IRI_REGEX = re.compile(f'{IRI_SCHEME_REGEX.pattern}{COLON}|{COLON_SLASH_SLASH}') @@ -29,21 +29,63 @@ def get_sufficiently_unique_iri(iri: str) -> str: + ''' + >>> get_sufficiently_unique_iri('flipl://iri.example/blarg/?#') + '://iri.example/blarg' + >>> get_sufficiently_unique_iri('namly:urn.example:blerg') + 'namly:urn.example:blerg' + ''' (_suffuniq_iri, _) = get_sufficiently_unique_iri_and_scheme(iri) return _suffuniq_iri def get_iri_scheme(iri: str) -> str: + ''' + >>> get_iri_scheme('flipl://iri.example/blarg/?#') + 'flipl' + >>> get_iri_scheme('namly:urn.example:blerg') + 'namly' + ''' (_, _iri_scheme) = get_sufficiently_unique_iri_and_scheme(iri) return _iri_scheme def iris_sufficiently_equal(*iris) -> bool: + ''' + >>> iris_sufficiently_equal( + ... 'flipl://iri.example/blarg/blerg/?#', + ... 'http://iri.example/blarg/blerg', + ... 'https://iri.example/blarg/blerg', + ... 'git://iri.example/blarg/blerg', + ... ) + True + >>> iris_sufficiently_equal( + ... 'flipl://iri.example/blarg/blerg', + ... 'namly:iri.example/blarg/blerg', + ... ) + False + >>> iris_sufficiently_equal( + ... 'namly:urn.example:blerg', + ... 'namly:urn.example:blerg', + ... ) + True + >>> iris_sufficiently_equal( + ... 'namly:urn.example:blerg', + ... 'nimly:urn.example:blerg', + ... ) + False + ''' _suffuniq_iris = set(map(get_sufficiently_unique_iri, iris)) return len(_suffuniq_iris) == 1 def get_sufficiently_unique_iri_and_scheme(iri: str) -> tuple[str, str]: + ''' + >>> get_sufficiently_unique_iri_and_scheme('flipl://iri.example/blarg/?#') + ('://iri.example/blarg', 'flipl') + >>> get_sufficiently_unique_iri_and_scheme('namly:urn.example:blerg') + ('namly:urn.example:blerg', 'namly') + ''' _scheme_match = IRI_SCHEME_REGEX_IGNORECASE.match(iri) if _scheme_match: _scheme = _scheme_match.group().lower() @@ -57,8 +99,8 @@ def get_sufficiently_unique_iri_and_scheme(iri: str) -> tuple[str, str]: _scheme = '' _remainder = iri # for an iri with '://', is "safe enough" to normalize a little: - _split_remainder = urlsplit(_remainder) - _cleaned_remainder = urlunsplit(( + _split_remainder = _urp.urlsplit(_remainder) + _cleaned_remainder = _urp.urlunsplit(( '', # scheme already split _split_remainder.netloc, _split_remainder.path.rstrip('/'), # remove trailing slashes @@ -69,6 +111,14 @@ def get_sufficiently_unique_iri_and_scheme(iri: str) -> tuple[str, str]: def is_worthwhile_iri(iri: str): + ''' + >>> is_worthwhile_iri('flipl://iri.example/blarg/?#') + True + >>> is_worthwhile_iri('namly:urn.example:blerg') + True + >>> is_worthwhile_iri('_:1234') + False + ''' return ( isinstance(iri, str) and not iri.startswith('_') # skip artefacts of sharev2 shenanigans @@ -76,10 +126,16 @@ def is_worthwhile_iri(iri: str): def iri_path_as_keyword(iris: list[str] | tuple[str, ...], *, suffuniq=False) -> str: - assert isinstance(iris, (list, tuple)) and all( - isinstance(_pathstep, str) - for _pathstep in iris - ), f'expected list or tuple of str, got {iris}' + '''return a string-serialized list of iris + + meant for storing in an elasticsearch "keyword" field (happens to use json) + >>> iri_path_as_keyword(['flipl://iri.example/blarg', 'namly:urn.example:blerg']) + '["flipl://iri.example/blarg", "namly:urn.example:blerg"]' + >>> iri_path_as_keyword( + ... ['flipl://iri.example/blarg', 'namly:urn.example:blerg'], + ... suffuniq=True) + '["://iri.example/blarg", "namly:urn.example:blerg"]' + ''' _list = iris if suffuniq: _list = [ @@ -90,9 +146,36 @@ def iri_path_as_keyword(iris: list[str] | tuple[str, ...], *, suffuniq=False) -> def unquote_iri(iri: str) -> str: + ''' + like `urllib.parse.unquote` but recognizes multiply-quoted IRIs + (unquoting until starting "foo:" or "://", leaving further quoted characters intact) + + >>> unquote_iri('flipl://iri.example/blarg/?#') + 'flipl://iri.example/blarg/?#' + >>> unquote_iri('flipl%3A//iri.example/blarg/%3F%23') + 'flipl://iri.example/blarg/?#' + >>> unquote_iri('namly:urn.example:blerg') + 'namly:urn.example:blerg' + >>> unquote_iri('namly%3Aurn.example%3Ablerg') + 'namly:urn.example:blerg' + >>> unquote_iri('werbleWord') + 'werbleWord' + + >>> import urllib.parse as _urp + >>> _unquoted = 'flipl://iri.example/blarg/?' + _urp.urlencode({'param': '://bl@rg?'}) + >>> unquote_iri(_unquoted) + 'flipl://iri.example/blarg/?param=%3A%2F%2Fbl%40rg%3F' + >>> unquote_iri(_urp.quote(_unquoted)) + 'flipl://iri.example/blarg/?param=%3A%2F%2Fbl%40rg%3F' + >>> unquote_iri(_urp.quote(_urp.quote(_unquoted))) + 'flipl://iri.example/blarg/?param=%3A%2F%2Fbl%40rg%3F' + >>> unquote_iri(_urp.quote(_urp.quote(_urp.quote(_unquoted)))) + 'flipl://iri.example/blarg/?param=%3A%2F%2Fbl%40rg%3F' + ''' _unquoted_iri = iri - while QUOTED_IRI_REGEX.match(_unquoted_iri): - _unquoted_iri = unquote(_unquoted_iri) - if not UNQUOTED_IRI_REGEX.match(_unquoted_iri): - raise trove_exceptions.InvalidQuotedIri(f'does not look like a quoted iri: {iri}') + while not UNQUOTED_IRI_REGEX.match(_unquoted_iri): + _next_unquoted_iri = _urp.unquote(_unquoted_iri) + if _unquoted_iri == _next_unquoted_iri: + break + _unquoted_iri = _next_unquoted_iri return _unquoted_iri diff --git a/trove/util/propertypath.py b/trove/util/propertypath.py new file mode 100644 index 000000000..eaf8a30cf --- /dev/null +++ b/trove/util/propertypath.py @@ -0,0 +1,85 @@ +import urllib + +from primitive_metadata import primitive_rdf as rdf + +from trove import exceptions as trove_exceptions + + +### +# type aliases +Propertypath = tuple[str, ...] +PropertypathSet = frozenset[Propertypath] + +### +# constants + +# between each step in a property path "foo.bar.baz" +PROPERTYPATH_DELIMITER = '.' + +# special path-step that matches any property +GLOB_PATHSTEP = '*' +ONE_GLOB_PROPERTYPATH: Propertypath = (GLOB_PATHSTEP,) + + +def is_globpath(path: Propertypath) -> bool: + ''' + >>> is_globpath(('*',)) + True + >>> is_globpath(('*', '*')) + True + >>> is_globpath(('*', 'url:url')) + False + ''' + return all(_pathstep == GLOB_PATHSTEP for _pathstep in path) + + +def make_globpath(length: int) -> Propertypath: + ''' + >>> make_globpath(1) + ('*',) + >>> make_globpath(2) + ('*', '*') + >>> make_globpath(5) + ('*', '*', '*', '*', '*') + ''' + return ONE_GLOB_PROPERTYPATH * length + + +def parse_propertypath( + serialized_path: str, + shorthand: rdf.IriShorthand, + allow_globs: bool = False, +) -> Propertypath: + _path = tuple( + shorthand.expand_iri(_pathstep) + for _pathstep in serialized_path.split(PROPERTYPATH_DELIMITER) + ) + if GLOB_PATHSTEP in _path: + if not allow_globs: + raise trove_exceptions.InvalidPropertyPath(serialized_path, 'no * allowed') + if any(_pathstep != GLOB_PATHSTEP for _pathstep in _path): + raise trove_exceptions.InvalidPropertyPath( + serialized_path, + f'path must be all * or no * (got {serialized_path})', + ) + return _path + + +def propertypathstep_key( + pathstep: str, + shorthand: rdf.IriShorthand, +) -> str: + if pathstep == GLOB_PATHSTEP: + return pathstep + # assume iri + return urllib.parse.quote(shorthand.compact_iri(pathstep)) + + +def propertypath_key( + path: Propertypath, + shorthand: rdf.IriShorthand, +) -> str: + return PROPERTYPATH_DELIMITER.join( + propertypathstep_key(_pathstep, shorthand) + for _pathstep in path + ) diff --git a/trove/util/queryparams.py b/trove/util/queryparams.py index 3cff5b681..0a9bb5d75 100644 --- a/trove/util/queryparams.py +++ b/trove/util/queryparams.py @@ -1,6 +1,7 @@ +from __future__ import annotations import dataclasses import re -from typing import Iterable +import typing # TODO: remove django dependency (tho it is convenient) from django.http import QueryDict @@ -25,6 +26,9 @@ # value to be split on commas, used as a list or set QUERYPARAM_VALUES_DELIM = ',' +TRUTHY_VALUES = frozenset(('t', 'true', '1', 'y', 'yes')) +FALSY_VALUES = frozenset(('f', 'false', '0', 'n', 'no')) + @dataclasses.dataclass(frozen=True) class QueryparamName: @@ -87,5 +91,54 @@ def split_queryparam_value(value: str): return value.split(QUERYPARAM_VALUES_DELIM) -def join_queryparam_value(values: Iterable[str]): +def join_queryparam_value(values: typing.Iterable[str]): return QUERYPARAM_VALUES_DELIM.join(values) + + +def get_single_value( + queryparams: QueryparamDict, + queryparam_name: QueryparamName | str, +) -> str | None: + if isinstance(queryparam_name, QueryparamName): + _family_name = queryparam_name.family + _expected_brackets = queryparam_name.bracketed_names + else: + _family_name = queryparam_name + _expected_brackets = () + _paramvalues = [ + _paramvalue + for _paramname, _paramvalue in queryparams.get(_family_name, ()) + if _paramname.bracketed_names == _expected_brackets + ] + if not _paramvalues: + return None + try: + (_singlevalue,) = _paramvalues + except ValueError: + raise trove_exceptions.InvalidRepeatedQueryParam(str(queryparam_name)) + else: + return _singlevalue + + +def get_bool_value( + queryparams: QueryparamDict, + queryparam_name: QueryparamName | str, + *, + if_absent: bool = False, # by default, param absence is falsy + if_empty: bool = True, # by default, presence (with empty value) is truthy +) -> bool: + _value = get_single_value(queryparams, queryparam_name) + if _value is None: + return if_absent + if _value == '': + return if_empty + return parse_booly_str(_value) + + +def parse_booly_str(value: str): + _lowered = value.lower() + if _lowered in TRUTHY_VALUES: + return True + if _lowered in FALSY_VALUES: + return False + raise ValueError(f'unboolable string: "{value}"') diff --git a/trove/util/trove_params.py b/trove/util/trove_params.py new file mode 100644 index 000000000..92e5ed2da --- /dev/null +++ b/trove/util/trove_params.py @@ -0,0 +1,134 @@ +from __future__ import annotations +from collections import defaultdict +import dataclasses +import typing +if typing.TYPE_CHECKING: + from collections.abc import Mapping + +# TODO: remove django dependency (tho it is convenient) +from django.http import QueryDict +from primitive_metadata import primitive_rdf as rdf + +from trove import exceptions as trove_exceptions +from trove.util.chainmap import SimpleChainMap +from trove.util.frozen import freeze +from trove.util.propertypath import ( + PropertypathSet, + Propertypath, + parse_propertypath, +) +from trove.util import queryparams as _qp +from trove.vocab.namespaces import namespaces_shorthand + + +@dataclasses.dataclass(frozen=True) +class BasicTroveParams: + iri_shorthand: rdf.IriShorthand = dataclasses.field(repr=False) + accept_mediatype: str | None + included_relations: PropertypathSet = dataclasses.field(repr=False, compare=False) + attrpaths_by_type: Mapping[str, PropertypathSet] = dataclasses.field(repr=False, compare=False) + blend_cards: bool + + ### + # class methods + + @classmethod + def from_querystring(cls, querystring: str) -> typing.Self: + return cls.from_queryparams(_qp.queryparams_from_querystring(querystring)) + + @classmethod + def from_queryparams(cls, queryparams: _qp.QueryparamDict) -> typing.Self: + return cls(**cls.parse_queryparams(queryparams)) + + @classmethod + def parse_queryparams(cls, queryparams: _qp.QueryparamDict) -> dict: + # subclasses should override and add their fields to super().parse_queryparams(queryparams) + _shorthand = cls._gather_shorthand(queryparams) + return { + 'iri_shorthand': _shorthand, + 'included_relations': cls._gather_included_relations(queryparams, _shorthand), + 'attrpaths_by_type': cls._gather_attrpaths(queryparams, _shorthand), + 'accept_mediatype': _qp.get_single_value(queryparams, 'acceptMediatype'), + 'blend_cards': _qp.get_bool_value(queryparams, 'blendCards'), + } + + @classmethod + def _default_shorthand(cls) -> rdf.IriShorthand: + return namespaces_shorthand() + + @classmethod + def _default_include(cls) -> PropertypathSet: + return frozenset() + + @classmethod + def _default_attrpaths(cls) -> Mapping[str, tuple[Propertypath, ...]]: + return {} + + @classmethod + def _gather_shorthand(cls, queryparams: _qp.QueryparamDict): + _prefixmap = {} + for _qp_name, _iri in queryparams.get('iriShorthand', []): + try: + (_shortname,) = _qp_name.bracketed_names + except ValueError: + raise trove_exceptions.InvalidQueryParamName(_qp_name) + else: + _prefixmap[_shortname] = _iri + _shorthand = cls._default_shorthand() + if _prefixmap: + _shorthand = _shorthand.with_update(_prefixmap) + return _shorthand + + @classmethod + def _gather_included_relations(cls, queryparams: _qp.QueryparamDict, shorthand: rdf.IriShorthand) -> PropertypathSet: + _include_params = queryparams.get('include', []) + if _include_params: + return frozenset(( + parse_propertypath(_path_value, shorthand) + for _, _include_value in _include_params + for _path_value in _qp.split_queryparam_value(_include_value) + )) + return cls._default_include() + + @classmethod + def _gather_attrpaths(cls, queryparams: _qp.QueryparamDict, shorthand: rdf.IriShorthand) -> Mapping[ + str, + tuple[Propertypath, ...], + ]: + _attrpaths = SimpleChainMap([cls._default_attrpaths()]) + _fields_params = queryparams.get('fields', []) + if _fields_params: + _requested: dict[str, list[Propertypath]] = defaultdict(list) + for _param_name, _param_value in _fields_params: + try: + (_typenames,) = filter(bool, _param_name.bracketed_names) + except (IndexError, ValueError): + raise trove_exceptions.InvalidQueryParamName( + f'expected "fields[TYPE]" (with exactly one non-empty bracketed segment)' + f' (got "{_param_name}")' + ) + else: + for _type in _qp.split_queryparam_value(_typenames): + _type_iri = shorthand.expand_iri(_type) + _requested[_type_iri].extend( + parse_propertypath(_path_value, shorthand) + for _path_value in _qp.split_queryparam_value(_param_value) + ) + _attrpaths = _attrpaths.with_new(freeze(_requested)) + return _attrpaths + + ### + # instance methods + + def to_querystring(self) -> str: + return self.to_querydict().urlencode() + + def to_querydict(self) -> QueryDict: + # subclasses should override and add their fields to super().to_querydict() + _querydict = QueryDict(mutable=True) + if self.accept_mediatype: + _querydict['acceptMediatype'] = self.accept_mediatype + if self.blend_cards: + _querydict['blendCards'] = '' + # TODO: iriShorthand, include, fields[...] + return _querydict diff --git a/trove/views/_base.py b/trove/views/_base.py new file mode 100644 index 000000000..e2cd48f48 --- /dev/null +++ b/trove/views/_base.py @@ -0,0 +1,148 @@ +__all__ = ( + 'GatheredTroveView', + 'StaticTroveView', +) + +import abc +from collections.abc import Container +import functools +from typing import ClassVar + +from django import http as djhttp +from django.views import View +from primitive_metadata import gather +from primitive_metadata import primitive_rdf as rdf + +from trove import exceptions as trove_exceptions +from trove.vocab.namespaces import TROVE, RDF +from trove.util.trove_params import BasicTroveParams +from trove.render import ( + BaseRenderer, + DEFAULT_RENDERER_TYPE, + get_renderer_type, +) +from trove.render._rendering import ProtoRendering +from ._gather_ask import ask_gathering_from_params +from ._responder import ( + make_http_error_response, + make_http_response, +) + + +class BaseTroveView(View, abc.ABC): + # optional ClassVars: + params_type: ClassVar[type[BasicTroveParams]] = BasicTroveParams + + @abc.abstractmethod + def _render_response_content(self, request, params, renderer_type: type[BaseRenderer], url_kwargs) -> ProtoRendering: + raise NotImplementedError + + def get(self, request, **kwargs): + try: + _renderer_type = get_renderer_type(request) + except trove_exceptions.CannotRenderMediatype as _error: + return make_http_error_response( + error=_error, + renderer_type=DEFAULT_RENDERER_TYPE, + ) + try: + _params = self._parse_params(request) + return make_http_response( + content_rendering=self._render_response_content(request, _params, _renderer_type, kwargs), + http_request=request, + ) + except trove_exceptions.TroveError as _error: + return make_http_error_response( + error=_error, + renderer_type=_renderer_type, + ) + + def _parse_params(self, request: djhttp.HttpRequest): + return self.params_type.from_querystring(request.META['QUERY_STRING']) + + +class GatheredTroveView(BaseTroveView, abc.ABC): + # ClassVars expected on inheritors: + gathering_organizer: ClassVar[gather.GatheringOrganizer] + # optional ClassVars: + focus_type_iris: ClassVar[Container[str]] = () + + def _render_response_content(self, request, params, renderer_type: type[BaseRenderer], url_kwargs): + _focus = self._build_focus(request, params, url_kwargs) + _renderer = self._gather_to_renderer(_focus, params, renderer_type) + return _renderer.render_document() + + def _gather_to_renderer(self, focus, params, renderer_type: type[BaseRenderer]) -> BaseRenderer: + _gathering = self._build_gathering(params, renderer_type) + if renderer_type.PASSIVE_RENDER: + ask_gathering_from_params(_gathering, params, focus) + return renderer_type(focus, _gathering) + + def _get_focus_iri(self, request, params): + return request.build_absolute_uri() + + def _build_focus(self, request, params, url_kwargs): + return gather.Focus.new(self._get_focus_iri(request, params), self.focus_type_iris) + + def _build_gathering(self, params, renderer_type: type[BaseRenderer]) -> gather.Gathering: + return self.gathering_organizer.new_gathering( + self._get_gatherer_kwargs(params, renderer_type), + ) + + def _get_gatherer_kwargs(self, params, renderer_type): + _kwargs = {} + _deriver_kw = _get_param_keyword(TROVE.deriverIRI, self.gathering_organizer) + if _deriver_kw: + _kwargs[_deriver_kw] = renderer_type.get_deriver_iri(params.blend_cards) + _blend_kw = _get_param_keyword(TROVE.blendCards, self.gathering_organizer) + if _blend_kw: + _kwargs[_blend_kw] = params.blend_cards + return _kwargs + + +class StaticTroveView(BaseTroveView, abc.ABC): + @classmethod + @abc.abstractmethod + def get_focus_iri(cls) -> str: + raise NotImplementedError + + @classmethod + @abc.abstractmethod + def get_static_triples(cls, focus_iri: str) -> rdf.RdfTripleDictionary: + raise NotImplementedError + + @classmethod + @functools.cache + def cached_static_triples(cls, focus_iri): + return cls.get_static_triples(focus_iri) + + def _render_response_content(self, request, params, renderer_type: type[BaseRenderer], url_kwargs): + _focus_iri = self.get_focus_iri() + _triples = self.cached_static_triples(_focus_iri) + _focus = gather.Focus.new( + _focus_iri, + type_iris=_triples.get(_focus_iri, {}).get(RDF.type, ()), + ) + + class _FakeStaticGathering: + gatherer_kwargs: dict = {} + + def leaf_a_record(self): + return _triples + + _renderer = renderer_type( + response_focus=_focus, + response_gathering=_FakeStaticGathering(), + ) + return _renderer.render_document() + + +### +# local helpers + +def _get_param_keyword(param_iri: str, organizer: gather.GatheringOrganizer) -> str | None: + if param_iri in organizer.norms.param_iris: + for (_k, _v) in organizer.gatherer_params.items(): + if _v == param_iri: + return _k + return None diff --git a/trove/views/_gather_ask.py b/trove/views/_gather_ask.py index 63bae1098..c995a9907 100644 --- a/trove/views/_gather_ask.py +++ b/trove/views/_gather_ask.py @@ -1,11 +1,11 @@ from primitive_metadata import gather -from trove.trovesearch.search_params import BaseTroveParams +from trove.util.trove_params import BasicTroveParams def ask_gathering_from_params( gathering: gather.Gathering, - params: BaseTroveParams, + params: BasicTroveParams, start_focus: gather.Focus, ): # fill the gathering's cache with included related resources... diff --git a/trove/views/browse.py b/trove/views/browse.py index ab1e488fe..6739b53d7 100644 --- a/trove/views/browse.py +++ b/trove/views/browse.py @@ -1,70 +1,56 @@ -from django import http -from django.shortcuts import redirect -from django.views import View -from primitive_metadata import primitive_rdf +import dataclasses -from trove import models as trove_db -from trove.render import get_renderer_type -from trove.util.iris import unquote_iri, get_sufficiently_unique_iri -from trove.vocab import namespaces as ns -from trove.vocab import static_vocab -from ._responder import make_http_response +from trove import exceptions as trove_exceptions +from trove.util.iris import unquote_iri +from trove.vocab import namespaces as _ns +from trove.vocab.osfmap import osfmap_json_shorthand +from trove.vocab.trove import trove_json_shorthand +from trove.trovebrowse_gathering import trovebrowse +from trove.util.trove_params import BasicTroveParams +from trove.util.queryparams import ( + QueryparamDict, + get_single_value, +) +from ._base import GatheredTroveView -class BrowseIriView(View): - def get(self, request, **kwargs): - _iri_param = kwargs.get('iri') or request.GET.get('iri') - if not _iri_param: - raise http.Http404 # TODO: docs? random browse? - _iri = ns.NAMESPACES_SHORTHAND.expand_iri(unquote_iri(_iri_param)) - _suffuniq_iri = get_sufficiently_unique_iri(_iri) - _trove_term = _recognize_trove_term(_suffuniq_iri) - if _trove_term is not None: - return redirect('trove-vocab', vocab_term=_trove_term) - _card_focus_iri, _combined_rdf = _get_latest_cardf(_iri) - _thesaurus_entry = static_vocab.combined_thesaurus__suffuniq().get(_suffuniq_iri, {}) - if _thesaurus_entry: - _combined_rdf.add_twopledict(_card_focus_iri, _thesaurus_entry) - _renderer_type = get_renderer_type(request) - _renderer = _renderer_type( - _card_focus_iri, - _combined_rdf.tripledict, - ) - return make_http_response( - content_rendering=_renderer.render_document(), - http_headers=[('Content-Disposition', 'inline')], - http_request=request, - ) +@dataclasses.dataclass(frozen=True) +class BrowseParams(BasicTroveParams): + iri: str + @classmethod + def parse_queryparams(cls, queryparams: QueryparamDict) -> dict: + _iri_value = get_single_value(queryparams, 'iri') + if not _iri_value: + raise trove_exceptions.MissingRequiredQueryParam('iri') + return { + **super().parse_queryparams(queryparams), + 'iri': cls._parse_iri(_iri_value), + } -def _get_latest_cardf(iri: str): - _combined_rdf = primitive_rdf.RdfGraph({}) - try: - _identifier = trove_db.ResourceIdentifier.objects.get_for_iri(iri) - except trove_db.ResourceIdentifier.DoesNotExist: - return iri, _combined_rdf - else: - _rdf_qs = ( - trove_db.LatestIndexcardRdf.objects - .filter(indexcard__focus_identifier_set=_identifier) - .select_related('indexcard') - ) - _focus_iri = None - for _indexcard_rdf in _rdf_qs: - if _focus_iri is None: - _focus_iri = _indexcard_rdf.focus_iri - _combined_rdf.add((_focus_iri, ns.FOAF.primaryTopicOf, _indexcard_rdf.indexcard.get_iri())) - for (_subj, _pred, _obj) in primitive_rdf.iter_tripleset(_indexcard_rdf.as_rdf_tripledict()): - _combined_rdf.add( - (_focus_iri, _pred, _obj) - if _subj == _indexcard_rdf.focus_iri - else (_subj, _pred, _obj) - ) - return (_focus_iri or iri), _combined_rdf + @classmethod + def _parse_iri(cls, iri_value: str): + _iri = unquote_iri(iri_value) + if ':' in _iri: + return _ns.namespaces_shorthand().expand_iri(_iri) + for _shorthand_factory in (osfmap_json_shorthand, trove_json_shorthand): + _expanded = _shorthand_factory().expand_iri(_iri) + if _expanded != _iri: + return _expanded + raise trove_exceptions.IriInvalid(_iri) + @classmethod + def _default_include(cls): + return frozenset(( + _ns.TROVE.thesaurusEntry, + _ns.FOAF.isPrimaryTopicOf, + _ns.TROVE.usedAtPath, + )) -def _recognize_trove_term(suffuniq_iri: str): - _suffuniq_trove = get_sufficiently_unique_iri(str(ns.TROVE)) - if suffuniq_iri.startswith(_suffuniq_trove): - return primitive_rdf.iri_minus_namespace(suffuniq_iri, _suffuniq_trove).strip('/') - return None + +class BrowseIriView(GatheredTroveView): + gathering_organizer = trovebrowse + params_type = BrowseParams + + def _get_focus_iri(self, request, params: BrowseParams): # override GatheredTroveView + return params.iri diff --git a/trove/views/indexcard.py b/trove/views/indexcard.py index 208a15f85..158102f80 100644 --- a/trove/views/indexcard.py +++ b/trove/views/indexcard.py @@ -1,50 +1,25 @@ -from django.views import View +from django.http import Http404 -from trove import exceptions as trove_exceptions from trove import models as trove_db -from trove.render import ( - DEFAULT_RENDERER_TYPE, - get_renderer_type, -) from trove.trovesearch.search_params import IndexcardParams from trove.trovesearch.trovesearch_gathering import ( trovesearch_by_indexstrategy, IndexcardFocus, ) from trove.vocab.trove import trove_indexcard_iri -from ._gather_ask import ask_gathering_from_params -from ._responder import ( - make_http_error_response, - make_http_response, -) +from ._base import GatheredTroveView + +class IndexcardView(GatheredTroveView): + params_type = IndexcardParams + gathering_organizer = trovesearch_by_indexstrategy -class IndexcardView(View): - def get(self, request, indexcard_uuid): + def _build_focus(self, request, params, url_kwargs): try: - _renderer_type = get_renderer_type(request) - _gathering = trovesearch_by_indexstrategy.new_gathering({ - 'deriver_iri': _renderer_type.INDEXCARD_DERIVER_IRI, - }) - _indexcard_iri = trove_indexcard_iri(indexcard_uuid) - _params = IndexcardParams.from_querystring(request.META['QUERY_STRING']) - _focus = IndexcardFocus.new( - iris=_indexcard_iri, - indexcard=trove_db.Indexcard.objects.get_for_iri(_indexcard_iri), - ) - ask_gathering_from_params(_gathering, _params, _focus) - _renderer = _renderer_type(_focus, _gathering) - return make_http_response( - content_rendering=_renderer.render_document(), - http_request=request, - ) - except trove_exceptions.CannotRenderMediatype as _error: - return make_http_error_response( - error=_error, - renderer_type=DEFAULT_RENDERER_TYPE, - ) - except trove_exceptions.TroveError as _error: - return make_http_error_response( - error=_error, - renderer_type=_renderer_type, + _indexcard_uuid = url_kwargs['indexcard_uuid'] + return IndexcardFocus.new( + iris=trove_indexcard_iri(_indexcard_uuid), + indexcard=trove_db.Indexcard.objects.get(uuid=_indexcard_uuid), ) + except trove_db.Indexcard.DoesNotExist: + raise Http404 diff --git a/trove/views/search.py b/trove/views/search.py index d164b36e4..f84f50623 100644 --- a/trove/views/search.py +++ b/trove/views/search.py @@ -1,16 +1,12 @@ import abc +from collections.abc import Callable import logging -from typing import Callable -from django import http -from django.views import View from primitive_metadata import gather from share.search import index_strategy -from trove import exceptions as trove_exceptions from trove.trovesearch.search_handle import BasicSearchHandle from trove.trovesearch.search_params import ( - BaseTroveParams, CardsearchParams, ValuesearchParams, ) @@ -19,70 +15,28 @@ CardsearchFocus, ValuesearchFocus, ) -from trove.render import ( - DEFAULT_RENDERER_TYPE, - get_renderer_type, -) -from ._gather_ask import ask_gathering_from_params -from ._responder import ( - make_http_error_response, - make_http_response, -) +from trove.util.trove_params import BasicTroveParams +from ._base import GatheredTroveView logger = logging.getLogger(__name__) -_TrovesearchHandler = Callable[[BaseTroveParams], BasicSearchHandle] - - -class _BaseTrovesearchView(View, abc.ABC): - # expected on inheritors - focus_type: type[gather.Focus] - params_dataclass: type[CardsearchParams] - - def get(self, request): - try: - _renderer_type = get_renderer_type(request) - except trove_exceptions.CannotRenderMediatype as _error: - return make_http_error_response( - error=_error, - renderer_type=DEFAULT_RENDERER_TYPE, - ) - try: - _url = request.build_absolute_uri() - _search_gathering = self._start_gathering(renderer_type=_renderer_type) - _search_params = self._parse_search_params(request) - _strategy = index_strategy.get_strategy_for_trovesearch(_search_params) - _focus = self.focus_type.new( - iris=_url, - search_params=_search_params, - search_handle=self.get_search_handle(_strategy, _search_params), - ) - if _renderer_type.PASSIVE_RENDER: - ask_gathering_from_params(_search_gathering, _search_params, _focus) - # take gathered data into a response - _renderer = _renderer_type(_focus, _search_gathering) - return make_http_response( - content_rendering=_renderer.render_document(), - http_request=request, - ) - except trove_exceptions.TroveError as _error: - return make_http_error_response( - error=_error, - renderer_type=_renderer_type, - ) - - def _parse_search_params(self, request: http.HttpRequest) -> CardsearchParams: - return self.params_dataclass.from_querystring( - request.META['QUERY_STRING'], - ) +_TrovesearchHandler = Callable[[BasicTroveParams], BasicSearchHandle] + - def _start_gathering(self, renderer_type) -> gather.Gathering: - # TODO: 404 for unknown strategy - return trovesearch_by_indexstrategy.new_gathering({ - 'deriver_iri': renderer_type.INDEXCARD_DERIVER_IRI, - }) +class _BaseTrovesearchView(GatheredTroveView, abc.ABC): + focus_type: type[gather.Focus] = gather.Focus # expected on subclasses + + gathering_organizer = trovesearch_by_indexstrategy # for GatheredTroveView + + def _build_focus(self, request, params, url_kwargs): # override GatheredTroveView + _strategy = index_strategy.get_strategy_for_trovesearch(params) + return self.focus_type.new( + iris=self._get_focus_iri(request, params), + search_params=params, + search_handle=self.get_search_handle(_strategy, params), + ) def get_search_handle(self, strategy, search_params) -> BasicSearchHandle: return self._get_wrapped_handler(strategy)(search_params) @@ -105,7 +59,7 @@ def _wrapped_handler(search_params): class CardsearchView(_BaseTrovesearchView): focus_type = CardsearchFocus - params_dataclass = CardsearchParams + params_type = CardsearchParams def get_search_handler(self, strategy): return strategy.pls_handle_cardsearch @@ -113,7 +67,7 @@ def get_search_handler(self, strategy): class ValuesearchView(_BaseTrovesearchView): focus_type = ValuesearchFocus - params_dataclass = ValuesearchParams + params_type = ValuesearchParams def get_search_handler(self, strategy): return strategy.pls_handle_valuesearch diff --git a/trove/views/shtrove_root.py b/trove/views/shtrove_root.py new file mode 100644 index 000000000..596d524a1 --- /dev/null +++ b/trove/views/shtrove_root.py @@ -0,0 +1,35 @@ +from primitive_metadata import primitive_rdf as rdf +from django.conf import settings + +from trove.vocab import namespaces as ns +from ._base import StaticTroveView + + +class ShtroveRootView(StaticTroveView): + @classmethod + def get_focus_iri(cls): + return settings.SHARE_WEB_URL + + @classmethod + def get_static_triples(cls, focus_iri: str) -> rdf.RdfTripleDictionary: + _here = rdf.IriNamespace(focus_iri) + _docs = _here['trove/docs'] + _browse = _here['trove/browse'] + _cardsearch = _here['trove/index-card-search'] + return { + focus_iri: { + ns.DCTERMS.description: { + rdf.literal('a trove of metadata meant to be shared', language='en'), + }, + ns.RDFS.seeAlso: {_docs, _browse, _cardsearch}, + }, + _docs: { + ns.DCTERMS.title: {rdf.literal('trove search-api docs', language='en')}, + }, + _browse: { + ns.DCTERMS.title: {rdf.literal('trove browse', language='en')}, + }, + _cardsearch: { + ns.DCTERMS.title: {rdf.literal('trove index-card-search', language='en')}, + }, + } diff --git a/trove/views/vocab.py b/trove/views/vocab.py index 62982f34e..3a896fe82 100644 --- a/trove/views/vocab.py +++ b/trove/views/vocab.py @@ -1,40 +1,21 @@ +from urllib.parse import urlencode + from django import http +from django.shortcuts import redirect +from django.urls import reverse from django.views import View -from trove import exceptions as trove_exceptions -from trove.render import ( - DEFAULT_RENDERER_TYPE, - get_renderer_type, -) from trove.vocab.namespaces import TROVE from trove.vocab.trove import TROVE_API_THESAURUS -from ._responder import ( - make_http_error_response, - make_http_response, -) class TroveVocabView(View): def get(self, request, vocab_term): _iri = TROVE[vocab_term] - try: - _data = {_iri: TROVE_API_THESAURUS[_iri]} - except KeyError: + if _iri not in TROVE_API_THESAURUS: raise http.Http404 - try: - _renderer_type = get_renderer_type(request) - _renderer = _renderer_type(_iri, _data) - return make_http_response( - content_rendering=_renderer.render_document(), - http_request=request, - ) - except trove_exceptions.CannotRenderMediatype as _error: - return make_http_error_response( - error=_error, - renderer_type=DEFAULT_RENDERER_TYPE, - ) - except trove_exceptions.TroveError as _error: - return make_http_error_response( - error=_error, - renderer_type=_renderer_type, - ) + _browse_url = '?'.join(( + reverse('trove:browse-iri'), + urlencode({'iri': _iri}), + )) + return redirect(_browse_url) diff --git a/trove/vocab/namespaces.py b/trove/vocab/namespaces.py index f61f176c4..c0ebf1cb6 100644 --- a/trove/vocab/namespaces.py +++ b/trove/vocab/namespaces.py @@ -1,3 +1,5 @@ +import functools + from primitive_metadata import primitive_rdf as rdf from primitive_metadata.namespaces import ( RDF, @@ -32,7 +34,7 @@ 'SKOS', 'TROVE', 'XSD', - 'NAMESPACES_SHORTHAND', + 'namespaces_shorthand', ) # namespaces used in OAI-PMH @@ -63,4 +65,6 @@ _NAMESPACES_BY_PREFIX['blarg'] = BLARG -NAMESPACES_SHORTHAND = DEFAULT_SHORTHAND.with_update(_NAMESPACES_BY_PREFIX) +@functools.cache +def namespaces_shorthand() -> rdf.IriShorthand: + return DEFAULT_SHORTHAND.with_update(_NAMESPACES_BY_PREFIX) diff --git a/trove/vocab/osfmap.py b/trove/vocab/osfmap.py index 4f9112127..5d4eadce6 100644 --- a/trove/vocab/osfmap.py +++ b/trove/vocab/osfmap.py @@ -1,4 +1,8 @@ +from __future__ import annotations +import typing import functools +if typing.TYPE_CHECKING: + from collections.abc import Iterator from primitive_metadata.primitive_rdf import ( literal, @@ -8,6 +12,16 @@ from primitive_metadata import gather from share.models.feature_flag import FeatureFlag +from trove.util.propertypath import ( + Propertypath, + PropertypathSet, + parse_propertypath, + propertypath_key, +) +from trove.util.queryparams import ( + join_queryparam_value, + split_queryparam_value, +) from trove.util.shorthand import build_shorthand_from_thesaurus from trove.vocab.jsonapi import JSONAPI_MEMBERNAME from trove.vocab.namespaces import ( @@ -22,7 +36,7 @@ RDFS, SKOS, TROVE, - NAMESPACES_SHORTHAND, + namespaces_shorthand, ) OSFMAP_LINK = 'https://osf.io/8yczr' @@ -814,17 +828,6 @@ ) -@functools.cache -def osfmap_shorthand() -> IriShorthand: - '''build iri shorthand that includes unprefixed osfmap terms - ''' - return build_shorthand_from_thesaurus( - thesaurus=OSFMAP_THESAURUS, - label_predicate=JSONAPI_MEMBERNAME, - base_shorthand=NAMESPACES_SHORTHAND, - ) - - ALL_SUGGESTED_PROPERTY_PATHS = ( (DCTERMS.created,), (OSFMAP.funder,), @@ -927,7 +930,9 @@ def osfmap_shorthand() -> IriShorthand: NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES) SKIPPABLE_PROPERTIES = (OSFMAP.contains, OWL.sameAs) EXTRA_INDEXED_LITERAL_PATHS = frozenset(( + # indirect text-search paths used by osf-search (DCTERMS.creator, FOAF.name), + (OSFMAP.isContainedBy, DCTERMS.creator, FOAF.name), )) DEFAULT_TABULAR_SEARCH_COLUMN_PATHS: tuple[tuple[str, ...], ...] = ( @@ -941,6 +946,43 @@ def osfmap_shorthand() -> IriShorthand: (DCTERMS.rights,), ) +# end constants +### + + +### +# functions + +@functools.cache # built once +def osfmap_json_shorthand() -> IriShorthand: + '''build iri shorthand that includes unprefixed osfmap terms + ''' + return build_shorthand_from_thesaurus( + thesaurus=OSFMAP_THESAURUS, + label_predicate=JSONAPI_MEMBERNAME, + base_shorthand=namespaces_shorthand(), + ) + + +def parse_osfmap_propertypath(serialized_path: str, *, allow_globs=False) -> Propertypath: + return parse_propertypath(serialized_path, osfmap_json_shorthand(), allow_globs=allow_globs) + + +def parse_osfmap_propertypath_set(serialized_path_set: str, *, allow_globs=False) -> Iterator[Propertypath]: + for _path in split_queryparam_value(serialized_path_set): + yield parse_osfmap_propertypath(_path, allow_globs=allow_globs) + + +def osfmap_propertypath_key(propertypath: Propertypath) -> str: + return propertypath_key(propertypath, osfmap_json_shorthand()) + + +def osfmap_propertypath_set_key(propertypath_set: PropertypathSet) -> str: + return join_queryparam_value( + osfmap_propertypath_key(_propertypath) + for _propertypath in propertypath_set + ) + def suggested_property_paths(type_iris: set[str]) -> tuple[tuple[str, ...], ...]: _suggested: tuple[tuple[str, ...], ...] diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index c8c2f377f..166a5a24e 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -18,11 +18,7 @@ JSONAPI_ATTRIBUTE, JSONAPI_RELATIONSHIP, ) -from trove.vocab.osfmap import ( - DATE_PROPERTIES, - OSFMAP_LINK, - osfmap_shorthand, -) +from trove.vocab import osfmap from trove.vocab.namespaces import ( DCTERMS, OWL, @@ -30,7 +26,7 @@ RDFS, SKOS, TROVE, - NAMESPACES_SHORTHAND, + namespaces_shorthand, ) @@ -47,9 +43,10 @@ def _literal_markdown(text: str, *, language: str): def trove_browse_link(iri: str): + _compact = namespaces_shorthand().compact_iri(iri) return urllib.parse.urljoin( - reverse('trovetrove:browse-iri'), - f'?iri={urllib.parse.quote(iri)}', + reverse('trove:browse-iri'), + f'?iri={urllib.parse.quote(_compact)}', ) @@ -84,12 +81,12 @@ def trove_browse_link(iri: str): DCTERMS.description: {_literal_markdown(f'''an **index-card** is a metadata record about a specific thing. -that thing is called the "focus" of the index-card and is identified by a "focus iri" +that thing is called the "focus" of the index-card and is identified by a "focus IRI" -- any thing may be identified by multiple iris, but choose one within an index-card (and perhaps include the others with `owl:sameAs`) the metadata about the thing is a quoted [rdf graph](https://www.w3.org/TR/rdf11-concepts/#data-model) -in which every triple is reachable from the card's focus iri +in which every triple is reachable from the card's focus IRI following predicates as directed edges from subject to object. there is not (yet) any size limit for an index-card's metadata, @@ -103,12 +100,12 @@ def trove_browse_link(iri: str): when represented as `application/vnd.api+json` (jsonapi), the `resourceMetadata` attribute contains a json object that has: -* `@id` with the focus iri +* `@id` with the focus IRI * `@type` with the focus resource's `rdf:type` -* property keys from [OSFMAP]({OSFMAP_LINK}) shorthand (each corresponding to an iri) +* property keys from [OSFMAP]({osfmap.OSFMAP_LINK}) shorthand (each corresponding to an IRI) * property values as lists of objects: * literal text as `{{"@value": "..."}}` - * iri references as `{{"@id": "..."}}` + * IRI references as `{{"@id": "..."}}` ''', language='en')}, }, @@ -163,9 +160,8 @@ def trove_browse_link(iri: str): search index-cards that match a fuzzy text search for the word "word" in the title (aka `dcterms:title`, ``) uses query parameter: -``` -cardSearchText[title]=word -``` + +* `cardSearchText[title]=word` ''', language='en')}, RDF.value: {literal('/trove/index-card-search?cardSearchText[title]=word&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -176,9 +172,8 @@ def trove_browse_link(iri: str): search index-cards that have at least one creator affiliated with [COS](https://cos.io) uses query parameter: -``` -cardSearchFilter[creator.affiliation]=https://cos.io -``` + +* `cardSearchFilter[creator.affiliation]=https://cos.io` ''', language='en')}, RDF.value: {literal('/trove/index-card-search?cardSearchFilter[creator.affiliation]=https://cos.io&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -190,9 +185,8 @@ def trove_browse_link(iri: str): values after 2022 uses query parameter: -``` -cardSearchFilter[dateCreated][after]=2022 -``` + +* `cardSearchFilter[dateCreated][after]=2022` ''', language='en')}, RDF.value: {literal('/trove/index-card-search?cardSearchFilter[dateCreated][after]=2022&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -200,12 +194,11 @@ def trove_browse_link(iri: str): RDFS.label: {literal('card-search-with-star-path', language='en')}, RDFS.comment: {literal('card-search with star path', language='en')}, DCTERMS.description: {_literal_markdown(''' -searches index-cards with a specific iri value at any property +searches index-cards with a specific IRI value at any property uses query parameter: -``` -cardSearchFilter[*]=https://osf.io -``` + +* `cardSearchFilter[*]=https://osf.io` ''', language='en')}, RDF.value: {literal('/trove/index-card-search?cardSearchFilter[*]=https://osf.io&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -216,10 +209,9 @@ def trove_browse_link(iri: str): searches for index-cards that have a `funder` and do not have an `affiliation` uses query parameters: -``` -cardSearchFilter[funder][is-present] -cardSearchFilter[affiliation][is-absent] -``` + +* `cardSearchFilter[funder][is-present]` +* `cardSearchFilter[affiliation][is-absent]` ''', language='en')}, RDF.value: {literal('/trove/index-card-search?cardSearchFilter[funder][is-present]&cardSearchFilter[affiliation][is-absent]&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -241,22 +233,21 @@ def trove_browse_link(iri: str): # TROVE.include, }, RDFS.label: {literal('index-value-search', language='en')}, - RDFS.comment: {literal('search for iri values based on how they are used', language='en')}, + RDFS.comment: {literal('search for IRI values based on how they are used', language='en')}, DCTERMS.description: {_literal_markdown('''**index-value-search** is -a way to find iri values that could be used in a cardSearchFilter +a way to find IRI values that could be used in a cardSearchFilter ''', language='en')}, TROVE.example: { blanknode({ RDFS.label: {literal('value-search without card-search', language='en')}, RDFS.comment: {literal('value-search without card-search', language='en')}, DCTERMS.description: {_literal_markdown(''' -search for iri values for the property `creator` (aka `dcterms:creator`, +search for IRI values for the property `creator` (aka `dcterms:creator`, ``) -uses query parameter: -``` -valueSearchPropertyPath=creator -``` +uses query parameters: + +* `valueSearchPropertyPath=creator` ''', language='en')}, RDF.value: {literal('/trove/index-value-search?valueSearchPropertyPath=creator&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -264,28 +255,26 @@ def trove_browse_link(iri: str): RDFS.label: {literal('value-search with card-search', language='en')}, RDFS.comment: {literal('value-search with card-search', language='en')}, DCTERMS.description: {_literal_markdown(''' -search for iri values for the property `creator` within the context of a card-search +search for IRI values for the property `creator` within the context of a card-search uses query parameter: -``` -valueSearchPropertyPath=creator -cardSearchText=sciency -cardSearchFilter[subject][is-present] -``` + +* `valueSearchPropertyPath=creator` +* `cardSearchText=sciency` +* `cardSearchFilter[subject][is-present]` ''', language='en')}, RDF.value: {literal('/trove/index-value-search?valueSearchPropertyPath=creator&cardSearchText=sciency&cardSearchFilter[subject][is-present]&acceptMediatype=application/vnd.api%2Bjson')}, }), blanknode({ - RDFS.label: {literal('value-search specific iri', language='en')}, - RDFS.comment: {literal('value-search specific iri', language='en')}, + RDFS.label: {literal('value-search specific IRI', language='en')}, + RDFS.comment: {literal('value-search specific IRI', language='en')}, DCTERMS.description: {_literal_markdown(''' -search for a specific iri value in the property `creator` +search for a specific IRI value in the property `creator` -uses query parameter: -``` -valueSearchPropertyPath=creator -valueSearchFilter[sameAs]=https://orcid.org/0000-0002-6155-6104 -``` +uses query parameters: + +* `valueSearchPropertyPath=creator` +* `valueSearchFilter[sameAs]=https://orcid.org/0000-0002-6155-6104` ''', language='en')}, RDF.value: {literal('/trove/index-value-search?valueSearchPropertyPath=creator&valueSearchFilter[sameAs]=https://orcid.org/0000-0002-6155-6104&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -293,13 +282,12 @@ def trove_browse_link(iri: str): RDFS.label: {literal('value-search by value type', language='en')}, RDFS.comment: {literal('value-search by value type', language='en')}, DCTERMS.description: {_literal_markdown(''' -search for iri values that are used as `creator` and have `rdf:type` `Person` (aka `foaf:Person`) +search for IRI values that are used as `creator` and have `rdf:type` `Person` (aka `foaf:Person`) -uses query parameter: -``` -valueSearchPropertyPath=creator -valueSearchFilter[resourceType]=Person -``` +uses query parameters: + +* `valueSearchPropertyPath=creator` +* `valueSearchFilter[resourceType]=Person` ''', language='en')}, RDF.value: {literal('/trove/index-value-search?valueSearchPropertyPath=creator&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -307,14 +295,13 @@ def trove_browse_link(iri: str): RDFS.label: {literal('value-search with text', language='en')}, RDFS.comment: {literal('value-search with text', language='en')}, DCTERMS.description: {_literal_markdown(''' -search for iri values used as `license` that have "cc" in their label +search for IRI values used as `license` that have "cc" in their label (`rdfs:label`, `dcterms:title`, or `foaf:name`) -uses query parameter: -``` -valueSearchPropertyPath=license -valueSearchText=cc -``` +uses query parameters: + +* `valueSearchPropertyPath=license` +* `valueSearchText=cc` ''', language='en')}, RDF.value: {literal('/trove/index-value-search?valueSearchPropertyPath=license&valueSearchText=cc&acceptMediatype=application/vnd.api%2Bjson')}, }), @@ -532,17 +519,24 @@ def trove_browse_link(iri: str): RDFS.comment: {literal('free-text search query', language='en')}, TROVE.jsonSchema: {literal_json({'type': 'string'})}, DCTERMS.description: {_literal_markdown('''**cardSearchText** is -a query parameter for free-text search, e.g. `cardSearchText=foo` +a query parameter for free-text search within an index-card. + +accepts comma-separated property-paths in an optional bracketed parameter, +e.g. `cardSearchText[title,description]=foo` +(without brackets equivalent to `cardSearchText[*]`, matching any property-path of length one from the index-card focus). -special characters in search text: +different index-strategies may parse and process search text differently +-- the current default index-strategy supports these special characters (to use them literally, precede with backslash (`\\`)) +* `+` signifies AND operation (default) +* `|` signifies OR operation +* `-` negates a single token +* `"` wraps a number of tokens to signify a phrase for searching +* `*` at the end of a term signifies a prefix query +* `(` and `)` signify precedence +* `~N` (where N is an integer) after a word signifies edit distance (fuzziness) +* `~N` (where N is an integer) after a phrase signifies slop amount -* `"` (double quotes): use on both sides of a word or phrase to require exact text match - -- without quotes, text search is fuzzier and more approximate -* `-` (hyphen): use before a word or quoted phrase (before the leading `"`) to require - that the exact word or phrase be absent -accepts comma-separated property-paths in an optional bracketed parameter (default -`*]`, matches any one property), e.g. `cardSearchText[title,description]=foo` ''', language='en')}, }, TROVE.cardSearchFilter: { @@ -560,11 +554,11 @@ def trove_browse_link(iri: str): * `propertypath_set`: comma-separated **property-path** set * `filter_operator`: any one of the operators defined below -* `value_iris`: comma-separated iri set +* `value_iris`: comma-separated IRI set ### filter operators -operators on iri values: +operators on IRI values: * `any-of` (default): at least one of the value iris * `none-of`: none of the value iris @@ -602,10 +596,10 @@ def trove_browse_link(iri: str): RDFS.comment: {literal('free-text search (within a title, name, or label associated with an IRI)', language='en')}, TROVE.jsonSchema: {literal_json({'type': 'string'})}, DCTERMS.description: {_literal_markdown('''**valueSearchText** is -a query parameter that matches text closely associated with each value -(specifically `dcterms:title`, `foaf:name`, and `rdfs:label`) +a query parameter to narrow an index-value-search by free-text search. -note: does not accept any bracketed parameters +behaves like `cardSearchText` except that paths are interpreted relative to +(non-focus) IRI values within each index-card. ''', language='en')}, }, TROVE.indexCardId: { @@ -629,7 +623,7 @@ def trove_browse_link(iri: str): it may be used only two ways: -* `valueSearchFilter[sameAs]=` to request a specific value by IRI +* `valueSearchFilter[sameAs]=` to request a specific value by IRI * `valueSearchFilter[resourceType]=` to request values used with `rdf:type ` ''', language='en')}, }, @@ -676,7 +670,7 @@ def trove_browse_link(iri: str): to sort by date values, use `sort` (or `sort[date-value]`) with a **property-path** that ends with one of the following supported date properties: -{", ".join(f"`{osfmap_shorthand().compact_iri(_date_iri)}`" for _date_iri in DATE_PROPERTIES)} +{", ".join(f"`{osfmap.osfmap_json_shorthand().compact_iri(_date_iri)}`" for _date_iri in osfmap.DATE_PROPERTIES)} to sort by integer values, use `sort[integer-value]` with a **property-path** to the integers of interest. @@ -723,22 +717,22 @@ def trove_browse_link(iri: str): DCTERMS.description: {_literal_markdown(f'''a **property-path** is a dot-separated path of short-hand IRIs, used in several api parameters -currently the only supported shorthand is defined by [OSFMAP]({OSFMAP_LINK}) +currently the only supported shorthand is defined by [OSFMAP]({osfmap.OSFMAP_LINK}) for example, `creator.name` is parsed as a two-step path that follows `creator` (aka `dcterms:creator`, ``) and then `name` (aka `foaf:name`, ``) most places that allow one property-path also accept a comma-separated set of paths, like `title,description` (which is parsed as two paths: `title` and `description`) -or `creator.name,affiliation.name,funder.name` (which is parsed as three paths: `creator.name`, -`affiliation.name`, and `funder.name`) +or `affiliation,creator.affiliation,funder` (which is parsed as three paths: `affiliation`, +`creator.affiliation`, and `funder`) the special path segment `*` matches any property -* `*`: match text values one step away from the focus -* `*.*`: match text values exactly two steps away -* `*,*.*`: match text values one OR two steps away -* `*,creator.name`: match text values one step away OR at the specific path `creator.name` +* `*`: match values one step away from the focus +* `*.*`: match values exactly two steps away +* `*,*.*`: match values one OR two steps away +* `*,creator`: match values one step away OR at the specific path `creator` (currently, if a path contains `*`, then every step must be `*` -- mixed paths like `*.affiliation` are not supported) @@ -840,13 +834,13 @@ def trove_browse_link(iri: str): @functools.cache -def trove_shorthand() -> IriShorthand: +def trove_json_shorthand() -> IriShorthand: '''build iri shorthand that includes unprefixed terms (as defined in TROVE_API_THESAURUS) ''' return build_shorthand_from_thesaurus( thesaurus=TROVE_API_THESAURUS, label_predicate=JSONAPI_MEMBERNAME, - base_shorthand=NAMESPACES_SHORTHAND, + base_shorthand=namespaces_shorthand(), )