diff --git a/share/admin/search.py b/share/admin/search.py index 95614a0fc..8894346dd 100644 --- a/share/admin/search.py +++ b/share/admin/search.py @@ -33,7 +33,7 @@ def search_indexes_view(request): if request.method == 'POST': _index_strategy = parse_strategy_name(request.POST['strategy_name']) _pls_doer = PLS_DOERS[request.POST['pls_do']] - _pls_doer(_index_strategy) + _pls_doer(_index_strategy, request.POST) _redirect_id = _index_strategy.strategy_name return HttpResponseRedirect('#'.join((request.path, _redirect_id))) @@ -104,35 +104,35 @@ def _serialize_backfill( } -def _pls_setup(index_strategy: IndexStrategy): +def _pls_setup(index_strategy: IndexStrategy, request_kwargs): assert index_strategy.is_current index_strategy.pls_setup() -def _pls_start_keeping_live(index_strategy: IndexStrategy): +def _pls_start_keeping_live(index_strategy: IndexStrategy, request_kwargs): index_strategy.pls_start_keeping_live() -def _pls_stop_keeping_live(index_strategy: IndexStrategy): +def _pls_stop_keeping_live(index_strategy: IndexStrategy, request_kwargs): index_strategy.pls_stop_keeping_live() -def _pls_start_backfill(index_strategy: IndexStrategy): +def _pls_start_backfill(index_strategy: IndexStrategy, request_kwargs): assert index_strategy.is_current index_strategy.pls_start_backfill() -def _pls_mark_backfill_complete(index_strategy: IndexStrategy): +def _pls_mark_backfill_complete(index_strategy: IndexStrategy, request_kwargs): index_strategy.pls_mark_backfill_complete() -def _pls_make_default_for_searching(index_strategy: IndexStrategy): +def _pls_make_default_for_searching(index_strategy: IndexStrategy, request_kwargs): index_strategy.pls_make_default_for_searching() -def _pls_delete(index_strategy: IndexStrategy): - assert not index_strategy.is_current - index_strategy.pls_teardown() +def _pls_delete(index_strategy: IndexStrategy, request_kwargs): + if request_kwargs.get('really') == 'really really': + index_strategy.pls_teardown() PLS_DOERS = { diff --git a/share/search/index_strategy/_trovesearch_util.py b/share/search/index_strategy/_trovesearch_util.py index b106cf3f8..e38872712 100644 --- a/share/search/index_strategy/_trovesearch_util.py +++ b/share/search/index_strategy/_trovesearch_util.py @@ -19,16 +19,12 @@ ) from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri from trove.vocab.namespaces import ( - DCTERMS, OWL, RDF, TROVE, XSD, ) -from trove.vocab.osfmap import ( - is_date_property, - SKIPPABLE_PROPERTIES, -) +from trove.vocab import osfmap _logger = logging.getLogger(__name__) @@ -45,7 +41,6 @@ 'type': 'text', 'index_options': 'offsets', # for highlighting } -TEXT_PATH_DEPTH_MAX = 1 ### @@ -88,15 +83,6 @@ def iris_synonyms(iris: typing.Iterable[str], rdfdoc: rdf.RdfGraph) -> set[str]: } -def should_skip_path(path: Propertypath) -> bool: - _last = path[-1] - if _last in SKIPPABLE_PROPERTIES: - return True - if len(path) > 1 and _last == DCTERMS.identifier: - return True - return False - - def propertypath_as_keyword(path: Propertypath) -> str: assert not is_globpath(path) return json.dumps(path) @@ -124,6 +110,18 @@ def _dict_of_sets(): @dataclasses.dataclass class GraphWalk: + '''GraphWalk: a recorded traversal of an RDF graph from a focus resource + + (note: traversal performed greedily in `__post_init__`, filling values and paths fields + -- don't instantiate early or keep it around longer than needed) + + auto-filled fields: + - `paths_walked` contains all (unique, acyclic) predicate-paths followed from the focus + - `iri_values` contains all IRIs encountered as objects along those paths + - `text_values`, `date_values`, and `integer_values` contain literal values encountered + "close to" the focus (meaning no IRI-identified resources along the path), with special + exception to include more distant paths from osfmap.EXTRA_INDEXED_LITERAL_PATHS + ''' rdfdoc: rdf.RdfGraph focus_iri: str already_visiting: set[str] = dataclasses.field(default_factory=set) @@ -144,19 +142,10 @@ class GraphWalk: def __post_init__(self): for _walk_path, _walk_obj in self._walk_from_subject(self.focus_iri): self.paths_walked.add(_walk_path) - if isinstance(_walk_obj, str): - self.iri_values[_walk_path].add(_walk_obj) - elif isinstance(_walk_obj, datetime.date): + if isinstance(_walk_obj, datetime.date): self.date_values[_walk_path].add(_walk_obj) - elif isinstance(_walk_obj, int): - self.integer_values[_walk_path].add(_walk_obj) - elif isinstance(_walk_obj, rdf.Literal): - if XSD.integer in _walk_obj.datatype_iris: - self.integer_values[_walk_path].add(int(_walk_obj.unicode_value)) - if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris): - self.text_values[_walk_path].add(_walk_obj) - # try for date in a date property, regardless of the above - if is_date_property(_walk_path[-1]) and isinstance(_walk_obj, (str, rdf.Literal)): + elif osfmap.is_date_property(_walk_path[-1]): # note: osfmap-specific + # index date properties only as dates _date_str = ( _walk_obj.unicode_value if isinstance(_walk_obj, rdf.Literal) @@ -168,20 +157,55 @@ def __post_init__(self): _logger.debug('skipping malformatted date "%s"', _date_str) else: self.date_values[_walk_path].add(_parsed_date) + elif isinstance(_walk_obj, str): + self.iri_values[_walk_path].add(_walk_obj) + elif isinstance(_walk_obj, int): + self.integer_values[_walk_path].add(_walk_obj) + elif isinstance(_walk_obj, rdf.Literal): + if XSD.integer in _walk_obj.datatype_iris: + self.integer_values[_walk_path].add(int(_walk_obj.unicode_value)) + if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris): + self.text_values[_walk_path].add(_walk_obj) def shortwalk_from(self, from_iri: str) -> GraphWalk: return GraphWalk( self.rdfdoc, from_iri, - already_visiting={self.focus_iri}, + already_visiting={*self.already_visiting, self.focus_iri}, ) + def _should_keep_literal( + self, + path: Propertypath, + obj: rdf.Literal, + *, + close_to_focus: bool = True, + ) -> bool: + assert path + if path in osfmap.EXTRA_INDEXED_LITERAL_PATHS: # note: osfmap-specific + return True + return ( + close_to_focus + and path[-1] not in osfmap.SKIPPABLE_PROPERTIES # note: osfmap-specific + ) + + def _should_keep_related_resource( + self, + path: Propertypath, + obj: rdf.RdfObject, + ) -> bool: + assert path + return (path[-1] not in osfmap.SKIPPABLE_PROPERTIES) # note: osfmap-specific + def _walk_from_subject( self, iri: str, path_so_far: tuple[str, ...] = (), ) -> typing.Iterator[tuple[Propertypath, rdf.RdfObject]]: '''walk the graph from the given subject, yielding (pathkey, obj) for every reachable object + + if `path_so_far` is non-empty then only IRIs will be yielded, not literal values + (recommend value-search to find the IRIs you need) ''' if iri in self.already_visiting: return @@ -189,17 +213,20 @@ def _walk_from_subject( _twoples = self.rdfdoc.tripledict.get(iri, {}) for _next_steps, _obj in walk_twoples(_twoples): _path = (*path_so_far, *_next_steps) - if not should_skip_path(_path): - yield (_path, _obj) - if isinstance(_obj, str): # step further for iri + if isinstance(_obj, str): # IRI + if self._should_keep_related_resource(_path, _obj): + yield (_path, _obj) yield from self._walk_from_subject(_obj, path_so_far=_path) + elif self._should_keep_literal(_path, _obj, close_to_focus=(not path_so_far)): + yield (_path, _obj) @functools.cached_property - def paths_by_iri(self) -> defaultdict[str, set[Propertypath]]: + def paths_by_iri(self) -> dict[str, set[Propertypath]]: _paths_by_iri: defaultdict[str, set[Propertypath]] = defaultdict(set) for _path, _iris in self.iri_values.items(): for _iri in _iris: _paths_by_iri[_iri].add(_path) + _paths_by_iri.default_factory = None # now behave as a normal dictionary return _paths_by_iri @contextlib.contextmanager diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py index 9257cd3fd..2a40d1211 100644 --- a/share/search/index_strategy/trovesearch_denorm.py +++ b/share/search/index_strategy/trovesearch_denorm.py @@ -455,9 +455,7 @@ def _texts_at_properties(self, walk: ts.GraphWalk, properties: Iterable[str]): def _texts_by_depth(self, walk: ts.GraphWalk): _by_depth: dict[int, set[str]] = defaultdict(set) for _path, _text_set in walk.text_values.items(): - _depth = len(_path) - if _depth <= ts.TEXT_PATH_DEPTH_MAX: - _by_depth[_depth].update(_text.unicode_value for _text in _text_set) + _by_depth[len(_path)].update(_text.unicode_value for _text in _text_set) return { _depth_field_name(_depth): list(_value_set) for _depth, _value_set in _by_depth.items() diff --git a/templates/admin/search-indexes.html b/templates/admin/search-indexes.html index 40e1a7347..9f2e6bb30 100644 --- a/templates/admin/search-indexes.html +++ b/templates/admin/search-indexes.html @@ -81,6 +81,15 @@

current: {{ strategy_info.status.strategy_id }}

{% trans "backfill" %}:{{ strategy_info.backfill.backfill_status }}

{% endif %} + {% if strategy_info.status.is_set_up %} +
+ {% csrf_token %} + + + + +
+ {% endif %} @@ -139,8 +148,8 @@

prior: {{ prior_strategy_status.strategy_id }}

{% csrf_token %} - + {% endif %} diff --git a/trove/vocab/osfmap.py b/trove/vocab/osfmap.py index 419b595ce..4f9112127 100644 --- a/trove/vocab/osfmap.py +++ b/trove/vocab/osfmap.py @@ -926,6 +926,9 @@ def osfmap_shorthand() -> IriShorthand: LABEL_PROPERTIES = (RDFS.label, SKOS.prefLabel, SKOS.altLabel) NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES) SKIPPABLE_PROPERTIES = (OSFMAP.contains, OWL.sameAs) +EXTRA_INDEXED_LITERAL_PATHS = frozenset(( + (DCTERMS.creator, FOAF.name), +)) DEFAULT_TABULAR_SEARCH_COLUMN_PATHS: tuple[tuple[str, ...], ...] = ( (OWL.sameAs,), # includes DOI