From 7467691e0a9b9ddb310db5197d85e0ca2cc01605 Mon Sep 17 00:00:00 2001
From: abram axel booth
Date: Fri, 28 Feb 2025 13:42:34 -0500
Subject: [PATCH 1/3] fix: less absurd graph-walk bounds
---
.../index_strategy/_trovesearch_util.py | 91 ++++++++++++-------
.../index_strategy/trovesearch_denorm.py | 4 +-
trove/vocab/osfmap.py | 3 +
3 files changed, 63 insertions(+), 35 deletions(-)
diff --git a/share/search/index_strategy/_trovesearch_util.py b/share/search/index_strategy/_trovesearch_util.py
index b106cf3f8..e38872712 100644
--- a/share/search/index_strategy/_trovesearch_util.py
+++ b/share/search/index_strategy/_trovesearch_util.py
@@ -19,16 +19,12 @@
)
from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri
from trove.vocab.namespaces import (
- DCTERMS,
OWL,
RDF,
TROVE,
XSD,
)
-from trove.vocab.osfmap import (
- is_date_property,
- SKIPPABLE_PROPERTIES,
-)
+from trove.vocab import osfmap
_logger = logging.getLogger(__name__)
@@ -45,7 +41,6 @@
'type': 'text',
'index_options': 'offsets', # for highlighting
}
-TEXT_PATH_DEPTH_MAX = 1
###
@@ -88,15 +83,6 @@ def iris_synonyms(iris: typing.Iterable[str], rdfdoc: rdf.RdfGraph) -> set[str]:
}
-def should_skip_path(path: Propertypath) -> bool:
- _last = path[-1]
- if _last in SKIPPABLE_PROPERTIES:
- return True
- if len(path) > 1 and _last == DCTERMS.identifier:
- return True
- return False
-
-
def propertypath_as_keyword(path: Propertypath) -> str:
assert not is_globpath(path)
return json.dumps(path)
@@ -124,6 +110,18 @@ def _dict_of_sets():
@dataclasses.dataclass
class GraphWalk:
+ '''GraphWalk: a recorded traversal of an RDF graph from a focus resource
+
+ (note: traversal performed greedily in `__post_init__`, filling values and paths fields
+ -- don't instantiate early or keep it around longer than needed)
+
+ auto-filled fields:
+ - `paths_walked` contains all (unique, acyclic) predicate-paths followed from the focus
+ - `iri_values` contains all IRIs encountered as objects along those paths
+ - `text_values`, `date_values`, and `integer_values` contain literal values encountered
+ "close to" the focus (meaning no IRI-identified resources along the path), with special
+ exception to include more distant paths from osfmap.EXTRA_INDEXED_LITERAL_PATHS
+ '''
rdfdoc: rdf.RdfGraph
focus_iri: str
already_visiting: set[str] = dataclasses.field(default_factory=set)
@@ -144,19 +142,10 @@ class GraphWalk:
def __post_init__(self):
for _walk_path, _walk_obj in self._walk_from_subject(self.focus_iri):
self.paths_walked.add(_walk_path)
- if isinstance(_walk_obj, str):
- self.iri_values[_walk_path].add(_walk_obj)
- elif isinstance(_walk_obj, datetime.date):
+ if isinstance(_walk_obj, datetime.date):
self.date_values[_walk_path].add(_walk_obj)
- elif isinstance(_walk_obj, int):
- self.integer_values[_walk_path].add(_walk_obj)
- elif isinstance(_walk_obj, rdf.Literal):
- if XSD.integer in _walk_obj.datatype_iris:
- self.integer_values[_walk_path].add(int(_walk_obj.unicode_value))
- if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris):
- self.text_values[_walk_path].add(_walk_obj)
- # try for date in a date property, regardless of the above
- if is_date_property(_walk_path[-1]) and isinstance(_walk_obj, (str, rdf.Literal)):
+ elif osfmap.is_date_property(_walk_path[-1]): # note: osfmap-specific
+ # index date properties only as dates
_date_str = (
_walk_obj.unicode_value
if isinstance(_walk_obj, rdf.Literal)
@@ -168,20 +157,55 @@ def __post_init__(self):
_logger.debug('skipping malformatted date "%s"', _date_str)
else:
self.date_values[_walk_path].add(_parsed_date)
+ elif isinstance(_walk_obj, str):
+ self.iri_values[_walk_path].add(_walk_obj)
+ elif isinstance(_walk_obj, int):
+ self.integer_values[_walk_path].add(_walk_obj)
+ elif isinstance(_walk_obj, rdf.Literal):
+ if XSD.integer in _walk_obj.datatype_iris:
+ self.integer_values[_walk_path].add(int(_walk_obj.unicode_value))
+ if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris):
+ self.text_values[_walk_path].add(_walk_obj)
def shortwalk_from(self, from_iri: str) -> GraphWalk:
return GraphWalk(
self.rdfdoc,
from_iri,
- already_visiting={self.focus_iri},
+ already_visiting={*self.already_visiting, self.focus_iri},
)
+ def _should_keep_literal(
+ self,
+ path: Propertypath,
+ obj: rdf.Literal,
+ *,
+ close_to_focus: bool = True,
+ ) -> bool:
+ assert path
+ if path in osfmap.EXTRA_INDEXED_LITERAL_PATHS: # note: osfmap-specific
+ return True
+ return (
+ close_to_focus
+ and path[-1] not in osfmap.SKIPPABLE_PROPERTIES # note: osfmap-specific
+ )
+
+ def _should_keep_related_resource(
+ self,
+ path: Propertypath,
+ obj: rdf.RdfObject,
+ ) -> bool:
+ assert path
+ return (path[-1] not in osfmap.SKIPPABLE_PROPERTIES) # note: osfmap-specific
+
def _walk_from_subject(
self,
iri: str,
path_so_far: tuple[str, ...] = (),
) -> typing.Iterator[tuple[Propertypath, rdf.RdfObject]]:
'''walk the graph from the given subject, yielding (pathkey, obj) for every reachable object
+
+ if `path_so_far` is non-empty then only IRIs will be yielded, not literal values
+ (recommend value-search to find the IRIs you need)
'''
if iri in self.already_visiting:
return
@@ -189,17 +213,20 @@ def _walk_from_subject(
_twoples = self.rdfdoc.tripledict.get(iri, {})
for _next_steps, _obj in walk_twoples(_twoples):
_path = (*path_so_far, *_next_steps)
- if not should_skip_path(_path):
- yield (_path, _obj)
- if isinstance(_obj, str): # step further for iri
+ if isinstance(_obj, str): # IRI
+ if self._should_keep_related_resource(_path, _obj):
+ yield (_path, _obj)
yield from self._walk_from_subject(_obj, path_so_far=_path)
+ elif self._should_keep_literal(_path, _obj, close_to_focus=(not path_so_far)):
+ yield (_path, _obj)
@functools.cached_property
- def paths_by_iri(self) -> defaultdict[str, set[Propertypath]]:
+ def paths_by_iri(self) -> dict[str, set[Propertypath]]:
_paths_by_iri: defaultdict[str, set[Propertypath]] = defaultdict(set)
for _path, _iris in self.iri_values.items():
for _iri in _iris:
_paths_by_iri[_iri].add(_path)
+ _paths_by_iri.default_factory = None # now behave as a normal dictionary
return _paths_by_iri
@contextlib.contextmanager
diff --git a/share/search/index_strategy/trovesearch_denorm.py b/share/search/index_strategy/trovesearch_denorm.py
index 9257cd3fd..2a40d1211 100644
--- a/share/search/index_strategy/trovesearch_denorm.py
+++ b/share/search/index_strategy/trovesearch_denorm.py
@@ -455,9 +455,7 @@ def _texts_at_properties(self, walk: ts.GraphWalk, properties: Iterable[str]):
def _texts_by_depth(self, walk: ts.GraphWalk):
_by_depth: dict[int, set[str]] = defaultdict(set)
for _path, _text_set in walk.text_values.items():
- _depth = len(_path)
- if _depth <= ts.TEXT_PATH_DEPTH_MAX:
- _by_depth[_depth].update(_text.unicode_value for _text in _text_set)
+ _by_depth[len(_path)].update(_text.unicode_value for _text in _text_set)
return {
_depth_field_name(_depth): list(_value_set)
for _depth, _value_set in _by_depth.items()
diff --git a/trove/vocab/osfmap.py b/trove/vocab/osfmap.py
index 419b595ce..4f9112127 100644
--- a/trove/vocab/osfmap.py
+++ b/trove/vocab/osfmap.py
@@ -926,6 +926,9 @@ def osfmap_shorthand() -> IriShorthand:
LABEL_PROPERTIES = (RDFS.label, SKOS.prefLabel, SKOS.altLabel)
NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES)
SKIPPABLE_PROPERTIES = (OSFMAP.contains, OWL.sameAs)
+EXTRA_INDEXED_LITERAL_PATHS = frozenset((
+ (DCTERMS.creator, FOAF.name),
+))
DEFAULT_TABULAR_SEARCH_COLUMN_PATHS: tuple[tuple[str, ...], ...] = (
(OWL.sameAs,), # includes DOI
From ce27fa1552bd2c1fbc6b12147e2f7f076244cef0 Mon Sep 17 00:00:00 2001
From: abram axel booth
Date: Fri, 28 Feb 2025 14:17:56 -0500
Subject: [PATCH 2/3] add: confirm deletion on admin search page
---
share/admin/search.py | 20 ++++++++++----------
templates/admin/search-indexes.html | 11 ++++++++++-
2 files changed, 20 insertions(+), 11 deletions(-)
diff --git a/share/admin/search.py b/share/admin/search.py
index 95614a0fc..c69354fe2 100644
--- a/share/admin/search.py
+++ b/share/admin/search.py
@@ -33,7 +33,7 @@ def search_indexes_view(request):
if request.method == 'POST':
_index_strategy = parse_strategy_name(request.POST['strategy_name'])
_pls_doer = PLS_DOERS[request.POST['pls_do']]
- _pls_doer(_index_strategy)
+ _pls_doer(_index_strategy, request.POST)
_redirect_id = _index_strategy.strategy_name
return HttpResponseRedirect('#'.join((request.path, _redirect_id)))
@@ -104,35 +104,35 @@ def _serialize_backfill(
}
-def _pls_setup(index_strategy: IndexStrategy):
+def _pls_setup(index_strategy: IndexStrategy, request_kwargs):
assert index_strategy.is_current
index_strategy.pls_setup()
-def _pls_start_keeping_live(index_strategy: IndexStrategy):
+def _pls_start_keeping_live(index_strategy: IndexStrategy, request_kwargs):
index_strategy.pls_start_keeping_live()
-def _pls_stop_keeping_live(index_strategy: IndexStrategy):
+def _pls_stop_keeping_live(index_strategy: IndexStrategy, request_kwargs):
index_strategy.pls_stop_keeping_live()
-def _pls_start_backfill(index_strategy: IndexStrategy):
+def _pls_start_backfill(index_strategy: IndexStrategy, request_kwargs):
assert index_strategy.is_current
index_strategy.pls_start_backfill()
-def _pls_mark_backfill_complete(index_strategy: IndexStrategy):
+def _pls_mark_backfill_complete(index_strategy: IndexStrategy, request_kwargs):
index_strategy.pls_mark_backfill_complete()
-def _pls_make_default_for_searching(index_strategy: IndexStrategy):
+def _pls_make_default_for_searching(index_strategy: IndexStrategy, request_kwargs):
index_strategy.pls_make_default_for_searching()
-def _pls_delete(index_strategy: IndexStrategy):
- assert not index_strategy.is_current
- index_strategy.pls_teardown()
+def _pls_delete(index_strategy: IndexStrategy, request_kwargs):
+ if request_kwargs.get('really') == 'really':
+ index_strategy.pls_teardown()
PLS_DOERS = {
diff --git a/templates/admin/search-indexes.html b/templates/admin/search-indexes.html
index 40e1a7347..05d2067cd 100644
--- a/templates/admin/search-indexes.html
+++ b/templates/admin/search-indexes.html
@@ -81,6 +81,15 @@ current: {{ strategy_info.status.strategy_id }}
{% trans "backfill" %}:{{ strategy_info.backfill.backfill_status }}
{% endif %}
+ {% if strategy_info.status.is_set_up %}
+
+ {% endif %}
@@ -139,8 +148,8 @@ prior: {{ prior_strategy_status.strategy_id }}
{% csrf_token %}
-
+
{% endif %}
From 2652b620c978f352a5f86f09bb8d6ebdc59b10d4 Mon Sep 17 00:00:00 2001
From: abram axel booth
Date: Fri, 28 Feb 2025 14:57:42 -0500
Subject: [PATCH 3/3] fix: really really
---
share/admin/search.py | 2 +-
templates/admin/search-indexes.html | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/share/admin/search.py b/share/admin/search.py
index c69354fe2..8894346dd 100644
--- a/share/admin/search.py
+++ b/share/admin/search.py
@@ -131,7 +131,7 @@ def _pls_make_default_for_searching(index_strategy: IndexStrategy, request_kwarg
def _pls_delete(index_strategy: IndexStrategy, request_kwargs):
- if request_kwargs.get('really') == 'really':
+ if request_kwargs.get('really') == 'really really':
index_strategy.pls_teardown()
diff --git a/templates/admin/search-indexes.html b/templates/admin/search-indexes.html
index 05d2067cd..9f2e6bb30 100644
--- a/templates/admin/search-indexes.html
+++ b/templates/admin/search-indexes.html
@@ -87,7 +87,7 @@ current: {{ strategy_info.status.strategy_id }}
-
+
{% endif %}
@@ -149,7 +149,7 @@ prior: {{ prior_strategy_status.strategy_id }}
-
+
{% endif %}