Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions share/admin/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def search_indexes_view(request):
if request.method == 'POST':
_index_strategy = parse_strategy_name(request.POST['strategy_name'])
_pls_doer = PLS_DOERS[request.POST['pls_do']]
_pls_doer(_index_strategy)
_pls_doer(_index_strategy, request.POST)
_redirect_id = _index_strategy.strategy_name
return HttpResponseRedirect('#'.join((request.path, _redirect_id)))

Expand Down Expand Up @@ -104,35 +104,35 @@ def _serialize_backfill(
}


def _pls_setup(index_strategy: IndexStrategy):
def _pls_setup(index_strategy: IndexStrategy, request_kwargs):
assert index_strategy.is_current
index_strategy.pls_setup()


def _pls_start_keeping_live(index_strategy: IndexStrategy):
def _pls_start_keeping_live(index_strategy: IndexStrategy, request_kwargs):
index_strategy.pls_start_keeping_live()


def _pls_stop_keeping_live(index_strategy: IndexStrategy):
def _pls_stop_keeping_live(index_strategy: IndexStrategy, request_kwargs):
index_strategy.pls_stop_keeping_live()


def _pls_start_backfill(index_strategy: IndexStrategy):
def _pls_start_backfill(index_strategy: IndexStrategy, request_kwargs):
assert index_strategy.is_current
index_strategy.pls_start_backfill()


def _pls_mark_backfill_complete(index_strategy: IndexStrategy):
def _pls_mark_backfill_complete(index_strategy: IndexStrategy, request_kwargs):
index_strategy.pls_mark_backfill_complete()


def _pls_make_default_for_searching(index_strategy: IndexStrategy):
def _pls_make_default_for_searching(index_strategy: IndexStrategy, request_kwargs):
index_strategy.pls_make_default_for_searching()


def _pls_delete(index_strategy: IndexStrategy):
assert not index_strategy.is_current
index_strategy.pls_teardown()
def _pls_delete(index_strategy: IndexStrategy, request_kwargs):
if request_kwargs.get('really') == 'really really':
index_strategy.pls_teardown()


PLS_DOERS = {
Expand Down
91 changes: 59 additions & 32 deletions share/search/index_strategy/_trovesearch_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,12 @@
)
from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri
from trove.vocab.namespaces import (
DCTERMS,
OWL,
RDF,
TROVE,
XSD,
)
from trove.vocab.osfmap import (
is_date_property,
SKIPPABLE_PROPERTIES,
)
from trove.vocab import osfmap


_logger = logging.getLogger(__name__)
Expand All @@ -45,7 +41,6 @@
'type': 'text',
'index_options': 'offsets', # for highlighting
}
TEXT_PATH_DEPTH_MAX = 1


###
Expand Down Expand Up @@ -88,15 +83,6 @@ def iris_synonyms(iris: typing.Iterable[str], rdfdoc: rdf.RdfGraph) -> set[str]:
}


def should_skip_path(path: Propertypath) -> bool:
_last = path[-1]
if _last in SKIPPABLE_PROPERTIES:
return True
if len(path) > 1 and _last == DCTERMS.identifier:
return True
return False


def propertypath_as_keyword(path: Propertypath) -> str:
assert not is_globpath(path)
return json.dumps(path)
Expand Down Expand Up @@ -124,6 +110,18 @@ def _dict_of_sets():

@dataclasses.dataclass
class GraphWalk:
'''GraphWalk: a recorded traversal of an RDF graph from a focus resource

(note: traversal performed greedily in `__post_init__`, filling values and paths fields
-- don't instantiate early or keep it around longer than needed)

auto-filled fields:
- `paths_walked` contains all (unique, acyclic) predicate-paths followed from the focus
- `iri_values` contains all IRIs encountered as objects along those paths
- `text_values`, `date_values`, and `integer_values` contain literal values encountered
"close to" the focus (meaning no IRI-identified resources along the path), with special
exception to include more distant paths from osfmap.EXTRA_INDEXED_LITERAL_PATHS
'''
rdfdoc: rdf.RdfGraph
focus_iri: str
already_visiting: set[str] = dataclasses.field(default_factory=set)
Expand All @@ -144,19 +142,10 @@ class GraphWalk:
def __post_init__(self):
for _walk_path, _walk_obj in self._walk_from_subject(self.focus_iri):
self.paths_walked.add(_walk_path)
if isinstance(_walk_obj, str):
self.iri_values[_walk_path].add(_walk_obj)
elif isinstance(_walk_obj, datetime.date):
if isinstance(_walk_obj, datetime.date):
self.date_values[_walk_path].add(_walk_obj)
elif isinstance(_walk_obj, int):
self.integer_values[_walk_path].add(_walk_obj)
elif isinstance(_walk_obj, rdf.Literal):
if XSD.integer in _walk_obj.datatype_iris:
self.integer_values[_walk_path].add(int(_walk_obj.unicode_value))
if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris):
self.text_values[_walk_path].add(_walk_obj)
# try for date in a date property, regardless of the above
if is_date_property(_walk_path[-1]) and isinstance(_walk_obj, (str, rdf.Literal)):
elif osfmap.is_date_property(_walk_path[-1]): # note: osfmap-specific
# index date properties only as dates
_date_str = (
_walk_obj.unicode_value
if isinstance(_walk_obj, rdf.Literal)
Expand All @@ -168,38 +157,76 @@ def __post_init__(self):
_logger.debug('skipping malformatted date "%s"', _date_str)
else:
self.date_values[_walk_path].add(_parsed_date)
elif isinstance(_walk_obj, str):
self.iri_values[_walk_path].add(_walk_obj)
elif isinstance(_walk_obj, int):
self.integer_values[_walk_path].add(_walk_obj)
elif isinstance(_walk_obj, rdf.Literal):
if XSD.integer in _walk_obj.datatype_iris:
self.integer_values[_walk_path].add(int(_walk_obj.unicode_value))
if {RDF.string, RDF.langString}.intersection(_walk_obj.datatype_iris):
self.text_values[_walk_path].add(_walk_obj)

def shortwalk_from(self, from_iri: str) -> GraphWalk:
return GraphWalk(
self.rdfdoc,
from_iri,
already_visiting={self.focus_iri},
already_visiting={*self.already_visiting, self.focus_iri},
)

def _should_keep_literal(
self,
path: Propertypath,
obj: rdf.Literal,
*,
close_to_focus: bool = True,
) -> bool:
assert path
if path in osfmap.EXTRA_INDEXED_LITERAL_PATHS: # note: osfmap-specific
return True
return (
close_to_focus
and path[-1] not in osfmap.SKIPPABLE_PROPERTIES # note: osfmap-specific
)

def _should_keep_related_resource(
self,
path: Propertypath,
obj: rdf.RdfObject,
) -> bool:
assert path
return (path[-1] not in osfmap.SKIPPABLE_PROPERTIES) # note: osfmap-specific

def _walk_from_subject(
self,
iri: str,
path_so_far: tuple[str, ...] = (),
) -> typing.Iterator[tuple[Propertypath, rdf.RdfObject]]:
'''walk the graph from the given subject, yielding (pathkey, obj) for every reachable object

if `path_so_far` is non-empty then only IRIs will be yielded, not literal values
(recommend value-search to find the IRIs you need)
'''
if iri in self.already_visiting:
return
with self._visit(iri):
_twoples = self.rdfdoc.tripledict.get(iri, {})
for _next_steps, _obj in walk_twoples(_twoples):
_path = (*path_so_far, *_next_steps)
if not should_skip_path(_path):
yield (_path, _obj)
if isinstance(_obj, str): # step further for iri
if isinstance(_obj, str): # IRI
if self._should_keep_related_resource(_path, _obj):
yield (_path, _obj)
yield from self._walk_from_subject(_obj, path_so_far=_path)
elif self._should_keep_literal(_path, _obj, close_to_focus=(not path_so_far)):
yield (_path, _obj)

@functools.cached_property
def paths_by_iri(self) -> defaultdict[str, set[Propertypath]]:
def paths_by_iri(self) -> dict[str, set[Propertypath]]:
_paths_by_iri: defaultdict[str, set[Propertypath]] = defaultdict(set)
for _path, _iris in self.iri_values.items():
for _iri in _iris:
_paths_by_iri[_iri].add(_path)
_paths_by_iri.default_factory = None # now behave as a normal dictionary
return _paths_by_iri

@contextlib.contextmanager
Expand Down
4 changes: 1 addition & 3 deletions share/search/index_strategy/trovesearch_denorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,9 +455,7 @@ def _texts_at_properties(self, walk: ts.GraphWalk, properties: Iterable[str]):
def _texts_by_depth(self, walk: ts.GraphWalk):
_by_depth: dict[int, set[str]] = defaultdict(set)
for _path, _text_set in walk.text_values.items():
_depth = len(_path)
if _depth <= ts.TEXT_PATH_DEPTH_MAX:
_by_depth[_depth].update(_text.unicode_value for _text in _text_set)
_by_depth[len(_path)].update(_text.unicode_value for _text in _text_set)
return {
_depth_field_name(_depth): list(_value_set)
for _depth, _value_set in _by_depth.items()
Expand Down
11 changes: 10 additions & 1 deletion templates/admin/search-indexes.html
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,15 @@ <h3>current: {{ strategy_info.status.strategy_id }}</h3>
{% trans "backfill" %}:{{ strategy_info.backfill.backfill_status }}
</a></p>
{% endif %}
{% if strategy_info.status.is_set_up %}
<form method="post">
{% csrf_token %}
<input type="hidden" name="strategy_name" value="{{index_strategy_name}}" />
<input type="hidden" name="pls_do" value="delete" />
<input type="submit" value="{% trans "delete" %}" />
<label>{% trans "really? (you have to type 'really really')" %}<input type="text" name="really"/></label>
</form>
{% endif %}
</nav>
<table>
<tr>
Expand Down Expand Up @@ -139,8 +148,8 @@ <h3>prior: {{ prior_strategy_status.strategy_id }}</h3>
{% csrf_token %}
<input type="hidden" name="strategy_name" value="{{prior_strategy_status.strategy_id}}" />
<input type="hidden" name="pls_do" value="delete" />
<!-- TODO: confirm destructive action -->
<input type="submit" value="{% trans "delete" %}" />
<label>{% trans "really? (you have to type 'really really')" %}<input type="text" name="really"/></label>
</form>
{% endif %}
</nav>
Expand Down
3 changes: 3 additions & 0 deletions trove/vocab/osfmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,9 @@ def osfmap_shorthand() -> IriShorthand:
LABEL_PROPERTIES = (RDFS.label, SKOS.prefLabel, SKOS.altLabel)
NAMELIKE_PROPERTIES = (*TITLE_PROPERTIES, *NAME_PROPERTIES, *LABEL_PROPERTIES)
SKIPPABLE_PROPERTIES = (OSFMAP.contains, OWL.sameAs)
EXTRA_INDEXED_LITERAL_PATHS = frozenset((
(DCTERMS.creator, FOAF.name),
))

DEFAULT_TABULAR_SEARCH_COLUMN_PATHS: tuple[tuple[str, ...], ...] = (
(OWL.sameAs,), # includes DOI
Expand Down