Skip to content

Commit

Permalink
support filtering across multiple propertypaths
Browse files Browse the repository at this point in the history
breaking change behind a feature flag (`periodic_propertypaths`)

change how property paths are serialized in query params:
- use `.` (period) to delimit path-steps, e.g. `creator.affiliation`
  parsed as one path of two steps (previously `creator,affiliation`)
- use `,` (comma) to delimit multiple paths, e.g.
  `creator.affiliation,contributor.affiliation` parsed as two paths of
  two steps (previously impossible)
  • Loading branch information
aaxelb committed Sep 11, 2023
1 parent b97f248 commit adf94b3
Show file tree
Hide file tree
Showing 7 changed files with 241 additions and 77 deletions.
1 change: 1 addition & 0 deletions share/models/feature_flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class FeatureFlag(models.Model):
# flag name constants:
ELASTIC_EIGHT_DEFAULT = 'elastic_eight_default'
IGNORE_SHAREV2_INGEST = 'ignore_sharev2_ingest'
PERIODIC_PROPERTYPATH = 'periodic_propertypath'

# name _should_ be one of the constants above, but that is not enforced by `choices`
name = models.TextField(unique=True)
Expand Down
80 changes: 52 additions & 28 deletions share/search/index_strategy/trove_indexcard_flats.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,20 +343,25 @@ def pls_handle_cardsearch(self, cardsearch_params: CardsearchParams) -> Cardsear

def pls_handle_valuesearch(self, valuesearch_params: ValuesearchParams) -> ValuesearchResponse:
_cursor = _SimpleCursor.from_page_param(valuesearch_params.page)
_is_date_search = all(
is_date_property(_path[-1])
for _path in valuesearch_params.valuesearch_propertypath_set
)
try:
_es8_response = self.index_strategy.es8_client.search(
index=self.indexname,
query=self._cardsearch_query(
valuesearch_params.cardsearch_filter_set,
valuesearch_params.cardsearch_textsegment_set,
additional_filters=[{'term': {
'iri_paths_present': iri_path_as_keyword(valuesearch_params.valuesearch_property_path),
}}],
additional_filters=[{'terms': {'iri_paths_present': [
iri_path_as_keyword(_path)
for _path in valuesearch_params.valuesearch_propertypath_set
]}}],
),
size=0, # ignore cardsearch hits; just want the aggs
aggs=(
self._valuesearch_date_aggs(valuesearch_params)
if is_date_property(valuesearch_params.valuesearch_property_path[-1])
if _is_date_search
else self._valuesearch_iri_aggs(valuesearch_params, _cursor)
),
)
Expand Down Expand Up @@ -495,10 +500,10 @@ def _cardsearch_aggs(self, cardsearch_params):

def _valuesearch_iri_aggs(self, valuesearch_params: ValuesearchParams, cursor: '_SimpleCursor'):
_nested_iri_bool = {
'filter': [{'term': {'nested_iri.suffuniq_path_from_focus': iri_path_as_keyword(
valuesearch_params.valuesearch_property_path,
suffuniq=True,
)}}],
'filter': [{'terms': {'nested_iri.suffuniq_path_from_focus': [
iri_path_as_keyword(_path, suffuniq=True)
for _path in valuesearch_params.valuesearch_propertypath_set
]}}],
'must': [],
'must_not': [],
'should': [],
Expand Down Expand Up @@ -559,11 +564,11 @@ def _valuesearch_date_aggs(self, valuesearch_params: ValuesearchParams):
'nested': {'path': 'nested_date'},
'aggs': {
'value_at_propertypath': {
'filter': {'term': {
'nested_date.suffuniq_path_from_focus': iri_path_as_keyword(
valuesearch_params.valuesearch_property_path,
suffuniq=True,
),
'filter': {'terms': {
'nested_date.suffuniq_path_from_focus': [
iri_path_as_keyword(_path, suffuniq=True)
for _path in valuesearch_params.valuesearch_propertypath_set
],
}},
'aggs': {
'count_by_year': {
Expand Down Expand Up @@ -640,25 +645,40 @@ def _valuesearch_date_result(self, date_bucket):
)

def _cardsearch_presence_query(self, search_filter) -> dict:
_path_keyword = iri_path_as_keyword(search_filter.property_path, suffuniq=True)
return {'term': {'iri_paths_present_suffuniq': _path_keyword}}
_filters = [
self._cardsearch_path_presence_query(_path)
for _path in search_filter.propertypath_set
]
if len(_filters) == 1:
return _filters[0]
return {'bool': {'must': _filters}}

def _cardsearch_iri_filter(self, search_filter) -> dict:
_field = '.'.join((
'flat_iri_values_suffuniq',
_iri_path_as_flattened_key(search_filter.property_path),
))
return {'terms': {
_field: [
get_sufficiently_unique_iri(_iri)
for _iri in search_filter.value_set
],
def _cardsearch_path_presence_query(self, path: tuple[str, ...]):
return {'term': {
'iri_paths_present_suffuniq': iri_path_as_keyword(path, suffuniq=True),
}}

def _cardsearch_iri_filter(self, search_filter) -> dict:
_filters = [
self._cardsearch_path_iri_query(_path, search_filter.value_set)
for _path in search_filter.propertypath_set
]
if len(_filters) == 1:
return _filters[0]
return {'bool': {'should': _filters}} # at least one match

def _cardsearch_path_iri_query(self, path, value_set):
return {'terms': {_iri_path_as_flattened_field(path): [
get_sufficiently_unique_iri(_iri)
for _iri in value_set
]}}

def _cardsearch_date_filter(self, search_filter) -> dict:
_propertypath_keyword = iri_path_as_keyword(search_filter.property_path, suffuniq=True)
_filter_list = [
{'term': {'nested_date.suffuniq_path_from_focus': _propertypath_keyword}},
{'terms': {'nested_date.suffuniq_path_from_focus': [
iri_path_as_keyword(_path, suffuniq=True)
for _path in search_filter.propertypath_set
]}},
]
if search_filter.operator == SearchFilter.FilterOperator.BEFORE:
_value = min(search_filter.value_set) # rely on string-comparable isoformat
Expand Down Expand Up @@ -738,7 +758,7 @@ def _cardsearch_response(
and cursor.is_first_page()
and not cursor.first_page_uuids
and any(
_filter.property_path != (RDF.type,) # look for a non-default filter
not _filter.is_type_filter() # look for a non-default filter
for _filter in cardsearch_params.cardsearch_filter_set
)
)
Expand Down Expand Up @@ -923,6 +943,10 @@ def _iri_path_as_flattened_key(path: tuple[str, ...]) -> str:
return base64.b16encode(json.dumps(path).encode()).decode()


def _iri_path_as_flattened_field(path: tuple[str, ...]) -> str:
return f'flat_iri_values_suffuniq.{_iri_path_as_flattened_key(path)}'


@dataclasses.dataclass
class _SimpleCursor:
start_index: int
Expand Down
86 changes: 58 additions & 28 deletions share/search/search_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from django.http import QueryDict

from share.models import FeatureFlag
from share.search import exceptions
from trove.util.queryparams import (
QueryparamDict,
Expand Down Expand Up @@ -36,6 +37,7 @@

DEFAULT_PAGE_SIZE = 13
MAX_PAGE_SIZE = 101
PROPERTYPATH_DELIMITER = '.'


###
Expand Down Expand Up @@ -167,7 +169,7 @@ def is_iri_operator(self):
def is_valueless_operator(self):
return self in (self.IS_PRESENT, self.IS_ABSENT)

property_path: tuple[str]
propertypath_set: frozenset[tuple[str, ...]]
value_set: frozenset[str]
operator: FilterOperator
original_param_name: typing.Optional[str] = None
Expand All @@ -184,11 +186,11 @@ def for_queryparam_family(cls, queryparams: QueryparamDict, queryparam_family: s
@classmethod
def from_filter_param(cls, param_name: QueryparamName, param_value: str):
_operator = None
try: # "filter[<serialized_path>][<operator>]"
(_serialized_path, _operator_value) = param_name.bracketed_names
try: # "filter[<serialized_path_set>][<operator>]"
(_serialized_path_set, _operator_value) = param_name.bracketed_names
except ValueError:
try: # "filter[<serialized_path>]" (with default operator)
(_serialized_path,) = param_name.bracketed_names
try: # "filter[<serialized_path_set>]" (with default operator)
(_serialized_path_set,) = param_name.bracketed_names
except ValueError:
raise exceptions.InvalidSearchParam(
f'expected one or two bracketed queryparam-name segments'
Expand All @@ -200,23 +202,23 @@ def from_filter_param(cls, param_name: QueryparamName, param_value: str):
_operator = SearchFilter.FilterOperator.from_shortname(_operator_value)
except ValueError:
raise ValueError(f'unrecognized search-filter operator "{_operator_value}"')
_propertypath = tuple(
osfmap_labeler.iri_for_label(_pathstep, default=_pathstep)
for _pathstep in split_queryparam_value(_serialized_path)
_propertypath_set = _parse_propertypath_set(_serialized_path_set)
_is_date_filter = all(
is_date_property(_path[-1])
for _path in _propertypath_set
)
_is_date_property = is_date_property(_propertypath[-1])
if _operator is None: # default operator
_operator = (
SearchFilter.FilterOperator.AT_DATE
if _is_date_property
if _is_date_filter
else SearchFilter.FilterOperator.ANY_OF
)
if _operator.is_date_operator() and not _is_date_property:
if _operator.is_date_operator() and not _is_date_filter:
raise ValueError(f'cannot use date operator {_operator.value} on non-date property')
_value_list = []
if not _operator.is_valueless_operator():
for _value in split_queryparam_value(param_value):
if _is_date_property:
if _is_date_filter:
_value_list.append(_value) # TODO: vali-date
else:
try:
Expand All @@ -226,13 +228,29 @@ def from_filter_param(cls, param_name: QueryparamName, param_value: str):
else:
_value_list.append(_iri)
return cls(
property_path=_propertypath,
propertypath_set=_propertypath_set,
value_set=frozenset(_value_list),
operator=_operator,
original_param_name=str(param_name),
original_param_value=param_value,
)

def is_sameas_filter(self) -> bool:
'''detect the special filter for matching exact identifier iris
'''
return (
self.propertypath_set == {(OWL.sameAs,)}
and self.operator == SearchFilter.FilterOperator.ANY_OF
)

def is_type_filter(self) -> bool:
'''detect the special filter for matching resource type
'''
return (
self.propertypath_set == {(RDF.type,)}
and self.operator == SearchFilter.FilterOperator.ANY_OF
)


@dataclasses.dataclass(frozen=True)
class SortParam:
Expand Down Expand Up @@ -345,7 +363,7 @@ def to_querydict(self) -> QueryDict:
class ValuesearchParams(CardsearchParams):
# includes fields from CardsearchParams, because a
# valuesearch is always in context of a cardsearch
valuesearch_property_path: tuple[str]
valuesearch_propertypath_set: frozenset[tuple[str, ...]]
valuesearch_text: str
valuesearch_textsegment_set: frozenset[str]
valuesearch_filter_set: frozenset[SearchFilter]
Expand All @@ -355,19 +373,16 @@ class ValuesearchParams(CardsearchParams):
@staticmethod
def from_queryparams(queryparams: QueryparamDict) -> 'ValuesearchParams':
_valuesearch_text = _get_text_queryparam(queryparams, 'valueSearchText')
_raw_property_path = _get_single_value(queryparams, QueryparamName('valueSearchPropertyPath'))
if not _raw_property_path:
_raw_propertypath = _get_single_value(queryparams, QueryparamName('valueSearchPropertyPath'))
if not _raw_propertypath:
raise ValueError('TODO: 400 valueSearchPropertyPath required')
return ValuesearchParams(
**CardsearchParams.parse_cardsearch_queryparams(queryparams),
valuesearch_property_path=tuple(
osfmap_labeler.iri_for_label(_pathstep, default=_pathstep)
for _pathstep in split_queryparam_value(_raw_property_path)
),
valuesearch_propertypath_set=_parse_propertypath_set(_raw_propertypath),
valuesearch_text=_valuesearch_text,
valuesearch_textsegment_set=Textsegment.split_str(_valuesearch_text),
valuesearch_filter_set=SearchFilter.for_queryparam_family(queryparams, 'valueSearchFilter'),
original_valuesearch_property_path=_raw_property_path,
original_valuesearch_property_path=_raw_propertypath,
)

def to_querydict(self):
Expand All @@ -381,12 +396,12 @@ def to_querydict(self):

def valuesearch_iris(self):
for _filter in self.valuesearch_filter_set:
if _filter.property_path == (OWL.sameAs,) and _filter.operator == SearchFilter.FilterOperator.ANY_OF:
if _filter.is_sameas_filter():
yield from _filter.value_set

def valuesearch_type_iris(self):
for _filter in self.valuesearch_filter_set:
if _filter.property_path == (RDF.type,) and _filter.operator == SearchFilter.FilterOperator.ANY_OF:
if _filter.is_type_filter():
yield from _filter.value_set


Expand Down Expand Up @@ -423,16 +438,31 @@ def _get_single_value(
return _singlevalue


def _parse_propertypath_set(serialized_path_set: str) -> frozenset[tuple[str, ...]]:
if FeatureFlag.objects.flag_is_up(FeatureFlag.PERIODIC_PROPERTYPATH):
# comma-delimited set of dot-delimited paths
return frozenset(
tuple(
osfmap_labeler.iri_for_label(_pathstep, default=_pathstep)
for _pathstep in _path.split(PROPERTYPATH_DELIMITER)
)
for _path in split_queryparam_value(serialized_path_set)
)
# single comma-delimited path
_propertypath = tuple(
osfmap_labeler.iri_for_label(_pathstep, default=_pathstep)
for _pathstep in split_queryparam_value(serialized_path_set)
)
return frozenset([_propertypath])


def _get_related_property_paths(filter_set) -> tuple[tuple[str]]:
# hard-coded for osf.io search pages, static list per type
# TODO: replace with some dynamism, maybe a 'significant_terms' aggregation
_type_iris = set()
for _filter in filter_set:
if _filter.property_path == (RDF.type,):
if _filter.operator == SearchFilter.FilterOperator.ANY_OF:
_type_iris.update(_filter.value_set)
if _filter.operator == SearchFilter.FilterOperator.NONE_OF:
_type_iris.difference_update(_filter.value_set)
if _filter.is_type_filter():
_type_iris.update(_filter.value_set)
return suggested_property_paths(_type_iris)


Expand Down
12 changes: 12 additions & 0 deletions tests/_testutil.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from unittest import mock


def patch_feature_flag(*flag_names, up=True):
from share.models.feature_flag import FeatureFlag
_old_isup = FeatureFlag.objects.flag_is_up

def _patched_isup(flag_name):
if flag_name in flag_names:
return up
return _old_isup(flag_name)
return mock.patch.object(FeatureFlag.objects, 'flag_is_up', new=_patched_isup)

0 comments on commit adf94b3

Please sign in to comment.