Skip to content

Commit

Permalink
skip indexing cards without osfmap_json for search results
Browse files Browse the repository at this point in the history
  • Loading branch information
aaxelb committed Aug 24, 2023
1 parent 8bc74d6 commit 47db09f
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 5 deletions.
14 changes: 12 additions & 2 deletions share/search/index_strategy/trove_indexcard.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import uuid
from typing import Iterable

from django.db.models import Exists, OuterRef
import elasticsearch8
from gather import primitive_rdf

Expand Down Expand Up @@ -37,6 +38,7 @@
from trove.util.iris import get_sufficiently_unique_iri, is_worthwhile_iri, iri_path_as_keyword
from trove.vocab.osfmap import is_date_property
from trove.vocab.namespaces import TROVE, FOAF, RDF, RDFS, DCTERMS, OWL, SKOS, OSFMAP
from .trove_indexcard_flats import _should_skip_card


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -154,8 +156,8 @@ def index_mappings(self):

def _build_sourcedoc(self, indexcard_rdf):
_rdfdoc = primitive_rdf.TripledictWrapper(indexcard_rdf.as_rdf_tripledict())
if not any(_rdfdoc.q(indexcard_rdf.focus_iri, NAMELIKE_PROPERTIES)):
return None # skip cards without some value for name/title/label
if _should_skip_card(indexcard_rdf, _rdfdoc):
return None # will be deleted from the index
_nested_iris = defaultdict(set)
_nested_dates = defaultdict(set)
_nested_texts = defaultdict(set)
Expand Down Expand Up @@ -228,6 +230,14 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk):
_indexcard_rdf_qs = (
trove_db.LatestIndexcardRdf.objects
.filter(indexcard_id__in=messages_chunk.target_ids_chunk)
.filter(Exists(
trove_db.DerivedIndexcard.objects
.filter(upriver_indexcard_id=OuterRef('indexcard_id'))
.filter(deriver_identifier__in=(
trove_db.ResourceIdentifier.objects
.queryset_for_iri(TROVE['derive/osfmap_json'])
))
))
.exclude(indexcard__deleted__isnull=False)
.select_related('indexcard__source_record_suid__source_config')
.prefetch_related('indexcard__focus_identifier_set')
Expand Down
20 changes: 17 additions & 3 deletions share/search/index_strategy/trove_indexcard_flats.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import uuid
from typing import Iterable

from django.db.models import Exists, OuterRef
import elasticsearch8
from gather import primitive_rdf

Expand Down Expand Up @@ -166,8 +167,8 @@ def index_mappings(self):

def _build_sourcedoc(self, indexcard_rdf):
_rdfdoc = primitive_rdf.TripledictWrapper(indexcard_rdf.as_rdf_tripledict())
if not any(_rdfdoc.q(indexcard_rdf.focus_iri, NAMELIKE_PROPERTIES)):
return None # skip cards without some value for name/title/label
if _should_skip_card(indexcard_rdf, _rdfdoc):
return None # will be deleted from the index
_nested_iris = defaultdict(set)
_nested_dates = defaultdict(set)
_nested_texts = defaultdict(set)
Expand Down Expand Up @@ -266,6 +267,14 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk):
_indexcard_rdf_qs = (
trove_db.LatestIndexcardRdf.objects
.filter(indexcard_id__in=messages_chunk.target_ids_chunk)
.filter(Exists(
trove_db.DerivedIndexcard.objects
.filter(upriver_indexcard_id=OuterRef('indexcard_id'))
.filter(deriver_identifier__in=(
trove_db.ResourceIdentifier.objects
.queryset_for_iri(TROVE['derive/osfmap_json'])
))
))
.exclude(indexcard__deleted__isnull=False)
.select_related('indexcard__source_record_suid__source_config')
.prefetch_related('indexcard__focus_identifier_set')
Expand All @@ -283,7 +292,7 @@ def build_elastic_actions(self, messages_chunk: messages.MessagesChunk):
)
_remaining_indexcard_ids.discard(_indexcard_rdf.indexcard_id)
yield _indexcard_rdf.indexcard_id, _index_action
# delete any that don't have "latest" rdf
# delete any that don't have "latest" rdf and derived osfmap_json
_leftovers = trove_db.Indexcard.objects.filter(id__in=_remaining_indexcard_ids)
for _indexcard in _leftovers:
yield _indexcard.id, self.build_delete_action(_indexcard.get_iri())
Expand Down Expand Up @@ -816,6 +825,11 @@ def fuzzy_text_should_queries(self, textsegments: list[Textsegment]) -> Iterable
###
# module-local utils

def _should_skip_card(indexcard_rdf, rdfdoc):
# skip cards without some value for name/title/label
return not any(rdfdoc.q(indexcard_rdf.focus_iri, NAMELIKE_PROPERTIES))


def _bucketlist(agg_result: dict) -> list[str]:
return [
_bucket['key']
Expand Down

0 comments on commit 47db09f

Please sign in to comment.