diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java index ee12b7b564..549cd27f13 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursor.java @@ -72,14 +72,12 @@ import javax.annotation.Nullable; import java.io.IOException; import java.io.StringReader; -import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Objects; import java.util.Set; import java.util.concurrent.CompletableFuture; @@ -95,10 +93,6 @@ public class LuceneAutoCompleteResultCursor implements BaseCursor { private static final Logger LOGGER = LoggerFactory.getLogger(LuceneAutoCompleteResultCursor.class); - private static final int tokenCountBeforeHighlighted = 3; - private static final int tokenCountAfterHighlighted = 3; - private static final String highlightedTextConnector = "... "; - @Nonnull private final Executor executor; @Nonnull @@ -196,184 +190,6 @@ private void performLookup() throws IOException { } } - @SuppressWarnings("squid:S3776") // Cognitive complexity is too high. Candidate for later refactoring - @Nullable - @VisibleForTesting - static String searchAllMaybeHighlight(@Nonnull String fieldName, @Nonnull Analyzer queryAnalyzer, @Nonnull String text, - @Nonnull Set matchedTokens, @Nullable String prefixToken, - boolean allMatchingRequired, - @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { - try (TokenStream ts = queryAnalyzer.tokenStream(fieldName, new StringReader(text))) { - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); - ts.reset(); - StringBuilder sb = luceneQueryHighlightParameters.isHighlight() ? new StringBuilder() : null; - int upto = 0; - Set matchedInText = new HashSet<>(); - boolean matchedPrefix = false; - ArrayDeque pres = new ArrayDeque<>(); - ArrayDeque ends = new ArrayDeque<>(); - int lastMatchPos = -tokenCountAfterHighlighted - 1; - int currentPos = 0; - while (ts.incrementToken()) { - String token = termAtt.toString(); - int startOffset = offsetAtt.startOffset(); - int endOffset = offsetAtt.endOffset(); - if (upto < startOffset) { - if (luceneQueryHighlightParameters.isHighlight()) { - if (luceneQueryHighlightParameters.isCutSnippets()) { - if (currentPos - lastMatchPos <= tokenCountAfterHighlighted + 1) { - addNonMatch(sb, text.substring(upto, startOffset)); - } else { - pres.add(text.substring(upto, startOffset)); - if (pres.size() > tokenCountBeforeHighlighted) { - pres.poll(); - } - if (ends.size() < luceneQueryHighlightParameters.getSnippedSize() - tokenCountAfterHighlighted) { - ends.add(text.substring(upto, startOffset)); - } - } - } else { - addNonMatch(sb, text.substring(upto, startOffset)); - } - } - upto = startOffset; - } else if (upto > startOffset) { - continue; - } - - if (matchedTokens.contains(token)) { - // Token matches. - if (luceneQueryHighlightParameters.isHighlight()) { - if (luceneQueryHighlightParameters.isCutSnippets() && currentPos - lastMatchPos > tokenCountBeforeHighlighted + tokenCountAfterHighlighted + 1) { - addNonMatch(sb, highlightedTextConnector); - } - while (!pres.isEmpty()) { - addNonMatch(sb, pres.poll()); - } - ends.clear(); - int start = startOffset; - while (start < endOffset) { - int index = text.toLowerCase(Locale.ROOT).indexOf(token, start); - if (index < 0 || index >= endOffset) { - addNonMatch(sb, text.substring(start, endOffset)); - break; - } - int actualStartOffset = index; - int actualEndOffset = index + token.length(); - addNonMatch(sb, text.substring(start, index)); - String substring = text.substring(actualStartOffset, actualEndOffset); - if (substring.equalsIgnoreCase(token) && !tokenAlreadyHighlighted(text, actualStartOffset, actualEndOffset, - luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) { - addWholeMatch(sb, substring, - luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag()); - } else { - addNonMatch(sb, substring); - } - start = actualEndOffset; - } - } - upto = endOffset; - matchedInText.add(token); - lastMatchPos = currentPos; - } else if (prefixToken != null && token.startsWith(prefixToken)) { - if (luceneQueryHighlightParameters.isHighlight()) { - if (!tokenAlreadyHighlighted(text, startOffset, endOffset, - luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) { - addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken, - luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag()); - } else { - addNonMatch(sb, text.substring(startOffset, endOffset)); - } - } - upto = endOffset; - matchedPrefix = true; - } - currentPos++; - } - ts.end(); - - if (allMatchingRequired && ((prefixToken != null && !matchedPrefix) || (matchedInText.size() < matchedTokens.size()))) { - // Query text not actually found in document text. Return null - return null; - } - - // Text was found. Return text (highlighted or not) - if (luceneQueryHighlightParameters.isHighlight()) { - int endOffset = offsetAtt.endOffset(); - if (upto < endOffset && !luceneQueryHighlightParameters.isCutSnippets()) { - addNonMatch(sb, text.substring(upto)); - } else if (luceneQueryHighlightParameters.isCutSnippets()) { - while (!ends.isEmpty()) { - addNonMatch(sb, ends.poll()); - } - addNonMatch(sb, highlightedTextConnector); - } - return sb.toString(); - } else { - return text; - } - - } catch (IOException e) { - return null; - } - } - - // Check this before highlighting tokens, so the highlighting is idempotent - private static boolean tokenAlreadyHighlighted(@Nonnull String text, int startOffset, int endOffset, - @Nonnull String leftTag, @Nonnull String rightTag) { - return startOffset - leftTag.length() >= 0 - && endOffset + rightTag.length() > text.length() - && text.startsWith(leftTag, startOffset - 3) - && text.startsWith(rightTag, endOffset); - } - - /** Called while highlighting a single result, to append a - * non-matching chunk of text from the suggestion to the - * provided fragments list. - * @param sb The {@code StringBuilder} to append to - * @param text The text chunk to add - */ - private static void addNonMatch(StringBuilder sb, String text) { - sb.append(text); - } - - /** Called while highlighting a single result, to append - * the whole matched token to the provided fragments list. - * @param sb The {@code StringBuilder} to append to - * @param surface The surface form (original) text - * @param leftTag the tag to add left to the surface - * @param rightTag the tag to add right to the surface - */ - private static void addWholeMatch(StringBuilder sb, String surface, String leftTag, String rightTag) { - sb.append(leftTag); - sb.append(surface); - sb.append(rightTag); - } - - /** Called while highlighting a single result, to append a - * matched prefix token, to the provided fragments list. - * @param sb The {@code StringBuilder} to append to - * @param surface The fragment of the surface form - * (indexed during build, corresponding to - * this match - * @param prefixToken The prefix of the token that matched - * @param leftTag the tag to add left to the surface - * @param rightTag the tag to add right to the surface - */ - private static void addPrefixMatch(StringBuilder sb, String surface, String prefixToken, String leftTag, String rightTag) { - // TODO: apps can try to invert their analysis logic - // here, e.g. downcase the two before checking prefix: - if (prefixToken.length() >= surface.length()) { - addWholeMatch(sb, surface, leftTag, rightTag); - return; - } - sb.append(leftTag); - sb.append(surface.substring(0, prefixToken.length())); - sb.append(rightTag); - sb.append(surface.substring(prefixToken.length())); - } - @SuppressWarnings("PMD.CloseResource") public RecordCursor lookup() throws IOException { // Determine the tokens from the query key @@ -612,7 +428,7 @@ private RecordCursor findIndexEntriesInRecord(ScoreDocAndRecord scor // matched terms return null; } - String match = searchAllMaybeHighlight(documentField.getFieldName(), queryAnalyzer, text, queryTokens, prefixToken, true, + String match = LuceneHighlighting.searchAllMaybeHighlight(documentField.getFieldName(), queryAnalyzer, text, queryTokens, prefixToken, true, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(highlight)); if (match == null) { // Text not found in this field diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java index 1014412d2e..97b71da39b 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecord.java @@ -38,12 +38,9 @@ import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Objects; -import java.util.Set; /** * Helper class for converting {@link FDBRecord}s to Lucene documents. @@ -134,42 +131,6 @@ public static List getFields(@Nonnull KeyExpr return fields.getFields(); } - // Modify the Lucene fields of a record message with highlighting the terms from the given termMap - @Nonnull - public static void highlightTermsInMessage(@Nonnull KeyExpression expression, @Nonnull Message.Builder builder, @Nonnull Map> termMap, - @Nonnull LuceneAnalyzerCombinationProvider analyzerSelector, - @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { - LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource recordRebuildSource = new LuceneIndexKeyValueToPartialRecordUtils.RecordRebuildSource<>(null, builder.getDescriptorForType(), builder, builder.build()); - - LuceneIndexExpressions.getFields(expression, recordRebuildSource, - (source, fieldName, value, type, stored, sorted, overriddenKeyRanges, groupingKeyIndex, keyIndex, fieldConfigsIgnored) -> { - Set terms = new HashSet<>(); - terms.addAll(termMap.getOrDefault(fieldName, Collections.emptySet())); - terms.addAll(termMap.getOrDefault("", Collections.emptySet())); - if (terms.isEmpty()) { - return; - } - for (Map.Entry entry : source.message.getAllFields().entrySet()) { - Object entryValue = entry.getValue(); - if (entryValue instanceof String && entryValue.equals(value) - && terms.stream().filter(t -> ((String) entryValue).toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))).findAny().isPresent()) { - String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValue).getAnalyzer(), (String) entryValue, termMap.get(fieldName), null, false, luceneQueryHighlightParameters); - source.buildMessage(highlightedText, entry.getKey(), null, null, true, 0); - } else if (entryValue instanceof List) { - int index = 0; - for (Object entryValueElement : ((List) entryValue)) { - if (entryValueElement instanceof String && entryValueElement.equals(value) - && terms.stream().filter(t -> ((String) entryValueElement).toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT))).findAny().isPresent()) { - String highlightedText = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String) entryValueElement).getAnalyzer(), (String) entryValueElement, termMap.get(fieldName), null, false, luceneQueryHighlightParameters); - source.buildMessage(highlightedText, entry.getKey(), null, null, true, index); - } - index++; - } - } - } - }, null); - } - protected static class FDBRecordSource implements LuceneIndexExpressions.RecordSource> { @Nonnull private final FDBRecord rec; diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlightTermsPlan.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlightTermsPlan.java index 4e34090813..f68c1d207b 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlightTermsPlan.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlightTermsPlan.java @@ -22,15 +22,11 @@ import com.apple.foundationdb.record.EvaluationContext; import com.apple.foundationdb.record.ExecuteProperties; -import com.apple.foundationdb.record.IndexEntry; import com.apple.foundationdb.record.ObjectPlanHash; import com.apple.foundationdb.record.PlanHashable; import com.apple.foundationdb.record.RecordCursor; import com.apple.foundationdb.record.provider.common.StoreTimer; -import com.apple.foundationdb.record.provider.foundationdb.FDBIndexedRecord; -import com.apple.foundationdb.record.provider.foundationdb.FDBQueriedRecord; import com.apple.foundationdb.record.provider.foundationdb.FDBRecordStoreBase; -import com.apple.foundationdb.record.provider.foundationdb.FDBStoredRecord; import com.apple.foundationdb.record.query.plan.cascades.AliasMap; import com.apple.foundationdb.record.query.plan.cascades.CorrelationIdentifier; import com.apple.foundationdb.record.query.plan.cascades.GroupExpressionRef; @@ -83,29 +79,7 @@ public RecordCursor executePlan(@Nonnull final @Nonnull final ExecuteProperties executeProperties) { final RecordCursor results = getInnerPlan().executePlan(store, context, continuation, executeProperties); - return results .map(result -> QueryResult.fromQueriedRecord(highlightTermsInRecord(result.getQueriedRecord()))); - } - - @Nullable - @SuppressWarnings("unchecked") - private FDBQueriedRecord highlightTermsInRecord(@Nullable FDBQueriedRecord queriedRecord) { - if (queriedRecord == null) { - return queriedRecord; - } - IndexEntry indexEntry = queriedRecord.getIndexEntry(); - if (!(indexEntry instanceof LuceneRecordCursor.ScoreDocIndexEntry)) { - return queriedRecord; - } - LuceneRecordCursor.ScoreDocIndexEntry docIndexEntry = (LuceneRecordCursor.ScoreDocIndexEntry)indexEntry; - if (!docIndexEntry.getLuceneQueryHighlightParameters().isHighlight()) { - return queriedRecord; - } - M message = queriedRecord.getRecord(); - M.Builder builder = message.toBuilder(); - LuceneDocumentFromRecord.highlightTermsInMessage(docIndexEntry.getIndexKey(), builder, - docIndexEntry.getTermMap(), docIndexEntry.getAnalyzerSelector(), docIndexEntry.getLuceneQueryHighlightParameters()); - FDBStoredRecord storedRecord = queriedRecord.getStoredRecord().asBuilder().setRecord((M) builder.build()).build(); - return FDBQueriedRecord.indexed(new FDBIndexedRecord<>(indexEntry, storedRecord)); + return results .map(result -> QueryResult.fromQueriedRecord(LuceneHighlighting.highlightTermsInRecord(result.getQueriedRecord()))); } @Override diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlighting.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlighting.java new file mode 100644 index 0000000000..b3f4600da2 --- /dev/null +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneHighlighting.java @@ -0,0 +1,430 @@ +/* + * LuceneHighlighting.java + * + * This source file is part of the FoundationDB open source project + * + * Copyright 2022 Apple Inc. and the FoundationDB project authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.apple.foundationdb.record.lucene; + +import com.apple.foundationdb.record.IndexEntry; +import com.apple.foundationdb.record.metadata.Key; +import com.apple.foundationdb.record.metadata.expressions.FieldKeyExpression; +import com.apple.foundationdb.record.metadata.expressions.KeyExpression; +import com.apple.foundationdb.record.provider.foundationdb.FDBIndexedRecord; +import com.apple.foundationdb.record.provider.foundationdb.FDBQueriedRecord; +import com.apple.foundationdb.record.provider.foundationdb.FDBStoredRecord; +import com.google.protobuf.Descriptors; +import com.google.protobuf.Message; +import org.apache.commons.lang3.StringUtils; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +/** + * Helper class for highlighting search matches. + */ +public class LuceneHighlighting { + private static final int tokenCountBeforeHighlighted = 3; + private static final int tokenCountAfterHighlighted = 3; + private static final String highlightedTextConnector = "... "; + + private LuceneHighlighting() { + } + + @SuppressWarnings("squid:S3776") // Cognitive complexity is too high. Candidate for later refactoring + @Nullable + static String searchAllMaybeHighlight(@Nonnull String fieldName, @Nonnull Analyzer queryAnalyzer, @Nonnull String text, + @Nonnull Set matchedTokens, @Nullable String prefixToken, + boolean allMatchingRequired, + @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { + try (TokenStream ts = queryAnalyzer.tokenStream(fieldName, new StringReader(text))) { + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); + ts.reset(); + StringBuilder sb = luceneQueryHighlightParameters.isHighlight() ? new StringBuilder() : null; + int upto = 0; + Set matchedInText = new HashSet<>(); + boolean matchedPrefix = false; + ArrayDeque pres = new ArrayDeque<>(); + ArrayDeque ends = new ArrayDeque<>(); + int lastMatchPos = -tokenCountAfterHighlighted - 1; + int currentPos = 0; + while (ts.incrementToken()) { + String token = termAtt.toString(); + int startOffset = offsetAtt.startOffset(); + int endOffset = offsetAtt.endOffset(); + if (upto < startOffset) { + if (luceneQueryHighlightParameters.isHighlight()) { + if (luceneQueryHighlightParameters.isCutSnippets()) { + if (currentPos - lastMatchPos <= tokenCountAfterHighlighted + 1) { + addNonMatch(sb, text.substring(upto, startOffset)); + } else { + pres.add(text.substring(upto, startOffset)); + if (pres.size() > tokenCountBeforeHighlighted) { + pres.poll(); + } + if (ends.size() < luceneQueryHighlightParameters.getSnippedSize() - tokenCountAfterHighlighted) { + ends.add(text.substring(upto, startOffset)); + } + } + } else { + addNonMatch(sb, text.substring(upto, startOffset)); + } + } + upto = startOffset; + } else if (upto > startOffset) { + continue; + } + + if (matchedTokens.contains(token)) { + // Token matches. + if (luceneQueryHighlightParameters.isHighlight()) { + if (luceneQueryHighlightParameters.isCutSnippets() && currentPos - lastMatchPos > tokenCountBeforeHighlighted + tokenCountAfterHighlighted + 1) { + addNonMatch(sb, highlightedTextConnector); + } + while (!pres.isEmpty()) { + addNonMatch(sb, pres.poll()); + } + ends.clear(); + int start = startOffset; + while (start < endOffset) { + int index = text.toLowerCase(Locale.ROOT).indexOf(token, start); + if (index < 0 || index >= endOffset) { + addNonMatch(sb, text.substring(start, endOffset)); + break; + } + int actualStartOffset = index; + int actualEndOffset = index + token.length(); + addNonMatch(sb, text.substring(start, index)); + String substring = text.substring(actualStartOffset, actualEndOffset); + if (substring.equalsIgnoreCase(token) && !tokenAlreadyHighlighted(text, actualStartOffset, actualEndOffset, + luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) { + addWholeMatch(sb, substring, + luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag()); + } else { + addNonMatch(sb, substring); + } + start = actualEndOffset; + } + } + upto = endOffset; + matchedInText.add(token); + lastMatchPos = currentPos; + } else if (prefixToken != null && token.startsWith(prefixToken)) { + if (luceneQueryHighlightParameters.isHighlight()) { + if (!tokenAlreadyHighlighted(text, startOffset, endOffset, + luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag())) { + addPrefixMatch(sb, text.substring(startOffset, endOffset), prefixToken, + luceneQueryHighlightParameters.getLeftTag(), luceneQueryHighlightParameters.getRightTag()); + } else { + addNonMatch(sb, text.substring(startOffset, endOffset)); + } + } + upto = endOffset; + matchedPrefix = true; + } + currentPos++; + } + ts.end(); + + if (allMatchingRequired && ((prefixToken != null && !matchedPrefix) || (matchedInText.size() < matchedTokens.size()))) { + // Query text not actually found in document text. Return null + return null; + } + + // Text was found. Return text (highlighted or not) + if (luceneQueryHighlightParameters.isHighlight()) { + int endOffset = offsetAtt.endOffset(); + if (upto < endOffset && !luceneQueryHighlightParameters.isCutSnippets()) { + addNonMatch(sb, text.substring(upto)); + } else if (luceneQueryHighlightParameters.isCutSnippets()) { + while (!ends.isEmpty()) { + addNonMatch(sb, ends.poll()); + } + addNonMatch(sb, highlightedTextConnector); + } + return sb.toString(); + } else { + return text; + } + + } catch (IOException e) { + return null; + } + } + + // Check this before highlighting tokens, so the highlighting is idempotent + private static boolean tokenAlreadyHighlighted(@Nonnull String text, int startOffset, int endOffset, + @Nonnull String leftTag, @Nonnull String rightTag) { + return startOffset - leftTag.length() >= 0 + && endOffset + rightTag.length() <= text.length() + && text.startsWith(leftTag, startOffset - leftTag.length()) + && text.startsWith(rightTag, endOffset); + } + + /** Called while highlighting a single result, to append a + * non-matching chunk of text from the suggestion to the + * provided fragments list. + * @param sb The {@code StringBuilder} to append to + * @param text The text chunk to add + */ + private static void addNonMatch(StringBuilder sb, String text) { + sb.append(text); + } + + /** Called while highlighting a single result, to append + * the whole matched token to the provided fragments list. + * @param sb The {@code StringBuilder} to append to + * @param surface The surface form (original) text + * @param leftTag the tag to add left to the surface + * @param rightTag the tag to add right to the surface + */ + private static void addWholeMatch(StringBuilder sb, String surface, String leftTag, String rightTag) { + sb.append(leftTag); + sb.append(surface); + sb.append(rightTag); + } + + /** Called while highlighting a single result, to append a + * matched prefix token, to the provided fragments list. + * @param sb The {@code StringBuilder} to append to + * @param surface The fragment of the surface form + * (indexed during build, corresponding to + * this match + * @param prefixToken The prefix of the token that matched + * @param leftTag the tag to add left to the surface + * @param rightTag the tag to add right to the surface + */ + private static void addPrefixMatch(StringBuilder sb, String surface, String prefixToken, String leftTag, String rightTag) { + // TODO: apps can try to invert their analysis logic + // here, e.g. downcase the two before checking prefix: + if (prefixToken.length() >= surface.length()) { + addWholeMatch(sb, surface, leftTag, rightTag); + return; + } + sb.append(leftTag); + sb.append(surface.substring(0, prefixToken.length())); + sb.append(rightTag); + sb.append(surface.substring(prefixToken.length())); + } + + @Nullable + @SuppressWarnings("unchecked") + public static FDBQueriedRecord highlightTermsInRecord(@Nullable FDBQueriedRecord queriedRecord) { + if (queriedRecord == null) { + return queriedRecord; + } + IndexEntry indexEntry = queriedRecord.getIndexEntry(); + if (!(indexEntry instanceof LuceneRecordCursor.ScoreDocIndexEntry)) { + return queriedRecord; + } + LuceneRecordCursor.ScoreDocIndexEntry docIndexEntry = (LuceneRecordCursor.ScoreDocIndexEntry)indexEntry; + if (!docIndexEntry.getLuceneQueryHighlightParameters().isHighlight()) { + return queriedRecord; + } + M message = queriedRecord.getRecord(); + M.Builder builder = message.toBuilder(); + highlightTermsInMessage(docIndexEntry.getIndexKey(), builder, + docIndexEntry.getTermMap(), docIndexEntry.getAnalyzerSelector(), docIndexEntry.getLuceneQueryHighlightParameters()); + FDBStoredRecord storedRecord = queriedRecord.getStoredRecord().asBuilder().setRecord((M) builder.build()).build(); + return FDBQueriedRecord.indexed(new FDBIndexedRecord<>(indexEntry, storedRecord)); + } + + // Modify the Lucene fields of a record message with highlighting the terms from the given termMap + @Nonnull + public static void highlightTermsInMessage(@Nonnull KeyExpression expression, @Nonnull Message.Builder builder, @Nonnull Map> termMap, + @Nonnull LuceneAnalyzerCombinationProvider analyzerSelector, + @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { + RecordRebuildSource recordRebuildSource = new RecordRebuildSource<>(null, builder.getDescriptorForType(), builder, builder.build()); + + LuceneIndexExpressions.getFields(expression, recordRebuildSource, + (source, fieldName, value, type, stored, sorted, overriddenKeyRanges, groupingKeyIndex, keyIndex, fieldConfigsIgnored) -> { + if (type != LuceneIndexExpressions.DocumentFieldType.TEXT) { + return; + } + Set terms = getFieldTerms(termMap, fieldName); + if (terms.isEmpty()) { + return; + } + for (Map.Entry entry : source.message.getAllFields().entrySet()) { + final Descriptors.FieldDescriptor entryDescriptor = entry.getKey(); + final Object entryValue = entry.getValue(); + if (entryValue instanceof String) { + buildIfMatch(source, fieldName, value, + entryDescriptor, entryValue, 0, + terms, analyzerSelector, luceneQueryHighlightParameters); + } else if (entryValue instanceof List) { + int index = 0; + for (Object entryValueElement : ((List) entryValue)) { + buildIfMatch(source, fieldName, value, + entryDescriptor, entryValueElement, index, + terms, analyzerSelector, luceneQueryHighlightParameters); + index++; + } + } + } + }, null); + } + + private static void buildIfMatch(RecordRebuildSource source, String fieldName, Object fieldValue, + Descriptors.FieldDescriptor entryDescriptor, Object entryValue, int index, + @Nonnull Set terms, + @Nonnull LuceneAnalyzerCombinationProvider analyzerSelector, + @Nonnull LuceneScanQueryParameters.LuceneQueryHighlightParameters luceneQueryHighlightParameters) { + if (entryValue.equals(fieldValue) && terms.stream().anyMatch(t -> StringUtils.containsIgnoreCase((String)entryValue, t))) { + String highlightedText = searchAllMaybeHighlight(fieldName, analyzerSelector.provideIndexAnalyzer((String)entryValue).getAnalyzer(), (String)entryValue, terms, null, false, luceneQueryHighlightParameters); + source.buildMessage(highlightedText, entryDescriptor, null, null, true, index); + } + } + + static class RecordRebuildSource implements LuceneIndexExpressions.RecordSource> { + @Nullable + public final RecordRebuildSource parent; + @Nonnull + public final Descriptors.Descriptor descriptor; + @Nullable + public final Descriptors.FieldDescriptor fieldDescriptor; + @Nonnull + public final Message.Builder builder; + public final Message message; + public final int indexIfRepeated; + + RecordRebuildSource(@Nullable RecordRebuildSource parent, @Nonnull Descriptors.Descriptor descriptor, @Nonnull Message.Builder builder, @Nonnull Message message) { + //this.rec = rec; + this.parent = parent; + this.descriptor = descriptor; + this.fieldDescriptor = null; + this.builder = builder; + this.message = message; + this.indexIfRepeated = 0; + } + + RecordRebuildSource(@Nullable RecordRebuildSource parent, @Nonnull Descriptors.FieldDescriptor fieldDescriptor, @Nonnull Message.Builder builder, @Nonnull Message message, int indexIfRepeated) { + //this.rec = rec; + this.parent = parent; + this.descriptor = fieldDescriptor.getMessageType(); + this.fieldDescriptor = fieldDescriptor; + this.builder = builder; + this.message = message; + this.indexIfRepeated = indexIfRepeated; + } + + @Override + public Descriptors.Descriptor getDescriptor() { + return descriptor; + } + + @Override + public Iterable> getChildren(@Nonnull FieldKeyExpression parentExpression) { + final String parentField = parentExpression.getFieldName(); + final Descriptors.FieldDescriptor parentFieldDescriptor = descriptor.findFieldByName(parentField); + + final List> children = new ArrayList<>(); + int index = 0; + for (Key.Evaluated evaluated : parentExpression.evaluateMessage(null, message)) { + final Message submessage = (Message)evaluated.toList().get(0); + if (submessage != null) { + if (parentFieldDescriptor.isRepeated()) { + children.add(new RecordRebuildSource(this, parentFieldDescriptor, + builder.newBuilderForField(parentFieldDescriptor), + submessage, index++)); + } else { + children.add(new RecordRebuildSource(this, parentFieldDescriptor, + builder.getFieldBuilder(parentFieldDescriptor), + submessage, index)); + } + } + } + return children; + } + + @Override + public Iterable getValues(@Nonnull FieldKeyExpression fieldExpression) { + final List values = new ArrayList<>(); + for (Key.Evaluated evaluated : fieldExpression.evaluateMessage(null, message)) { + Object value = evaluated.getObject(0); + if (value != null) { + values.add(value); + } + } + return values; + } + + @SuppressWarnings("java:S3776") + public void buildMessage(@Nullable Object value, Descriptors.FieldDescriptor subFieldDescriptor, @Nullable String customizedKey, @Nullable String mappedKeyField, boolean forLuceneField, int index) { + final Descriptors.FieldDescriptor mappedKeyFieldDescriptor = mappedKeyField == null ? null : descriptor.findFieldByName(mappedKeyField); + if (mappedKeyFieldDescriptor != null) { + if (customizedKey == null) { + return; + } + builder.setField(mappedKeyFieldDescriptor, customizedKey); + } + + if (value == null) { + return; + } + if (subFieldDescriptor.isRepeated()) { + if (subFieldDescriptor.getJavaType().equals(Descriptors.FieldDescriptor.JavaType.MESSAGE)) { + Message.Builder subBuilder = builder.newBuilderForField(subFieldDescriptor); + subBuilder.mergeFrom((Message) builder.getRepeatedField(subFieldDescriptor, index)).mergeFrom((Message) value); + builder.setRepeatedField(subFieldDescriptor, index, subBuilder.build()); + } else { + builder.setRepeatedField(subFieldDescriptor, index, value); + } + + } else { + int count = builder.getAllFields().size(); + if (message != null && count == 0) { + builder.mergeFrom(message); + } + builder.setField(subFieldDescriptor, value); + } + + if (parent != null) { + parent.buildMessage(builder.build(), this.fieldDescriptor, mappedKeyFieldDescriptor == null ? customizedKey : null, mappedKeyFieldDescriptor == null ? mappedKeyField : null, forLuceneField, indexIfRepeated); + } + } + } + + @Nonnull + private static Set getFieldTerms(@Nonnull Map> termMap, @Nonnull String fieldName) { + final Set terms = new HashSet<>(); + final Set forField = termMap.get(fieldName); + if (forField != null) { + terms.addAll(forField); + } + final Set forAll = termMap.get(""); + if (forAll != null) { + terms.addAll(forAll); + } + return terms; + } + +} diff --git a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexKeyValueToPartialRecordUtils.java b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexKeyValueToPartialRecordUtils.java index c9d54dbc4c..f818e38b65 100644 --- a/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexKeyValueToPartialRecordUtils.java +++ b/fdb-record-layer-lucene/src/main/java/com/apple/foundationdb/record/lucene/LuceneIndexKeyValueToPartialRecordUtils.java @@ -23,7 +23,6 @@ import com.apple.foundationdb.record.IndexEntry; import com.apple.foundationdb.record.RecordCoreException; import com.apple.foundationdb.record.logging.LogMessageKeys; -import com.apple.foundationdb.record.metadata.Key; import com.apple.foundationdb.record.metadata.expressions.FieldKeyExpression; import com.apple.foundationdb.record.metadata.expressions.GroupingKeyExpression; import com.apple.foundationdb.record.metadata.expressions.KeyExpression; @@ -201,115 +200,6 @@ private static Pair, List> getOriginalAndMappedFieldElement return Pair.of(fixedFieldNames, dynamicFieldNames); } - static class RecordRebuildSource implements LuceneIndexExpressions.RecordSource> { - @Nullable - public final RecordRebuildSource parent; - @Nonnull - public final Descriptors.Descriptor descriptor; - @Nullable - public final Descriptors.FieldDescriptor fieldDescriptor; - @Nonnull - public final Message.Builder builder; - public final Message message; - public final int indexIfRepeated; - - RecordRebuildSource(@Nullable RecordRebuildSource parent, @Nonnull Descriptors.Descriptor descriptor, @Nonnull Message.Builder builder, @Nonnull Message message) { - //this.rec = rec; - this.parent = parent; - this.descriptor = descriptor; - this.fieldDescriptor = null; - this.builder = builder; - this.message = message; - this.indexIfRepeated = 0; - } - - RecordRebuildSource(@Nullable RecordRebuildSource parent, @Nonnull Descriptors.FieldDescriptor fieldDescriptor, @Nonnull Message.Builder builder, @Nonnull Message message, int indexIfRepeated) { - //this.rec = rec; - this.parent = parent; - this.descriptor = fieldDescriptor.getMessageType(); - this.fieldDescriptor = fieldDescriptor; - this.builder = builder; - this.message = message; - this.indexIfRepeated = indexIfRepeated; - } - - @Override - public Descriptors.Descriptor getDescriptor() { - return descriptor; - } - - @Override - public Iterable> getChildren(@Nonnull FieldKeyExpression parentExpression) { - final String parentField = parentExpression.getFieldName(); - final Descriptors.FieldDescriptor parentFieldDescriptor = descriptor.findFieldByName(parentField); - - final List> children = new ArrayList<>(); - int index = 0; - for (Key.Evaluated evaluated : parentExpression.evaluateMessage(null, message)) { - final Message submessage = (Message)evaluated.toList().get(0); - if (submessage != null) { - if (parentFieldDescriptor.isRepeated()) { - children.add(new RecordRebuildSource(this, parentFieldDescriptor, - builder.newBuilderForField(parentFieldDescriptor), - submessage, index++)); - } else { - children.add(new RecordRebuildSource(this, parentFieldDescriptor, - builder.getFieldBuilder(parentFieldDescriptor), - submessage, index)); - } - } - } - return children; - } - - @Override - public Iterable getValues(@Nonnull FieldKeyExpression fieldExpression) { - final List values = new ArrayList<>(); - for (Key.Evaluated evaluated : fieldExpression.evaluateMessage(null, message)) { - Object value = evaluated.getObject(0); - if (value != null) { - values.add(value); - } - } - return values; - } - - @SuppressWarnings("java:S3776") - public void buildMessage(@Nullable Object value, Descriptors.FieldDescriptor subFieldDescriptor, @Nullable String customizedKey, @Nullable String mappedKeyField, boolean forLuceneField, int index) { - final Descriptors.FieldDescriptor mappedKeyFieldDescriptor = mappedKeyField == null ? null : descriptor.findFieldByName(mappedKeyField); - if (mappedKeyFieldDescriptor != null) { - if (customizedKey == null) { - return; - } - builder.setField(mappedKeyFieldDescriptor, customizedKey); - } - - if (value == null) { - return; - } - if (subFieldDescriptor.isRepeated()) { - if (subFieldDescriptor.getJavaType().equals(Descriptors.FieldDescriptor.JavaType.MESSAGE)) { - Message.Builder subBuilder = builder.newBuilderForField(subFieldDescriptor); - subBuilder.mergeFrom((Message) builder.getRepeatedField(subFieldDescriptor, index)).mergeFrom((Message) value); - builder.setRepeatedField(subFieldDescriptor, index, subBuilder.build()); - } else { - builder.setRepeatedField(subFieldDescriptor, index, value); - } - - } else { - int count = builder.getAllFields().size(); - if (message != null && count == 0) { - builder.mergeFrom(message); - } - builder.setField(subFieldDescriptor, value); - } - - if (parent != null) { - parent.buildMessage(builder.build(), this.fieldDescriptor, mappedKeyFieldDescriptor == null ? customizedKey : null, mappedKeyFieldDescriptor == null ? mappedKeyField : null, forLuceneField, indexIfRepeated); - } - } - } - /** * A {@link com.apple.foundationdb.record.lucene.LuceneIndexExpressions.RecordSource} implementation to build the partial record message. */ diff --git a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java index 0907090d7f..f498879fb8 100644 --- a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java +++ b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneAutoCompleteResultCursorTest.java @@ -126,7 +126,7 @@ private static void assertSearchMatches(String queryString, List expecte assertEquals(expectedPrefixToken, prefixToken); Set queryTokenSet = new HashSet<>(tokens); - @Nullable String match = LuceneAutoCompleteResultCursor.searchAllMaybeHighlight("text", analyzer, text, queryTokenSet, prefixToken, true, + @Nullable String match = LuceneHighlighting.searchAllMaybeHighlight("text", analyzer, text, queryTokenSet, prefixToken, true, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(highlight)); assertEquals(expectedMatch, match); } diff --git a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecordTest.java b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecordTest.java index 05f98316e7..03994d0a8b 100644 --- a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecordTest.java +++ b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneDocumentFromRecordTest.java @@ -66,7 +66,7 @@ void simple() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "some" for text field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("text", Set.of("some")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("text", Set.of("some")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("some text", builder.build().getText()); @@ -99,7 +99,7 @@ void group() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "text" for text field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("text", Set.of("text")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("text", Set.of("text")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("more text", builder.build().getText()); @@ -137,7 +137,7 @@ void multi() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "text" for text field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("text", Set.of("text")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("text", Set.of("text")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("some text", builder.build().getText(0)); assertEquals("other text", builder.build().getText(1)); @@ -183,7 +183,7 @@ void biGroup() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "text" for text field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("text2", Set.of("text")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("text2", Set.of("text")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("first text", builder.build().getText()); assertEquals("second text", builder.build().getText2()); @@ -229,7 +229,7 @@ void uncorrelatedMap() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "v2" for entry_value field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("entry_value", Set.of("v2")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("entry_value", Set.of("v2")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("v1", builder.build().getEntry(0).getValue()); assertEquals("v2", builder.build().getEntry(1).getValue()); @@ -271,7 +271,7 @@ void map() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "v2" for k2 field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("k2", Set.of("v2")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("k2", Set.of("v2")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("v1", builder.build().getEntry(0).getValue()); assertEquals("v2", builder.build().getEntry(1).getValue()); @@ -316,7 +316,7 @@ void groupedMap() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "v20" for k2 field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("k2", Set.of("v20")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("k2", Set.of("v20")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("v10", builder.build().getEntry(0).getValue()); assertEquals("v20", builder.build().getEntry(1).getValue()); @@ -366,7 +366,7 @@ void groupingMap() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "2val" for entry_second_value field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("entry_second_value", Set.of("2val")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("entry_second_value", Set.of("2val")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("val", builder.build().getEntry(0).getValue()); assertEquals("2val", builder.build().getEntry(0).getSecondValue()); @@ -419,7 +419,7 @@ void groupingMapWithExtra() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "second" for entry_second_value field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("entry_second_value", Set.of("second")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("entry_second_value", Set.of("second")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("first", builder.build().getEntry(0).getValue()); assertEquals("second", builder.build().getEntry(0).getSecondValue()); @@ -470,7 +470,7 @@ void mapWithSubMessage() { LuceneDocumentFromRecord.getRecordFields(index, record)); // Highlight "testValue" for entry_k1_value field - LuceneDocumentFromRecord.highlightTermsInMessage(index, builder, Map.of("entry_k1_value", Set.of("testvalue")), analyzerProvider, + LuceneHighlighting.highlightTermsInMessage(index, builder, Map.of("entry_k1_value", Set.of("testvalue")), analyzerProvider, new LuceneScanQueryParameters.LuceneQueryHighlightParameters(true)); assertEquals("testValue", builder.build().getEntry(0).getSubEntry().getValue()); diff --git a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java index d00dfe7441..fbe12aa417 100644 --- a/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java +++ b/fdb-record-layer-lucene/src/test/java/com/apple/foundationdb/record/lucene/LuceneIndexTest.java @@ -2327,7 +2327,7 @@ private FDBStoredRecord possiblyHighlightedStoredRecord(F } M message = indexedRecord.getRecord(); M.Builder builder = message.toBuilder(); - LuceneDocumentFromRecord.highlightTermsInMessage(docIndexEntry.getIndexKey(), builder, + LuceneHighlighting.highlightTermsInMessage(docIndexEntry.getIndexKey(), builder, docIndexEntry.getTermMap(), docIndexEntry.getAnalyzerSelector(), docIndexEntry.getLuceneQueryHighlightParameters()); return storedRecord.asBuilder().setRecord((M) builder.build()).build(); }