Highlighting: nuke XPostingsHighlighter

Our own fork of the lucene PostingsHighlighter is not easy to maintain and doesn't give us any added value at this point. In particular, it was introduced to support the require_field_match option and discrete per value highlighting, used in case one wants to highlight the whole content of a field, but get back one snippet per value. These two features won't make it into lucene as they slow things down and shouldn't have been supported from day one on our end probably. One other customization we had was support for a wider range of queries via custom rewrite etc. (yet another way to slow things down), which got added to lucene and works much much better than what we used to do (instead of or rewrite, term s are pulled out of the automata for multi term queries). Removing our fork means the following in terms of features: - dropped support for require_field_match: the postings highlighter will only highlight fields that were queried - the output is different compared to other highlighters in case `fragment_size` is set to 0: one single snippet is returned in case a field has multiple values, rather than one highlighted snipper per value Closes elastic#10625 Closes elastic#11077
javanna · May 9, 2015 · a2487b6 · a2487b6
1 parent 9e01ded
commit a2487b6
Show file tree

Hide file tree

Showing 10 changed files with 173 additions and 3,230 deletions.
diff --git a/docs/reference/migration/migrate_2_0.asciidoc b/docs/reference/migration/migrate_2_0.asciidoc
@@ -513,3 +513,14 @@ Client client = TransportClient.builder().settings(settings).build();
 
 Log messages are now truncated at 10,000 characters. This can be changed in the
 `logging.yml` configuration file.
+
+=== Highlighting
+
+The Postings highlighter doesn't support the `require_field_match` option
+anymore, it will only highlight fields that were queried.
+
+The output returned from the Postings highlighter has changed compared to the
+other highlighters when highlighting a field with multiple values in
+combination with `fragments_size` set to `0`. One single highlighted snippet
+will get returned rather than one snippet per value.
+
diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java b/src/main/java/org/apache/lucene/search/postingshighlight/CustomPostingsHighlighter.java
@@ -18,11 +18,9 @@
 
 package org.apache.lucene.search.postingshighlight;
 
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexReaderContext;
-import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.search.Query;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.search.highlight.HighlightUtils;
 
@@ -32,105 +30,77 @@
 import java.util.Map;
 
 /**
- * Subclass of the {@link XPostingsHighlighter} that works for a single field in a single document.
- * It receives the field values as input and it performs discrete highlighting on each single value
- * calling the highlightDoc method multiple times.
- * It allows to pass in the query terms to avoid calling extract terms multiple times.
- *
- * The use that we make of the postings highlighter is not optimal. It would be much better to
- * highlight multiple docs in a single call, as we actually lose its sequential IO.  But that would require:
- * 1) to make our fork more complex and harder to maintain to perform discrete highlighting (needed to return
- * a different snippet per value when number_of_fragments=0 and the field has multiple values)
- * 2) refactoring of the elasticsearch highlight api which currently works per hit
+ * Subclass of the {@link PostingsHighlighter} that works for a single field in a single document.
+ * Uses a custom {@link PassageFormatter}. Accepts field content as a constructor argument, given that loading
+ * is custom and can be done reading from _source field. Supports using different {@link BreakIterator} to break
+ * the text into fragments. Considers every distinct field value as a discrete passage for highlighting (unless
+ * the whole content needs to be highlighted). Supports both returning empty snippets and non highlighted snippets
+ * when no highlighting can be performed.
  *
+ * The use that we make of the postings highlighter is not optimal. It would be much better to highlight
+ * multiple docs in a single call, as we actually lose its sequential IO.  That would require to
+ * refactor the elasticsearch highlight api which currently works per hit.
  */
-public final class CustomPostingsHighlighter extends XPostingsHighlighter {
+public final class CustomPostingsHighlighter extends PostingsHighlighter {
 
     private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
     private static final Passage[] EMPTY_PASSAGE = new Passage[0];
 
+    private final Analyzer analyzer;
     private final CustomPassageFormatter passageFormatter;
-    private final int noMatchSize;
-    private final int totalContentLength;
-    private final String[] fieldValues;
-    private final int[] fieldValuesOffsets;
-    private int currentValueIndex = 0;
-
-    private BreakIterator breakIterator;
-
-    public CustomPostingsHighlighter(CustomPassageFormatter passageFormatter, List<Object> fieldValues, boolean mergeValues, int maxLength, int noMatchSize) {
-        super(maxLength);
-        this.passageFormatter = passageFormatter;
-        this.noMatchSize = noMatchSize;
-
-        if (mergeValues) {
-            String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(getMultiValuedSeparator("")));
-            String fieldValue = rawValue.substring(0, Math.min(rawValue.length(), maxLength));
-            this.fieldValues = new String[]{fieldValue};
-            this.fieldValuesOffsets = new int[]{0};
-            this.totalContentLength = fieldValue.length();
-        } else {
-            this.fieldValues = new String[fieldValues.size()];
-            this.fieldValuesOffsets = new int[fieldValues.size()];
-            int contentLength = 0;
-            int offset = 0;
-            int previousLength = -1;
-            for (int i = 0; i < fieldValues.size(); i++) {
-                String rawValue = fieldValues.get(i).toString();
-                String fieldValue = rawValue.substring(0, Math.min(rawValue.length(), maxLength));
-                this.fieldValues[i] = fieldValue;
-                contentLength += fieldValue.length();
-                offset += previousLength + 1;
-                this.fieldValuesOffsets[i] = offset;
-                previousLength = fieldValue.length();
-            }
-            this.totalContentLength = contentLength;
-        }
-    }
-
-    /*
-    Our own api to highlight a single document field, passing in the query terms, and get back our own Snippet object
+    private final BreakIterator breakIterator;
+    private final boolean returnNonHighlightedSnippets;
+    private final List<Object> fieldValues;
+
+    /**
+     * Creates a new instance of {@link CustomPostingsHighlighter}
+     *
+     * @param analyzer the analyzer used for the field at index time, used for multi term queries internally
+     * @param passageFormatter our own {@link PassageFormatter} which generates snippets in forms of {@link Snippet} objects
+     * @param fieldValues the original field values as constructor argument, loaded from te _source field or the relevant stored field.
+     * @param maxLength the maximum length of the field content to be read for highlighting
+     * @param returnNonHighlightedSnippets whether non highlighted snippets should be returned rather than empty snippets when
+     *                                     no highlighting can be performed
      */
-    public Snippet[] highlightDoc(String field, BytesRef[] terms, IndexReader reader, int docId, int maxPassages) throws IOException {
-        IndexReaderContext readerContext = reader.getContext();
-        List<LeafReaderContext> leaves = readerContext.leaves();
-
-        String[] contents = new String[]{loadCurrentFieldValue()};
-        Map<Integer, Object> snippetsMap = highlightField(field, contents, getBreakIterator(field), terms, new int[]{docId}, leaves, maxPassages);
-
-        //increment the current value index so that next time we'll highlight the next value if available
-        currentValueIndex++;
-
-        Object snippetObject = snippetsMap.get(docId);
-        if (snippetObject != null && snippetObject instanceof Snippet[]) {
-            return (Snippet[]) snippetObject;
-        }
-        return EMPTY_SNIPPET;
+    public CustomPostingsHighlighter(Analyzer analyzer, CustomPassageFormatter passageFormatter, List<Object> fieldValues, int maxLength, boolean returnNonHighlightedSnippets) {
+        this(analyzer, passageFormatter, null, fieldValues, maxLength, returnNonHighlightedSnippets);
     }
 
-    /*
-    Method provided through our own fork: allows to do proper scoring when doing per value discrete highlighting.
-    Used to provide the total length of the field (all values) for proper scoring.
+    /**
+     * Creates a new instance of {@link CustomPostingsHighlighter}
+     *
+     * @param analyzer the analyzer used for the field at index time, used for multi term queries internally
+     * @param passageFormatter our own {@link PassageFormatter} which generates snippets in forms of {@link Snippet} objects
+     * @param breakIterator an instance {@link BreakIterator} selected depending on the highlighting options
+     * @param fieldValues the original field values as constructor argument, loaded from te _source field or the relevant stored field.
+     * @param maxLength the maximum length of the field content to be read for highlighting
+     * @param returnNonHighlightedSnippets whether non highlighted snippets should be returned rather than empty snippets when
+     *                                     no highlighting can be performed
      */
-    @Override
-    protected int getContentLength(String field, int docId) {
-        return totalContentLength;
+    public CustomPostingsHighlighter(Analyzer analyzer, CustomPassageFormatter passageFormatter, BreakIterator breakIterator, List<Object> fieldValues, int maxLength, boolean returnNonHighlightedSnippets) {
+        super(maxLength);
+        this.analyzer = analyzer;
+        this.passageFormatter = passageFormatter;
+        this.breakIterator = breakIterator;
+        this.returnNonHighlightedSnippets = returnNonHighlightedSnippets;
+        this.fieldValues = fieldValues;
     }
 
-    /*
-    Method provided through our own fork: allows to perform proper per value discrete highlighting.
-    Used to provide the offset for the current value.
+    /**
+     * Highlights terms extracted from the provided query within the content of the provided field name
      */
-    @Override
-    protected int getOffsetForCurrentValue(String field, int docId) {
-        if (currentValueIndex < fieldValuesOffsets.length) {
-            return fieldValuesOffsets[currentValueIndex];
+    public Snippet[] highlightField(String field, Query query, IndexSearcher searcher, int docId, int maxPassages) throws IOException {
+        Map<String, Object[]> fieldsAsObjects = super.highlightFieldsAsObjects(new String[]{field}, query, searcher, new int[]{docId}, new int[]{maxPassages});
+        Object[] snippetObjects = fieldsAsObjects.get(field);
+        if (snippetObjects != null) {
+            //one single document at a time
+            assert snippetObjects.length == 1;
+            Object snippetObject = snippetObjects[0];
+            if (snippetObject != null && snippetObject instanceof Snippet[]) {
+                return (Snippet[]) snippetObject;
+            }
         }
-        throw new IllegalArgumentException("No more values offsets to return");
-    }
-
-    public void setBreakIterator(BreakIterator breakIterator) {
-        this.breakIterator = breakIterator;
+        return EMPTY_SNIPPET;
     }
 
     @Override
@@ -158,29 +128,25 @@ protected char getMultiValuedSeparator(String field) {
      */
     @Override
     protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
-        if (noMatchSize > 0) {
+        if (returnNonHighlightedSnippets) {
             //we want to return the first sentence of the first snippet only
             return super.getEmptyHighlight(fieldName, bi, 1);
         }
         return EMPTY_PASSAGE;
     }
 
-    /*
-    Not needed since we call our own loadCurrentFieldValue explicitly, but we override it anyway for consistency.
-     */
     @Override
-    protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
-        return new String[][]{new String[]{loadCurrentFieldValue()}};
+    protected Analyzer getIndexAnalyzer(String field) {
+        return analyzer;
     }
 
-    /*
-     Our own method that returns the field values, which relies on the content that was provided when creating the highlighter.
-     Supports per value discrete highlighting calling the highlightDoc method multiple times, one per value.
-    */
-    protected String loadCurrentFieldValue() {
-        if (currentValueIndex < fieldValues.length) {
-            return fieldValues[currentValueIndex];
-        }
-        throw new IllegalArgumentException("No more values to return");
+    @Override
+    protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
+        //we only highlight one field, one document at a time
+        assert fields.length == 1;
+        assert docids.length == 1;
+        String rawValue = Strings.collectionToDelimitedString(fieldValues, String.valueOf(getMultiValuedSeparator(fields[0])));
+        String fieldValue = rawValue.substring(0, Math.min(rawValue.length(), maxLength));
+        return new String[][]{new String[]{fieldValue}};
     }
 }
diff --git a/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java b/src/main/java/org/apache/lucene/search/postingshighlight/Snippet.java
@@ -22,7 +22,7 @@
  * Represents a scored highlighted snippet.
  * It's our own arbitrary object that we get back from the postings highlighter when highlighting a document.
  * Every snippet contains its formatted text and its score.
- * The score is needed since we highlight every single value separately and we might want to return snippets sorted by score.
+ * The score is needed in case we want to sort snippets by score, they get sorted by position in the text by default.
  */
 public class Snippet {