Add support for combining fields to the FVH

The Fast Vector Highlighter can combine matches on multiple fields to highlight a single field using `matched_fields`. This is most intuitive for multifields that analyze the same string in different ways. Example: { "query": { "query_string": { "query": "content.plain:running scissors", "fields": ["content"] } }, "highlight": { "order": "score", "fields": { "content": { "matched_fields": ["content", "content.plain"], "type" : "fvh" } } } } Closes elastic#3750
nik9000 · Nov 30, 2013 · bd5af73 · bd5af73
1 parent 88d829a
commit bd5af73
Show file tree

Hide file tree

Showing 6 changed files with 326 additions and 7 deletions.
diff --git a/docs/reference/search/request/highlighting.asciidoc b/docs/reference/search/request/highlighting.asciidoc
@@ -75,6 +75,8 @@ will be used instead of the plain highlighter.  The fast vector highlighter:
  `fragment_offset` (see <<boundary-characters,below>>)
 * Requires setting `term_vector` to `with_positions_offsets` which
   increases the size of the index
+* Can combine matches from multiple fields into one result.  See
+  `matched_fields`
 
 Here is an example of setting the `content` field to allow for
 highlighting using the fast vector highlighter on it (this will cause
@@ -102,7 +104,7 @@ The following is an example that forces the use of the plain highlighter:
     "query" : {...},
     "highlight" : {
         "fields" : {
-            "content" : { "type" : "plain"}
+            "content" : {"type" : "plain"}
         }
     }
 }
@@ -385,3 +387,124 @@ defined in it. It defaults to `.,!? \t\n`.
 
 The `boundary_max_scan` allows to control how far to look for boundary
 characters, and defaults to `20`.
+
+
+added[0.90.8]
+[[matched-fields]]
+==== Matched Fields
+The Fast Vector Highlighter can combine matches on multiple fields to
+highlight a single field using `matched_fields`.  This is most
+intuitive for multifields that analyze the same string in different
+ways.  All `matched_fields` must have `term_vector` set to
+`with_positions_offsets` but only the field to which the matches are
+combined is loaded so only that field would benefit from having
+`store` set to `yes`.
+
+In the following examples `content` is analyzed by the `english`
+analyzer and `content.plain` is analyzed by the `standard` analyzer.
+
+[source,js]
+--------------------------------------------------
+{
+    "query": {
+        "query_string": {
+            "query": "content.plain:running scissors",
+            "fields": ["content"]
+        }
+    },
+    "highlight": {
+        "order": "score",
+        "fields": {
+            "content": {
+                "matched_fields": ["content", "content.plain"],
+                "type" : "fvh"
+            }
+        }
+    }
+}
+--------------------------------------------------
+The above matches both "run with scissors" and "running with scissors"
+and would highlight "running" and "scissors" but not "run". If both
+phrases appear in a large document then "running with scissors" is
+sorted above "run with scissors" in the fragments list because there
+are more matches in that fragment.
+
+[source,js]
+--------------------------------------------------
+{
+    "query": {
+        "query_string": {
+            "query": "running scissors",
+            "fields": ["content", "content.plain^10"]
+        }
+    },
+    "highlight": {
+        "order": "score",
+        "fields": {
+            "content": {
+                "matched_fields": ["content", "content.plain"],
+                "type" : "fvh"
+            }
+        }
+    }
+}
+--------------------------------------------------
+The above highlights "run" as well as "running" and "scissors" but
+still sorts "running with scissors" above "run with scissors" because
+the plain match ("running") is boosted.
+
+[source,js]
+--------------------------------------------------
+{
+    "query": {
+        "query_string": {
+            "query": "running scissors",
+            "fields": ["content", "content.plain^10"]
+        }
+    },
+    "highlight": {
+        "order": "score",
+        "fields": {
+            "content": {
+                "matched_fields": ["content.plain"],
+                "type" : "fvh"
+            }
+        }
+    }
+}
+--------------------------------------------------
+The above query wouldn't highlight "run" or "scissor" but shows that
+it is just fine not to list the field to which the matches are combined
+(`content`) in the matched fields.
+
+[NOTE]
+Technically it is also fine to add fields to `matched_fields` that
+don't share the same underlying string as the field to which the matches
+are combined.  The results might not make much sense and if one of the
+matches is off the end of the text then the whole the query will fail.
+
+[NOTE]
+===================================================================
+There is a small amount of overhead involved with setting
+`matched_fields` to a non-empty array so always prefer
+[source,js]
+--------------------------------------------------
+    "highlight": {
+        "fields": {
+            "content": {}
+        }
+    }
+--------------------------------------------------
+to
+[source,js]
+--------------------------------------------------
+    "highlight": {
+        "fields": {
+            "content": {
+                "matched_fields": ["content"],
+                "type" : "fvh"
+            }
+        }
+    }
+--------------------------------------------------
+===================================================================
diff --git a/src/main/java/org/elasticsearch/search/highlight/FastVectorHighlighter.java b/src/main/java/org/elasticsearch/search/highlight/FastVectorHighlighter.java
@@ -145,8 +145,14 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
             int numberOfFragments = field.numberOfFragments() == 0 ? Integer.MAX_VALUE : field.numberOfFragments();
             int fragmentCharSize = field.numberOfFragments() == 0 ? Integer.MAX_VALUE : field.fragmentCharSize();
             // we highlight against the low level reader and docId, because if we load source, we want to reuse it if possible
-            fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.names().indexName(), fragmentCharSize, numberOfFragments,
-                    entry.fragListBuilder, entry.fragmentsBuilder, field.preTags(), field.postTags(), encoder);
+            // Only send matched fields if they were requested to save time.
+            if (field.matchedFields() != null && !field.matchedFields().isEmpty()) {
+                fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.names().indexName(), field.matchedFields(), fragmentCharSize,
+                        numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.preTags(), field.postTags(), encoder);
+            } else {
+                fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.names().indexName(), fragmentCharSize,
+                        numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.preTags(), field.postTags(), encoder);
+            }
 
             if (fragments != null && fragments.length > 0) {
                 return new HighlightField(field.field(), StringText.convertFromStringArray(fragments));

diff --git a/src/main/java/org/elasticsearch/search/highlight/HighlightBuilder.java b/src/main/java/org/elasticsearch/search/highlight/HighlightBuilder.java
@@ -315,6 +315,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
                 if (field.noMatchSize != null) {
                     builder.field("no_match_size", field.noMatchSize);
                 }
+                if (field.matchedFields != null) {
+                    builder.field("matched_fields", field.matchedFields);
+                }
                 if (field.options != null && field.options.size() > 0) {
                     builder.field("options", field.options);
                 }
@@ -344,6 +347,7 @@ public static class Field {
         String fragmenter;
         QueryBuilder highlightQuery;
         Integer noMatchSize;
+        String[] matchedFields;
         Map<String, Object> options;
 
         public Field(String name) {
@@ -465,5 +469,15 @@ public Field options(Map<String, Object> options) {
             this.options = options;
             return this;
         }
+
+        /**
+         * Set the matched fields to highlight against this field data.  Default to null, meaning just
+         * the named field.  If you provide a list of fields here then don't forget to include name as
+         * it is not automatically included.
+         */
+        public Field matchedFields(String... matchedFields) {
+            this.matchedFields = matchedFields;
+            return this;
+        }
     }
 }
diff --git a/src/main/java/org/elasticsearch/search/highlight/HighlighterParseElement.java b/src/main/java/org/elasticsearch/search/highlight/HighlighterParseElement.java
@@ -20,6 +20,7 @@
 package org.elasticsearch.search.highlight;
 
 import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner;
 import org.elasticsearch.common.xcontent.XContentParser;
@@ -29,6 +30,7 @@
 
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 import static com.google.common.collect.Lists.newArrayList;
 
@@ -162,6 +164,12 @@ public void parse(XContentParser parser, SearchContext context) throws Exception
                                             postTagsList.add(parser.text());
                                         }
                                         field.postTags(postTagsList.toArray(new String[postTagsList.size()]));
+                                    } else if ("matched_fields".equals(fieldName) || "matchedFields".equals(fieldName)) {
+                                        Set<String> matchedFields = Sets.newHashSet();
+                                        while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
+                                            matchedFields.add(parser.text());
+                                        }
+                                        field.matchedFields(matchedFields);
                                     }
                                 } else if (token.isValue()) {
                                     if ("fragment_size".equals(fieldName) || "fragmentSize".equals(fieldName)) {

diff --git a/src/main/java/org/elasticsearch/search/highlight/SearchContextHighlight.java b/src/main/java/org/elasticsearch/search/highlight/SearchContextHighlight.java
@@ -23,6 +23,7 @@
 
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 /**
  *
@@ -73,6 +74,8 @@ public static class Field {
 
         private int noMatchSize = -1;
 
+        private Set<String> matchedFields;
+
         private Map<String, Object> options;
 
         public Field(String field) {
@@ -203,6 +206,14 @@ public void noMatchSize(int noMatchSize) {
             this.noMatchSize = noMatchSize;
         }
 
+        public Set<String> matchedFields() {
+            return matchedFields;
+        }
+
+        public void matchedFields(Set<String> matchedFields) {
+            this.matchedFields = matchedFields;
+        }
+
         public Map<String, Object> options() {
             return options;
         }