diff --git a/docs/reference/search/request/highlighting.asciidoc b/docs/reference/search/request/highlighting.asciidoc index 591a4b20ef1c9..300b66d8f9d85 100644 --- a/docs/reference/search/request/highlighting.asciidoc +++ b/docs/reference/search/request/highlighting.asciidoc @@ -75,6 +75,8 @@ will be used instead of the plain highlighter. The fast vector highlighter: `fragment_offset` (see <>) * Requires setting `term_vector` to `with_positions_offsets` which increases the size of the index +* Can combine matches from multiple fields into one result. See + `matched_fields` Here is an example of setting the `content` field to allow for highlighting using the fast vector highlighter on it (this will cause @@ -102,7 +104,7 @@ The following is an example that forces the use of the plain highlighter: "query" : {...}, "highlight" : { "fields" : { - "content" : { "type" : "plain"} + "content" : {"type" : "plain"} } } } @@ -385,3 +387,124 @@ defined in it. It defaults to `.,!? \t\n`. The `boundary_max_scan` allows to control how far to look for boundary characters, and defaults to `20`. + + +added[0.90.8] +[[matched-fields]] +==== Matched Fields +The Fast Vector Highlighter can combine matches on multiple fields to +highlight a single field using `matched_fields`. This is most +intuitive for multifields that analyze the same string in different +ways. All `matched_fields` must have `term_vector` set to +`with_positions_offsets` but only the field to which the matches are +combined is loaded so only that field would benefit from having +`store` set to `yes`. + +In the following examples `content` is analyzed by the `english` +analyzer and `content.plain` is analyzed by the `standard` analyzer. + +[source,js] +-------------------------------------------------- +{ + "query": { + "query_string": { + "query": "content.plain:running scissors", + "fields": ["content"] + } + }, + "highlight": { + "order": "score", + "fields": { + "content": { + "matched_fields": ["content", "content.plain"], + "type" : "fvh" + } + } + } +} +-------------------------------------------------- +The above matches both "run with scissors" and "running with scissors" +and would highlight "running" and "scissors" but not "run". If both +phrases appear in a large document then "running with scissors" is +sorted above "run with scissors" in the fragments list because there +are more matches in that fragment. + +[source,js] +-------------------------------------------------- +{ + "query": { + "query_string": { + "query": "running scissors", + "fields": ["content", "content.plain^10"] + } + }, + "highlight": { + "order": "score", + "fields": { + "content": { + "matched_fields": ["content", "content.plain"], + "type" : "fvh" + } + } + } +} +-------------------------------------------------- +The above highlights "run" as well as "running" and "scissors" but +still sorts "running with scissors" above "run with scissors" because +the plain match ("running") is boosted. + +[source,js] +-------------------------------------------------- +{ + "query": { + "query_string": { + "query": "running scissors", + "fields": ["content", "content.plain^10"] + } + }, + "highlight": { + "order": "score", + "fields": { + "content": { + "matched_fields": ["content.plain"], + "type" : "fvh" + } + } + } +} +-------------------------------------------------- +The above query wouldn't highlight "run" or "scissor" but shows that +it is just fine not to list the field to which the matches are combined +(`content`) in the matched fields. + +[NOTE] +Technically it is also fine to add fields to `matched_fields` that +don't share the same underlying string as the field to which the matches +are combined. The results might not make much sense and if one of the +matches is off the end of the text then the whole the query will fail. + +[NOTE] +=================================================================== +There is a small amount of overhead involved with setting +`matched_fields` to a non-empty array so always prefer +[source,js] +-------------------------------------------------- + "highlight": { + "fields": { + "content": {} + } + } +-------------------------------------------------- +to +[source,js] +-------------------------------------------------- + "highlight": { + "fields": { + "content": { + "matched_fields": ["content"], + "type" : "fvh" + } + } + } +-------------------------------------------------- +=================================================================== diff --git a/src/main/java/org/elasticsearch/search/highlight/FastVectorHighlighter.java b/src/main/java/org/elasticsearch/search/highlight/FastVectorHighlighter.java index 3342388ff72e5..ca2d3a56feb7e 100644 --- a/src/main/java/org/elasticsearch/search/highlight/FastVectorHighlighter.java +++ b/src/main/java/org/elasticsearch/search/highlight/FastVectorHighlighter.java @@ -145,8 +145,14 @@ public HighlightField highlight(HighlighterContext highlighterContext) { int numberOfFragments = field.numberOfFragments() == 0 ? Integer.MAX_VALUE : field.numberOfFragments(); int fragmentCharSize = field.numberOfFragments() == 0 ? Integer.MAX_VALUE : field.fragmentCharSize(); // we highlight against the low level reader and docId, because if we load source, we want to reuse it if possible - fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.names().indexName(), fragmentCharSize, numberOfFragments, - entry.fragListBuilder, entry.fragmentsBuilder, field.preTags(), field.postTags(), encoder); + // Only send matched fields if they were requested to save time. + if (field.matchedFields() != null && !field.matchedFields().isEmpty()) { + fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.names().indexName(), field.matchedFields(), fragmentCharSize, + numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.preTags(), field.postTags(), encoder); + } else { + fragments = cache.fvh.getBestFragments(fieldQuery, hitContext.reader(), hitContext.docId(), mapper.names().indexName(), fragmentCharSize, + numberOfFragments, entry.fragListBuilder, entry.fragmentsBuilder, field.preTags(), field.postTags(), encoder); + } if (fragments != null && fragments.length > 0) { return new HighlightField(field.field(), StringText.convertFromStringArray(fragments)); diff --git a/src/main/java/org/elasticsearch/search/highlight/HighlightBuilder.java b/src/main/java/org/elasticsearch/search/highlight/HighlightBuilder.java index afef2d73716d4..42099f79e7768 100644 --- a/src/main/java/org/elasticsearch/search/highlight/HighlightBuilder.java +++ b/src/main/java/org/elasticsearch/search/highlight/HighlightBuilder.java @@ -315,6 +315,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws if (field.noMatchSize != null) { builder.field("no_match_size", field.noMatchSize); } + if (field.matchedFields != null) { + builder.field("matched_fields", field.matchedFields); + } if (field.options != null && field.options.size() > 0) { builder.field("options", field.options); } @@ -344,6 +347,7 @@ public static class Field { String fragmenter; QueryBuilder highlightQuery; Integer noMatchSize; + String[] matchedFields; Map options; public Field(String name) { @@ -465,5 +469,15 @@ public Field options(Map options) { this.options = options; return this; } + + /** + * Set the matched fields to highlight against this field data. Default to null, meaning just + * the named field. If you provide a list of fields here then don't forget to include name as + * it is not automatically included. + */ + public Field matchedFields(String... matchedFields) { + this.matchedFields = matchedFields; + return this; + } } } diff --git a/src/main/java/org/elasticsearch/search/highlight/HighlighterParseElement.java b/src/main/java/org/elasticsearch/search/highlight/HighlighterParseElement.java index bd9ad13219302..c022fb4da77e0 100644 --- a/src/main/java/org/elasticsearch/search/highlight/HighlighterParseElement.java +++ b/src/main/java/org/elasticsearch/search/highlight/HighlighterParseElement.java @@ -20,6 +20,7 @@ package org.elasticsearch.search.highlight; import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import org.apache.lucene.search.Query; import org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner; import org.elasticsearch.common.xcontent.XContentParser; @@ -29,6 +30,7 @@ import java.util.List; import java.util.Map; +import java.util.Set; import static com.google.common.collect.Lists.newArrayList; @@ -162,6 +164,12 @@ public void parse(XContentParser parser, SearchContext context) throws Exception postTagsList.add(parser.text()); } field.postTags(postTagsList.toArray(new String[postTagsList.size()])); + } else if ("matched_fields".equals(fieldName) || "matchedFields".equals(fieldName)) { + Set matchedFields = Sets.newHashSet(); + while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) { + matchedFields.add(parser.text()); + } + field.matchedFields(matchedFields); } } else if (token.isValue()) { if ("fragment_size".equals(fieldName) || "fragmentSize".equals(fieldName)) { diff --git a/src/main/java/org/elasticsearch/search/highlight/SearchContextHighlight.java b/src/main/java/org/elasticsearch/search/highlight/SearchContextHighlight.java index 11df3aa0dfdaf..157f01251a24a 100644 --- a/src/main/java/org/elasticsearch/search/highlight/SearchContextHighlight.java +++ b/src/main/java/org/elasticsearch/search/highlight/SearchContextHighlight.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.Map; +import java.util.Set; /** * @@ -73,6 +74,8 @@ public static class Field { private int noMatchSize = -1; + private Set matchedFields; + private Map options; public Field(String field) { @@ -203,6 +206,14 @@ public void noMatchSize(int noMatchSize) { this.noMatchSize = noMatchSize; } + public Set matchedFields() { + return matchedFields; + } + + public void matchedFields(Set matchedFields) { + this.matchedFields = matchedFields; + } + public Map options() { return options; } diff --git a/src/test/java/org/elasticsearch/search/highlight/HighlighterSearchTests.java b/src/test/java/org/elasticsearch/search/highlight/HighlighterSearchTests.java index c03f12f87aa6b..ae5dc9f6a0f95 100644 --- a/src/test/java/org/elasticsearch/search/highlight/HighlighterSearchTests.java +++ b/src/test/java/org/elasticsearch/search/highlight/HighlighterSearchTests.java @@ -27,15 +27,13 @@ import org.elasticsearch.common.settings.ImmutableSettings.Builder; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; -import org.elasticsearch.index.query.FilterBuilders; -import org.elasticsearch.index.query.IdsQueryBuilder; -import org.elasticsearch.index.query.MatchQueryBuilder; +import org.elasticsearch.index.query.*; import org.elasticsearch.index.query.MatchQueryBuilder.Operator; import org.elasticsearch.index.query.MatchQueryBuilder.Type; -import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.elasticsearch.search.highlight.HighlightBuilder.Field; import org.elasticsearch.test.ElasticsearchIntegrationTest; import org.hamcrest.Matcher; import org.junit.Test; @@ -630,6 +628,165 @@ public void testFastVectorHighlighter() throws Exception { assertHighlight(searchResponse, 0, "field2", 0, 1, equalTo("The quick brown fox jumps over the lazy dog")); } + @Test + public void testMatchedFieldsFvhRequireFieldMatch() throws Exception { + checkMatchedFieldsCase(true); + } + + @Test + public void testMatchedFieldsFvhNoRequireFieldMatch() throws Exception { + checkMatchedFieldsCase(false); + } + + private void checkMatchedFieldsCase(boolean requireFieldMatch) throws Exception { + client().admin().indices().prepareCreate("test") + .addMapping("type1", XContentFactory.jsonBuilder().startObject().startObject("type1") + .startObject("properties") + .startObject("foo") + .field("type", "multi_field") + .startObject("fields") + .startObject("foo") + .field("type", "string") + .field("termVector", "with_positions_offsets") + .field("store", "yes") + .field("analyzer", "english") + .endObject() + .startObject("plain") + .field("type", "string") + .field("termVector", "with_positions_offsets") + .field("analyzer", "standard") + .endObject() + .endObject() + .endObject() + .startObject("bar") + .field("type", "multi_field") + .startObject("fields") + .startObject("bar") + .field("type", "string") + .field("termVector", "with_positions_offsets") + .field("store", "yes") + .field("analyzer", "english") + .endObject() + .startObject("plain") + .field("type", "string") + .field("termVector", "with_positions_offsets") + .field("analyzer", "standard") + .endObject() + .endObject() + .endObject() + .endObject()).execute().actionGet(); + ensureGreen(); + + index("test", "type1", "1", + "foo", "running with scissors"); + index("test", "type1", "2", + "foo", "cat cat junk junk junk junk junk junk junk cats junk junk", + "bar", "cat cat junk junk junk junk junk junk junk cats junk junk"); + index("test", "type1", "3", + "foo", "weird", + "bar", "result"); + refresh(); + + Field fooField = new Field("foo").numOfFragments(1).order("score").fragmentSize(25) + .highlighterType("fvh").requireFieldMatch(requireFieldMatch); + Field barField = new Field("bar").numOfFragments(1).order("score").fragmentSize(25) + .highlighterType("fvh").requireFieldMatch(requireFieldMatch); + SearchRequestBuilder req = client().prepareSearch("test").addHighlightedField(fooField); + + // First check highlighting without any matched fields set + SearchResponse resp = req.setQuery(queryString("running scissors").field("foo")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("running with scissors")); + + // And that matching a subfield doesn't automatically highlight it + resp = req.setQuery(queryString("foo.plain:running scissors").field("foo")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("running with scissors")); + + // Add the subfield to the list of matched fields but don't match it. Everything should still work + // like before we added it. + fooField.matchedFields("foo", "foo.plain"); + resp = req.setQuery(queryString("running scissors").field("foo")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("running with scissors")); + + // Now make half the matches come from the stored field and half from just a matched field. + resp = req.setQuery(queryString("foo.plain:running scissors").field("foo")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("running with scissors")); + + // Now remove the stored field from the matched field list. That should work too. + fooField.matchedFields("foo.plain"); + resp = req.setQuery(queryString("foo.plain:running scissors").field("foo")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("running with scissors")); + + // Now make sure boosted fields don't blow up when matched fields is both the subfield and stored field. + fooField.matchedFields("foo", "foo.plain"); + resp = req.setQuery(queryString("foo.plain:running^5 scissors").field("foo")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("running with scissors")); + + // Now just all matches are against the matched field. This still returns highlighting. + resp = req.setQuery(queryString("foo.plain:running foo.plain:scissors").field("foo")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("running with scissors")); + + // And all matched field via the queryString's field parameter, just in case + resp = req.setQuery(queryString("running scissors").field("foo.plain")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("running with scissors")); + + // Finding the same string two ways is ok too + resp = req.setQuery(queryString("run foo.plain:running^5 scissors").field("foo")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("running with scissors")); + + // But we use the best found score when sorting fragments + resp = req.setQuery(queryString("cats foo.plain:cats^5").field("foo")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("junk junk cats junk junk")); + + // which can also be written by searching on the subfield + resp = req.setQuery(queryString("cats").field("foo").field("foo.plain^5")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("junk junk cats junk junk")); + + // Speaking of two fields, you can have two fields, only one of which has matchedFields enabled + QueryBuilder twoFieldsQuery = queryString("cats").field("foo").field("foo.plain^5") + .field("bar").field("bar.plain^5"); + resp = req.setQuery(twoFieldsQuery).addHighlightedField(barField).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("junk junk cats junk junk")); + assertHighlight(resp, 0, "bar", 0, equalTo("cat cat junk junk junk junk")); + + // And you can enable matchedField highlighting on both + barField.matchedFields("bar", "bar.plain"); + resp = req.get(); + assertHighlight(resp, 0, "foo", 0, equalTo("junk junk cats junk junk")); + assertHighlight(resp, 0, "bar", 0, equalTo("junk junk cats junk junk")); + + // Setting a matchedField that isn't searched/doesn't exist is simply ignored. + barField.matchedFields("bar", "candy"); + resp = req.get(); + assertHighlight(resp, 0, "foo", 0, equalTo("junk junk cats junk junk")); + assertHighlight(resp, 0, "bar", 0, equalTo("cat cat junk junk junk junk")); + + // If the stored field doesn't have a value it doesn't matter what you match, you get nothing. + barField.matchedFields("bar", "foo.plain"); + resp = req.setQuery(queryString("running scissors").field("foo.plain").field("bar")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("running with scissors")); + assertThat(resp.getHits().getAt(0).getHighlightFields(), not(hasKey("bar"))); + + // If the stored field is found but the matched field isn't then you don't get a result either. + fooField.matchedFields("bar.plain"); + resp = req.setQuery(queryString("running scissors").field("foo").field("foo.plain").field("bar").field("bar.plain")).get(); + assertThat(resp.getHits().getAt(0).getHighlightFields(), not(hasKey("foo"))); + + // But if you add the stored field to the list of matched fields then you'll get a result again + fooField.matchedFields("foo", "bar.plain"); + resp = req.setQuery(queryString("running scissors").field("foo").field("foo.plain").field("bar").field("bar.plain")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("running with scissors")); + assertThat(resp.getHits().getAt(0).getHighlightFields(), not(hasKey("bar"))); + + // You _can_ highlight fields that aren't subfields of one another. + resp = req.setQuery(queryString("weird").field("foo").field("foo.plain").field("bar").field("bar.plain")).get(); + assertHighlight(resp, 0, "foo", 0, equalTo("weird")); + assertHighlight(resp, 0, "bar", 0, equalTo("result")); + + //But be careful. It'll blow up if there is a result paste the end of the field. + resp = req.setQuery(queryString("result").field("foo").field("foo.plain").field("bar").field("bar.plain")).get(); + assertThat("Expected ShardFailures", resp.getShardFailures().length, greaterThan(0)); + } + @Test @Slow public void testFastVectorHighlighterManyDocs() throws Exception {