diff --git a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc index 3647a6bd61c2f..b90c45de1d8ab 100644 --- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc @@ -449,67 +449,10 @@ WARNING: Use of background filters will slow the query as each term's postings m ===== Filtering Values It is possible (although rarely required) to filter the values for which buckets will be created. This can be done using the `include` and -`exclude` parameters which are based on regular expressions. This functionality mirrors the features -offered by the `terms` aggregation. +`exclude` parameters which are based on a regular expression string or arrays of exact terms. This functionality mirrors the features +described in the <> documentation. -[source,js] --------------------------------------------------- -{ - "aggs" : { - "tags" : { - "significant_terms" : { - "field" : "tags", - "include" : ".*sport.*", - "exclude" : "water_.*" - } - } - } -} --------------------------------------------------- - -In the above example, buckets will be created for all the tags that has the word `sport` in them, except those starting -with `water_` (so the tag `water_sports` will no be aggregated). The `include` regular expression will determine what -values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When -both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`. - -The regular expression are based on the Java(TM) http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html[Pattern], -and as such, they it is also possible to pass in flags that will determine how the compiled regular expression will work: - -[source,js] --------------------------------------------------- -{ - "aggs" : { - "tags" : { - "terms" : { - "field" : "tags", - "include" : { - "pattern" : ".*sport.*", - "flags" : "CANON_EQ|CASE_INSENSITIVE" <1> - }, - "exclude" : { - "pattern" : "water_.*", - "flags" : "CANON_EQ|CASE_INSENSITIVE" - } - } - } - } -} --------------------------------------------------- - -<1> the flags are concatenated using the `|` character as a separator - -The possible flags that can be used are: -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CANON_EQ[`CANON_EQ`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CASE_INSENSITIVE[`CASE_INSENSITIVE`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#COMMENTS[`COMMENTS`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#DOTALL[`DOTALL`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#LITERAL[`LITERAL`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#MULTILINE[`MULTILINE`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CASE[`UNICODE_CASE`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`] - ===== Execution hint There are two mechanisms by which terms aggregations can be executed: either by using field values directly in order to aggregate diff --git a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc index 4dab0f3d3455b..025e6034c86da 100644 --- a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc @@ -418,7 +418,7 @@ Generating the terms using a script: ==== Filtering Values It is possible to filter the values for which buckets will be created. This can be done using the `include` and -`exclude` parameters which are based on regular expressions. +`exclude` parameters which are based on regular expression strings or arrays of exact values. [source,js] -------------------------------------------------- @@ -477,6 +477,29 @@ http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CA http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`] +For matching based on exact values the `include` and `exclude` parameters can simply take an array of +strings that represent the terms as they are found in the index: + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "JapaneseCars" : { + "terms" : { + "field" : "make", + "include" : ["mazda", "honda"] + } + }, + "ActiveCarManufacturers" : { + "terms" : { + "field" : "make", + "exclude" : ["rover", "jensen"] + } + } + } +} +-------------------------------------------------- + ==== Multi-field terms aggregation The `terms` aggregation does not support collecting terms from multiple fields diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java index f1c8c23befac0..42cee57271346 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java @@ -19,6 +19,7 @@ package org.elasticsearch.search.aggregations.bucket.terms; +import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.search.aggregations.Aggregator; import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode; @@ -43,6 +44,8 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder { private String executionHint; private SubAggCollectionMode collectionMode; private Boolean showTermDocCountError; + private String[] includeTerms = null; + private String[] excludeTerms = null; /** * Sole constructor. @@ -101,10 +104,24 @@ public TermsBuilder include(String regex) { * @see java.util.regex.Pattern#compile(String, int) */ public TermsBuilder include(String regex, int flags) { + if (includeTerms != null) { + throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both"); + } this.includePattern = regex; this.includeFlags = flags; return this; } + + /** + * Define a set of terms that should be aggregated. + */ + public TermsBuilder include(String [] terms) { + if (includePattern != null) { + throw new ElasticsearchIllegalArgumentException("include clause must be an array of strings or a regex, not both"); + } + this.includeTerms = terms; + return this; + } /** * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular @@ -123,10 +140,25 @@ public TermsBuilder exclude(String regex) { * @see java.util.regex.Pattern#compile(String, int) */ public TermsBuilder exclude(String regex, int flags) { + if (excludeTerms != null) { + throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both"); + } this.excludePattern = regex; this.excludeFlags = flags; return this; } + + /** + * Define a set of terms that should not be aggregated. + */ + public TermsBuilder exclude(String [] terms) { + if (excludePattern != null) { + throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both"); + } + this.excludeTerms = terms; + return this; + } + /** * When using scripts, the value type indicates the types of the values the script is generating. @@ -189,6 +221,9 @@ protected XContentBuilder doInternalXContent(XContentBuilder builder, Params par if (collectionMode != null) { builder.field(Aggregator.COLLECT_MODE.getPreferredName(), collectionMode.parseField().getPreferredName()); } + if (includeTerms != null) { + builder.array("include", includeTerms); + } if (includePattern != null) { if (includeFlags == 0) { builder.field("include", includePattern); @@ -199,6 +234,9 @@ protected XContentBuilder doInternalXContent(XContentBuilder builder, Params par .endObject(); } } + if (excludeTerms != null) { + builder.array("exclude", excludeTerms); + } if (excludePattern != null) { if (excludeFlags == 0) { builder.field("exclude", excludePattern); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java index 7d7432ca6c548..e48b13b9965c4 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java @@ -21,8 +21,10 @@ import org.apache.lucene.index.RandomAccessOrds; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; import org.apache.lucene.util.LongBitSet; +import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.common.regex.Regex; import org.elasticsearch.common.xcontent.XContentParser; @@ -31,6 +33,8 @@ import org.elasticsearch.search.internal.SearchContext; import java.io.IOException; +import java.util.HashSet; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -43,51 +47,127 @@ public class IncludeExclude { private final Matcher include; private final Matcher exclude; private final CharsRefBuilder scratch = new CharsRefBuilder(); + private Set includeValues; + private Set excludeValues; + private final boolean hasRegexTest; /** * @param include The regular expression pattern for the terms to be included - * (may only be {@code null} if {@code exclude} is not {@code null} + * (may only be {@code null} if one of the other arguments is none-null. + * @param includeValues The terms to be included + * (may only be {@code null} if one of the other arguments is none-null. * @param exclude The regular expression pattern for the terms to be excluded - * (may only be {@code null} if {@code include} is not {@code null} + * (may only be {@code null} if one of the other arguments is none-null. + * @param excludeValues The terms to be excluded + * (may only be {@code null} if one of the other arguments is none-null. */ - public IncludeExclude(Pattern include, Pattern exclude) { - assert include != null || exclude != null : "include & exclude cannot both be null"; // otherwise IncludeExclude object should be null + public IncludeExclude(Pattern include, Pattern exclude, Set includeValues, Set excludeValues) { + assert includeValues != null || include != null || + exclude != null || excludeValues != null : "includes & excludes cannot both be null"; // otherwise IncludeExclude object should be null this.include = include != null ? include.matcher("") : null; this.exclude = exclude != null ? exclude.matcher("") : null; + hasRegexTest = include != null || exclude != null; + this.includeValues = includeValues; + this.excludeValues = excludeValues; } /** * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns. */ public boolean accept(BytesRef value) { - scratch.copyUTF8Bytes(value); - if (include == null) { - // exclude must not be null - return !exclude.reset(scratch.get()).matches(); - } - if (!include.reset(scratch.get()).matches()) { - return false; + + if(hasRegexTest){ + //We need to perform UTF8 to UTF16 conversion for use in the regex matching + scratch.copyUTF8Bytes(value); } - if (exclude == null) { + return isIncluded(value, scratch.get()) && !isExcluded(value, scratch.get()); + } + + private boolean isIncluded(BytesRef value, CharsRef utf16Chars) { + + if ((includeValues == null) && (include == null)) { + // No include criteria to be tested. return true; } - return !exclude.reset(scratch.get()).matches(); + + if (include != null) { + if (include.reset(scratch.get()).matches()) { + return true; + } + } + if (includeValues != null) { + if (includeValues.contains(value)) { + return true; + } + } + // Some include criteria was tested but no match found + return false; + } + + private boolean isExcluded(BytesRef value, CharsRef utf16Chars) { + if (exclude != null) { + if (exclude.reset(scratch.get()).matches()) { + return true; + } + } + if (excludeValues != null) { + if (excludeValues.contains(value)) { + return true; + } + } + // No exclude criteria was tested or no match found + return false; } /** * Computes which global ordinals are accepted by this IncludeExclude instance. */ public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) { - TermsEnum globalTermsEnum = valueSource.globalOrdinalsValues().termsEnum(); LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount()); - try { - for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) { - if (accept(term)) { - acceptedGlobalOrdinals.set(globalTermsEnum.ord()); + // There are 3 ways of populating this bitset: + // 1) Looking up the global ordinals for known "include" terms + // 2) Looking up the global ordinals for known "exclude" terms + // 3) Traversing the term enum for all terms and running past regexes + // Option 3 is known to be very slow in the case of high-cardinality fields and + // should be avoided if possible. + if (includeValues != null) { + // optimize for the case where the set of accepted values is a set + // of known terms, not a regex that would have to be tested against all terms in the index + for (BytesRef includeValue : includeValues) { + // We need to perform UTF8 to UTF16 conversion for use in the regex matching + scratch.copyUTF8Bytes(includeValue); + if (!isExcluded(includeValue, scratch.get())) { + long ord = globalOrdinals.lookupTerm(includeValue); + if (ord >= 0) { + acceptedGlobalOrdinals.set(ord); + } + } + } + } else { + if(hasRegexTest) { + // We have includeVals that are a regex or only regex excludes - we need to do the potentially + // slow option of hitting termsEnum for every term in the index. + TermsEnum globalTermsEnum = valueSource.globalOrdinalsValues().termsEnum(); + try { + for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) { + if (accept(term)) { + acceptedGlobalOrdinals.set(globalTermsEnum.ord()); + } + } + } catch (IOException e) { + throw ExceptionsHelper.convertToElastic(e); + } + } else { + // we only have a set of known values to exclude - create a bitset with all good values and negate the known bads + acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length()); + for (BytesRef excludeValue : excludeValues) { + long ord = globalOrdinals.lookupTerm(excludeValue); + if (ord >= 0) { + acceptedGlobalOrdinals.clear(ord); + } } + } - } catch (IOException e) { - throw ExceptionsHelper.convertToElastic(e); } return acceptedGlobalOrdinals; } @@ -102,6 +182,8 @@ public static class Parser { int includeFlags = 0; // 0 means no flags String exclude = null; int excludeFlags = 0; // 0 means no flags + Set includeValues; + Set excludeValues; public Parser(String aggName, InternalAggregation.Type aggType, SearchContext context) { this.aggName = aggName; @@ -121,6 +203,18 @@ public boolean token(String currentFieldName, XContentParser.Token token, XConte } return true; } + + if (token == XContentParser.Token.START_ARRAY) { + if ("include".equals(currentFieldName)) { + includeValues = parseArrayToSet(parser); + return true; + } + if ("exclude".equals(currentFieldName)) { + excludeValues = parseArrayToSet(parser); + return true; + } + return false; + } if (token == XContentParser.Token.START_OBJECT) { if ("include".equals(currentFieldName)) { @@ -163,14 +257,27 @@ public boolean token(String currentFieldName, XContentParser.Token token, XConte return false; } - + private Set parseArrayToSet(XContentParser parser) throws IOException { + final Set set = new HashSet<>(); + if (parser.currentToken() != XContentParser.Token.START_ARRAY) { + throw new ElasticsearchParseException("Missing start of array in include/exclude clause"); + } + while (parser.nextToken() != XContentParser.Token.END_ARRAY) { + if (!parser.currentToken().isValue()) { + throw new ElasticsearchParseException("Array elements in include/exclude clauses should be string values"); + } + set.add(new BytesRef(parser.text())); + } + return set; + } + public IncludeExclude includeExclude() { - if (include == null && exclude == null) { + if (include == null && exclude == null && includeValues == null && excludeValues == null) { return null; } Pattern includePattern = include != null ? Pattern.compile(include, includeFlags) : null; Pattern excludePattern = exclude != null ? Pattern.compile(exclude, excludeFlags) : null; - return new IncludeExclude(includePattern, excludePattern); + return new IncludeExclude(includePattern, excludePattern, includeValues, excludeValues); } } diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java index 137181434b607..4180fbc33069c 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java @@ -38,9 +38,11 @@ import org.hamcrest.Matchers; import org.junit.Test; +import java.text.NumberFormat; import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.regex.Pattern; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; @@ -51,6 +53,7 @@ import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.is; import static org.hamcrest.core.IsNull.notNullValue; +import static org.hamcrest.core.IsNull.nullValue; /** * @@ -336,6 +339,94 @@ public void singleValueField_WithRegexFiltering_WithFlags() throws Exception { assertThat(bucket.getDocCount(), equalTo(1l)); } } + + + @Test + public void singleValueField_WithExactTermFiltering() throws Exception { + // include without exclude + String incVals[] = { "val000", "val001", "val002", "val003", "val004", "val005", "val006", "val007", "val008", "val009" }; + SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type") + .addAggregation(terms("terms") + .executionHint(randomExecutionHint()) + .field(SINGLE_VALUED_FIELD_NAME) + .collectMode(randomFrom(SubAggCollectionMode.values())) + .include(incVals)) + .execute().actionGet(); + + assertSearchResponse(response); + + Terms terms = response.getAggregations().get("terms"); + assertThat(terms, notNullValue()); + assertThat(terms.getName(), equalTo("terms")); + assertThat(terms.getBuckets().size(), equalTo(incVals.length)); + + for (String incVal : incVals) { + Terms.Bucket bucket = terms.getBucketByKey(incVal); + assertThat(bucket, notNullValue()); + assertThat(key(bucket), equalTo(incVal)); + assertThat(bucket.getDocCount(), equalTo(1l)); + } + + // include and exclude + // Slightly illogical example with exact terms below as include and exclude sets + // are made to overlap but the exclude set should have priority over matches. + // we should be left with: val002, val003, val004, val005, val006, val007, val008, val009 + String excVals[] = { "val000", "val001" }; + + response = client().prepareSearch("idx").setTypes("high_card_type") + .addAggregation(terms("terms") + .executionHint(randomExecutionHint()) + .field(SINGLE_VALUED_FIELD_NAME) + .collectMode(randomFrom(SubAggCollectionMode.values())) + .include(incVals) + .exclude(excVals)) + .execute().actionGet(); + + assertSearchResponse(response); + + terms = response.getAggregations().get("terms"); + assertThat(terms, notNullValue()); + assertThat(terms.getName(), equalTo("terms")); + assertThat(terms.getBuckets().size(), equalTo(8)); + + for (int i = 2; i < 10; i++) { + Terms.Bucket bucket = terms.getBucketByKey("val00" + i); + assertThat(bucket, notNullValue()); + assertThat(key(bucket), equalTo("val00" + i)); + assertThat(bucket.getDocCount(), equalTo(1l)); + } + + // Check case with only exact term exclude clauses + response = client().prepareSearch("idx").setTypes("high_card_type") + .addAggregation(terms("terms") + .executionHint(randomExecutionHint()) + .field(SINGLE_VALUED_FIELD_NAME) + .collectMode(randomFrom(SubAggCollectionMode.values())) + .exclude(excVals)) + .execute().actionGet(); + + assertSearchResponse(response); + + terms = response.getAggregations().get("terms"); + assertThat(terms, notNullValue()); + assertThat(terms.getName(), equalTo("terms")); + assertThat(terms.getBuckets().size(), equalTo(10)); + for (String key : excVals) { + Terms.Bucket bucket = terms.getBucketByKey(key); + assertThat(bucket, nullValue()); + } + NumberFormat nf=NumberFormat.getIntegerInstance(Locale.ENGLISH); + nf.setMinimumIntegerDigits(3); + for (int i = 2; i < 12; i++) { + Terms.Bucket bucket = terms.getBucketByKey("val" + nf.format(i)); + assertThat(bucket, notNullValue()); + assertThat(key(bucket), equalTo("val" + nf.format(i))); + assertThat(bucket.getDocCount(), equalTo(1l)); + } + + + } + @Test