From 29b1902cfbd4a67fcaef1044e62a9ebf32b7e7db Mon Sep 17 00:00:00 2001 From: markharwood Date: Wed, 18 Feb 2015 15:06:10 +0000 Subject: [PATCH] =?UTF-8?q?New=20aggregations=20feature=20-=20=E2=80=9CPer?= =?UTF-8?q?centageScore=E2=80=9D=20heuristic=20for=20significant=5Fterms?= =?UTF-8?q?=20aggregation=20provides=20simple=20=E2=80=9Cper-capita?= =?UTF-8?q?=E2=80=9D=20type=20measures.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #9720 --- .../significantterms-aggregation.asciidoc | 18 ++++ .../heuristics/PercentageScore.java | 101 ++++++++++++++++++ .../heuristics/SignificanceHeuristic.java | 4 +- .../SignificantTermsHeuristicModule.java | 2 + ...nsportSignificantTermsHeuristicModule.java | 2 + .../bucket/SignificantTermsTests.java | 18 ++++ .../SignificanceHeuristicTests.java | 20 +++- 7 files changed, 161 insertions(+), 4 deletions(-) create mode 100644 src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/PercentageScore.java diff --git a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc index bd817c9c9f88b..40417a065f940 100644 --- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc @@ -320,6 +320,24 @@ Google normalized distance as described in "The Google Similarity Distance", Ci `gnd` also accepts the `background_is_superset` parameter. + +===== Percentage +A simple calculation of the number of documents in the foreground sample with a term divided by the number of documents in the background with the term. +By default this produces a score greater than zero and less than one. + +The benefit of this heuristic is that the scoring logic is simple to explain to anyone familiar with a "per capita" statistic. However, for fields with high cardinality there is a tendency for this heuristic to select the rarest terms such as typos that occur only once because they score 1/1 = 100%. + +It would be hard for a seasoned boxer to win a championship if the prize was awarded purely on the basis of percentage of fights won - by these rules a newcomer with only one fight under his belt would be impossible to beat. +Multiple observations are typically required to reinforce a view so it is recommended in these cases to set both `min_doc_count` and `shard_min_doc_count` to a higher value such as 10 in order to filter out the low-frequency terms that otherwise take precedence. + +[source,js] +-------------------------------------------------- + + "percentage": { + } +-------------------------------------------------- + + ===== Which one is best? diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/PercentageScore.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/PercentageScore.java new file mode 100644 index 0000000000000..487c05c180d45 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/PercentageScore.java @@ -0,0 +1,101 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.elasticsearch.search.aggregations.bucket.significant.heuristics; + + +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.query.QueryParsingException; + +import java.io.IOException; + +public class PercentageScore extends SignificanceHeuristic { + + public static final PercentageScore INSTANCE = new PercentageScore(); + + protected static final String[] NAMES = {"percentage"}; + + private PercentageScore() {}; + + public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() { + @Override + public SignificanceHeuristic readResult(StreamInput in) throws IOException { + return readFrom(in); + } + + @Override + public String getName() { + return NAMES[0]; + } + }; + + public static SignificanceHeuristic readFrom(StreamInput in) throws IOException { + return INSTANCE; + } + + /** + * Indicates the significance of a term in a sample by determining what percentage + * of all occurrences of a term are found in the sample. + */ + @Override + public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) { + checkFrequencyValidity(subsetFreq, subsetSize, supersetFreq, supersetSize, "PercentageScore"); + if (supersetFreq == 0) { + // avoid a divide by zero issue + return 0; + } + return (double) subsetFreq / (double) supersetFreq; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeString(STREAM.getName()); + } + + public static class PercentageScoreParser implements SignificanceHeuristicParser { + + @Override + public SignificanceHeuristic parse(XContentParser parser) throws IOException, QueryParsingException { + // move to the closing bracket + if (!parser.nextToken().equals(XContentParser.Token.END_OBJECT)) { + throw new ElasticsearchParseException("expected }, got " + parser.currentName() + " instead in percentage score"); + } + return new PercentageScore(); + } + + @Override + public String[] getNames() { + return NAMES; + } + } + + public static class PercentageScoreBuilder implements SignificanceHeuristicBuilder { + + @Override + public void toXContent(XContentBuilder builder) throws IOException { + builder.startObject(STREAM.getName()).endObject(); + } + } +} + diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificanceHeuristic.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificanceHeuristic.java index 1283fc0892496..61e29e83e9077 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificanceHeuristic.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificanceHeuristic.java @@ -42,10 +42,10 @@ protected void checkFrequencyValidity(long subsetFreq, long subsetSize, long sup throw new ElasticsearchIllegalArgumentException("Frequencies of subset and superset must be positive in " + scoreFunctionName + ".getScore()"); } if (subsetFreq > subsetSize) { - throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in JLHScore.score(..)"); + throw new ElasticsearchIllegalArgumentException("subsetFreq > subsetSize, in " + scoreFunctionName); } if (supersetFreq > supersetSize) { - throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in JLHScore.score(..)"); + throw new ElasticsearchIllegalArgumentException("supersetFreq > supersetSize, in " + scoreFunctionName); } } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificantTermsHeuristicModule.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificantTermsHeuristicModule.java index b17f502775cf7..aa0048ed1cfdd 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificantTermsHeuristicModule.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/SignificantTermsHeuristicModule.java @@ -21,6 +21,7 @@ package org.elasticsearch.search.aggregations.bucket.significant.heuristics; import com.google.common.collect.Lists; + import org.elasticsearch.common.inject.AbstractModule; import org.elasticsearch.common.inject.multibindings.Multibinder; @@ -33,6 +34,7 @@ public class SignificantTermsHeuristicModule extends AbstractModule { public SignificantTermsHeuristicModule() { registerParser(JLHScore.JLHScoreParser.class); + registerParser(PercentageScore.PercentageScoreParser.class); registerParser(MutualInformation.MutualInformationParser.class); registerParser(GND.GNDParser.class); registerParser(ChiSquare.ChiSquareParser.class); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/TransportSignificantTermsHeuristicModule.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/TransportSignificantTermsHeuristicModule.java index efe1d31447326..d8de1fabb70bc 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/TransportSignificantTermsHeuristicModule.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/TransportSignificantTermsHeuristicModule.java @@ -21,6 +21,7 @@ package org.elasticsearch.search.aggregations.bucket.significant.heuristics; import com.google.common.collect.Lists; + import org.elasticsearch.common.inject.AbstractModule; import java.util.List; @@ -32,6 +33,7 @@ public class TransportSignificantTermsHeuristicModule extends AbstractModule { public TransportSignificantTermsHeuristicModule() { registerStream(JLHScore.STREAM); + registerStream(PercentageScore.STREAM); registerStream(MutualInformation.STREAM); registerStream(GND.STREAM); registerStream(ChiSquare.STREAM); diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java index 5fe1084964354..742fd2cd1faeb 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java @@ -33,6 +33,7 @@ import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND; import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore; import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation; +import org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore; import org.elasticsearch.search.aggregations.bucket.terms.Terms; import org.elasticsearch.search.aggregations.bucket.terms.TermsBuilder; import org.elasticsearch.test.ElasticsearchIntegrationTest; @@ -272,6 +273,23 @@ public void textAnalysisChiSquare() throws Exception { checkExpectedStringTermsFound(topTerms); } + @Test + public void textAnalysisPercentageScore() throws Exception { + SearchResponse response = client() + .prepareSearch("test") + .setSearchType(SearchType.QUERY_AND_FETCH) + .setQuery(new TermQueryBuilder("_all", "terje")) + .setFrom(0) + .setSize(60) + .setExplain(true) + .addAggregation( + new SignificantTermsBuilder("mySignificantTerms").field("description").executionHint(randomExecutionHint()) + .significanceHeuristic(new PercentageScore.PercentageScoreBuilder()).minDocCount(2)).execute().actionGet(); + assertSearchResponse(response); + SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); + checkExpectedStringTermsFound(topTerms); + } + @Test public void badFilteredAnalysis() throws Exception { // Deliberately using a bad choice of filter here for the background context in order diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java index ec3d17b929467..0bd4ec415620d 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java @@ -30,7 +30,16 @@ import org.elasticsearch.common.xcontent.json.JsonXContent; import org.elasticsearch.search.SearchShardTarget; import org.elasticsearch.search.aggregations.InternalAggregations; -import org.elasticsearch.search.aggregations.bucket.significant.heuristics.*; +import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare; +import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND; +import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore; +import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation; +import org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore; +import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristic; +import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicBuilder; +import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicParser; +import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicParserMapper; +import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificanceHeuristicStreams; import org.elasticsearch.search.internal.SearchContext; import org.elasticsearch.test.ElasticsearchIntegrationTest; import org.elasticsearch.test.ElasticsearchTestCase; @@ -45,7 +54,11 @@ import java.util.List; import java.util.Set; -import static org.hamcrest.Matchers.*; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.lessThan; +import static org.hamcrest.Matchers.lessThanOrEqualTo; /** * @@ -68,6 +81,7 @@ public SearchShardTarget shardTarget() { public void streamResponse() throws Exception { SignificanceHeuristicStreams.registerStream(MutualInformation.STREAM, MutualInformation.STREAM.getName()); SignificanceHeuristicStreams.registerStream(JLHScore.STREAM, JLHScore.STREAM.getName()); + SignificanceHeuristicStreams.registerStream(PercentageScore.STREAM, PercentageScore.STREAM.getName()); SignificanceHeuristicStreams.registerStream(GND.STREAM, GND.STREAM.getName()); SignificanceHeuristicStreams.registerStream(ChiSquare.STREAM, ChiSquare.STREAM.getName()); Version version = ElasticsearchIntegrationTest.randomVersion(); @@ -304,6 +318,7 @@ public void testAssertions() throws Exception { testBackgroundAssertions(new MutualInformation(true, true), new MutualInformation(true, false)); testBackgroundAssertions(new ChiSquare(true, true), new ChiSquare(true, false)); testBackgroundAssertions(new GND(true), new GND(false)); + testAssertions(PercentageScore.INSTANCE); testAssertions(JLHScore.INSTANCE); } @@ -311,6 +326,7 @@ public void testAssertions() throws Exception { public void basicScoreProperties() { basicScoreProperties(JLHScore.INSTANCE, true); basicScoreProperties(new GND(true), true); + basicScoreProperties(PercentageScore.INSTANCE, true); basicScoreProperties(new MutualInformation(true, true), false); basicScoreProperties(new ChiSquare(true, true), false); }