New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Significant_terms agg: added option for a backgroundFilter #5944
Changes from 2 commits
0cbc8ff
2a3c378
5070c85
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
package org.elasticsearch.search.aggregations.bucket.significant; | ||
|
||
import org.elasticsearch.common.xcontent.XContentBuilder; | ||
import org.elasticsearch.index.query.FilterBuilder; | ||
import org.elasticsearch.search.aggregations.AggregationBuilder; | ||
|
||
import java.io.IOException; | ||
|
@@ -36,6 +37,8 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms | |
private int requiredSize = SignificantTermsParser.DEFAULT_REQUIRED_SIZE; | ||
private int shardSize = SignificantTermsParser.DEFAULT_SHARD_SIZE; | ||
private int minDocCount = SignificantTermsParser.DEFAULT_MIN_DOC_COUNT; | ||
private FilterBuilder filterBuilder; | ||
|
||
|
||
public SignificantTermsBuilder(String name) { | ||
super(name, SignificantStringTerms.TYPE.name()); | ||
|
@@ -60,6 +63,12 @@ public SignificantTermsBuilder minDocCount(int minDocCount) { | |
this.minDocCount = minDocCount; | ||
return this; | ||
} | ||
|
||
public SignificantTermsBuilder backgroundFilter(FilterBuilder filter) { | ||
this.filterBuilder = filter; | ||
return this; | ||
} | ||
|
||
|
||
@Override | ||
protected XContentBuilder internalXContent(XContentBuilder builder, Params params) throws IOException { | ||
|
@@ -76,6 +85,10 @@ protected XContentBuilder internalXContent(XContentBuilder builder, Params param | |
if (shardSize != SignificantTermsParser.DEFAULT_SHARD_SIZE) { | ||
builder.field("shard_size", shardSize); | ||
} | ||
if (filterBuilder != null) { | ||
builder.field("background_filter"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe use |
||
filterBuilder.toXContent(builder, params); | ||
} | ||
|
||
return builder.endObject(); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
package org.elasticsearch.search.aggregations.bucket.significant; | ||
|
||
import org.apache.lucene.search.Filter; | ||
import org.elasticsearch.common.ParseField; | ||
import org.elasticsearch.common.xcontent.XContentParser; | ||
import org.elasticsearch.search.SearchParseException; | ||
import org.elasticsearch.search.aggregations.Aggregator; | ||
|
@@ -40,6 +41,8 @@ public class SignificantTermsParser implements Aggregator.Parser { | |
|
||
//Typically need more than one occurrence of something for it to be statistically significant | ||
public static final int DEFAULT_MIN_DOC_COUNT = 3; | ||
|
||
private static final ParseField BACKGROUND_FILTER = new ParseField("background_filter"); | ||
|
||
@Override | ||
public String type() { | ||
|
@@ -90,18 +93,9 @@ public AggregatorFactory parse(String aggregationName, XContentParser parser, Se | |
throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "]."); | ||
} | ||
} else if (token == XContentParser.Token.START_OBJECT) { | ||
// TODO not sure if code below is the best means to declare a filter for | ||
// defining an alternative background stats context. | ||
// In trial runs it becomes obvious that the choice of background does have to | ||
// be a strict superset of the foreground subset otherwise the significant terms algo | ||
// immediately singles out the odd terms that are in the foreground but not represented | ||
// in the background. So a better approach may be to use a designated parent agg as the | ||
// background because parent aggs are always guaranteed to be a superset whereas arbitrary | ||
// filters defined by end users and parsed below are not. | ||
// if ("background_context".equals(currentFieldName)) { | ||
// filter = context.queryParserService().parseInnerFilter(parser).filter(); | ||
// } | ||
|
||
if (BACKGROUND_FILTER.match(currentFieldName)) { | ||
filter = context.queryParserService().parseInnerFilter(parser).filter(); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a |
||
} else { | ||
throw new SearchParseException(context, "Unexpected token " + token + " in [" + aggregationName + "]."); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
import org.elasticsearch.action.search.SearchType; | ||
import org.elasticsearch.common.settings.ImmutableSettings; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.index.query.FilterBuilders; | ||
import org.elasticsearch.index.query.TermQueryBuilder; | ||
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms; | ||
import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket; | ||
|
@@ -141,8 +142,57 @@ public void textAnalysis() throws Exception { | |
assertSearchResponse(response); | ||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); | ||
checkExpectedStringTermsFound(topTerms); | ||
} | ||
} | ||
|
||
@Test | ||
public void badFilteredAnalysis() throws Exception { | ||
// Deliberately using a bad choice of filter here for the background context in order | ||
// to test robustness. | ||
// We search for the name of a snowboarder but use music-related content (fact_category:1) | ||
// as the background source of term statistics. | ||
SearchResponse response = client().prepareSearch("test") | ||
.setSearchType(SearchType.QUERY_AND_FETCH) | ||
.setQuery(new TermQueryBuilder("_all", "terje")) | ||
.setFrom(0).setSize(60).setExplain(true) | ||
.addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description") | ||
.minDocCount(2).backgroundFilter(FilterBuilders.termFilter("fact_category", 1))) | ||
.execute() | ||
.actionGet(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add |
||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); | ||
// We expect at least one of the significant terms to have been selected on the basis | ||
// that it is present in the foreground selection but entirely missing from the filtered | ||
// background used as context. | ||
boolean hasMissingBackgroundTerms = false; | ||
for (Bucket topTerm : topTerms) { | ||
if (topTerm.getSupersetDf() == 0) { | ||
hasMissingBackgroundTerms = true; | ||
break; | ||
} | ||
} | ||
assertTrue(hasMissingBackgroundTerms); | ||
} | ||
|
||
@Test | ||
public void filteredAnalysis() throws Exception { | ||
SearchResponse response = client().prepareSearch("test") | ||
.setSearchType(SearchType.QUERY_AND_FETCH) | ||
.setQuery(new TermQueryBuilder("_all", "weller")) | ||
.setFrom(0).setSize(60).setExplain(true) | ||
.addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description") | ||
.minDocCount(1).backgroundFilter(FilterBuilders.termsFilter("description", "paul"))) | ||
.execute() | ||
.actionGet(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you also call |
||
SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms"); | ||
HashSet<String> topWords = new HashSet<String>(); | ||
for (Bucket topTerm : topTerms) { | ||
topWords.add(topTerm.getKey()); | ||
} | ||
//The word "paul" should be a constant of all docs in the background set and therefore not seen as significant | ||
assertFalse(topWords.contains("paul")); | ||
//"Weller" is the only Paul who was in The Jam and therefore this should be identified as a differentiator from the background of all other Pauls. | ||
assertTrue(topWords.contains("jam")); | ||
} | ||
|
||
@Test | ||
public void nestedAggs() throws Exception { | ||
String[][] expectedKeywordsByCategory={ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we use additional smoothing instead in order not to have this particular case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think "0" is the only special case that needs treatment as it is the absence of any evidence and without this adjustment the score returned is infinity. Anything > 0 is at least "some evidence" and therefore something I don't think we should tamper with.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was more thinking about something like Laplace smoothing, that we already use eg. in the phrase suggester http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-suggesters-phrase.html#_smoothing_models
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can probably be done in a different issue though.