Significant_terms agg: added option for a background_filter to define…

… background context for analysis of term frequencies Closes #5944
elastic · May 13, 2014 · 889fa6b · 889fa6b
1 parent 5b3be35
commit 889fa6b
Show file tree

Hide file tree

Showing 7 changed files with 123 additions and 24 deletions.
diff --git a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc
@@ -233,13 +233,19 @@ free-text field and use them in a `terms` query on the same field with a `highli
 are presented unstemmed, highlighted, with the right case, in the right order and with some context, their significance/meaning is more readily apparent.
 ============
 
-==== Limitations
+==== Custom background sets
+added[1.2.0]
+
+
+Ordinarily, the foreground set of documents is "diffed" against a background set of all the documents in your index.
+However, sometimes it may prove useful to use a narrower background set as the basis for comparisons. 
+For example, a query on documents relating to "Madrid" in an index with content from all over the world might reveal that "Spanish" 
+was a significant term. This may be true but if you want some more focused terms you could use a `background_filter` 
+on the term 'spain' to establish a narrower set of documents as context. With this as a background "Spanish" would now 
+be seen as commonplace and therefore not as significant as words like "capital" that relate more strongly with Madrid.  
+Note that using a background filter will slow things down - each term's background frequency must now be derived on-the-fly from filtering posting lists rather than reading the index's pre-computed count for a term.  
 
-===== Single _background_ comparison base
-The above examples show how to select the _foreground_ set for analysis using a query or parent aggregation to filter but currently there is no means of specifying
-a _background_ set other than the index from which all results are ultimately drawn. Sometimes it may prove useful to use a different
-background set as the basis for comparisons e.g. to first select the tweets for the TV show "XFactor" and then look
-for significant terms in a subset of that content which is from this week.
+==== Limitations
 
 ===== Significant terms must be indexed values
 Unlike the terms aggregation it is currently not possible to use script-generated terms for counting purposes.
@@ -337,6 +343,37 @@ WARNING: Setting `min_doc_count` to `1` is generally not advised as it tends to
 
 
 
+===== Custom background context
+
+The default source of statistical information for background term frequencies is the entire index and this
+scope can be narrowed through the use of a `background_filter` to focus in on significant terms within a narrower
+context: 
+
+[source,js]
+--------------------------------------------------
+{
+    "query" : {
+        "match" : "madrid"
+    },
+    "aggs" : {
+        "tags" : {
+            "significant_terms" : { 
+                "field" : "tag",
+                "background_filter": {
+                	"term" : { "text" : "spain"}
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+
+The above filter would help focus in on terms that were peculiar to the city of Madrid rather than revealing 
+terms like "Spanish" that are unusual in the full index's worldwide context but commonplace in the subset of documents containing the 
+word "Spain".   
+
+WARNING: Use of background filters will slow the query as each term's postings must be filtered to determine a frequency
+
 
 ===== Filtering Values
 

diff --git a/...arch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java b/...arch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java
@@ -97,7 +97,6 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) {
             spare.subsetSize = subsetSize;
             spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
             spare.supersetSize = supersetSize;
-            assert spare.subsetDf <= spare.supersetDf;
             // During shard-local down-selection we use subset/superset stats
             // that are for this shard only
             // Back at the central reducer these properties will be updated with

diff --git a/...va/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java b/...va/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java
@@ -54,7 +54,6 @@ public static abstract class Bucket extends SignificantTerms.Bucket {
         protected Bucket(long subsetDf, long subsetSize, long supersetDf, long supersetSize, InternalAggregations aggregations) {
             super(subsetDf, subsetSize, supersetDf, supersetSize);
             this.aggregations = aggregations;
-            assert subsetDf <= supersetDf;
             updateScore();
         }
 
@@ -96,7 +95,12 @@ public static double getSampledTermSignificance(long subsetFreq, long subsetSize
                 // avoid any divide by zero issues
                 return 0;
             }
-
+            if (supersetFreq == 0) {
+                // If we are using a background context that is not a strict superset, a foreground 
+                // term may be missing from the background, so for the purposes of this calculation
+                // we assume a value of 1 for our calculations which avoids returning an "infinity" result
+                supersetFreq = 1;
+            }
             double subsetProbability = (double) subsetFreq / (double) subsetSize;
             double supersetProbability = (double) supersetFreq / (double) supersetSize;
 
@@ -154,7 +158,6 @@ public Bucket reduce(List<? extends Bucket> buckets, BigArrays bigArrays) {
                 }
                 aggregationsList.add(bucket.aggregations);
             }
-            assert reduced.subsetDf <= reduced.supersetDf;
             reduced.aggregations = InternalAggregations.reduce(aggregationsList, bigArrays);
             return reduced;
         }

diff --git a/...lasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/...lasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
@@ -84,7 +84,6 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) {
             spare.subsetSize = subsetSize;
             spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
             spare.supersetSize = supersetSize;
-            assert spare.subsetDf <= spare.supersetDf;
             // During shard-local down-selection we use subset/superset stats 
             // that are for this shard only
             // Back at the central reducer these properties will be updated with

diff --git a/...ava/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java b/...ava/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java
@@ -20,6 +20,7 @@
 package org.elasticsearch.search.aggregations.bucket.significant;
 
 import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.index.query.FilterBuilder;
 import org.elasticsearch.search.aggregations.AggregationBuilder;
 
 import java.io.IOException;
@@ -42,6 +43,8 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
     private int includeFlags;
     private String excludePattern;
     private int excludeFlags;
+    private FilterBuilder filterBuilder;
+
 
     public SignificantTermsBuilder(String name) {
         super(name, SignificantStringTerms.TYPE.name());
@@ -66,6 +69,12 @@ public SignificantTermsBuilder minDocCount(int minDocCount) {
         this.minDocCount = minDocCount;
         return this;
     }
+
+    public SignificantTermsBuilder backgroundFilter(FilterBuilder filter) {
+        this.filterBuilder = filter;
+        return this;
+    }
+
 
     public SignificantTermsBuilder shardMinDocCount(int shardMinDocCount) {
         this.shardMinDocCount = shardMinDocCount;
@@ -162,6 +171,11 @@ protected XContentBuilder internalXContent(XContentBuilder builder, Params param
                         .endObject();
             }
         }
+
+        if (filterBuilder != null) {
+            builder.field(SignificantTermsParser.BACKGROUND_FILTER.getPreferredName());
+            filterBuilder.toXContent(builder, params); 
+        }
 
         return builder.endObject();
     }

diff --git a/...java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java b/...java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java
@@ -41,6 +41,8 @@ public class SignificantTermsParser implements Aggregator.Parser {
 
     //Typically need more than one occurrence of something for it to be statistically significant
     public static final int DEFAULT_MIN_DOC_COUNT = 3;
+
+    static final ParseField BACKGROUND_FILTER = new ParseField("background_filter");
 
     static final ParseField SHARD_MIN_DOC_COUNT_FIELD_NAME = new ParseField("shard_min_doc_count");
     public static final int DEFAULT_SHARD_MIN_DOC_COUNT = 1;
@@ -99,18 +101,11 @@ public AggregatorFactory parse(String aggregationName, XContentParser parser, Se
 
                 }
             } else if (token == XContentParser.Token.START_OBJECT) {
-                // TODO not sure if code below is the best means to declare a filter for 
-                // defining an alternative background stats context.
-                // In trial runs it becomes obvious that the choice of background does have to  
-                // be a strict superset of the foreground subset otherwise the significant terms algo
-                // immediately singles out the odd terms that are in the foreground but not represented
-                // in the background. So a better approach may be to use a designated parent agg as the  
-                // background because parent aggs are always guaranteed to be a superset whereas arbitrary
-                // filters defined by end users and parsed below are not.
-//                if ("background_context".equals(currentFieldName)) {
-//                    filter = context.queryParserService().parseInnerFilter(parser).filter();
-//                }
-
+                if (BACKGROUND_FILTER.match(currentFieldName)) {
+                    filter = context.queryParserService().parseInnerFilter(parser).filter();
+                } else {
+                    throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");                    
+                }
             } else {
                 throw new SearchParseException(context, "Unexpected token " + token + " in [" + aggregationName + "].");
             }

diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java
@@ -23,6 +23,7 @@
 import org.elasticsearch.action.search.SearchType;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.query.FilterBuilders;
 import org.elasticsearch.index.query.TermQueryBuilder;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket;
@@ -183,8 +184,59 @@ public void textAnalysis() throws Exception {
         assertSearchResponse(response);
         SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
         checkExpectedStringTermsFound(topTerms);
-    }    
+    }   
+
+    @Test
+    public void badFilteredAnalysis() throws Exception {
+        // Deliberately using a bad choice of filter here for the background context in order
+        // to test robustness. 
+        // We search for the name of a snowboarder but use music-related content (fact_category:1)
+        // as the background source of term statistics.
+        SearchResponse response = client().prepareSearch("test")
+                .setSearchType(SearchType.QUERY_AND_FETCH)
+                .setQuery(new TermQueryBuilder("_all", "terje"))
+                .setFrom(0).setSize(60).setExplain(true)                
+                .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description")
+                           .minDocCount(2).backgroundFilter(FilterBuilders.termFilter("fact_category", 1)))
+                .execute()
+                .actionGet();
+        assertSearchResponse(response);
+        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
+        // We expect at least one of the significant terms to have been selected on the basis
+        // that it is present in the foreground selection but entirely missing from the filtered
+        // background used as context.
+        boolean hasMissingBackgroundTerms = false;
+        for (Bucket topTerm : topTerms) {
+            if (topTerm.getSupersetDf() == 0) {
+                hasMissingBackgroundTerms = true;
+                break;
+            }
+        }
+        assertTrue(hasMissingBackgroundTerms);
+    }       
 
+    @Test
+    public void filteredAnalysis() throws Exception {
+        SearchResponse response = client().prepareSearch("test")
+                .setSearchType(SearchType.QUERY_AND_FETCH)
+                .setQuery(new TermQueryBuilder("_all", "weller"))
+                .setFrom(0).setSize(60).setExplain(true)                
+                .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description")
+                           .minDocCount(1).backgroundFilter(FilterBuilders.termsFilter("description",  "paul")))
+                .execute()
+                .actionGet();
+        assertSearchResponse(response);
+        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
+        HashSet<String> topWords = new HashSet<String>();
+        for (Bucket topTerm : topTerms) {
+            topWords.add(topTerm.getKey());
+        }
+        //The word "paul" should be a constant of all docs in the background set and therefore not seen as significant 
+        assertFalse(topWords.contains("paul"));
+        //"Weller" is the only Paul who was in The Jam and therefore this should be identified as a differentiator from the background of all other Pauls. 
+        assertTrue(topWords.contains("jam"));
+    }       
+
     @Test
     public void nestedAggs() throws Exception {
         String[][] expectedKeywordsByCategory={