elastic · markharwood · Apr 2, 2014 · May 1, 2014 · May 12, 2014 · jpountz
diff --git a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc
@@ -233,13 +233,20 @@ free-text field and use them in a `terms` query on the same field with a `highli
 are presented unstemmed, highlighted, with the right case, in the right order and with some context, their significance/meaning is more readily apparent.
 ============
 
+==== Custom background sets
+added[1.2.0]
+
+
+Ordinarily, the foreground set of documents is "diffed" against a background set of all the documents in your index.
+However, sometimes it may prove useful to use a narrower background set as the basis for comparisons. 
+For example, a query on documents relating to "Madrid" in an index with content from all over the world might reveal that "Spanish" 
+was a significant term. This may be true but if you want some more focused terms you could use a `background_filter` 
+on the term 'spain' to establish a narrower set of documents as context. With this as a background "Spanish" would now 
+be seen as commonplace and therefore not as significant as words like "capital" that relate more strongly with Madrid.  
+Note that using a background filter will slow things down - each term's background frequency must now be derived on-the-fly from filtering posting lists rather than reading the index's pre-computed count for a term.  
+
 ==== Limitations
 
-===== Single _background_ comparison base
-The above examples show how to select the _foreground_ set for analysis using a query or parent aggregation to filter but currently there is no means of specifying
-a _background_ set other than the index from which all results are ultimately drawn. Sometimes it may prove useful to use a different
-background set as the basis for comparisons e.g. to first select the tweets for the TV show "XFactor" and then look
-for significant terms in a subset of that content which is from this week. 
 
 ===== Significant terms must be indexed values
 Unlike the terms aggregation it is currently not possible to use script-generated terms for counting purposes.
@@ -324,6 +331,37 @@ WARNING: Setting `min_doc_count` to `1` is generally not advised as it tends to
          reinforce that, while still rare, the term was not the result of a one-off accident. The
          default value of 3 is used to provide a minimum weight-of-evidence.
 
+===== Custom background context
+
+The default source of statistical information for background term frequencies is the entire index and this
+scope can be narrowed through the use of a `background_filter` to focus in on significant terms within a narrower
+context: 
+
+[source,js]
+--------------------------------------------------
+{
+    "query" : {
+        "match" : "madrid"
+    },
+    "aggs" : {
+        "tags" : {
+            "significant_terms" : { 
+                "field" : "tag",
+                "background_filter": {
+                	"term" : { "text" : "spain"}
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+
+The above filter would help focus in on terms that were peculiar to the city of Madrid rather than revealing 
+terms like "Spanish" that are unusual in the full index's worldwide context but commonplace in the subset of documents containing the 
+word "Spain".   
+
+WARNING: Use of background filters will slow the query as each term's postings must be filtered to determine a frequency
+
 
 ===== Filtering Values
 

diff --git a/...va/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java b/...va/org/elasticsearch/search/aggregations/bucket/significant/InternalSignificantTerms.java
@@ -54,7 +54,6 @@ public static abstract class Bucket extends SignificantTerms.Bucket {
         protected Bucket(long subsetDf, long subsetSize, long supersetDf, long supersetSize, InternalAggregations aggregations) {
             super(subsetDf, subsetSize, supersetDf, supersetSize);
             this.aggregations = aggregations;
-            assert subsetDf <= supersetDf;
             updateScore();
         }
 
@@ -96,7 +95,12 @@ public static double getSampledTermSignificance(long subsetFreq, long subsetSize
                 // avoid any divide by zero issues
                 return 0;
             }
-
+            if (supersetFreq == 0) {
+                // If we are using a background context that is not a strict superset, a foreground 
+                // term may be missing from the background, so for the purposes of this calculation
+                // we assume a value of 1 for our calculations which avoids returning an "infinity" result
+                supersetFreq = 1;
+            }
             double subsetProbability = (double) subsetFreq / (double) subsetSize;
             double supersetProbability = (double) supersetFreq / (double) supersetSize;
 
@@ -154,7 +158,6 @@ public Bucket reduce(List<? extends Bucket> buckets, BigArrays bigArrays) {
                 }
                 aggregationsList.add(bucket.aggregations);
             }
-            assert reduced.subsetDf <= reduced.supersetDf;
             reduced.aggregations = InternalAggregations.reduce(aggregationsList, bigArrays);
             return reduced;
         }

diff --git a/...lasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/...lasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
@@ -82,7 +82,6 @@ public SignificantStringTerms buildAggregation(long owningBucketOrdinal) {
             spare.subsetSize = subsetSize;
             spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.termBytes);
             spare.supersetSize = supersetSize;
-            assert spare.subsetDf <= spare.supersetDf;
             // During shard-local down-selection we use subset/superset stats 
             // that are for this shard only
             // Back at the central reducer these properties will be updated with

diff --git a/...ava/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java b/...ava/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsBuilder.java
@@ -20,6 +20,7 @@
 package org.elasticsearch.search.aggregations.bucket.significant;
 
 import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.index.query.FilterBuilder;
 import org.elasticsearch.search.aggregations.AggregationBuilder;
 
 import java.io.IOException;
@@ -36,6 +37,8 @@ public class SignificantTermsBuilder extends AggregationBuilder<SignificantTerms
     private int requiredSize = SignificantTermsParser.DEFAULT_REQUIRED_SIZE;
     private int shardSize = SignificantTermsParser.DEFAULT_SHARD_SIZE;
     private int minDocCount = SignificantTermsParser.DEFAULT_MIN_DOC_COUNT;
+    private FilterBuilder filterBuilder;
+
 
     public SignificantTermsBuilder(String name) {
         super(name, SignificantStringTerms.TYPE.name());
@@ -60,6 +63,12 @@ public SignificantTermsBuilder minDocCount(int minDocCount) {
         this.minDocCount = minDocCount;
         return this;
     }
+
+    public SignificantTermsBuilder backgroundFilter(FilterBuilder filter) {
+        this.filterBuilder = filter;
+        return this;
+    }
+
 
     @Override
     protected XContentBuilder internalXContent(XContentBuilder builder, Params params) throws IOException {
@@ -76,6 +85,10 @@ protected XContentBuilder internalXContent(XContentBuilder builder, Params param
         if (shardSize != SignificantTermsParser.DEFAULT_SHARD_SIZE) {
             builder.field("shard_size", shardSize);
         }
+        if (filterBuilder != null) {
+            builder.field("background_filter");
+            filterBuilder.toXContent(builder, params); 
+        }
 
         return builder.endObject();
     }

diff --git a/...java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java b/...java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java
@@ -19,6 +19,7 @@
 package org.elasticsearch.search.aggregations.bucket.significant;
 
 import org.apache.lucene.search.Filter;
+import org.elasticsearch.common.ParseField;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.search.SearchParseException;
 import org.elasticsearch.search.aggregations.Aggregator;
@@ -40,6 +41,8 @@ public class SignificantTermsParser implements Aggregator.Parser {
 
     //Typically need more than one occurrence of something for it to be statistically significant
     public static final int DEFAULT_MIN_DOC_COUNT = 3;
+
+    private static final ParseField BACKGROUND_FILTER = new ParseField("background_filter");
 
     @Override
     public String type() {
@@ -90,18 +93,9 @@ public AggregatorFactory parse(String aggregationName, XContentParser parser, Se
                     throw new SearchParseException(context, "Unknown key for a " + token + " in [" + aggregationName + "]: [" + currentFieldName + "].");
                 }
             } else if (token == XContentParser.Token.START_OBJECT) {
-                // TODO not sure if code below is the best means to declare a filter for 
-                // defining an alternative background stats context.
-                // In trial runs it becomes obvious that the choice of background does have to  
-                // be a strict superset of the foreground subset otherwise the significant terms algo
-                // immediately singles out the odd terms that are in the foreground but not represented
-                // in the background. So a better approach may be to use a designated parent agg as the  
-                // background because parent aggs are always guaranteed to be a superset whereas arbitrary
-                // filters defined by end users and parsed below are not.
-//                if ("background_context".equals(currentFieldName)) {
-//                    filter = context.queryParserService().parseInnerFilter(parser).filter();
-//                }
-
+                if (BACKGROUND_FILTER.match(currentFieldName)) {
+                    filter = context.queryParserService().parseInnerFilter(parser).filter();
+                }
             } else {
                 throw new SearchParseException(context, "Unexpected token " + token + " in [" + aggregationName + "].");
             }

diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsTests.java
@@ -23,6 +23,7 @@
 import org.elasticsearch.action.search.SearchType;
 import org.elasticsearch.common.settings.ImmutableSettings;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.query.FilterBuilders;
 import org.elasticsearch.index.query.TermQueryBuilder;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket;
@@ -141,8 +142,57 @@ public void textAnalysis() throws Exception {
         assertSearchResponse(response);
         SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
         checkExpectedStringTermsFound(topTerms);
-    }    
+    }   
 
+    @Test
+    public void badFilteredAnalysis() throws Exception {
+        // Deliberately using a bad choice of filter here for the background context in order
+        // to test robustness. 
+        // We search for the name of a snowboarder but use music-related content (fact_category:1)
+        // as the background source of term statistics.
+        SearchResponse response = client().prepareSearch("test")
+                .setSearchType(SearchType.QUERY_AND_FETCH)
+                .setQuery(new TermQueryBuilder("_all", "terje"))
+                .setFrom(0).setSize(60).setExplain(true)                
+                .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description")
+                           .minDocCount(2).backgroundFilter(FilterBuilders.termFilter("fact_category", 1)))
+                .execute()
+                .actionGet();
+        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
+        // We expect at least one of the significant terms to have been selected on the basis
+        // that it is present in the foreground selection but entirely missing from the filtered
+        // background used as context.
+        boolean hasMissingBackgroundTerms = false;
+        for (Bucket topTerm : topTerms) {
+            if (topTerm.getSupersetDf() == 0) {
+                hasMissingBackgroundTerms = true;
+                break;
+            }
+        }
+        assertTrue(hasMissingBackgroundTerms);
+    }       
+
+    @Test
+    public void filteredAnalysis() throws Exception {
+        SearchResponse response = client().prepareSearch("test")
+                .setSearchType(SearchType.QUERY_AND_FETCH)
+                .setQuery(new TermQueryBuilder("_all", "weller"))
+                .setFrom(0).setSize(60).setExplain(true)                
+                .addAggregation(new SignificantTermsBuilder("mySignificantTerms").field("description")
+                           .minDocCount(1).backgroundFilter(FilterBuilders.termsFilter("description",  "paul")))
+                .execute()
+                .actionGet();
+        SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
+        HashSet<String> topWords = new HashSet<String>();
+        for (Bucket topTerm : topTerms) {
+            topWords.add(topTerm.getKey());
+        }
+        //The word "paul" should be a constant of all docs in the background set and therefore not seen as significant 
+        assertFalse(topWords.contains("paul"));
+        //"Weller" is the only Paul who was in The Jam and therefore this should be identified as a differentiator from the background of all other Pauls. 
+        assertTrue(topWords.contains("jam"));
+    }       
+
     @Test
     public void nestedAggs() throws Exception {
         String[][] expectedKeywordsByCategory={