elastic · markharwood · Mar 23, 2015 · Apr 15, 2015 · Apr 20, 2015 · colings86
diff --git a/docs/reference/search/aggregations/bucket/sampler-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/sampler-aggregation.asciidoc
@@ -0,0 +1,154 @@
+[[search-aggregations-bucket-sampler-aggregation]]
+=== Sampler Aggregation
+
+experimental[]
+
+A filtering aggregation used to limit any sub aggregations' processing to a sample of the top-scoring documents.
+Optionally, diversity settings can be used to limit the number of matches that share a common value such as an "author".
+
+.Example use cases:
+* Tightening the focus of analytics to high-relevance matches rather than the potentially very long tail of low-quality matches
+* Removing bias from analytics by ensuring fair representation of content from different sources
+* Reducing the running cost of aggregations that can produce useful results using only samples e.g. `significant_terms`
+
+
+Example:
+
+[source,js]
+--------------------------------------------------
+{
+    "query": {
+        "match": {
+            "text": "iphone"
+        }
+    },
+    "aggs": {
+        "sample": {
+            "sampler": {
+                "shard_size": 200,
+                "field" : "user.id"   
+            },
+            "aggs": {
+                "keywords": {
+                    "significant_terms": {
+                        "field": "text"
+                    }
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+
+Response:
+
+[source,js]
+--------------------------------------------------
+{
+    ...
+        "aggregations": {
+        "sample": {
+            "doc_count": 1000,<1>
+            "keywords": {<2>
+                "doc_count": 1000,
+                "buckets": [
+                    ...
+                    {
+                        "key": "bend",
+                        "doc_count": 58,
+                        "score": 37.982536582524276,
+                        "bg_count": 103
+                    },
+                    ....
+}
+--------------------------------------------------
+
+<1> 1000 documents were sampled in total becase we asked for a maximum of 200 from an index with 5 shards. The cost of performing the nested significant_terms aggregation was therefore limited rather than unbounded.
+<2> The results of the significant_terms aggregation are not skewed by any single over-active Twitter user because we asked for a maximum of one tweet from any one user in our sample.
+
+
+==== shard_size
+
+The `shard_size` parameter limits how many top-scoring documents are collected in the sample processed on each shard.
+The default value is 100.
+
+=== Controlling diversity
+Optionally, you can use the `field` or `script` and `max_docs_per_value` settings to control the maximum number of documents collected on any one shard which share a common value.
+The choice of value (e.g. `author`) is loaded from a regular `field` or derived dynamically by a `script`.
+
+The aggregation will throw an error if the choice of field or script produces multiple values for a document.
+It is currently not possible to offer this form of de-duplication using many values, primarily due to concerns over efficiency.
+
+NOTE: Any good market researcher will tell you that when working with samples of data it is important
+that the sample represents a healthy variety of opinions rather than being skewed by any single voice.
+The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography, a large spike in a timeline or an over-active forum spammer).  
+
+==== Field
+
+Controlling diversity using a field:
+
+[source,js]
+--------------------------------------------------
+{
+    "aggs" : {
+        "sample" : {
+            "sampler" : {
+                "field" : "author",
+                "max_docs_per_value" : 3
+            }
+        }
+    }
+}
+--------------------------------------------------
+
+Note that the `max_docs_per_value` setting applies on a per-shard basis only for the purposes of shard-local sampling.
+It is not intended as a way of providing a global de-duplication feature on search results.
+
+
+
+==== Script
+
+Controlling diversity using a script:
+
+[source,js]
+--------------------------------------------------
+{
+    "aggs" : {
+        "sample" : {
+            "sampler" : {
+                "script" : "doc['author'].value + '/' + doc['genre'].value"
+            }
+        }
+    }
+}
+--------------------------------------------------
+Note in the above example we chose to use the default `max_docs_per_value` setting of 1 and combine author and genre fields to ensure 
+each shard sample has, at most, one match for an author/genre pair.
+
+
+==== execution_hint
+
+When using the settings to control diversity, the optional `execution_hint` setting can influence the management of the values used for de-duplication.
+Each option will hold up to `shard_size` values in memory while performing de-duplication but the type of value held can be controlled as follows:
+
+ - hold field values directly (`map`)
+ - hold ordinals of the field as determined by the Lucene index (`global_ordinals`)
+ - hold hashes of the field values - with potential for hash collisions (`bytes_hash`)
+
+The default setting is to use `global_ordinals` if this information is available from the Lucene index and reverting to `map` if not.
+The `bytes_hash` setting may prove faster in some cases but introduces the possibility of false positives in de-duplication logic due to the possibility of hash collisions.
+Please note that Elasticsearch will ignore the choice of execution hint if it is not applicable and that there is no backward compatibility guarantee on these hints.
+
+=== Limitations
+
+==== Cannot be nested under `breadth_first` aggregations
+Being a quality-based filter the sampler aggregation needs access to the relevance score produced for each document.
+It therefore cannot be nested under a `terms` aggregation which has the `collect_mode` switched from the default `depth_first` mode to `breadth_first` as this discards scores.
+In this situation an error will be thrown.
+
+==== Limited de-dup logic.
+The de-duplication logic in the diversify settings applies only at a shard level so will not apply across shards.
+
+==== No specialized syntax for geo/date fields
+Currently the syntax for defining the diversifying values is defined by a choice of `field` or `script` - there is no added syntactical sugar for expressing geo or date units such as "1w" (1 week).
+This support may be added in a later release and users will currently have to create these sorts of values using a script.
diff --git a/src/main/java/org/elasticsearch/search/aggregations/AggregationModule.java b/src/main/java/org/elasticsearch/search/aggregations/AggregationModule.java
@@ -20,6 +20,7 @@
 
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
+
 import org.elasticsearch.common.inject.AbstractModule;
 import org.elasticsearch.common.inject.Module;
 import org.elasticsearch.common.inject.SpawnModules;
@@ -38,6 +39,7 @@
 import org.elasticsearch.search.aggregations.bucket.range.date.DateRangeParser;
 import org.elasticsearch.search.aggregations.bucket.range.geodistance.GeoDistanceParser;
 import org.elasticsearch.search.aggregations.bucket.range.ipv4.IpRangeParser;
+import org.elasticsearch.search.aggregations.bucket.sampler.SamplerParser;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsParser;
 import org.elasticsearch.search.aggregations.bucket.significant.heuristics.SignificantTermsHeuristicModule;
 import org.elasticsearch.search.aggregations.bucket.terms.TermsParser;
@@ -80,6 +82,7 @@ public AggregationModule() {
         parsers.add(MissingParser.class);
         parsers.add(FilterParser.class);
         parsers.add(FiltersParser.class);
+        parsers.add(SamplerParser.class);
         parsers.add(TermsParser.class);
         parsers.add(SignificantTermsParser.class);
         parsers.add(RangeParser.class);

diff --git a/src/main/java/org/elasticsearch/search/aggregations/AggregatorBase.java b/src/main/java/org/elasticsearch/search/aggregations/AggregatorBase.java
@@ -19,6 +19,7 @@
 package org.elasticsearch.search.aggregations;
 
 import org.apache.lucene.index.LeafReaderContext;
+import org.elasticsearch.search.aggregations.bucket.BestBucketsDeferringCollector;
 import org.elasticsearch.search.aggregations.bucket.DeferringBucketCollector;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.search.internal.SearchContext.Lifetime;
@@ -136,7 +137,7 @@ public final void preCollection() throws IOException {
         for (int i = 0; i < subAggregators.length; ++i) {
             if (shouldDefer(subAggregators[i])) {
                 if (recordingWrapper == null) {
-                    recordingWrapper = new DeferringBucketCollector();
+                    recordingWrapper = getDeferringCollector();
                 }
                 deferredCollectors.add(subAggregators[i]);
                 subAggregators[i] = recordingWrapper.wrap(subAggregators[i]);
@@ -153,6 +154,12 @@ public final void preCollection() throws IOException {
         collectableSubAggregators.preCollection();
     }
 
+    public DeferringBucketCollector getDeferringCollector() {
+        // Default impl is a collector that selects the best buckets
+        // but an alternative defer policy may be based on best docs.
+        return new BestBucketsDeferringCollector();
+    }
+
     /**
      * This method should be overidden by subclasses that want to defer calculation
      * of a child aggregation until a first pass is complete and a set of buckets has

diff --git a/src/main/java/org/elasticsearch/search/aggregations/TransportAggregationModule.java b/src/main/java/org/elasticsearch/search/aggregations/TransportAggregationModule.java
@@ -36,6 +36,8 @@
 import org.elasticsearch.search.aggregations.bucket.range.date.InternalDateRange;
 import org.elasticsearch.search.aggregations.bucket.range.geodistance.InternalGeoDistance;
 import org.elasticsearch.search.aggregations.bucket.range.ipv4.InternalIPv4Range;
+import org.elasticsearch.search.aggregations.bucket.sampler.InternalSampler;
+import org.elasticsearch.search.aggregations.bucket.sampler.UnmappedSampler;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantLongTerms;
 import org.elasticsearch.search.aggregations.bucket.significant.SignificantStringTerms;
 import org.elasticsearch.search.aggregations.bucket.significant.UnmappedSignificantTerms;
@@ -83,6 +85,8 @@ protected void configure() {
         InternalGlobal.registerStreams();
         InternalFilter.registerStreams();
         InternalFilters.registerStream();
+        InternalSampler.registerStreams();
+        UnmappedSampler.registerStreams();
         InternalMissing.registerStreams();
         StringTerms.registerStreams();
         LongTerms.registerStreams();