forked from elastic/elasticsearch
/
SignificantLongTermsAggregator.java
118 lines (101 loc) · 5.11 KB
/
SignificantLongTermsAggregator.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.bucket.significant;
import org.apache.lucene.index.IndexReader;
import org.elasticsearch.common.lease.Releasables;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.AggregatorFactories;
import org.elasticsearch.search.aggregations.bucket.terms.LongTermsAggregator;
import org.elasticsearch.search.aggregations.support.AggregationContext;
import org.elasticsearch.search.aggregations.support.numeric.NumericValuesSource;
import org.elasticsearch.search.internal.ContextIndexSearcher;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
/**
*
*/
public class SignificantLongTermsAggregator extends LongTermsAggregator {
public SignificantLongTermsAggregator(String name, AggregatorFactories factories, NumericValuesSource valuesSource,
long estimatedBucketCount, int requiredSize, int shardSize, long minDocCount,
AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory) {
super(name, factories, valuesSource, estimatedBucketCount, null, requiredSize, shardSize, minDocCount, aggregationContext, parent);
this.termsAggFactory = termsAggFactory;
}
protected long numCollectedDocs;
private final SignificantTermsAggregatorFactory termsAggFactory;
@Override
public void collect(int doc, long owningBucketOrdinal) throws IOException {
super.collect(doc, owningBucketOrdinal);
numCollectedDocs++;
}
@Override
public SignificantLongTerms buildAggregation(long owningBucketOrdinal) {
assert owningBucketOrdinal == 0;
final int size = (int) Math.min(bucketOrds.size(), shardSize);
ContextIndexSearcher searcher = context.searchContext().searcher();
IndexReader topReader = searcher.getIndexReader();
long supersetSize = topReader.numDocs();
long subsetSize = numCollectedDocs;
BucketSignificancePriorityQueue ordered = new BucketSignificancePriorityQueue(size);
SignificantLongTerms.Bucket spare = null;
for (long i = 0; i < bucketOrds.capacity(); i++) {
final long ord = bucketOrds.id(i);
if (ord < 0) {
// slot is not allocated
continue;
}
if (spare == null) {
spare = new SignificantLongTerms.Bucket(0, 0, 0, 0, 0, null);
termsAggFactory.buildTermsEnum(context);
}
spare.term = bucketOrds.key(i);
spare.subsetDf = bucketDocCount(ord);
spare.subsetSize = subsetSize;
spare.supersetDf = termsAggFactory.getBackgroundFrequency(spare.term);
spare.supersetSize = supersetSize;
assert spare.subsetDf <= spare.supersetDf;
// During shard-local down-selection we use subset/superset stats that are for this shard only
// Back at the central reducer these properties will be updated with global stats
spare.updateScore();
spare.bucketOrd = ord;
spare = (SignificantLongTerms.Bucket) ordered.insertWithOverflow(spare);
}
final InternalSignificantTerms.Bucket[] list = new InternalSignificantTerms.Bucket[ordered.size()];
for (int i = ordered.size() - 1; i >= 0; i--) {
final SignificantLongTerms.Bucket bucket = (SignificantLongTerms.Bucket) ordered.pop();
bucket.aggregations = bucketAggregations(bucket.bucketOrd);
list[i] = bucket;
}
return new SignificantLongTerms(subsetSize, supersetSize, name, valuesSource.formatter(), requiredSize, minDocCount,
Arrays.asList(list));
}
@Override
public SignificantLongTerms buildEmptyAggregation() {
// We need to account for the significance of a miss in our global stats - provide corpus size as context
ContextIndexSearcher searcher = context.searchContext().searcher();
IndexReader topReader = searcher.getIndexReader();
int supersetSize = topReader.numDocs();
return new SignificantLongTerms(0, supersetSize, name, valuesSource.formatter(), requiredSize, minDocCount, Collections.<InternalSignificantTerms.Bucket>emptyList());
}
@Override
public void doRelease() {
Releasables.release(bucketOrds, termsAggFactory);
}
}