New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve terms aggregation to perform the segment ordinal to global ordinal lookup post segment collection #5895
Changes from all commits
f453762
3854bf4
1461cc3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,9 @@ | |
import org.apache.lucene.util.ArrayUtil; | ||
import org.apache.lucene.util.BytesRef; | ||
import org.apache.lucene.util.RamUsageEstimator; | ||
import org.elasticsearch.ExceptionsHelper; | ||
import org.elasticsearch.common.lease.Releasables; | ||
import org.elasticsearch.common.util.LongArray; | ||
import org.elasticsearch.common.util.LongHash; | ||
import org.elasticsearch.index.fielddata.BytesValues; | ||
import org.elasticsearch.index.fielddata.ordinals.Ordinals; | ||
|
@@ -37,6 +39,8 @@ | |
import java.io.IOException; | ||
import java.util.Arrays; | ||
|
||
import static org.elasticsearch.index.fielddata.ordinals.InternalGlobalOrdinalsBuilder.GlobalOrdinalMapping; | ||
|
||
/** | ||
* An aggregator of string values that relies on global ordinals in order to build buckets. | ||
*/ | ||
|
@@ -47,8 +51,8 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr | |
protected Ordinals.Docs globalOrdinals; | ||
|
||
public GlobalOrdinalsStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, long estimatedBucketCount, | ||
InternalOrder order, int requiredSize, int shardSize, long minDocCount, AggregationContext aggregationContext, Aggregator parent) { | ||
super(name, factories, estimatedBucketCount, aggregationContext, parent, order, requiredSize, shardSize, minDocCount); | ||
long maxOrd, InternalOrder order, int requiredSize, int shardSize, long minDocCount, AggregationContext aggregationContext, Aggregator parent) { | ||
super(name, factories, maxOrd, aggregationContext, parent, order, requiredSize, shardSize, minDocCount); | ||
this.valuesSource = valuesSource; | ||
} | ||
|
||
|
@@ -65,7 +69,6 @@ public boolean shouldCollect() { | |
public void setNextReader(AtomicReaderContext reader) { | ||
globalValues = valuesSource.globalBytesValues(); | ||
globalOrdinals = globalValues.ordinals(); | ||
initializeDocCounts(globalOrdinals.getMaxOrd()); | ||
} | ||
|
||
@Override | ||
|
@@ -135,9 +138,10 @@ public static class WithHash extends GlobalOrdinalsStringTermsAggregator { | |
private final LongHash bucketOrds; | ||
|
||
public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, long estimatedBucketCount, | ||
InternalOrder order, int requiredSize, int shardSize, long minDocCount, AggregationContext aggregationContext, | ||
long maxOrd, InternalOrder order, int requiredSize, int shardSize, long minDocCount, AggregationContext aggregationContext, | ||
Aggregator parent) { | ||
super(name, factories, valuesSource, estimatedBucketCount, order, requiredSize, shardSize, minDocCount, aggregationContext, parent); | ||
// Set maxOrd to estimatedBucketCount! To be conservative with memory. | ||
super(name, factories, valuesSource, estimatedBucketCount, estimatedBucketCount, order, requiredSize, shardSize, minDocCount, aggregationContext, parent); | ||
bucketOrds = new LongHash(estimatedBucketCount, aggregationContext.bigArrays()); | ||
} | ||
|
||
|
@@ -172,4 +176,79 @@ protected void doClose() { | |
|
||
} | ||
|
||
/** | ||
* Variant of {@link GlobalOrdinalsStringTermsAggregator} that resolves global ordinals post segment collection | ||
* instead of on the fly for each match.This is beneficial for low cardinality fields, because it can reduce | ||
* the amount of look-ups significantly. | ||
*/ | ||
public static class LowCardinality extends GlobalOrdinalsStringTermsAggregator { | ||
|
||
private final LongArray segmentDocCounts; | ||
|
||
private Ordinals.Docs segmentOrdinals; | ||
private LongArray current; | ||
|
||
public LowCardinality(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, long estimatedBucketCount, | ||
long maxOrd, InternalOrder order, int requiredSize, int shardSize, long minDocCount, AggregationContext aggregationContext, Aggregator parent) { | ||
super(name, factories, valuesSource, estimatedBucketCount, maxOrd, order, requiredSize, shardSize, minDocCount, aggregationContext, parent); | ||
this.segmentDocCounts = bigArrays.newLongArray(maxOrd, true); | ||
} | ||
|
||
@Override | ||
public void collect(int doc, long owningBucketOrdinal) throws IOException { | ||
final int numOrds = segmentOrdinals.setDocument(doc); | ||
for (int i = 0; i < numOrds; i++) { | ||
final long segmentOrd = segmentOrdinals.nextOrd(); | ||
current.increment(segmentOrd, 1); | ||
} | ||
} | ||
|
||
@Override | ||
public void setNextReader(AtomicReaderContext reader) { | ||
if (segmentOrdinals != null && segmentOrdinals.getMaxOrd() != globalOrdinals.getMaxOrd()) { | ||
mapSegmentCountsToGlobalCounts(); | ||
} | ||
|
||
super.setNextReader(reader); | ||
BytesValues.WithOrdinals bytesValues = valuesSource.bytesValues(); | ||
segmentOrdinals = bytesValues.ordinals(); | ||
if (segmentOrdinals.getMaxOrd() != globalOrdinals.getMaxOrd()) { | ||
current = segmentDocCounts; | ||
} else { | ||
current = getDocCounts(); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. very nice way to avoid the condition in |
||
} | ||
|
||
@Override | ||
protected void doPostCollection() { | ||
if (segmentOrdinals.getMaxOrd() != globalOrdinals.getMaxOrd()) { | ||
mapSegmentCountsToGlobalCounts(); | ||
} | ||
} | ||
|
||
@Override | ||
protected void doClose() { | ||
Releasables.close(segmentDocCounts); | ||
} | ||
|
||
private void mapSegmentCountsToGlobalCounts() { | ||
// There is no public method in Ordinals.Docs that allows for this mapping... | ||
// This is the cleanest way I can think of so far | ||
GlobalOrdinalMapping mapping = (GlobalOrdinalMapping) globalOrdinals; | ||
for (int i = 0; i < segmentDocCounts.size(); i++) { | ||
final long inc = segmentDocCounts.get(i); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ow nice, I didn't know that |
||
if (inc == 0) { | ||
continue; | ||
} | ||
final long globalOrd = mapping.getGlobalOrd(i); | ||
try { | ||
incrementBucketDocCount(inc, globalOrd); | ||
segmentDocCounts.set(i, 0); // reset for next segment | ||
} catch (IOException e) { | ||
throw ExceptionsHelper.convertToElastic(e); | ||
} | ||
} | ||
} | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
|
||
import org.apache.lucene.index.AtomicReaderContext; | ||
import org.apache.lucene.index.IndexReaderContext; | ||
import org.apache.lucene.search.IndexSearcher; | ||
import org.apache.lucene.util.ArrayUtil; | ||
import org.apache.lucene.util.BytesRef; | ||
import org.apache.lucene.util.BytesRefHash; | ||
|
@@ -29,6 +30,7 @@ | |
import org.elasticsearch.common.util.CollectionUtils; | ||
import org.elasticsearch.index.fielddata.*; | ||
import org.elasticsearch.index.fielddata.AtomicFieldData.Order; | ||
import org.elasticsearch.index.fielddata.ordinals.Ordinals; | ||
import org.elasticsearch.script.SearchScript; | ||
import org.elasticsearch.search.aggregations.support.ValuesSource.Bytes.SortedAndUnique.SortedUniqueBytesValues; | ||
import org.elasticsearch.search.aggregations.support.values.ScriptBytesValues; | ||
|
@@ -159,6 +161,8 @@ public static abstract class WithOrdinals extends Bytes implements TopReaderCont | |
|
||
public abstract BytesValues.WithOrdinals globalBytesValues(); | ||
|
||
public abstract long maxOrd(IndexSearcher indexSearcher); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should it be called There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it should, I'll change that. |
||
|
||
public static class FieldData extends WithOrdinals implements ReaderContextAware { | ||
|
||
protected boolean needsHashes; | ||
|
@@ -229,6 +233,17 @@ public BytesValues.WithOrdinals globalBytesValues() { | |
} | ||
return globalBytesValues; | ||
} | ||
|
||
@Override | ||
public long maxOrd(IndexSearcher indexSearcher) { | ||
IndexReaderContext topReaderContext = indexSearcher.getTopReaderContext(); | ||
AtomicReaderContext atomicReaderContext = indexSearcher.getIndexReader().leaves().get(0); | ||
IndexFieldData.WithOrdinals<?> globalFieldData = indexFieldData.loadGlobal(topReaderContext.reader()); | ||
AtomicFieldData.WithOrdinals afd = globalFieldData.load(atomicReaderContext); | ||
BytesValues.WithOrdinals values = afd.getBytesValues(false); | ||
Ordinals.Docs ordinals = values.ordinals(); | ||
return ordinals.getMaxOrd(); | ||
} | ||
} | ||
|
||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we don't need to have it public anymore?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I need access to this when mapping the collected segment ordinals to global ordinals. Or can that be done via another way?