Skip to content

Commit

Permalink
Fielddata: goodbye comparators.
Browse files Browse the repository at this point in the history
This commit removes custom comparators in favor of the ones that are in Lucene.

The major change is for nested documents: instead of having a comparator wrapper
that deals with nested documents, this is done at the fielddata level by having
a selector that returns the value to use for comparison.

Sorting with custom missing string values might be slower since it is using
TermValComparator since Lucene's TermOrdValComparator only supports sorting
missing values first or last. But other than this particular case, this change
will allow us to benefit from improvements on comparators from the Lucene side.

Close #5980
  • Loading branch information
jpountz committed Jul 23, 2014
1 parent 08fa5c6 commit 12ee48c
Show file tree
Hide file tree
Showing 52 changed files with 1,509 additions and 2,011 deletions.
Expand Up @@ -40,6 +40,13 @@ public enum FieldData {
assert Lucene.VERSION == Version.LUCENE_4_9 : "Remove emptySortedNumeric in 4.10 and use the method with the same name from Lucene's DocValues class. See LUCENE-5834.";
}

/**
* Return a {@link SortedBinaryDocValues} that doesn't contain any value.
*/
public static SortedBinaryDocValues emptySortedBinary(int maxDoc) {
return singleton(DocValues.emptyBinary(), new Bits.MatchNoBits(maxDoc));
}

/**
* Return a {@link SortedNumericDocValues} that doesn't contain any value.
*/
Expand Down
Expand Up @@ -21,20 +21,23 @@

import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.FieldComparatorSource;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.UnicodeUtil;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexComponent;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.settings.IndexSettings;
import org.elasticsearch.indices.fielddata.breaker.CircuitBreakerService;
import org.elasticsearch.search.MultiValueMode;

import java.io.IOException;

/**
* Thread-safe utility class that allows to get per-segment values via the
* {@link #load(AtomicReaderContext)} method.
Expand Down Expand Up @@ -93,7 +96,7 @@ public static MemoryStorageFormat getMemoryStorageHint(FieldDataType fieldDataTy
/**
* Comparator used for sorting.
*/
XFieldComparatorSource comparatorSource(@Nullable Object missingValue, MultiValueMode sortMode);
XFieldComparatorSource comparatorSource(@Nullable Object missingValue, MultiValueMode sortMode, Nested nested);

/**
* Clears any resources associated with this field data.
Expand All @@ -116,6 +119,52 @@ public abstract class XFieldComparatorSource extends FieldComparatorSource {
UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, MAX_TERM);
}

/**
* Simple wrapper class around a filter that matches parent documents
* and a filter that matches child documents. For every root document R,
* R will be in the parent filter and its children documents will be the
* documents that are contained in the inner set between the previous
* parent + 1, or 0 if there is no previous parent, and R (excluded).
*/
public static class Nested {
private final Filter rootFilter, innerFilter;

public Nested(Filter rootFilter, Filter innerFilter) {
this.rootFilter = rootFilter;
this.innerFilter = innerFilter;
}

// TODO: nested docs should not be random filters but specialized
// ones that guarantee that you always get a FixedBitSet
@Deprecated
private static FixedBitSet toFixedBitSet(DocIdSet set, int maxDoc) throws IOException {
if (set == null || set instanceof FixedBitSet) {
return (FixedBitSet) set;
} else {
final FixedBitSet fixedBitSet = new FixedBitSet(maxDoc);
final DocIdSetIterator it = set.iterator();
if (it != null) {
fixedBitSet.or(it);
}
return fixedBitSet;
}
}

/**
* Get a {@link FixedBitSet} that matches the root documents.
*/
public FixedBitSet rootDocs(AtomicReaderContext ctx) throws IOException {
return toFixedBitSet(rootFilter.getDocIdSet(ctx, null), ctx.reader().maxDoc());
}

/**
* Get a {@link FixedBitSet} that matches the inner documents.
*/
public FixedBitSet innerDocs(AtomicReaderContext ctx) throws IOException {
return toFixedBitSet(innerFilter.getDocIdSet(ctx, null), ctx.reader().maxDoc());
}
}

/** Whether missing values should be sorted first. */
protected final boolean sortMissingFirst(Object missingValue) {
return "_first".equals(missingValue);
Expand Down
Expand Up @@ -19,53 +19,271 @@

package org.elasticsearch.index.fielddata.fieldcomparator;

import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.RandomAccessOrds;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.Version;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
import org.elasticsearch.search.MultiValueMode;

import java.io.IOException;

/**
* Comparator source for string/binary values.
*/
public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparatorSource {

/** UTF-8 term containing a single code point: {@link Character#MAX_CODE_POINT} which will compare greater than all other index terms
* since {@link Character#MAX_CODE_POINT} is a noncharacter and thus shouldn't appear in an index term. */
public static final BytesRef MAX_TERM;
static {
MAX_TERM = new BytesRef();
final char[] chars = Character.toChars(Character.MAX_CODE_POINT);
UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, MAX_TERM);
}

private final IndexFieldData<?> indexFieldData;
private final MultiValueMode sortMode;
private final Object missingValue;
private final Nested nested;

public BytesRefFieldComparatorSource(IndexFieldData<?> indexFieldData, Object missingValue, MultiValueMode sortMode) {
public BytesRefFieldComparatorSource(IndexFieldData<?> indexFieldData, Object missingValue, MultiValueMode sortMode, Nested nested) {
this.indexFieldData = indexFieldData;
this.sortMode = sortMode;
this.missingValue = missingValue;
this.nested = nested;
}

@Override
public SortField.Type reducedType() {
return SortField.Type.STRING;
}

protected SortedBinaryDocValues getValues(AtomicReaderContext context) {
return indexFieldData.load(context).getBytesValues();
}

protected void setScorer(Scorer scorer) {}

@Override
public FieldComparator<?> newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException {
assert fieldname.equals(indexFieldData.getFieldNames().indexName());
final BytesRef missingBytes = (BytesRef) missingObject(missingValue, reversed);
assert indexFieldData == null || fieldname.equals(indexFieldData.getFieldNames().indexName());

final boolean sortMissingLast = sortMissingLast(missingValue) ^ reversed;
final BytesRef missingBytes = (BytesRef) missingObject(missingValue, reversed);
if (indexFieldData instanceof IndexOrdinalsFieldData) {
return new BytesRefOrdValComparator((IndexOrdinalsFieldData) indexFieldData, numHits, sortMode, missingBytes);
// The ordinal-based comparator only supports sorting missing values first or last so when
// a missing value is provided we fall back to the (slow) value-based comparator
// TODO: handle arbitrary missing values via a selector
if (sortMissingFirst(missingValue) || sortMissingLast(missingValue)) {
return new FieldComparator.TermOrdValComparator(numHits, null, sortMissingLast) {

@Override
protected SortedDocValues getSortedDocValues(AtomicReaderContext context, String field) throws IOException {
final RandomAccessOrds values = ((IndexOrdinalsFieldData) indexFieldData).load(context).getOrdinalsValues();
final SortedDocValues selectedValues;
if (nested == null) {
selectedValues = sortMode.select(values);
} else {
final FixedBitSet rootDocs = nested.rootDocs(context);
final FixedBitSet innerDocs = nested.innerDocs(context);
selectedValues = sortMode.select(values, rootDocs, innerDocs);
}
return selectedValues;
}

public BytesRef value(int slot) {
// TODO: When serializing the response to the coordinating node, we lose the information about
// whether the comparator sorts missing docs first or last. We should fix it and let
// TopDocs.merge deal with it (it knows how to)
BytesRef value = super.value(slot);
if (value == null) {
value = missingBytes;
}
return value;
}

};
}
}

final BytesRef nullPlaceHolder = new BytesRef();
final BytesRef nonNullMissingBytes = missingBytes == null ? nullPlaceHolder : missingBytes;
return new TermValComparator(numHits, null, sortMissingLast) {

@Override
protected BinaryDocValues getBinaryDocValues(AtomicReaderContext context, String field) throws IOException {
final SortedBinaryDocValues values = getValues(context);
final BinaryDocValues selectedValues;
if (nested == null) {
selectedValues = sortMode.select(values, nonNullMissingBytes);
} else {
final FixedBitSet rootDocs = nested.rootDocs(context);
final FixedBitSet innerDocs = nested.innerDocs(context);
selectedValues = sortMode.select(values, nonNullMissingBytes, rootDocs, innerDocs, context.reader().maxDoc());
}
return selectedValues;
}

@Override
protected Bits getDocsWithField(AtomicReaderContext context, String field) throws IOException {
return new Bits.MatchAllBits(context.reader().maxDoc());
}

@Override
protected boolean isNull(int doc, BytesRef term) {
return term == nullPlaceHolder;
}

@Override
public void setScorer(Scorer scorer) {
BytesRefFieldComparatorSource.this.setScorer(scorer);
}

@Override
public BytesRef value(int slot) {
BytesRef value = super.value(slot);
if (value == null) {
value = missingBytes;
}
return value;
}

};
}

static {
assert Lucene.VERSION == Version.LUCENE_4_9 : "The comparator below is a raw copy of Lucene's, remove it when upgrading to 4.10";
}

/** Sorts by field's natural Term sort order. All
* comparisons are done using BytesRef.compareTo, which is
* slow for medium to large result sets but possibly
* very fast for very small results sets. */
public static class TermValComparator extends FieldComparator<BytesRef> {

private final BytesRef[] values;
private final BytesRef[] tempBRs;
private BinaryDocValues docTerms;
private Bits docsWithField;
private final String field;
private BytesRef bottom;
private BytesRef topValue;
private final int missingSortCmp;

/** Sole constructor. */
public TermValComparator(int numHits, String field, boolean sortMissingLast) {
values = new BytesRef[numHits];
tempBRs = new BytesRef[numHits];
this.field = field;
missingSortCmp = sortMissingLast ? 1 : -1;
}

@Override
public int compare(int slot1, int slot2) {
final BytesRef val1 = values[slot1];
final BytesRef val2 = values[slot2];
return compareValues(val1, val2);
}

@Override
public int compareBottom(int doc) {
final BytesRef comparableBytes = getComparableBytes(doc, docTerms.get(doc));
return compareValues(bottom, comparableBytes);
}

@Override
public void copy(int slot, int doc) {
final BytesRef comparableBytes = getComparableBytes(doc, docTerms.get(doc));
if (comparableBytes == null) {
values[slot] = null;
} else {
if (tempBRs[slot] == null) {
tempBRs[slot] = new BytesRef();
}
values[slot] = tempBRs[slot];
values[slot].copyBytes(comparableBytes);
}
}

/** Retrieves the BinaryDocValues for the field in this segment */
protected BinaryDocValues getBinaryDocValues(AtomicReaderContext context, String field) throws IOException {
return FieldCache.DEFAULT.getTerms(context.reader(), field, true);
}

/** Retrieves the set of documents that have a value in this segment */
protected Bits getDocsWithField(AtomicReaderContext context, String field) throws IOException {
return FieldCache.DEFAULT.getDocsWithField(context.reader(), field);
}

/** Check whether the given value represents <tt>null</tt>. This can be
* useful if the {@link BinaryDocValues} returned by {@link #getBinaryDocValues}
* use a special value as a sentinel. The default implementation checks
* {@link #getDocsWithField}.
* <p>NOTE: The null value can only be an EMPTY {@link BytesRef}. */
protected boolean isNull(int doc, BytesRef term) {
return docsWithField != null && docsWithField.get(doc) == false;
}

@Override
public FieldComparator<BytesRef> setNextReader(AtomicReaderContext context) throws IOException {
docTerms = getBinaryDocValues(context, field);
docsWithField = getDocsWithField(context, field);
if (docsWithField instanceof Bits.MatchAllBits) {
docsWithField = null;
}
return this;
}

@Override
public void setBottom(final int bottom) {
this.bottom = values[bottom];
}

@Override
public void setTopValue(BytesRef value) {
// null is fine: it means the last doc of the prior
// search was missing this value
topValue = value;
}

@Override
public BytesRef value(int slot) {
return values[slot];
}

@Override
public int compareValues(BytesRef val1, BytesRef val2) {
// missing always sorts first:
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}

@Override
public int compareTop(int doc) {
final BytesRef comparableBytes = getComparableBytes(doc, docTerms.get(doc));
return compareValues(topValue, comparableBytes);
}

/**
* Given a document and a term, return the term itself if it exists or
* <tt>null</tt> otherwise.
*/
private BytesRef getComparableBytes(int doc, BytesRef term) {
if (term.length == 0 && isNull(doc, term)) {
return null;
}
return new BytesRefValComparator(indexFieldData, numHits, sortMode, missingBytes);
return term;
}
}

}

0 comments on commit 12ee48c

Please sign in to comment.