Fielddata: goodbye comparators.

This commit removes custom comparators in favor of the ones that are in Lucene. The major change is for nested documents: instead of having a comparator wrapper that deals with nested documents, this is done at the fielddata level by having a selector that returns the value to use for comparison. Sorting with custom missing string values might be slower since it is using TermValComparator since Lucene's TermOrdValComparator only supports sorting missing values first or last. But other than this particular case, this change will allow us to benefit from improvements on comparators from the Lucene side. Close #5980
elastic · Jul 23, 2014 · 12ee48c · 12ee48c
1 parent 08fa5c6
commit 12ee48c
Show file tree

Hide file tree

Showing 52 changed files with 1,509 additions and 2,011 deletions.
diff --git a/src/main/java/org/elasticsearch/index/fielddata/FieldData.java b/src/main/java/org/elasticsearch/index/fielddata/FieldData.java
@@ -40,6 +40,13 @@ public enum FieldData {
         assert Lucene.VERSION == Version.LUCENE_4_9 : "Remove emptySortedNumeric in 4.10 and use the method with the same name from Lucene's DocValues class. See LUCENE-5834.";
     }
 
+    /**
+     * Return a {@link SortedBinaryDocValues} that doesn't contain any value.
+     */
+    public static SortedBinaryDocValues emptySortedBinary(int maxDoc) {
+        return singleton(DocValues.emptyBinary(), new Bits.MatchNoBits(maxDoc));
+    }
+
     /**
      * Return a {@link SortedNumericDocValues} that doesn't contain any value.
      */

diff --git a/src/main/java/org/elasticsearch/index/fielddata/IndexFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/IndexFieldData.java
@@ -21,20 +21,23 @@
 
 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.search.FieldComparatorSource;
-import org.apache.lucene.search.SortField;
+import org.apache.lucene.search.*;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.UnicodeUtil;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.index.IndexComponent;
+import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.index.settings.IndexSettings;
 import org.elasticsearch.indices.fielddata.breaker.CircuitBreakerService;
 import org.elasticsearch.search.MultiValueMode;
 
+import java.io.IOException;
+
 /**
  * Thread-safe utility class that allows to get per-segment values via the
  * {@link #load(AtomicReaderContext)} method.
@@ -93,7 +96,7 @@ public static MemoryStorageFormat getMemoryStorageHint(FieldDataType fieldDataTy
     /**
      * Comparator used for sorting.
      */
-    XFieldComparatorSource comparatorSource(@Nullable Object missingValue, MultiValueMode sortMode);
+    XFieldComparatorSource comparatorSource(@Nullable Object missingValue, MultiValueMode sortMode, Nested nested);
 
     /**
      * Clears any resources associated with this field data.
@@ -116,6 +119,52 @@ public abstract class XFieldComparatorSource extends FieldComparatorSource {
             UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, MAX_TERM);
         }
 
+        /**
+         * Simple wrapper class around a filter that matches parent documents
+         * and a filter that matches child documents. For every root document R,
+         * R will be in the parent filter and its children documents will be the
+         * documents that are contained in the inner set between the previous
+         * parent + 1, or 0 if there is no previous parent, and R (excluded).
+         */
+        public static class Nested {
+            private final Filter rootFilter, innerFilter;
+
+            public Nested(Filter rootFilter, Filter innerFilter) {
+                this.rootFilter = rootFilter;
+                this.innerFilter = innerFilter;
+            }
+
+            // TODO: nested docs should not be random filters but specialized
+            // ones that guarantee that you always get a FixedBitSet
+            @Deprecated
+            private static FixedBitSet toFixedBitSet(DocIdSet set, int maxDoc) throws IOException {
+                if (set == null || set instanceof FixedBitSet) {
+                    return (FixedBitSet) set;
+                } else {
+                    final FixedBitSet fixedBitSet = new FixedBitSet(maxDoc);
+                    final DocIdSetIterator it = set.iterator();
+                    if (it != null) {
+                        fixedBitSet.or(it);
+                    }
+                    return fixedBitSet;
+                }
+            }
+
+            /**
+             * Get a {@link FixedBitSet} that matches the root documents.
+             */
+            public FixedBitSet rootDocs(AtomicReaderContext ctx) throws IOException {
+                return toFixedBitSet(rootFilter.getDocIdSet(ctx, null), ctx.reader().maxDoc());
+            }
+
+            /**
+             * Get a {@link FixedBitSet} that matches the inner documents.
+             */
+            public FixedBitSet innerDocs(AtomicReaderContext ctx) throws IOException {
+                return toFixedBitSet(innerFilter.getDocIdSet(ctx, null), ctx.reader().maxDoc());
+            }
+        }
+
         /** Whether missing values should be sorted first. */
         protected final boolean sortMissingFirst(Object missingValue) {
             return "_first".equals(missingValue);

diff --git a/...java/org/elasticsearch/index/fielddata/fieldcomparator/BytesRefFieldComparatorSource.java b/...java/org/elasticsearch/index/fielddata/fieldcomparator/BytesRefFieldComparatorSource.java
@@ -19,53 +19,271 @@
 
 package org.elasticsearch.index.fielddata.fieldcomparator;
 
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.RandomAccessOrds;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.search.FieldCache;
 import org.apache.lucene.search.FieldComparator;
+import org.apache.lucene.search.Scorer;
 import org.apache.lucene.search.SortField;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.Version;
+import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.index.fielddata.IndexFieldData;
 import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData;
+import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
 import org.elasticsearch.search.MultiValueMode;
 
 import java.io.IOException;
 
 /**
+ * Comparator source for string/binary values.
  */
 public class BytesRefFieldComparatorSource extends IndexFieldData.XFieldComparatorSource {
 
-    /** UTF-8 term containing a single code point: {@link Character#MAX_CODE_POINT} which will compare greater than all other index terms
-     *  since {@link Character#MAX_CODE_POINT} is a noncharacter and thus shouldn't appear in an index term. */
-    public static final BytesRef MAX_TERM;
-    static {
-        MAX_TERM = new BytesRef();
-        final char[] chars = Character.toChars(Character.MAX_CODE_POINT);
-        UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, MAX_TERM);
-    }
-
     private final IndexFieldData<?> indexFieldData;
     private final MultiValueMode sortMode;
     private final Object missingValue;
+    private final Nested nested;
 
-    public BytesRefFieldComparatorSource(IndexFieldData<?> indexFieldData, Object missingValue, MultiValueMode sortMode) {
+    public BytesRefFieldComparatorSource(IndexFieldData<?> indexFieldData, Object missingValue, MultiValueMode sortMode, Nested nested) {
         this.indexFieldData = indexFieldData;
         this.sortMode = sortMode;
         this.missingValue = missingValue;
+        this.nested = nested;
     }
 
     @Override
     public SortField.Type reducedType() {
         return SortField.Type.STRING;
     }
 
+    protected SortedBinaryDocValues getValues(AtomicReaderContext context) {
+        return indexFieldData.load(context).getBytesValues();
+    }
+
+    protected void setScorer(Scorer scorer) {}
+
     @Override
     public FieldComparator<?> newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException {
-        assert fieldname.equals(indexFieldData.getFieldNames().indexName());
-        final BytesRef missingBytes = (BytesRef) missingObject(missingValue, reversed);
+        assert indexFieldData == null || fieldname.equals(indexFieldData.getFieldNames().indexName());
 
+        final boolean sortMissingLast = sortMissingLast(missingValue) ^ reversed;
+        final BytesRef missingBytes = (BytesRef) missingObject(missingValue, reversed);
         if (indexFieldData instanceof IndexOrdinalsFieldData) {
-            return new BytesRefOrdValComparator((IndexOrdinalsFieldData) indexFieldData, numHits, sortMode, missingBytes);
+            // The ordinal-based comparator only supports sorting missing values first or last so when
+            // a missing value is provided we fall back to the (slow) value-based comparator
+            // TODO: handle arbitrary missing values via a selector
+            if (sortMissingFirst(missingValue) || sortMissingLast(missingValue)) {
+                return new FieldComparator.TermOrdValComparator(numHits, null, sortMissingLast) {
+
+                    @Override
+                    protected SortedDocValues getSortedDocValues(AtomicReaderContext context, String field) throws IOException {
+                        final RandomAccessOrds values = ((IndexOrdinalsFieldData) indexFieldData).load(context).getOrdinalsValues();
+                        final SortedDocValues selectedValues;
+                        if (nested == null) {
+                            selectedValues = sortMode.select(values);
+                        } else {
+                            final FixedBitSet rootDocs = nested.rootDocs(context);
+                            final FixedBitSet innerDocs = nested.innerDocs(context);
+                            selectedValues = sortMode.select(values, rootDocs, innerDocs);
+                        }
+                        return selectedValues;
+                    }
+
+                    public BytesRef value(int slot) {
+                        // TODO: When serializing the response to the coordinating node, we lose the information about
+                        // whether the comparator sorts missing docs first or last. We should fix it and let
+                        // TopDocs.merge deal with it (it knows how to)
+                        BytesRef value = super.value(slot);
+                        if (value == null) {
+                            value = missingBytes;
+                        }
+                        return value;
+                    }
+
+                };
+            }
+        }
+
+        final BytesRef nullPlaceHolder = new BytesRef();
+        final BytesRef nonNullMissingBytes = missingBytes == null ? nullPlaceHolder : missingBytes;
+        return new TermValComparator(numHits, null, sortMissingLast) {
+
+            @Override
+            protected BinaryDocValues getBinaryDocValues(AtomicReaderContext context, String field) throws IOException {
+                final SortedBinaryDocValues values = getValues(context);
+                final BinaryDocValues selectedValues;
+                if (nested == null) {
+                    selectedValues = sortMode.select(values, nonNullMissingBytes);
+                } else {
+                    final FixedBitSet rootDocs = nested.rootDocs(context);
+                    final FixedBitSet innerDocs = nested.innerDocs(context);
+                    selectedValues = sortMode.select(values, nonNullMissingBytes, rootDocs, innerDocs, context.reader().maxDoc());
+                }
+                return selectedValues;
+            }
+
+            @Override
+            protected Bits getDocsWithField(AtomicReaderContext context, String field) throws IOException {
+                return new Bits.MatchAllBits(context.reader().maxDoc());
+            }
+
+            @Override
+            protected boolean isNull(int doc, BytesRef term) {
+                return term == nullPlaceHolder;
+            }
+
+            @Override
+            public void setScorer(Scorer scorer) {
+                BytesRefFieldComparatorSource.this.setScorer(scorer);
+            }
+
+            @Override
+            public BytesRef value(int slot) {
+                BytesRef value = super.value(slot);
+                if (value == null) {
+                    value = missingBytes;
+                }
+                return value;
+            }
+
+        };
+    }
+
+    static {
+        assert Lucene.VERSION == Version.LUCENE_4_9 : "The comparator below is a raw copy of Lucene's, remove it when upgrading to 4.10";
+    }
+
+    /** Sorts by field's natural Term sort order.  All
+     *  comparisons are done using BytesRef.compareTo, which is
+     *  slow for medium to large result sets but possibly
+     *  very fast for very small results sets. */
+    public static class TermValComparator extends FieldComparator<BytesRef> {
+
+      private final BytesRef[] values;
+      private final BytesRef[] tempBRs;
+      private BinaryDocValues docTerms;
+      private Bits docsWithField;
+      private final String field;
+      private BytesRef bottom;
+      private BytesRef topValue;
+      private final int missingSortCmp;
+
+      /** Sole constructor. */
+      public TermValComparator(int numHits, String field, boolean sortMissingLast) {
+        values = new BytesRef[numHits];
+        tempBRs = new BytesRef[numHits];
+        this.field = field;
+        missingSortCmp = sortMissingLast ? 1 : -1;
+      }
+
+      @Override
+      public int compare(int slot1, int slot2) {
+        final BytesRef val1 = values[slot1];
+        final BytesRef val2 = values[slot2];
+        return compareValues(val1, val2);
+      }
+
+      @Override
+      public int compareBottom(int doc) {
+        final BytesRef comparableBytes = getComparableBytes(doc, docTerms.get(doc));
+        return compareValues(bottom, comparableBytes);
+      }
+
+      @Override
+      public void copy(int slot, int doc) {
+        final BytesRef comparableBytes = getComparableBytes(doc, docTerms.get(doc));
+        if (comparableBytes == null) {
+          values[slot] = null;
+        } else {
+          if (tempBRs[slot] == null) {
+            tempBRs[slot] = new BytesRef();
+          }
+          values[slot] = tempBRs[slot];
+          values[slot].copyBytes(comparableBytes);
+        }
+      }
+
+      /** Retrieves the BinaryDocValues for the field in this segment */
+      protected BinaryDocValues getBinaryDocValues(AtomicReaderContext context, String field) throws IOException {
+        return FieldCache.DEFAULT.getTerms(context.reader(), field, true);
+      }
+
+      /** Retrieves the set of documents that have a value in this segment */
+      protected Bits getDocsWithField(AtomicReaderContext context, String field) throws IOException {
+        return FieldCache.DEFAULT.getDocsWithField(context.reader(), field);
+      }
+
+      /** Check whether the given value represents <tt>null</tt>. This can be
+       *  useful if the {@link BinaryDocValues} returned by {@link #getBinaryDocValues}
+       *  use a special value as a sentinel. The default implementation checks
+       *  {@link #getDocsWithField}.
+       *  <p>NOTE: The null value can only be an EMPTY {@link BytesRef}. */
+      protected boolean isNull(int doc, BytesRef term) {
+        return docsWithField != null && docsWithField.get(doc) == false;
+      }
+
+      @Override
+      public FieldComparator<BytesRef> setNextReader(AtomicReaderContext context) throws IOException {
+        docTerms = getBinaryDocValues(context, field);
+        docsWithField = getDocsWithField(context, field);
+        if (docsWithField instanceof Bits.MatchAllBits) {
+          docsWithField = null;
+        }
+        return this;
+      }
+
+      @Override
+      public void setBottom(final int bottom) {
+        this.bottom = values[bottom];
+      }
+
+      @Override
+      public void setTopValue(BytesRef value) {
+        // null is fine: it means the last doc of the prior
+        // search was missing this value
+        topValue = value;
+      }
+
+      @Override
+      public BytesRef value(int slot) {
+        return values[slot];
+      }
+
+      @Override
+      public int compareValues(BytesRef val1, BytesRef val2) {
+        // missing always sorts first:
+        if (val1 == null) {
+          if (val2 == null) {
+            return 0;
+          }
+          return missingSortCmp;
+        } else if (val2 == null) {
+          return -missingSortCmp;
+        }
+        return val1.compareTo(val2);
+      }
+
+      @Override
+      public int compareTop(int doc) {
+        final BytesRef comparableBytes = getComparableBytes(doc, docTerms.get(doc));
+        return compareValues(topValue, comparableBytes);
+      }
+
+      /**
+       * Given a document and a term, return the term itself if it exists or
+       * <tt>null</tt> otherwise.
+       */
+      private BytesRef getComparableBytes(int doc, BytesRef term) {
+        if (term.length == 0 && isNull(doc, term)) {
+          return null;
         }
-        return new BytesRefValComparator(indexFieldData, numHits, sortMode, missingBytes);
+        return term;
+      }
     }
 
 }