Change numeric data types to use SORTED_NUMERIC docvalues type

instead of a custom encoding in BINARY. In low level benchmarks this is 2x to 5x faster: its also optimized for the common case where fields actually only contain at most one value for each document. Additionally SORTED_NUMERIC doesn't lose values if they appear more than once, so mathematical computations such as averages are correct. Closes #6967
elastic · Jul 23, 2014 · 77ddf9c · 77ddf9c
1 parent aca4a25
commit 77ddf9c
Show file tree

Hide file tree

Showing 15 changed files with 381 additions and 58 deletions.
diff --git a/src/main/java/org/elasticsearch/index/fielddata/plain/DocValuesIndexFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/plain/DocValuesIndexFieldData.java
@@ -22,6 +22,7 @@
 import com.google.common.collect.ImmutableSet;
 import org.apache.lucene.index.IndexReader;
 import org.elasticsearch.ElasticsearchIllegalArgumentException;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.logging.ESLogger;
 import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.settings.Settings;
@@ -107,7 +108,12 @@ public IndexFieldData<?> build(Index index, Settings indexSettings, FieldMapper<
                 assert !numericType.isFloatingPoint();
                 return new NumericDVIndexFieldData(index, fieldNames, mapper.fieldDataType());
             } else if (numericType != null) {
-                return new BinaryDVNumericIndexFieldData(index, fieldNames, numericType, mapper.fieldDataType());
+                if (Version.indexCreated(indexSettings).onOrAfter(Version.V_1_4_0)) {
+                    return new SortedNumericDVIndexFieldData(index, fieldNames, numericType, mapper.fieldDataType());
+                } else {
+                    // prior to ES 1.4: multi-valued numerics were boxed inside a byte[] as BINARY
+                    return new BinaryDVNumericIndexFieldData(index, fieldNames, numericType, mapper.fieldDataType());
+                }
             } else {
                 return new SortedSetDVOrdinalsIndexFieldData(index, cache, indexSettings, fieldNames, breakerService, mapper.fieldDataType());
             }

diff --git a/src/main/java/org/elasticsearch/index/fielddata/plain/SortedNumericDVIndexFieldData.java b/src/main/java/org/elasticsearch/index/fielddata/plain/SortedNumericDVIndexFieldData.java
@@ -0,0 +1,292 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.fielddata.plain;
+
+import com.google.common.base.Preconditions;
+import org.apache.lucene.index.*;
+import org.apache.lucene.util.NumericUtils;
+import org.elasticsearch.ElasticsearchIllegalStateException;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.fielddata.*;
+import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
+import org.elasticsearch.index.fielddata.fieldcomparator.DoubleValuesComparatorSource;
+import org.elasticsearch.index.fielddata.fieldcomparator.FloatValuesComparatorSource;
+import org.elasticsearch.index.fielddata.fieldcomparator.LongValuesComparatorSource;
+import org.elasticsearch.index.mapper.FieldMapper.Names;
+import org.elasticsearch.search.MultiValueMode;
+
+import java.io.IOException;
+
+/**
+ * FieldData backed by {@link AtomicReader#getSortedNumericDocValues(String)}
+ * @see FieldInfo.DocValuesType#SORTED_NUMERIC
+ */
+public class SortedNumericDVIndexFieldData extends DocValuesIndexFieldData implements IndexNumericFieldData {
+    private final NumericType numericType;
+
+    public SortedNumericDVIndexFieldData(Index index, Names fieldNames, NumericType numericType, FieldDataType fieldDataType) {
+        super(index, fieldNames, fieldDataType);
+        Preconditions.checkArgument(numericType != null, "numericType must be non-null");
+        this.numericType = numericType;
+    }
+
+    @Override
+    public org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource comparatorSource(Object missingValue, MultiValueMode sortMode, Nested nested) {
+        switch (numericType) {
+            case FLOAT:
+                return new FloatValuesComparatorSource(this, missingValue, sortMode, nested);
+            case DOUBLE: 
+                return new DoubleValuesComparatorSource(this, missingValue, sortMode, nested);
+            default:
+                assert !numericType.isFloatingPoint();
+                return new LongValuesComparatorSource(this, missingValue, sortMode, nested);
+        }
+    }
+
+    @Override
+    public NumericType getNumericType() {
+        return numericType;
+    }
+
+    @Override
+    public AtomicNumericFieldData loadDirect(AtomicReaderContext context) throws Exception {
+        return load(context);
+    }
+
+    @Override
+    public AtomicNumericFieldData load(AtomicReaderContext context) {
+        final AtomicReader reader = context.reader();
+        final String field = fieldNames.indexName();
+
+        switch (numericType) {
+            case FLOAT:
+                return new SortedNumericFloatFieldData(reader, field);
+            case DOUBLE:
+                return new SortedNumericDoubleFieldData(reader, field);
+            default:
+                return new SortedNumericLongFieldData(reader, field);
+        } 
+    }
+
+    /**
+     * FieldData implementation for integral types.
+     * <p>
+     * Order of values within a document is consistent with 
+     * {@link Long#compareTo(Long)}.
+     * <p>
+     * Although the API is multi-valued, most codecs in Lucene specialize 
+     * for the case where documents have at most one value. In this case
+     * {@link DocValues#unwrapSingleton(SortedNumericDocValues)} will return
+     * the underlying single-valued NumericDocValues representation, and 
+     * {@link DocValues#unwrapSingletonBits(SortedNumericDocValues)} will return
+     * a Bits matching documents that have a real value (as opposed to missing).
+     */
+    static final class SortedNumericLongFieldData extends AtomicLongFieldData {
+        final AtomicReader reader;
+        final String field;
+
+        SortedNumericLongFieldData(AtomicReader reader, String field) {
+            super(-1L);
+            this.reader = reader;
+            this.field = field;
+        }
+
+        @Override
+        public SortedNumericDocValues getLongValues() {
+            try {
+                return DocValues.getSortedNumeric(reader, field);
+            } catch (IOException e) {
+                throw new ElasticsearchIllegalStateException("Cannot load doc values", e);
+            }
+        }
+    }
+
+    /**
+     * FieldData implementation for 32-bit float values.
+     * <p>
+     * Order of values within a document is consistent with
+     * {@link Float#compareTo(Float)}, hence the following reversible
+     * transformation is applied at both index and search:
+     * {code}
+     *   bits ^ (bits >> 31) & 0x7fffffff
+     * {code}
+     * <p>
+     * Although the API is multi-valued, most codecs in Lucene specialize 
+     * for the case where documents have at most one value. In this case
+     * {@link FieldData#unwrapSingleton(SortedNumericDoubleValues)} will return
+     * the underlying single-valued NumericDoubleValues representation, and 
+     * {@link FieldData#unwrapSingletonBits(SortedNumericDoubleValues)} will return
+     * a Bits matching documents that have a real value (as opposed to missing).
+     */
+    static final class SortedNumericFloatFieldData extends AtomicDoubleFieldData {
+        final AtomicReader reader;
+        final String field;
+
+        SortedNumericFloatFieldData(AtomicReader reader, String field) {
+            super(-1L);
+            this.reader = reader;
+            this.field = field;
+        }
+
+        @Override
+        public SortedNumericDoubleValues getDoubleValues() {
+            try {
+                SortedNumericDocValues raw = DocValues.getSortedNumeric(reader, field);
+
+                NumericDocValues single = DocValues.unwrapSingleton(raw);
+                if (single != null) {
+                    return FieldData.singleton(new SingleFloatValues(single), DocValues.unwrapSingletonBits(raw));
+                } else {
+                    return new MultiFloatValues(raw);
+                }
+            } catch (IOException e) {
+                throw new ElasticsearchIllegalStateException("Cannot load doc values", e);
+            }
+        }
+    }
+
+    /** 
+     * Wraps a NumericDocValues and exposes a single 32-bit float per document.
+     */
+    static final class SingleFloatValues extends NumericDoubleValues {
+        final NumericDocValues in;
+
+        SingleFloatValues(NumericDocValues in) {
+            this.in = in;
+        }
+
+        @Override
+        public double get(int docID) {
+            return NumericUtils.sortableIntToFloat((int) in.get(docID));
+        }
+    }
+
+    /** 
+     * Wraps a SortedNumericDocValues and exposes multiple 32-bit floats per document.
+     */
+    static final class MultiFloatValues extends SortedNumericDoubleValues {
+        final SortedNumericDocValues in;
+
+        MultiFloatValues(SortedNumericDocValues in) {
+            this.in = in;
+        }
+
+        @Override
+        public void setDocument(int doc) {
+            in.setDocument(doc);
+        }
+
+        @Override
+        public double valueAt(int index) {
+            return NumericUtils.sortableIntToFloat((int) in.valueAt(index));
+        }
+
+        @Override
+        public int count() {
+            return in.count();
+        }
+    }
+
+    /**
+     * FieldData implementation for 64-bit double values.
+     * <p>
+     * Order of values within a document is consistent with
+     * {@link Double#compareTo(Double)}, hence the following reversible
+     * transformation is applied at both index and search:
+     * {code}
+     *   bits ^ (bits >> 63) & 0x7fffffffffffffffL
+     * {code}
+     * <p>
+     * Although the API is multi-valued, most codecs in Lucene specialize 
+     * for the case where documents have at most one value. In this case
+     * {@link FieldData#unwrapSingleton(SortedNumericDoubleValues)} will return
+     * the underlying single-valued NumericDoubleValues representation, and 
+     * {@link FieldData#unwrapSingletonBits(SortedNumericDoubleValues)} will return
+     * a Bits matching documents that have a real value (as opposed to missing).
+     */
+    static final class SortedNumericDoubleFieldData extends AtomicDoubleFieldData {
+        final AtomicReader reader;
+        final String field;
+
+        SortedNumericDoubleFieldData(AtomicReader reader, String field) {
+            super(-1L);
+            this.reader = reader;
+            this.field = field;
+        }
+
+        @Override
+        public SortedNumericDoubleValues getDoubleValues() {
+            try {
+                SortedNumericDocValues raw = DocValues.getSortedNumeric(reader, field);
+
+                NumericDocValues single = DocValues.unwrapSingleton(raw);
+                if (single != null) {
+                    return FieldData.singleton(new SingleDoubleValues(single), DocValues.unwrapSingletonBits(raw));
+                } else {
+                    return new MultiDoubleValues(raw);
+                }
+            } catch (IOException e) {
+                throw new ElasticsearchIllegalStateException("Cannot load doc values", e);
+            }
+        }
+    }
+
+    /** 
+     * Wraps a NumericDocValues and exposes a single 64-bit double per document.
+     */
+    static final class SingleDoubleValues extends NumericDoubleValues {
+        final NumericDocValues in;
+
+        SingleDoubleValues(NumericDocValues in) {
+            this.in = in;
+        }
+
+        @Override
+        public double get(int docID) {
+            return NumericUtils.sortableLongToDouble(in.get(docID));
+        }
+    }
+
+    /** 
+     * Wraps a SortedNumericDocValues and exposes multiple 64-bit doubles per document.
+     */
+    static final class MultiDoubleValues extends SortedNumericDoubleValues {
+        final SortedNumericDocValues in;
+
+        MultiDoubleValues(SortedNumericDocValues in) {
+            this.in = in;
+        }
+
+        @Override
+        public void setDocument(int doc) {
+            in.setDocument(doc);
+        }
+
+        @Override
+        public double valueAt(int index) {
+            return NumericUtils.sortableLongToDouble(in.valueAt(index));
+        }
+
+        @Override
+        public int count() {
+            return in.count();
+        }
+    }
+}
diff --git a/src/main/java/org/elasticsearch/index/mapper/core/ByteFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/ByteFieldMapper.java
@@ -320,7 +320,7 @@ protected void innerParseCreateField(ParseContext context, List<Field> fields) t
             fields.add(field);
         }
         if (hasDocValues()) {
-            addDocValue(context, value);
+            addDocValue(context, fields, value);
         }
     }
 

diff --git a/src/main/java/org/elasticsearch/index/mapper/core/DateFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/DateFieldMapper.java
@@ -41,7 +41,6 @@
 import org.elasticsearch.common.util.LocaleUtils;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentParser;
-import org.elasticsearch.index.analysis.NamedAnalyzer;
 import org.elasticsearch.index.analysis.NumericDateAnalyzer;
 import org.elasticsearch.index.codec.docvaluesformat.DocValuesFormatProvider;
 import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
@@ -514,7 +513,7 @@ protected void innerParseCreateField(ParseContext context, List<Field> fields) t
                 fields.add(field);
             }
             if (hasDocValues()) {
-                addDocValue(context, value);
+                addDocValue(context, fields, value);
             }
         }
     }

diff --git a/src/main/java/org/elasticsearch/index/mapper/core/DoubleFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/DoubleFieldMapper.java
@@ -316,12 +316,16 @@ protected void innerParseCreateField(ParseContext context, List<Field> fields) t
             fields.add(field);
         }
         if (hasDocValues()) {
-            CustomDoubleNumericDocValuesField field = (CustomDoubleNumericDocValuesField) context.doc().getByKey(names().indexName());
-            if (field != null) {
-                field.add(value);
+            if (useSortedNumericDocValues) {
+                addDocValue(context, fields, NumericUtils.doubleToSortableLong(value));
             } else {
-                field = new CustomDoubleNumericDocValuesField(names().indexName(), value);
-                context.doc().addWithKey(names().indexName(), field);
+                CustomDoubleNumericDocValuesField field = (CustomDoubleNumericDocValuesField) context.doc().getByKey(names().indexName());
+                if (field != null) {
+                    field.add(value);
+                } else {
+                    field = new CustomDoubleNumericDocValuesField(names().indexName(), value);
+                    context.doc().addWithKey(names().indexName(), field);
+                }
             }
         }
     }

diff --git a/src/main/java/org/elasticsearch/index/mapper/core/FloatFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/FloatFieldMapper.java
@@ -321,12 +321,16 @@ protected void innerParseCreateField(ParseContext context, List<Field> fields) t
             fields.add(field);
         }
         if (hasDocValues()) {
-            CustomFloatNumericDocValuesField field = (CustomFloatNumericDocValuesField) context.doc().getByKey(names().indexName());
-            if (field != null) {
-                field.add(value);
+            if (useSortedNumericDocValues) {
+                addDocValue(context, fields, NumericUtils.floatToSortableInt(value));
             } else {
-                field = new CustomFloatNumericDocValuesField(names().indexName(), value);
-                context.doc().addWithKey(names().indexName(), field);
+                CustomFloatNumericDocValuesField field = (CustomFloatNumericDocValuesField) context.doc().getByKey(names().indexName());
+                if (field != null) {
+                    field.add(value);
+                } else {
+                    field = new CustomFloatNumericDocValuesField(names().indexName(), value);
+                    context.doc().addWithKey(names().indexName(), field);
+                }
             }
         }
     }

diff --git a/src/main/java/org/elasticsearch/index/mapper/core/IntegerFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/IntegerFieldMapper.java
@@ -319,7 +319,7 @@ protected void addIntegerFields(ParseContext context, List<Field> fields, int va
             fields.add(field);
         }
         if (hasDocValues()) {
-            addDocValue(context, value);
+            addDocValue(context, fields, value);
         }
     }