Skip to content

Commit

Permalink
Change numeric data types to use SORTED_NUMERIC docvalues type
Browse files Browse the repository at this point in the history
instead of a custom encoding in BINARY.

In low level benchmarks this is 2x to 5x faster: its also optimized
for the common case where fields actually only contain at most one
value for each document.

Additionally SORTED_NUMERIC doesn't lose values if they appear more
than once, so mathematical computations such as averages are correct.

Closes #6967
  • Loading branch information
rmuir committed Jul 23, 2014
1 parent aca4a25 commit 77ddf9c
Show file tree
Hide file tree
Showing 15 changed files with 381 additions and 58 deletions.
Expand Up @@ -22,6 +22,7 @@
import com.google.common.collect.ImmutableSet;
import org.apache.lucene.index.IndexReader;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.Version;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.Settings;
Expand Down Expand Up @@ -107,7 +108,12 @@ public IndexFieldData<?> build(Index index, Settings indexSettings, FieldMapper<
assert !numericType.isFloatingPoint();
return new NumericDVIndexFieldData(index, fieldNames, mapper.fieldDataType());
} else if (numericType != null) {
return new BinaryDVNumericIndexFieldData(index, fieldNames, numericType, mapper.fieldDataType());
if (Version.indexCreated(indexSettings).onOrAfter(Version.V_1_4_0)) {
return new SortedNumericDVIndexFieldData(index, fieldNames, numericType, mapper.fieldDataType());
} else {
// prior to ES 1.4: multi-valued numerics were boxed inside a byte[] as BINARY
return new BinaryDVNumericIndexFieldData(index, fieldNames, numericType, mapper.fieldDataType());
}
} else {
return new SortedSetDVOrdinalsIndexFieldData(index, cache, indexSettings, fieldNames, breakerService, mapper.fieldDataType());
}
Expand Down
@@ -0,0 +1,292 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.fielddata.plain;

import com.google.common.base.Preconditions;
import org.apache.lucene.index.*;
import org.apache.lucene.util.NumericUtils;
import org.elasticsearch.ElasticsearchIllegalStateException;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.fielddata.*;
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
import org.elasticsearch.index.fielddata.fieldcomparator.DoubleValuesComparatorSource;
import org.elasticsearch.index.fielddata.fieldcomparator.FloatValuesComparatorSource;
import org.elasticsearch.index.fielddata.fieldcomparator.LongValuesComparatorSource;
import org.elasticsearch.index.mapper.FieldMapper.Names;
import org.elasticsearch.search.MultiValueMode;

import java.io.IOException;

/**
* FieldData backed by {@link AtomicReader#getSortedNumericDocValues(String)}
* @see FieldInfo.DocValuesType#SORTED_NUMERIC
*/
public class SortedNumericDVIndexFieldData extends DocValuesIndexFieldData implements IndexNumericFieldData {
private final NumericType numericType;

public SortedNumericDVIndexFieldData(Index index, Names fieldNames, NumericType numericType, FieldDataType fieldDataType) {
super(index, fieldNames, fieldDataType);
Preconditions.checkArgument(numericType != null, "numericType must be non-null");
this.numericType = numericType;
}

@Override
public org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource comparatorSource(Object missingValue, MultiValueMode sortMode, Nested nested) {
switch (numericType) {
case FLOAT:
return new FloatValuesComparatorSource(this, missingValue, sortMode, nested);
case DOUBLE:
return new DoubleValuesComparatorSource(this, missingValue, sortMode, nested);
default:
assert !numericType.isFloatingPoint();
return new LongValuesComparatorSource(this, missingValue, sortMode, nested);
}
}

@Override
public NumericType getNumericType() {
return numericType;
}

@Override
public AtomicNumericFieldData loadDirect(AtomicReaderContext context) throws Exception {
return load(context);
}

@Override
public AtomicNumericFieldData load(AtomicReaderContext context) {
final AtomicReader reader = context.reader();
final String field = fieldNames.indexName();

switch (numericType) {
case FLOAT:
return new SortedNumericFloatFieldData(reader, field);
case DOUBLE:
return new SortedNumericDoubleFieldData(reader, field);
default:
return new SortedNumericLongFieldData(reader, field);
}
}

/**
* FieldData implementation for integral types.
* <p>
* Order of values within a document is consistent with
* {@link Long#compareTo(Long)}.
* <p>
* Although the API is multi-valued, most codecs in Lucene specialize
* for the case where documents have at most one value. In this case
* {@link DocValues#unwrapSingleton(SortedNumericDocValues)} will return
* the underlying single-valued NumericDocValues representation, and
* {@link DocValues#unwrapSingletonBits(SortedNumericDocValues)} will return
* a Bits matching documents that have a real value (as opposed to missing).
*/
static final class SortedNumericLongFieldData extends AtomicLongFieldData {
final AtomicReader reader;
final String field;

SortedNumericLongFieldData(AtomicReader reader, String field) {
super(-1L);
this.reader = reader;
this.field = field;
}

@Override
public SortedNumericDocValues getLongValues() {
try {
return DocValues.getSortedNumeric(reader, field);
} catch (IOException e) {
throw new ElasticsearchIllegalStateException("Cannot load doc values", e);
}
}
}

/**
* FieldData implementation for 32-bit float values.
* <p>
* Order of values within a document is consistent with
* {@link Float#compareTo(Float)}, hence the following reversible
* transformation is applied at both index and search:
* {code}
* bits ^ (bits >> 31) & 0x7fffffff
* {code}
* <p>
* Although the API is multi-valued, most codecs in Lucene specialize
* for the case where documents have at most one value. In this case
* {@link FieldData#unwrapSingleton(SortedNumericDoubleValues)} will return
* the underlying single-valued NumericDoubleValues representation, and
* {@link FieldData#unwrapSingletonBits(SortedNumericDoubleValues)} will return
* a Bits matching documents that have a real value (as opposed to missing).
*/
static final class SortedNumericFloatFieldData extends AtomicDoubleFieldData {
final AtomicReader reader;
final String field;

SortedNumericFloatFieldData(AtomicReader reader, String field) {
super(-1L);
this.reader = reader;
this.field = field;
}

@Override
public SortedNumericDoubleValues getDoubleValues() {
try {
SortedNumericDocValues raw = DocValues.getSortedNumeric(reader, field);

NumericDocValues single = DocValues.unwrapSingleton(raw);
if (single != null) {
return FieldData.singleton(new SingleFloatValues(single), DocValues.unwrapSingletonBits(raw));
} else {
return new MultiFloatValues(raw);
}
} catch (IOException e) {
throw new ElasticsearchIllegalStateException("Cannot load doc values", e);
}
}
}

/**
* Wraps a NumericDocValues and exposes a single 32-bit float per document.
*/
static final class SingleFloatValues extends NumericDoubleValues {
final NumericDocValues in;

SingleFloatValues(NumericDocValues in) {
this.in = in;
}

@Override
public double get(int docID) {
return NumericUtils.sortableIntToFloat((int) in.get(docID));
}
}

/**
* Wraps a SortedNumericDocValues and exposes multiple 32-bit floats per document.
*/
static final class MultiFloatValues extends SortedNumericDoubleValues {
final SortedNumericDocValues in;

MultiFloatValues(SortedNumericDocValues in) {
this.in = in;
}

@Override
public void setDocument(int doc) {
in.setDocument(doc);
}

@Override
public double valueAt(int index) {
return NumericUtils.sortableIntToFloat((int) in.valueAt(index));
}

@Override
public int count() {
return in.count();
}
}

/**
* FieldData implementation for 64-bit double values.
* <p>
* Order of values within a document is consistent with
* {@link Double#compareTo(Double)}, hence the following reversible
* transformation is applied at both index and search:
* {code}
* bits ^ (bits >> 63) & 0x7fffffffffffffffL
* {code}
* <p>
* Although the API is multi-valued, most codecs in Lucene specialize
* for the case where documents have at most one value. In this case
* {@link FieldData#unwrapSingleton(SortedNumericDoubleValues)} will return
* the underlying single-valued NumericDoubleValues representation, and
* {@link FieldData#unwrapSingletonBits(SortedNumericDoubleValues)} will return
* a Bits matching documents that have a real value (as opposed to missing).
*/
static final class SortedNumericDoubleFieldData extends AtomicDoubleFieldData {
final AtomicReader reader;
final String field;

SortedNumericDoubleFieldData(AtomicReader reader, String field) {
super(-1L);
this.reader = reader;
this.field = field;
}

@Override
public SortedNumericDoubleValues getDoubleValues() {
try {
SortedNumericDocValues raw = DocValues.getSortedNumeric(reader, field);

NumericDocValues single = DocValues.unwrapSingleton(raw);
if (single != null) {
return FieldData.singleton(new SingleDoubleValues(single), DocValues.unwrapSingletonBits(raw));
} else {
return new MultiDoubleValues(raw);
}
} catch (IOException e) {
throw new ElasticsearchIllegalStateException("Cannot load doc values", e);
}
}
}

/**
* Wraps a NumericDocValues and exposes a single 64-bit double per document.
*/
static final class SingleDoubleValues extends NumericDoubleValues {
final NumericDocValues in;

SingleDoubleValues(NumericDocValues in) {
this.in = in;
}

@Override
public double get(int docID) {
return NumericUtils.sortableLongToDouble(in.get(docID));
}
}

/**
* Wraps a SortedNumericDocValues and exposes multiple 64-bit doubles per document.
*/
static final class MultiDoubleValues extends SortedNumericDoubleValues {
final SortedNumericDocValues in;

MultiDoubleValues(SortedNumericDocValues in) {
this.in = in;
}

@Override
public void setDocument(int doc) {
in.setDocument(doc);
}

@Override
public double valueAt(int index) {
return NumericUtils.sortableLongToDouble(in.valueAt(index));
}

@Override
public int count() {
return in.count();
}
}
}
Expand Up @@ -320,7 +320,7 @@ protected void innerParseCreateField(ParseContext context, List<Field> fields) t
fields.add(field);
}
if (hasDocValues()) {
addDocValue(context, value);
addDocValue(context, fields, value);
}
}

Expand Down
Expand Up @@ -41,7 +41,6 @@
import org.elasticsearch.common.util.LocaleUtils;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.NumericDateAnalyzer;
import org.elasticsearch.index.codec.docvaluesformat.DocValuesFormatProvider;
import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
Expand Down Expand Up @@ -514,7 +513,7 @@ protected void innerParseCreateField(ParseContext context, List<Field> fields) t
fields.add(field);
}
if (hasDocValues()) {
addDocValue(context, value);
addDocValue(context, fields, value);
}
}
}
Expand Down
Expand Up @@ -316,12 +316,16 @@ protected void innerParseCreateField(ParseContext context, List<Field> fields) t
fields.add(field);
}
if (hasDocValues()) {
CustomDoubleNumericDocValuesField field = (CustomDoubleNumericDocValuesField) context.doc().getByKey(names().indexName());
if (field != null) {
field.add(value);
if (useSortedNumericDocValues) {
addDocValue(context, fields, NumericUtils.doubleToSortableLong(value));
} else {
field = new CustomDoubleNumericDocValuesField(names().indexName(), value);
context.doc().addWithKey(names().indexName(), field);
CustomDoubleNumericDocValuesField field = (CustomDoubleNumericDocValuesField) context.doc().getByKey(names().indexName());
if (field != null) {
field.add(value);
} else {
field = new CustomDoubleNumericDocValuesField(names().indexName(), value);
context.doc().addWithKey(names().indexName(), field);
}
}
}
}
Expand Down
Expand Up @@ -321,12 +321,16 @@ protected void innerParseCreateField(ParseContext context, List<Field> fields) t
fields.add(field);
}
if (hasDocValues()) {
CustomFloatNumericDocValuesField field = (CustomFloatNumericDocValuesField) context.doc().getByKey(names().indexName());
if (field != null) {
field.add(value);
if (useSortedNumericDocValues) {
addDocValue(context, fields, NumericUtils.floatToSortableInt(value));
} else {
field = new CustomFloatNumericDocValuesField(names().indexName(), value);
context.doc().addWithKey(names().indexName(), field);
CustomFloatNumericDocValuesField field = (CustomFloatNumericDocValuesField) context.doc().getByKey(names().indexName());
if (field != null) {
field.add(value);
} else {
field = new CustomFloatNumericDocValuesField(names().indexName(), value);
context.doc().addWithKey(names().indexName(), field);
}
}
}
}
Expand Down
Expand Up @@ -319,7 +319,7 @@ protected void addIntegerFields(ParseContext context, List<Field> fields, int va
fields.add(field);
}
if (hasDocValues()) {
addDocValue(context, value);
addDocValue(context, fields, value);
}
}

Expand Down

0 comments on commit 77ddf9c

Please sign in to comment.