Skip to content

Commit

Permalink
Add doc values support to boolean fields.
Browse files Browse the repository at this point in the history
This pull request makes boolean handled like dates and ipv4 addresses: things
are stored as as numerics under the hood and aggregations add some special
formatting logic in order to return true/false in addition to 1/0.

For example, here is an output of a terms aggregation on a boolean field:
```
   "aggregations": {
      "top_f": {
         "doc_count_error_upper_bound": 0,
         "buckets": [
            {
               "key": 0,
               "key_as_string": "false",
               "doc_count": 2
            },
            {
               "key": 1,
               "key_as_string": "true",
               "doc_count": 1
            }
         ]
      }
   }
```

Sorted numeric doc values are used under the hood.

Close #4678
Close #7851
  • Loading branch information
jpountz committed Apr 2, 2015
1 parent e390ef5 commit 08f93cf
Show file tree
Hide file tree
Showing 99 changed files with 496 additions and 19 deletions.
3 changes: 2 additions & 1 deletion dev-tools/create-bwc-index.py
Expand Up @@ -61,7 +61,8 @@ def index_documents(es, index_name, type, num_docs):
for id in range(0, num_docs):
es.index(index=index_name, doc_type=type, id=id, body={'string': str(random.randint(0, 100)),
'long_sort': random.randint(0, 100),
'double_sort' : float(random.randint(0, 100))})
'double_sort' : float(random.randint(0, 100)),
'bool' : random.choice([True, False])})
if rarely():
es.indices.refresh(index=index_name)
if rarely():
Expand Down
32 changes: 32 additions & 0 deletions docs/reference/migration/migrate_2_0.asciidoc
Expand Up @@ -273,6 +273,38 @@ to provide special features. They now have limited configuration options.
* `_field_names` configuration is limited to disabling the field.
* `_size` configuration is limited to enabling the field.

=== Boolean fields

Boolean fields used to have a string fielddata with `F` meaning `false` and `T`
meaning `true`. They have been refactored to use numeric fielddata, with `0`
for `false` and `1` for `true`. As a consequence, the format of the responses of
the following APIs changed when applied to boolean fields: `0`/`1` is returned
instead of `F`/`T`:

- <<search-request-fielddata-fields,fielddata fields>>
- <<search-request-sort,sort values>>
- <<search-aggregations-bucket-terms-aggregation,terms aggregations>>

In addition, terms aggregations use a custom formatter for boolean (like for
dates and ip addresses, which are also backed by numbers) in order to return
the user-friendly representation of boolean fields: `false`/`true`:

[source,json]
---------------
"buckets": [
{
"key": 0,
"key_as_string": "false",
"doc_count": 42
},
{
"key": 1,
"key_as_string": "true",
"doc_count": 12
}
]
---------------

=== Codecs

It is no longer possible to specify per-field postings and doc values formats
Expand Down
Expand Up @@ -646,6 +646,25 @@ public XContentBuilder field(XContentBuilderString name, Iterable value) throws
return this;
}

public XContentBuilder field(String name, boolean... value) throws IOException {
startArray(name);
for (boolean o : value) {
value(o);
}
endArray();
return this;
}


public XContentBuilder field(XContentBuilderString name, boolean... value) throws IOException {
startArray(name);
for (boolean o : value) {
value(o);
}
endArray();
return this;
}

public XContentBuilder field(String name, String... value) throws IOException {
startArray(name);
for (String o : value) {
Expand Down
Expand Up @@ -33,6 +33,7 @@
import org.elasticsearch.index.Index;
import org.elasticsearch.index.fielddata.plain.*;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.core.BooleanFieldMapper;
import org.elasticsearch.index.mapper.internal.IndexFieldMapper;
import org.elasticsearch.index.mapper.internal.ParentFieldMapper;
import org.elasticsearch.index.IndexService;
Expand Down Expand Up @@ -78,6 +79,7 @@ public class IndexFieldDataService extends AbstractIndexComponent {
.put(ParentFieldMapper.NAME, new ParentChildIndexFieldData.Builder())
.put(IndexFieldMapper.NAME, new IndexIndexFieldData.Builder())
.put("binary", new DisabledIndexFieldData.Builder())
.put(BooleanFieldMapper.CONTENT_TYPE, new PackedArrayIndexFieldData.Builder().setNumericType(IndexNumericFieldData.NumericType.BOOLEAN))
.immutableMap();

docValuesBuildersByType = MapBuilder.<String, IndexFieldData.Builder>newMapBuilder()
Expand All @@ -90,6 +92,7 @@ public class IndexFieldDataService extends AbstractIndexComponent {
.put("long", new DocValuesIndexFieldData.Builder().numericType(IndexNumericFieldData.NumericType.LONG))
.put("geo_point", new GeoPointBinaryDVIndexFieldData.Builder())
.put("binary", new BytesBinaryDVIndexFieldData.Builder())
.put(BooleanFieldMapper.CONTENT_TYPE, new DocValuesIndexFieldData.Builder().numericType(IndexNumericFieldData.NumericType.BOOLEAN))
.immutableMap();

buildersByTypeAndFormat = MapBuilder.<Tuple<String, String>, IndexFieldData.Builder>newMapBuilder()
Expand Down Expand Up @@ -130,6 +133,10 @@ public class IndexFieldDataService extends AbstractIndexComponent {
.put(Tuple.tuple("binary", DOC_VALUES_FORMAT), new BytesBinaryDVIndexFieldData.Builder())
.put(Tuple.tuple("binary", DISABLED_FORMAT), new DisabledIndexFieldData.Builder())

.put(Tuple.tuple(BooleanFieldMapper.CONTENT_TYPE, ARRAY_FORMAT), new PackedArrayIndexFieldData.Builder().setNumericType(IndexNumericFieldData.NumericType.BOOLEAN))
.put(Tuple.tuple(BooleanFieldMapper.CONTENT_TYPE, DOC_VALUES_FORMAT), new DocValuesIndexFieldData.Builder().numericType(IndexNumericFieldData.NumericType.BOOLEAN))
.put(Tuple.tuple(BooleanFieldMapper.CONTENT_TYPE, DISABLED_FORMAT), new DisabledIndexFieldData.Builder())

.immutableMap();
}

Expand Down
Expand Up @@ -24,13 +24,38 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.NumericUtils;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.index.fielddata.ordinals.OrdinalsBuilder;
import org.elasticsearch.index.mapper.core.BooleanFieldMapper;

/**
*/
public interface IndexNumericFieldData extends IndexFieldData<AtomicNumericFieldData> {

public static enum NumericType {
BOOLEAN(1, false, SortField.Type.INT, 0, 1) {
@Override
public long toLong(BytesRef indexForm) {
if (indexForm.equals(BooleanFieldMapper.Values.FALSE)) {
return 0;
} else if (indexForm.equals(BooleanFieldMapper.Values.TRUE)) {
return 1;
} else {
throw new ElasticsearchIllegalArgumentException("Cannot convert " + indexForm + " to a boolean");
}
}

@Override
public void toIndexForm(Number number, BytesRefBuilder bytes) {
bytes.append(number.intValue() != 0 ? BooleanFieldMapper.Values.TRUE : BooleanFieldMapper.Values.FALSE);
}

@Override
public Number toNumber(BytesRef indexForm) {
return toLong(indexForm);
}

},
BYTE(8, false, SortField.Type.INT, Byte.MIN_VALUE, Byte.MAX_VALUE) {
@Override
public long toLong(BytesRef indexForm) {
Expand Down Expand Up @@ -174,7 +199,9 @@ public double toDouble(BytesRef indexForm) {
public abstract Number toNumber(BytesRef indexForm);

public final TermsEnum wrapTermsEnum(TermsEnum termsEnum) {
if (requiredBits() > 32) {
if (requiredBits() == 1) { // boolean, no prefix-terms
return termsEnum;
} else if (requiredBits() > 32) {
return OrdinalsBuilder.wrapNumeric64Bit(termsEnum);
} else {
return OrdinalsBuilder.wrapNumeric32Bit(termsEnum);
Expand Down
Expand Up @@ -99,7 +99,7 @@ public PackedArrayIndexFieldData(Index index, @IndexSettings Settings indexSetti
CircuitBreakerService breakerService) {
super(index, indexSettings, fieldNames, fieldDataType, cache);
Preconditions.checkNotNull(numericType);
Preconditions.checkArgument(EnumSet.of(NumericType.BYTE, NumericType.SHORT, NumericType.INT, NumericType.LONG).contains(numericType), getClass().getSimpleName() + " only supports integer types, not " + numericType);
Preconditions.checkArgument(EnumSet.of(NumericType.BOOLEAN, NumericType.BYTE, NumericType.SHORT, NumericType.INT, NumericType.LONG).contains(numericType), getClass().getSimpleName() + " only supports integer types, not " + numericType);
this.numericType = numericType;
this.breakerService = breakerService;
}
Expand Down Expand Up @@ -127,16 +127,13 @@ public AtomicNumericFieldData loadDirect(LeafReaderContext context) throws Excep

final float acceptableTransientOverheadRatio = fieldDataType.getSettings().getAsFloat("acceptable_transient_overhead_ratio", OrdinalsBuilder.DEFAULT_ACCEPTABLE_OVERHEAD_RATIO);
TermsEnum termsEnum = estimator.beforeLoad(terms);
assert !getNumericType().isFloatingPoint();
boolean success = false;
try (OrdinalsBuilder builder = new OrdinalsBuilder(-1, reader.maxDoc(), acceptableTransientOverheadRatio)) {
BytesRefIterator iter = builder.buildFromTerms(termsEnum);
BytesRef term;
assert !getNumericType().isFloatingPoint();
final boolean indexedAsLong = getNumericType().requiredBits() > 32;
while ((term = iter.next()) != null) {
final long value = indexedAsLong
? NumericUtils.prefixCodedToLong(term)
: NumericUtils.prefixCodedToInt(term);
final long value = numericType.toLong(term);
valuesBuilder.add(value);
}
final PackedLongValues values = valuesBuilder.build();
Expand Down
Expand Up @@ -21,6 +21,7 @@

import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.queries.TermFilter;
import org.apache.lucene.search.Filter;
Expand Down Expand Up @@ -51,9 +52,8 @@
import static org.elasticsearch.index.mapper.core.TypeParsers.parseField;

/**
*
* A field mapper for boolean fields.
*/
// TODO this can be made better, maybe storing a byte for it?
public class BooleanFieldMapper extends AbstractFieldMapper<Boolean> {

public static final String CONTENT_TYPE = "boolean";
Expand Down Expand Up @@ -100,7 +100,7 @@ public Builder tokenized(boolean tokenized) {

@Override
public BooleanFieldMapper build(BuilderContext context) {
return new BooleanFieldMapper(buildNames(context), boost, fieldType, nullValue,
return new BooleanFieldMapper(buildNames(context), boost, fieldType, docValues, nullValue,
similarity, normsLoading, fieldDataSettings, context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
}
}
Expand Down Expand Up @@ -128,10 +128,10 @@ public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext

private Boolean nullValue;

protected BooleanFieldMapper(Names names, float boost, FieldType fieldType, Boolean nullValue,
protected BooleanFieldMapper(Names names, float boost, FieldType fieldType, Boolean docValues, Boolean nullValue,
SimilarityProvider similarity, Loading normsLoading,
@Nullable Settings fieldDataSettings, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
super(names, boost, fieldType, false, Lucene.KEYWORD_ANALYZER, Lucene.KEYWORD_ANALYZER, similarity, normsLoading, fieldDataSettings, indexSettings, multiFields, copyTo);
super(names, boost, fieldType, docValues, Lucene.KEYWORD_ANALYZER, Lucene.KEYWORD_ANALYZER, similarity, normsLoading, fieldDataSettings, indexSettings, multiFields, copyTo);
this.nullValue = nullValue;
}

Expand All @@ -143,7 +143,7 @@ public FieldType defaultFieldType() {
@Override
public FieldDataType defaultFieldDataType() {
// TODO have a special boolean type?
return new FieldDataType("string");
return new FieldDataType(CONTENT_TYPE);
}

@Override
Expand Down Expand Up @@ -210,7 +210,7 @@ public Filter nullValueFilter() {

@Override
protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
if (fieldType().indexOptions() == IndexOptions.NONE && !fieldType().stored()) {
if (fieldType().indexOptions() == IndexOptions.NONE && !fieldType().stored() && !hasDocValues()) {
return;
}

Expand All @@ -230,6 +230,9 @@ protected void parseCreateField(ParseContext context, List<Field> fields) throws
return;
}
fields.add(new Field(names.indexName(), value ? "T" : "F", fieldType));
if (hasDocValues()) {
fields.add(new SortedNumericDocValuesField(names.indexName(), value ? 1 : 0));
}
}

@Override
Expand Down
Expand Up @@ -25,6 +25,7 @@
import org.elasticsearch.index.fielddata.IndexGeoPointFieldData;
import org.elasticsearch.index.fielddata.IndexNumericFieldData;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.core.BooleanFieldMapper;
import org.elasticsearch.index.mapper.core.DateFieldMapper;
import org.elasticsearch.index.mapper.core.NumberFieldMapper;
import org.elasticsearch.index.mapper.ip.IpFieldMapper;
Expand Down Expand Up @@ -209,6 +210,9 @@ private static ValueFormat resolveFormat(@Nullable String format, FieldMapper ma
if (mapper instanceof IpFieldMapper) {
return ValueFormat.IPv4;
}
if (mapper instanceof BooleanFieldMapper) {
return ValueFormat.BOOLEAN;
}
if (mapper instanceof NumberFieldMapper) {
return format != null ? ValueFormat.Number.format(format) : ValueFormat.RAW;
}
Expand Down
Expand Up @@ -28,6 +28,7 @@ public class ValueFormat {

public static final ValueFormat RAW = new ValueFormat(ValueFormatter.RAW, ValueParser.RAW);
public static final ValueFormat IPv4 = new ValueFormat(ValueFormatter.IPv4, ValueParser.IPv4);
public static final ValueFormat BOOLEAN = new ValueFormat(ValueFormatter.BOOLEAN, ValueParser.BOOLEAN);

private final ValueFormatter formatter;
private final ValueParser parser;
Expand Down
Expand Up @@ -19,8 +19,6 @@
package org.elasticsearch.search.aggregations.support.format;

import org.elasticsearch.common.geo.GeoHashUtils;
import org.elasticsearch.common.geo.GeoPoint;
import org.elasticsearch.common.geo.GeoUtils;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Streamable;
Expand All @@ -45,6 +43,7 @@ public interface ValueFormatter extends Streamable {
public final static ValueFormatter RAW = new Raw();
public final static ValueFormatter IPv4 = new IPv4Formatter();
public final static ValueFormatter GEOHASH = new GeoHash();
public final static ValueFormatter BOOLEAN = new BooleanFormatter();

/**
* Uniquely identifies this formatter (used for efficient serialization)
Expand Down Expand Up @@ -266,4 +265,31 @@ public void writeTo(StreamOutput out) throws IOException {
}
}

static class BooleanFormatter implements ValueFormatter {

static final byte ID = 10;

@Override
public byte id() {
return ID;
}

@Override
public String format(long value) {
return Boolean.valueOf(value != 0).toString();
}

@Override
public String format(double value) {
return Boolean.valueOf(value != 0).toString();
}

@Override
public void readFrom(StreamInput in) throws IOException {
}

@Override
public void writeTo(StreamOutput out) throws IOException {
}
}
}
Expand Up @@ -38,6 +38,7 @@ public static ValueFormatter read(StreamInput in) throws IOException {
case ValueFormatter.DateTime.ID: formatter = new ValueFormatter.DateTime(); break;
case ValueFormatter.Number.Pattern.ID: formatter = new ValueFormatter.Number.Pattern(); break;
case ValueFormatter.GeoHash.ID: formatter = ValueFormatter.GEOHASH; break;
case ValueFormatter.BooleanFormatter.ID: formatter = ValueFormatter.BOOLEAN; break;
default: throw new ElasticsearchIllegalArgumentException("Unknown value formatter with id [" + id + "]");
}
formatter.readFrom(in);
Expand Down
Expand Up @@ -41,6 +41,7 @@ public interface ValueParser {

static final ValueParser IPv4 = new IPv4();
static final ValueParser RAW = new Raw();
static final ValueParser BOOLEAN = new Boolean();

long parseLong(String value, SearchContext searchContext);

Expand Down Expand Up @@ -184,4 +185,20 @@ public double parseDouble(String value, SearchContext searchContext) {
}
}

static class Boolean implements ValueParser {

private Boolean() {
}

@Override
public long parseLong(String value, SearchContext searchContext) {
return java.lang.Boolean.parseBoolean(value) ? 1 : 0;
}

@Override
public double parseDouble(String value, SearchContext searchContext) {
return parseLong(value, searchContext);
}
}

}

0 comments on commit 08f93cf

Please sign in to comment.