Skip to content

Commit

Permalink
Disable bloom filters.
Browse files Browse the repository at this point in the history
make the "es090" postings format read-only, just to support old segments. There is a test version that subclasses it with write-capability for testing.

Closes #8571

Conflicts:
	docs/reference/indices/update-settings.asciidoc
	src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormat.java
	src/main/java/org/elasticsearch/index/codec/postingsformat/Elasticsearch090PostingsFormat.java
	src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java
	src/test/java/org/elasticsearch/index/codec/postingformat/DefaultPostingsFormatTests.java
	src/test/java/org/elasticsearch/index/codec/postingformat/ElasticsearchPostingsFormatTest.java
	src/test/java/org/elasticsearch/search/suggest/completion/CompletionPostingsFormatTest.java
  • Loading branch information
rmuir committed Nov 21, 2014
1 parent ef03c8a commit d325b19
Show file tree
Hide file tree
Showing 18 changed files with 105 additions and 358 deletions.
35 changes: 0 additions & 35 deletions docs/reference/indices/update-settings.asciidoc
Expand Up @@ -57,9 +57,6 @@ settings API:
`index.index_concurrency`::
Defaults to `8`.

`index.codec.bloom.load`::
Whether to load the bloom filter. Defaults to `false`.

`index.fail_on_merge_failure`::
Default to `true`.

Expand Down Expand Up @@ -219,35 +216,3 @@ curl -XPUT 'localhost:9200/myindex/_settings' -d '{
curl -XPOST 'localhost:9200/myindex/_open'
--------------------------------------------------

[float]
[[codec-bloom-load]]
=== Bloom filters

Up to version 1.3, Elasticsearch used to generate bloom filters for the `_uid`
field at indexing time and to load them at search time in order to speed-up
primary-key lookups by savings disk seeks.

As of 1.4, bloom filters are still generated at indexing time, but they are
no longer loaded at search time by default: they consume RAM in proportion to
the number of unique terms, which can quickly add up for certain use cases,
and separate performance improvements have made the performance gains with
bloom filters very small.

[TIP]
==================================================
You can enable loading of the bloom filter at search time on a
per-index basis by updating the index settings:
[source,js]
--------------------------------------------------
PUT /old_index/_settings?index.codec.bloom.load=true
--------------------------------------------------
This setting, which defaults to `false`, can be updated on a live index. Note,
however, that changing the value will cause the index to be reopened, which
will invalidate any existing caches.
==================================================

2 changes: 2 additions & 0 deletions src/main/java/org/elasticsearch/common/lucene/Lucene.java
Expand Up @@ -56,6 +56,8 @@ public class Lucene {
public static final Version VERSION = Version.LATEST;
public static final Version ANALYZER_VERSION = VERSION;
public static final Version QUERYPARSER_VERSION = VERSION;

public static final String LATEST_POSTINGS_FORMAT = "Lucene41";

public static final NamedAnalyzer STANDARD_ANALYZER = new NamedAnalyzer("_standard", AnalyzerScope.GLOBAL, new StandardAnalyzer(ANALYZER_VERSION));
public static final NamedAnalyzer KEYWORD_ANALYZER = new NamedAnalyzer("_keyword", AnalyzerScope.GLOBAL, new KeywordAnalyzer());
Expand Down
14 changes: 0 additions & 14 deletions src/main/java/org/elasticsearch/index/codec/CodecService.java
Expand Up @@ -44,16 +44,11 @@
*/
public class CodecService extends AbstractIndexComponent {

public static final String INDEX_CODEC_BLOOM_LOAD = "index.codec.bloom.load";
public static final boolean INDEX_CODEC_BLOOM_LOAD_DEFAULT = false;

private final PostingsFormatService postingsFormatService;
private final DocValuesFormatService docValuesFormatService;
private final MapperService mapperService;
private final ImmutableMap<String, Codec> codecs;

private volatile boolean loadBloomFilter = true;

public final static String DEFAULT_CODEC = "default";

public CodecService(Index index) {
Expand Down Expand Up @@ -83,7 +78,6 @@ public CodecService(Index index, @IndexSettings Settings indexSettings, Postings
codecs.put(codec, Codec.forName(codec));
}
this.codecs = codecs.immutableMap();
this.loadBloomFilter = indexSettings.getAsBoolean(INDEX_CODEC_BLOOM_LOAD, INDEX_CODEC_BLOOM_LOAD_DEFAULT);
}

public PostingsFormatService postingsFormatService() {
Expand All @@ -105,12 +99,4 @@ public Codec codec(String name) throws ElasticsearchIllegalArgumentException {
}
return codec;
}

public boolean isLoadBloomFilter() {
return this.loadBloomFilter;
}

public void setLoadBloomFilter(boolean loadBloomFilter) {
this.loadBloomFilter = loadBloomFilter;
}
}
Expand Up @@ -19,6 +19,8 @@

package org.elasticsearch.index.codec.postingsformat;

import org.apache.lucene.store.IndexInput;

import org.apache.lucene.codecs.*;
import org.apache.lucene.index.*;
import org.apache.lucene.store.ChecksumIndexInput;
Expand All @@ -28,8 +30,6 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.util.BloomFilter;
import org.elasticsearch.index.store.DirectoryUtils;
import org.elasticsearch.index.store.Store;

import java.io.IOException;
import java.util.*;
Expand All @@ -46,7 +46,9 @@
* This is a special bloom filter version, based on {@link org.elasticsearch.common.util.BloomFilter} and inspired
* by Lucene {@link org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat}.
* </p>
* @deprecated only for reading old segments
*/
@Deprecated
public final class BloomFilterPostingsFormat extends PostingsFormat {

public static final String BLOOM_CODEC_NAME = "XBloomFilter"; // the Lucene one is named BloomFilter
Expand Down Expand Up @@ -107,6 +109,7 @@ public BloomFilteredFieldsProducer fieldsProducer(SegmentReadState state)
public final class BloomFilteredFieldsProducer extends FieldsProducer {
private FieldsProducer delegateFieldsProducer;
HashMap<String, BloomFilter> bloomsByFieldName = new HashMap<>();
private final IndexInput data;

// for internal use only
FieldsProducer getDelegate() {
Expand All @@ -118,48 +121,22 @@ public BloomFilteredFieldsProducer(SegmentReadState state)

String bloomFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
ChecksumIndexInput bloomIn = null;
boolean success = false;
try {
bloomIn = state.directory.openChecksumInput(bloomFileName, state.context);
int version = CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
data = state.directory.openChecksumInput(bloomFileName, state.context);
int version = CodecUtil.checkHeader(data, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
BLOOM_CODEC_VERSION_CURRENT);
// // Load the hash function used in the BloomFilter
// hashFunction = HashFunction.forName(bloomIn.readString());
// Load the delegate postings format
PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn
.readString());

this.delegateFieldsProducer = delegatePostingsFormat
final String delegatePostings = data.readString();
this.delegateFieldsProducer = PostingsFormat.forName(delegatePostings)
.fieldsProducer(state);
int numBlooms = bloomIn.readInt();

boolean load = false;
Store.StoreDirectory storeDir = DirectoryUtils.getStoreDirectory(state.directory);
if (storeDir != null && storeDir.codecService() != null) {
load = storeDir.codecService().isLoadBloomFilter();
}

if (load && state.context.context != IOContext.Context.MERGE) {
// if we merge we don't need to load the bloom filters
for (int i = 0; i < numBlooms; i++) {
int fieldNum = bloomIn.readInt();
BloomFilter bloom = BloomFilter.deserialize(bloomIn);
FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
bloomsByFieldName.put(fieldInfo.name, bloom);
}
if (version >= BLOOM_CODEC_VERSION_CHECKSUM) {
CodecUtil.checkFooter(bloomIn);
} else {
CodecUtil.checkEOF(bloomIn);
}
}
IOUtils.close(bloomIn);
success = true;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(bloomIn, delegateFieldsProducer);
}
if (!success) {
IOUtils.closeWhileHandlingException(this);
}
}
}

Expand All @@ -170,7 +147,7 @@ public Iterator<String> iterator() {

@Override
public void close() throws IOException {
delegateFieldsProducer.close();
IOUtils.close(data, delegateFieldsProducer);
}

@Override
Expand Down Expand Up @@ -344,8 +321,9 @@ public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)

}


final class BloomFilteredFieldsConsumer extends FieldsConsumer {
// TODO: would be great to move this out to test code, but the interaction between es090 and bloom is complex
// at least it is not accessible via SPI
public final class BloomFilteredFieldsConsumer extends FieldsConsumer {
private FieldsConsumer delegateFieldsConsumer;
private Map<FieldInfo, BloomFilter> bloomFilters = new HashMap<>();
private SegmentWriteState state;
Expand All @@ -360,7 +338,7 @@ public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
}

// for internal use only
FieldsConsumer getDelegate() {
public FieldsConsumer getDelegate() {
return delegateFieldsConsumer;
}

Expand Down
Expand Up @@ -30,7 +30,9 @@
import java.util.Map;

/**
* @deprecated only for reading old segments
*/
@Deprecated
public class BloomFilterPostingsFormatProvider extends AbstractPostingsFormatProvider {

private final PostingsFormatProvider delegate;
Expand Down
Expand Up @@ -33,14 +33,17 @@
import java.io.IOException;

/**
* This is the default postings format for Elasticsearch that special cases
* This is the old default postings format for Elasticsearch that special cases
* the <tt>_uid</tt> field to use a bloom filter while all other fields
* will use a {@link Lucene41PostingsFormat}. This format will reuse the underlying
* {@link Lucene41PostingsFormat} and its files also for the <tt>_uid</tt> saving up to
* 5 files per segment in the default case.
* <p>
* @deprecated only for reading old segments
*/
public final class Elasticsearch090PostingsFormat extends PostingsFormat {
private final BloomFilterPostingsFormat bloomPostings;
@Deprecated
public class Elasticsearch090PostingsFormat extends PostingsFormat {
protected final BloomFilterPostingsFormat bloomPostings;

public Elasticsearch090PostingsFormat() {
super("es090");
Expand All @@ -53,23 +56,7 @@ public PostingsFormat getDefaultWrapped() {

@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
final BloomFilteredFieldsConsumer fieldsConsumer = bloomPostings.fieldsConsumer(state);
return new FieldsConsumer() {

@Override
public void close() throws IOException {
fieldsConsumer.close();
}

@Override
public TermsConsumer addField(FieldInfo field) throws IOException {
if (UidFieldMapper.NAME.equals(field.name)) {
// only go through bloom for the UID field
return fieldsConsumer.addField(field);
}
return fieldsConsumer.getDelegate().addField(field);
}
};
throw new UnsupportedOperationException("this codec can only be used for reading");
}

@Override
Expand Down
Expand Up @@ -19,6 +19,8 @@

package org.elasticsearch.index.codec.postingsformat;

import org.elasticsearch.common.lucene.Lucene;

import com.google.common.collect.ImmutableCollection;
import com.google.common.collect.ImmutableMap;
import org.apache.lucene.codecs.PostingsFormat;
Expand All @@ -29,10 +31,7 @@
* This class represents the set of Elasticsearch "built-in"
* {@link PostingsFormatProvider.Factory postings format factories}
* <ul>
* <li><b>bloom_default</b>: a postings format that uses a bloom filter to
* improve term lookup performance. This is useful for primarily keys or fields
* that are used as a delete key</li>
* <li><b>default</b>: the default Elasticsearch postings format offering best
* <li><b>default</b>: the default Lucene postings format offering best
* general purpose performance. This format is used if no postings format is
* specified in the field mapping.</li>
* <li><b>***</b>: other formats from Lucene core (e.g. Lucene41 as of Lucene 4.10)
Expand All @@ -50,12 +49,10 @@ public class PostingFormats {
for (String luceneName : PostingsFormat.availablePostingsFormats()) {
builtInPostingFormatsX.put(luceneName, new PreBuiltPostingsFormatProvider.Factory(PostingsFormat.forName(luceneName)));
}
final PostingsFormat defaultFormat = new Elasticsearch090PostingsFormat();
final PostingsFormat defaultFormat = PostingsFormat.forName(Lucene.LATEST_POSTINGS_FORMAT);
builtInPostingFormatsX.put(PostingsFormatService.DEFAULT_FORMAT,
new PreBuiltPostingsFormatProvider.Factory(PostingsFormatService.DEFAULT_FORMAT, defaultFormat));

builtInPostingFormatsX.put("bloom_default", new PreBuiltPostingsFormatProvider.Factory("bloom_default", wrapInBloom(PostingsFormat.forName("Lucene41"))));

builtInPostingFormats = builtInPostingFormatsX.immutableMap();
}

Expand Down
Expand Up @@ -1494,12 +1494,10 @@ public void onRefreshSettings(Settings settings) {
int indexConcurrency = settings.getAsInt(INDEX_INDEX_CONCURRENCY, InternalEngine.this.indexConcurrency);
boolean failOnMergeFailure = settings.getAsBoolean(INDEX_FAIL_ON_MERGE_FAILURE, InternalEngine.this.failOnMergeFailure);
String codecName = settings.get(INDEX_CODEC, InternalEngine.this.codecName);
final boolean codecBloomLoad = settings.getAsBoolean(CodecService.INDEX_CODEC_BLOOM_LOAD, codecService.isLoadBloomFilter());
boolean requiresFlushing = false;
if (indexConcurrency != InternalEngine.this.indexConcurrency ||
!codecName.equals(InternalEngine.this.codecName) ||
failOnMergeFailure != InternalEngine.this.failOnMergeFailure ||
codecBloomLoad != codecService.isLoadBloomFilter()) {
failOnMergeFailure != InternalEngine.this.failOnMergeFailure) {
try (InternalLock _ = readLock.acquire()) {
if (indexConcurrency != InternalEngine.this.indexConcurrency) {
logger.info("updating index.index_concurrency from [{}] to [{}]", InternalEngine.this.indexConcurrency, indexConcurrency);
Expand All @@ -1517,12 +1515,6 @@ public void onRefreshSettings(Settings settings) {
logger.info("updating {} from [{}] to [{}]", InternalEngine.INDEX_FAIL_ON_MERGE_FAILURE, InternalEngine.this.failOnMergeFailure, failOnMergeFailure);
InternalEngine.this.failOnMergeFailure = failOnMergeFailure;
}
if (codecBloomLoad != codecService.isLoadBloomFilter()) {
logger.info("updating {} from [{}] to [{}]", CodecService.INDEX_CODEC_BLOOM_LOAD, codecService.isLoadBloomFilter(), codecBloomLoad);
codecService.setLoadBloomFilter(codecBloomLoad);
// we need to flush in this case, to load/unload the bloom filters
requiresFlushing = true;
}
}
if (requiresFlushing) {
flush(new Flush().type(Flush.Type.NEW_WRITER));
Expand Down
Expand Up @@ -84,7 +84,6 @@ public IndexDynamicSettingsModule() {
indexDynamicSettings.addDynamicSetting(LogDocMergePolicyProvider.INDEX_COMPOUND_FORMAT);
indexDynamicSettings.addDynamicSetting(InternalEngine.INDEX_INDEX_CONCURRENCY, Validator.NON_NEGATIVE_INTEGER);
indexDynamicSettings.addDynamicSetting(InternalEngine.INDEX_COMPOUND_ON_FLUSH, Validator.BOOLEAN);
indexDynamicSettings.addDynamicSetting(CodecService.INDEX_CODEC_BLOOM_LOAD, Validator.BOOLEAN);
indexDynamicSettings.addDynamicSetting(InternalEngine.INDEX_GC_DELETES, Validator.TIME);
indexDynamicSettings.addDynamicSetting(InternalEngine.INDEX_CODEC);
indexDynamicSettings.addDynamicSetting(InternalEngine.INDEX_FAIL_ON_MERGE_FAILURE);
Expand Down
6 changes: 0 additions & 6 deletions src/main/java/org/elasticsearch/index/store/Store.java
Expand Up @@ -556,12 +556,6 @@ public ShardId shardId() {
return Store.this.shardId();
}

@Nullable
public CodecService codecService() {
ensureOpen();
return Store.this.codecService;
}

@Override
public IndexInput openInput(String name, IOContext context) throws IOException {
IndexInput in = super.openInput(name, context);
Expand Down
@@ -1,3 +1,2 @@
org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat
org.elasticsearch.index.codec.postingsformat.Elasticsearch090PostingsFormat
org.elasticsearch.search.suggest.completion.Completion090PostingsFormat

0 comments on commit d325b19

Please sign in to comment.