Disable bloom filters.

make the "es090" postings format read-only, just to support old segments. There is a test version that subclasses it with write-capability for testing. Closes #8571 Conflicts: docs/reference/indices/update-settings.asciidoc src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormat.java src/main/java/org/elasticsearch/index/codec/postingsformat/Elasticsearch090PostingsFormat.java src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java src/test/java/org/elasticsearch/index/codec/postingformat/DefaultPostingsFormatTests.java src/test/java/org/elasticsearch/index/codec/postingformat/ElasticsearchPostingsFormatTest.java src/test/java/org/elasticsearch/search/suggest/completion/CompletionPostingsFormatTest.java
elastic · Nov 21, 2014 · d325b19 · d325b19
1 parent ef03c8a
commit d325b19
Show file tree

Hide file tree

Showing 18 changed files with 105 additions and 358 deletions.
diff --git a/docs/reference/indices/update-settings.asciidoc b/docs/reference/indices/update-settings.asciidoc
@@ -57,9 +57,6 @@ settings API:
 `index.index_concurrency`::
     Defaults to `8`.
 
-`index.codec.bloom.load`::
-    Whether to load the bloom filter. Defaults to `false`.
-
 `index.fail_on_merge_failure`::
     Default to `true`.
 
@@ -219,35 +216,3 @@ curl -XPUT 'localhost:9200/myindex/_settings' -d '{
 
 curl -XPOST 'localhost:9200/myindex/_open'
 --------------------------------------------------
-
-[float]
-[[codec-bloom-load]]
-=== Bloom filters
-
-Up to version 1.3, Elasticsearch used to generate bloom filters for the `_uid`
-field at indexing time and to load them at search time in order to speed-up
-primary-key lookups by savings disk seeks.
-
-As of 1.4, bloom filters are still generated at indexing time, but they are
-no longer loaded at search time by default: they consume RAM in proportion to
-the number of unique terms, which can quickly add up for certain use cases,
-and separate performance improvements have made the performance gains with
-bloom filters very small.
-
-[TIP]
-==================================================
-
-You can enable loading of the bloom filter at search time on a
-per-index basis by updating the index settings:
-
-[source,js]
---------------------------------------------------
-PUT /old_index/_settings?index.codec.bloom.load=true
---------------------------------------------------
-
-This setting, which defaults to `false`, can be updated on a live index. Note,
-however, that changing the value will cause the index to be reopened, which
-will invalidate any existing caches.
-
-==================================================
-
diff --git a/src/main/java/org/elasticsearch/common/lucene/Lucene.java b/src/main/java/org/elasticsearch/common/lucene/Lucene.java
@@ -56,6 +56,8 @@ public class Lucene {
     public static final Version VERSION = Version.LATEST;
     public static final Version ANALYZER_VERSION = VERSION;
     public static final Version QUERYPARSER_VERSION = VERSION;
+
+    public static final String LATEST_POSTINGS_FORMAT = "Lucene41";
 
     public static final NamedAnalyzer STANDARD_ANALYZER = new NamedAnalyzer("_standard", AnalyzerScope.GLOBAL, new StandardAnalyzer(ANALYZER_VERSION));
     public static final NamedAnalyzer KEYWORD_ANALYZER = new NamedAnalyzer("_keyword", AnalyzerScope.GLOBAL, new KeywordAnalyzer());

diff --git a/src/main/java/org/elasticsearch/index/codec/CodecService.java b/src/main/java/org/elasticsearch/index/codec/CodecService.java
@@ -44,16 +44,11 @@
  */
 public class CodecService extends AbstractIndexComponent {
 
-    public static final String INDEX_CODEC_BLOOM_LOAD = "index.codec.bloom.load";
-    public static final boolean INDEX_CODEC_BLOOM_LOAD_DEFAULT = false;
-
     private final PostingsFormatService postingsFormatService;
     private final DocValuesFormatService docValuesFormatService;
     private final MapperService mapperService;
     private final ImmutableMap<String, Codec> codecs;
 
-    private volatile boolean loadBloomFilter = true;
-
     public final static String DEFAULT_CODEC = "default";
 
     public CodecService(Index index) {
@@ -83,7 +78,6 @@ public CodecService(Index index, @IndexSettings Settings indexSettings, Postings
             codecs.put(codec, Codec.forName(codec));
         }
         this.codecs = codecs.immutableMap();
-        this.loadBloomFilter = indexSettings.getAsBoolean(INDEX_CODEC_BLOOM_LOAD, INDEX_CODEC_BLOOM_LOAD_DEFAULT);
     }
 
     public PostingsFormatService postingsFormatService() {
@@ -105,12 +99,4 @@ public Codec codec(String name) throws ElasticsearchIllegalArgumentException {
         }
         return codec;
     }
-
-    public boolean isLoadBloomFilter() {
-        return this.loadBloomFilter;
-    }
-
-    public void setLoadBloomFilter(boolean loadBloomFilter) {
-        this.loadBloomFilter = loadBloomFilter;
-    }
 }
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormat.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormat.java
@@ -19,6 +19,8 @@
 
 package org.elasticsearch.index.codec.postingsformat;
 
+import org.apache.lucene.store.IndexInput;
+
 import org.apache.lucene.codecs.*;
 import org.apache.lucene.index.*;
 import org.apache.lucene.store.ChecksumIndexInput;
@@ -28,8 +30,6 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.elasticsearch.common.util.BloomFilter;
-import org.elasticsearch.index.store.DirectoryUtils;
-import org.elasticsearch.index.store.Store;
 
 import java.io.IOException;
 import java.util.*;
@@ -46,7 +46,9 @@
  * This is a special bloom filter version, based on {@link org.elasticsearch.common.util.BloomFilter} and inspired
  * by Lucene {@link org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat}.
  * </p>
+ * @deprecated only for reading old segments
  */
+@Deprecated
 public final class BloomFilterPostingsFormat extends PostingsFormat {
 
     public static final String BLOOM_CODEC_NAME = "XBloomFilter"; // the Lucene one is named BloomFilter
@@ -107,6 +109,7 @@ public BloomFilteredFieldsProducer fieldsProducer(SegmentReadState state)
     public final class BloomFilteredFieldsProducer extends FieldsProducer {
         private FieldsProducer delegateFieldsProducer;
         HashMap<String, BloomFilter> bloomsByFieldName = new HashMap<>();
+        private final IndexInput data;
 
         // for internal use only
         FieldsProducer getDelegate() {
@@ -118,48 +121,22 @@ public BloomFilteredFieldsProducer(SegmentReadState state)
 
             String bloomFileName = IndexFileNames.segmentFileName(
                     state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
-            ChecksumIndexInput bloomIn = null;
             boolean success = false;
             try {
-                bloomIn = state.directory.openChecksumInput(bloomFileName, state.context);
-                int version = CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
+                data = state.directory.openChecksumInput(bloomFileName, state.context);
+                int version = CodecUtil.checkHeader(data, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
                         BLOOM_CODEC_VERSION_CURRENT);
                 // // Load the hash function used in the BloomFilter
                 // hashFunction = HashFunction.forName(bloomIn.readString());
                 // Load the delegate postings format
-                PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn
-                        .readString());
-
-                this.delegateFieldsProducer = delegatePostingsFormat
+               final String delegatePostings = data.readString();
+                this.delegateFieldsProducer = PostingsFormat.forName(delegatePostings)
                         .fieldsProducer(state);
-                int numBlooms = bloomIn.readInt();
-
-                boolean load = false;
-                Store.StoreDirectory storeDir = DirectoryUtils.getStoreDirectory(state.directory);
-                if (storeDir != null && storeDir.codecService() != null) {
-                    load = storeDir.codecService().isLoadBloomFilter();
-                }
-
-                if (load && state.context.context != IOContext.Context.MERGE) {
-                    // if we merge we don't need to load the bloom filters
-                    for (int i = 0; i < numBlooms; i++) {
-                        int fieldNum = bloomIn.readInt();
-                        BloomFilter bloom = BloomFilter.deserialize(bloomIn);
-                        FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
-                        bloomsByFieldName.put(fieldInfo.name, bloom);
-                    }
-                    if (version >= BLOOM_CODEC_VERSION_CHECKSUM) {
-                        CodecUtil.checkFooter(bloomIn);
-                    } else {
-                        CodecUtil.checkEOF(bloomIn);
-                    }
-                }
-                IOUtils.close(bloomIn);
-                success = true;
+               success = true;
             } finally {
-                if (!success) {
-                    IOUtils.closeWhileHandlingException(bloomIn, delegateFieldsProducer);
-                }
+              if (!success) {
+                  IOUtils.closeWhileHandlingException(this);
+              }
             }
         }
 
@@ -170,7 +147,7 @@ public Iterator<String> iterator() {
 
         @Override
         public void close() throws IOException {
-            delegateFieldsProducer.close();
+            IOUtils.close(data, delegateFieldsProducer);
         }
 
         @Override
@@ -344,8 +321,9 @@ public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)
 
     }
 
-
-    final class BloomFilteredFieldsConsumer extends FieldsConsumer {
+    // TODO: would be great to move this out to test code, but the interaction between es090 and bloom is complex
+    // at least it is not accessible via SPI
+    public final class BloomFilteredFieldsConsumer extends FieldsConsumer {
         private FieldsConsumer delegateFieldsConsumer;
         private Map<FieldInfo, BloomFilter> bloomFilters = new HashMap<>();
         private SegmentWriteState state;
@@ -360,7 +338,7 @@ public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
         }
 
         // for internal use only
-        FieldsConsumer getDelegate() {
+        public FieldsConsumer getDelegate() {
             return delegateFieldsConsumer;
         }
 

diff --git a/.../java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormatProvider.java b/.../java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormatProvider.java
@@ -30,7 +30,9 @@
 import java.util.Map;
 
 /**
+ * @deprecated only for reading old segments
  */
+@Deprecated
 public class BloomFilterPostingsFormatProvider extends AbstractPostingsFormatProvider {
 
     private final PostingsFormatProvider delegate;

diff --git a/...ain/java/org/elasticsearch/index/codec/postingsformat/Elasticsearch090PostingsFormat.java b/...ain/java/org/elasticsearch/index/codec/postingsformat/Elasticsearch090PostingsFormat.java
@@ -33,14 +33,17 @@
 import java.io.IOException;
 
 /**
- * This is the default postings format for Elasticsearch that special cases
+ * This is the old default postings format for Elasticsearch that special cases
  * the <tt>_uid</tt> field to use a bloom filter while all other fields
  * will use a {@link Lucene41PostingsFormat}. This format will reuse the underlying
  * {@link Lucene41PostingsFormat} and its files also for the <tt>_uid</tt> saving up to
  * 5 files per segment in the default case.
+ * <p>
+ * @deprecated only for reading old segments
  */
-public final class Elasticsearch090PostingsFormat extends PostingsFormat {
-    private final BloomFilterPostingsFormat bloomPostings;
+@Deprecated
+public class Elasticsearch090PostingsFormat extends PostingsFormat {
+    protected final BloomFilterPostingsFormat bloomPostings;
 
     public Elasticsearch090PostingsFormat() {
         super("es090");
@@ -53,23 +56,7 @@ public PostingsFormat getDefaultWrapped() {
 
     @Override
     public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
-        final BloomFilteredFieldsConsumer fieldsConsumer = bloomPostings.fieldsConsumer(state);
-        return new FieldsConsumer() {
-
-            @Override
-            public void close() throws IOException {
-                fieldsConsumer.close();
-            }
-
-            @Override
-            public TermsConsumer addField(FieldInfo field) throws IOException {
-                if (UidFieldMapper.NAME.equals(field.name)) {
-                    // only go through bloom for the UID field
-                    return fieldsConsumer.addField(field);
-                }
-                return fieldsConsumer.getDelegate().addField(field);
-            }
-        };
+        throw new UnsupportedOperationException("this codec can only be used for reading");
     }
 
     @Override

diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java
@@ -19,6 +19,8 @@
 
 package org.elasticsearch.index.codec.postingsformat;
 
+import org.elasticsearch.common.lucene.Lucene;
+
 import com.google.common.collect.ImmutableCollection;
 import com.google.common.collect.ImmutableMap;
 import org.apache.lucene.codecs.PostingsFormat;
@@ -29,10 +31,7 @@
  * This class represents the set of Elasticsearch "built-in"
  * {@link PostingsFormatProvider.Factory postings format factories}
  * <ul>
- * <li><b>bloom_default</b>: a postings format that uses a bloom filter to
- * improve term lookup performance. This is useful for primarily keys or fields
- * that are used as a delete key</li>
- * <li><b>default</b>: the default Elasticsearch postings format offering best
+ * <li><b>default</b>: the default Lucene postings format offering best
  * general purpose performance. This format is used if no postings format is
  * specified in the field mapping.</li>
  * <li><b>***</b>: other formats from Lucene core (e.g. Lucene41 as of Lucene 4.10)
@@ -50,12 +49,10 @@ public class PostingFormats {
         for (String luceneName : PostingsFormat.availablePostingsFormats()) {
             builtInPostingFormatsX.put(luceneName, new PreBuiltPostingsFormatProvider.Factory(PostingsFormat.forName(luceneName)));
         }
-        final PostingsFormat defaultFormat = new Elasticsearch090PostingsFormat();
+        final PostingsFormat defaultFormat = PostingsFormat.forName(Lucene.LATEST_POSTINGS_FORMAT);
         builtInPostingFormatsX.put(PostingsFormatService.DEFAULT_FORMAT,
                                    new PreBuiltPostingsFormatProvider.Factory(PostingsFormatService.DEFAULT_FORMAT, defaultFormat));
 
-        builtInPostingFormatsX.put("bloom_default", new PreBuiltPostingsFormatProvider.Factory("bloom_default", wrapInBloom(PostingsFormat.forName("Lucene41"))));
-
         builtInPostingFormats = builtInPostingFormatsX.immutableMap();
     }
 

diff --git a/src/main/java/org/elasticsearch/index/engine/internal/InternalEngine.java b/src/main/java/org/elasticsearch/index/engine/internal/InternalEngine.java
@@ -1494,12 +1494,10 @@ public void onRefreshSettings(Settings settings) {
             int indexConcurrency = settings.getAsInt(INDEX_INDEX_CONCURRENCY, InternalEngine.this.indexConcurrency);
             boolean failOnMergeFailure = settings.getAsBoolean(INDEX_FAIL_ON_MERGE_FAILURE, InternalEngine.this.failOnMergeFailure);
             String codecName = settings.get(INDEX_CODEC, InternalEngine.this.codecName);
-            final boolean codecBloomLoad = settings.getAsBoolean(CodecService.INDEX_CODEC_BLOOM_LOAD, codecService.isLoadBloomFilter());
             boolean requiresFlushing = false;
             if (indexConcurrency != InternalEngine.this.indexConcurrency ||
                     !codecName.equals(InternalEngine.this.codecName) ||
-                    failOnMergeFailure != InternalEngine.this.failOnMergeFailure ||
-                    codecBloomLoad != codecService.isLoadBloomFilter()) {
+                    failOnMergeFailure != InternalEngine.this.failOnMergeFailure) {
                 try (InternalLock _ = readLock.acquire()) {
                     if (indexConcurrency != InternalEngine.this.indexConcurrency) {
                         logger.info("updating index.index_concurrency from [{}] to [{}]", InternalEngine.this.indexConcurrency, indexConcurrency);
@@ -1517,12 +1515,6 @@ public void onRefreshSettings(Settings settings) {
                         logger.info("updating {} from [{}] to [{}]", InternalEngine.INDEX_FAIL_ON_MERGE_FAILURE, InternalEngine.this.failOnMergeFailure, failOnMergeFailure);
                         InternalEngine.this.failOnMergeFailure = failOnMergeFailure;
                     }
-                    if (codecBloomLoad != codecService.isLoadBloomFilter()) {
-                        logger.info("updating {} from [{}] to [{}]", CodecService.INDEX_CODEC_BLOOM_LOAD, codecService.isLoadBloomFilter(), codecBloomLoad);
-                        codecService.setLoadBloomFilter(codecBloomLoad);
-                        // we need to flush in this case, to load/unload the bloom filters
-                        requiresFlushing = true;
-                    }
                 }
                 if (requiresFlushing) {
                     flush(new Flush().type(Flush.Type.NEW_WRITER));

diff --git a/src/main/java/org/elasticsearch/index/settings/IndexDynamicSettingsModule.java b/src/main/java/org/elasticsearch/index/settings/IndexDynamicSettingsModule.java
@@ -84,7 +84,6 @@ public IndexDynamicSettingsModule() {
         indexDynamicSettings.addDynamicSetting(LogDocMergePolicyProvider.INDEX_COMPOUND_FORMAT);
         indexDynamicSettings.addDynamicSetting(InternalEngine.INDEX_INDEX_CONCURRENCY, Validator.NON_NEGATIVE_INTEGER);
         indexDynamicSettings.addDynamicSetting(InternalEngine.INDEX_COMPOUND_ON_FLUSH, Validator.BOOLEAN);
-        indexDynamicSettings.addDynamicSetting(CodecService.INDEX_CODEC_BLOOM_LOAD, Validator.BOOLEAN);
         indexDynamicSettings.addDynamicSetting(InternalEngine.INDEX_GC_DELETES, Validator.TIME);
         indexDynamicSettings.addDynamicSetting(InternalEngine.INDEX_CODEC);
         indexDynamicSettings.addDynamicSetting(InternalEngine.INDEX_FAIL_ON_MERGE_FAILURE);

diff --git a/src/main/java/org/elasticsearch/index/store/Store.java b/src/main/java/org/elasticsearch/index/store/Store.java
@@ -556,12 +556,6 @@ public ShardId shardId() {
             return Store.this.shardId();
         }
 
-        @Nullable
-        public CodecService codecService() {
-            ensureOpen();
-            return Store.this.codecService;
-        }
-
         @Override
         public IndexInput openInput(String name, IOContext context) throws IOException {
             IndexInput in = super.openInput(name, context);

diff --git a/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
@@ -1,3 +1,2 @@
-org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat
 org.elasticsearch.index.codec.postingsformat.Elasticsearch090PostingsFormat
 org.elasticsearch.search.suggest.completion.Completion090PostingsFormat