Core: Change the default cache filter impl from FixedBitSet to WAH8Do…

…cIdSet. A few months ago, Lucene switched from FixedBitSet to WAH8DocIdSet in order to cache filters. WAH8DocIdSet is especially better when dealing with sparse sets: iteration is faster, memory usage is lower and there is an index that helps keep advance fast. This doesn't break existing code since #6280 already made sure that there was no abusive cast from DocIdSet to Bits or FixedBitSet and #7037 moved consumers of the filter cache that absolutely need to get fixed bitsets to their own cache. Since cached filters will be more memory-efficient, the filter cache size has been decreased from 10 to 5%. Although smaller, this might still allow to cache more filters.
elastic · Sep 3, 2014 · 8d39d48 · 8d39d48
1 parent 228778c
commit 8d39d48
Show file tree

Hide file tree

Showing 10 changed files with 57 additions and 40 deletions.
diff --git a/docs/reference/index-modules/cache.asciidoc b/docs/reference/index-modules/cache.asciidoc
@@ -24,10 +24,10 @@ when a cache becomes full, the least recently used data is evicted to
 make way for new data.
 
 The setting that allows one to control the memory size for the filter
-cache is `indices.cache.filter.size`, which defaults to `10%`. *Note*,
+cache is `indices.cache.filter.size`, which defaults to `5%`. *Note*,
 this is *not* an index level setting but a node level setting (can be
 configured in the node configuration).
 
 `indices.cache.filter.size` can accept either a percentage value, like
-`30%`, or an exact value, like `512mb`.
+`10%`, or an exact value, like `512mb`.
 
diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java b/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java
@@ -77,7 +77,7 @@ public DocIdSetIterator iterator() throws IOException {
         List<DocIdSet> iterators = new ArrayList<>(sets.length);
         List<Bits> bits = new ArrayList<>(sets.length);
         for (DocIdSet set : sets) {
-            if (DocIdSets.isFastIterator(set)) {
+            if (DocIdSets.hasFasterIteratorThanRandomAccess(set)) {
                 iterators.add(set);
             } else {
                 Bits bit = set.bits();

diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java b/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java
@@ -25,11 +25,13 @@
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.WAH8DocIdSet;
 import org.elasticsearch.common.Nullable;
 
 import java.io.IOException;
 
 /**
+ * Utility methods to work with {@link DocIdSet}s.
  */
 public class DocIdSets {
 
@@ -41,31 +43,42 @@ public static long sizeInBytes(DocIdSet docIdSet) {
     }
 
     /**
-     * Is it an empty {@link DocIdSet}?
+     * Is it an empty {@link DocIdSet}? This is best effort: a return value of
+     * <tt>false</tt> does not mean that the set contains at least one
+     * document.
      */
     public static boolean isEmpty(@Nullable DocIdSet set) {
         return set == null || set == DocIdSet.EMPTY;
     }
 
     /**
-     * Is {@link org.apache.lucene.search.DocIdSetIterator} implemented in a "fast" manner.
-     * For example, it does not ends up iterating one doc at a time check for its "value".
+     * In case a {@link DocIdSet} provides both sequential and random access,
+     * return whether using the iterator is faster than checking each document
+     * individually in the random-access view.
      */
-    public static boolean isFastIterator(DocIdSet set) {
+    public static boolean hasFasterIteratorThanRandomAccess(DocIdSet set) {
+        // In the case of FixedBitSet, the iterator is faster since it can
+        // check up to 64 bits at a time. However the other random-access sets
+        // (mainly script or geo filters) don't make iteration faster.
         return set instanceof FixedBitSet;
     }
 
     /**
      * Converts to a cacheable {@link DocIdSet}
      * <p/>
-     * Note, we don't use {@link org.apache.lucene.search.DocIdSet#isCacheable()} because execution
-     * might be expensive even if its cacheable (i.e. not going back to the reader to execute). We effectively
-     * always either return an empty {@link DocIdSet} or {@link FixedBitSet} but never <code>null</code>.
+     * We effectively always either return an empty {@link DocIdSet} or
+     * {@link FixedBitSet} but never <tt>null</tt>.
      */
     public static DocIdSet toCacheable(AtomicReader reader, @Nullable DocIdSet set) throws IOException {
-        if (set == null || set == DocIdSet.EMPTY) {
+        if (set == null) {
             return DocIdSet.EMPTY;
         }
+        // Some filters return doc id sets that are already cacheable. This is
+        // for instance the case of the terms filter which creates fixed bitsets
+        // In that case don't spend time converting it.
+        if (set.isCacheable()) { // covers DocIdSet.EMPTY as well
+            return set;
+        }
         DocIdSetIterator it = set.iterator();
         if (it == null) {
             return DocIdSet.EMPTY;
@@ -74,23 +87,21 @@ public static DocIdSet toCacheable(AtomicReader reader, @Nullable DocIdSet set)
         if (doc == DocIdSetIterator.NO_MORE_DOCS) {
             return DocIdSet.EMPTY;
         }
-        if (set instanceof FixedBitSet) {
-            return set;
-        }
-        // TODO: should we use WAH8DocIdSet like Lucene?
-        FixedBitSet fixedBitSet = new FixedBitSet(reader.maxDoc());
-        do {
-            fixedBitSet.set(doc);
-            doc = it.nextDoc();
-        } while (doc != DocIdSetIterator.NO_MORE_DOCS);
-        return fixedBitSet;
+        // We use WAH8DocIdSet like Lucene does by default
+        // Compared to FixedBitset, it doesn't have random access but is faster
+        // to iterate on, better compressed, and skips faster thanks to an index
+        WAH8DocIdSet.Builder builder = new WAH8DocIdSet.Builder();
+        builder.add(doc);
+        builder.add(it);
+        return builder.build();
     }
 
     /**
-     * Gets a set to bits.
+     * Gives random-access to a {@link DocIdSet}, potentially copying the
+     * {@link DocIdSet} to another data-structure that gives random access.
      */
     public static Bits toSafeBits(AtomicReader reader, @Nullable DocIdSet set) throws IOException {
-        if (set == null) {
+        if (isEmpty(set)) {
             return new Bits.MatchNoBits(reader.maxDoc());
         }
         Bits bits = set.bits();

diff --git a/src/main/java/org/elasticsearch/common/lucene/search/ApplyAcceptedDocsFilter.java b/src/main/java/org/elasticsearch/common/lucene/search/ApplyAcceptedDocsFilter.java
@@ -107,7 +107,7 @@ public Bits bits() throws IOException {
 
         @Override
         public DocIdSetIterator iterator() throws IOException {
-            if (!DocIdSets.isFastIterator(innerSet) && liveDocs instanceof FixedBitSet) {
+            if (!DocIdSets.hasFasterIteratorThanRandomAccess(innerSet) && liveDocs instanceof FixedBitSet) {
                 // might as well iterate over the live docs..., since the iterator is not fast enough
                 // but we can only do that if we have Bits..., in short, we reverse the order...
                 Bits bits = innerSet.bits();

diff --git a/src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java b/src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java
@@ -101,7 +101,7 @@ public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws
                 }
             }
             Bits bits = null;
-            if (!DocIdSets.isFastIterator(set)) {
+            if (!DocIdSets.hasFasterIteratorThanRandomAccess(set)) {
                 bits = set.bits();
             }
             results.add(new ResultClause(set, bits, clause));

diff --git a/src/main/java/org/elasticsearch/common/lucene/search/XFilteredQuery.java b/src/main/java/org/elasticsearch/common/lucene/search/XFilteredQuery.java
@@ -225,7 +225,7 @@ public Scorer filteredScorer(AtomicReaderContext context, Weight weight, DocIdSe
             // CHANGE: handle "default" value
             if (threshold == -1) {
                 // default  value, don't iterate on only apply filter after query if its not a "fast" docIdSet
-                if (!DocIdSets.isFastIterator(ApplyAcceptedDocsFilter.unwrap(docIdSet))) {
+                if (!DocIdSets.hasFasterIteratorThanRandomAccess(ApplyAcceptedDocsFilter.unwrap(docIdSet))) {
                     return FilteredQuery.QUERY_FIRST_FILTER_STRATEGY.filteredScorer(context, weight, docIdSet);
                 }
             }

diff --git a/src/main/java/org/elasticsearch/index/cache/filter/FilterCache.java b/src/main/java/org/elasticsearch/index/cache/filter/FilterCache.java
@@ -42,8 +42,6 @@ public EntriesStats(long sizeInBytes, long count) {
     // we need to "inject" the index service to not create cyclic dep
     void setIndexService(IndexService indexService);
 
-    String type();
-
     Filter cache(Filter filterToCache);
 
     void clear(Object reader);

diff --git a/src/main/java/org/elasticsearch/index/cache/filter/none/NoneFilterCache.java b/src/main/java/org/elasticsearch/index/cache/filter/none/NoneFilterCache.java
@@ -44,11 +44,6 @@ public void setIndexService(IndexService indexService) {
         // nothing to do here...
     }
 
-    @Override
-    public String type() {
-        return "none";
-    }
-
     @Override
     public void close() {
         // nothing to do here

diff --git a/src/main/java/org/elasticsearch/index/cache/filter/weighted/WeightedFilterCache.java b/src/main/java/org/elasticsearch/index/cache/filter/weighted/WeightedFilterCache.java
@@ -22,14 +22,17 @@
 import com.google.common.cache.Cache;
 import com.google.common.cache.RemovalListener;
 import com.google.common.cache.Weigher;
+import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.SegmentReader;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.WAH8DocIdSet;
 import org.elasticsearch.ElasticsearchException;
+import org.elasticsearch.ElasticsearchIllegalStateException;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.inject.Inject;
@@ -53,6 +56,11 @@
 import java.io.IOException;
 import java.util.concurrent.ConcurrentMap;
 
+/**
+ * Default implementation for a filter cache. Evictions are weighted based on
+ * the amount of memory that cache entries require. Filters are cached using
+ * {@link WAH8DocIdSet}.
+ */
 public class WeightedFilterCache extends AbstractIndexComponent implements FilterCache, SegmentReader.CoreClosedListener, IndexReader.ReaderClosedListener {
 
     final IndicesFilterCache indicesFilterCache;
@@ -71,11 +79,6 @@ public void setIndexService(IndexService indexService) {
         this.indexService = indexService;
     }
 
-    @Override
-    public String type() {
-        return "weighted";
-    }
-
     @Override
     public void close() throws ElasticsearchException {
         clear("close");
@@ -141,6 +144,13 @@ public Filter cache(Filter filterToCache) {
         return new FilterCacheFilterWrapper(filterToCache, this);
     }
 
+    /**
+     * Given a {@link DocIdSet}, return a cacheable {@link DocIdSet}.
+     */
+    protected DocIdSet doCache(AtomicReader reader, DocIdSet set) throws IOException {
+        return DocIdSets.toCacheable(reader, set);
+    }
+
     static class FilterCacheFilterWrapper extends CachedFilter {
 
         private final Filter filter;
@@ -174,7 +184,10 @@ public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws
                 // we can't pass down acceptedDocs provided, because we are caching the result, and acceptedDocs
                 // might be specific to a query. We don't pass the live docs either because a cache built for a specific
                 // generation of a segment might be reused by an older generation which has fewer deleted documents
-                cacheValue = DocIdSets.toCacheable(context.reader(), filter.getDocIdSet(context, null));
+                cacheValue = cache.doCache(context.reader(), filter.getDocIdSet(context, null));
+                if (!cacheValue.isCacheable()) {
+                    throw new ElasticsearchIllegalStateException("doCache must return cacheable filters, got " + cacheValue);
+                }
                 // we might put the same one concurrently, that's fine, it will be replaced and the removal
                 // will be called
                 ShardId shardId = ShardUtils.extractShardId(context.reader());

diff --git a/src/main/java/org/elasticsearch/indices/cache/filter/IndicesFilterCache.java b/src/main/java/org/elasticsearch/indices/cache/filter/IndicesFilterCache.java
@@ -91,7 +91,7 @@ public void onRefreshSettings(Settings settings) {
     public IndicesFilterCache(Settings settings, ThreadPool threadPool, NodeSettingsService nodeSettingsService) {
         super(settings);
         this.threadPool = threadPool;
-        this.size = componentSettings.get("size", "10%");
+        this.size = componentSettings.get("size", "5%");
         this.expire = componentSettings.getAsTime("expire", null);
         this.cleanInterval = componentSettings.getAsTime("clean_interval", TimeValue.timeValueSeconds(60));
         computeSizeInBytes();