Skip to content

Commit

Permalink
Core: Change the default cache filter impl from FixedBitSet to WAH8Do…
Browse files Browse the repository at this point in the history
…cIdSet.

A few months ago, Lucene switched from FixedBitSet to WAH8DocIdSet in order
to cache filters. WAH8DocIdSet is especially better when dealing with sparse
sets: iteration is faster, memory usage is lower and there is an index that
helps keep advance fast.

This doesn't break existing code since #6280 already made sure that there was no
abusive cast from DocIdSet to Bits or FixedBitSet and #7037 moved consumers of
the filter cache that absolutely need to get fixed bitsets to their own cache.

Since cached filters will be more memory-efficient, the filter cache size has
been decreased from 10 to 5%. Although smaller, this might still allow to cache
more filters.
  • Loading branch information
jpountz committed Sep 3, 2014
1 parent 228778c commit 8d39d48
Show file tree
Hide file tree
Showing 10 changed files with 57 additions and 40 deletions.
4 changes: 2 additions & 2 deletions docs/reference/index-modules/cache.asciidoc
Expand Up @@ -24,10 +24,10 @@ when a cache becomes full, the least recently used data is evicted to
make way for new data.

The setting that allows one to control the memory size for the filter
cache is `indices.cache.filter.size`, which defaults to `10%`. *Note*,
cache is `indices.cache.filter.size`, which defaults to `5%`. *Note*,
this is *not* an index level setting but a node level setting (can be
configured in the node configuration).

`indices.cache.filter.size` can accept either a percentage value, like
`30%`, or an exact value, like `512mb`.
`10%`, or an exact value, like `512mb`.

Expand Up @@ -77,7 +77,7 @@ public DocIdSetIterator iterator() throws IOException {
List<DocIdSet> iterators = new ArrayList<>(sets.length);
List<Bits> bits = new ArrayList<>(sets.length);
for (DocIdSet set : sets) {
if (DocIdSets.isFastIterator(set)) {
if (DocIdSets.hasFasterIteratorThanRandomAccess(set)) {
iterators.add(set);
} else {
Bits bit = set.bits();
Expand Down
51 changes: 31 additions & 20 deletions src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java
Expand Up @@ -25,11 +25,13 @@
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.WAH8DocIdSet;
import org.elasticsearch.common.Nullable;

import java.io.IOException;

/**
* Utility methods to work with {@link DocIdSet}s.
*/
public class DocIdSets {

Expand All @@ -41,31 +43,42 @@ public static long sizeInBytes(DocIdSet docIdSet) {
}

/**
* Is it an empty {@link DocIdSet}?
* Is it an empty {@link DocIdSet}? This is best effort: a return value of
* <tt>false</tt> does not mean that the set contains at least one
* document.
*/
public static boolean isEmpty(@Nullable DocIdSet set) {
return set == null || set == DocIdSet.EMPTY;
}

/**
* Is {@link org.apache.lucene.search.DocIdSetIterator} implemented in a "fast" manner.
* For example, it does not ends up iterating one doc at a time check for its "value".
* In case a {@link DocIdSet} provides both sequential and random access,
* return whether using the iterator is faster than checking each document
* individually in the random-access view.
*/
public static boolean isFastIterator(DocIdSet set) {
public static boolean hasFasterIteratorThanRandomAccess(DocIdSet set) {
// In the case of FixedBitSet, the iterator is faster since it can
// check up to 64 bits at a time. However the other random-access sets
// (mainly script or geo filters) don't make iteration faster.
return set instanceof FixedBitSet;
}

/**
* Converts to a cacheable {@link DocIdSet}
* <p/>
* Note, we don't use {@link org.apache.lucene.search.DocIdSet#isCacheable()} because execution
* might be expensive even if its cacheable (i.e. not going back to the reader to execute). We effectively
* always either return an empty {@link DocIdSet} or {@link FixedBitSet} but never <code>null</code>.
* We effectively always either return an empty {@link DocIdSet} or
* {@link FixedBitSet} but never <tt>null</tt>.
*/
public static DocIdSet toCacheable(AtomicReader reader, @Nullable DocIdSet set) throws IOException {
if (set == null || set == DocIdSet.EMPTY) {
if (set == null) {
return DocIdSet.EMPTY;
}
// Some filters return doc id sets that are already cacheable. This is
// for instance the case of the terms filter which creates fixed bitsets
// In that case don't spend time converting it.
if (set.isCacheable()) { // covers DocIdSet.EMPTY as well
return set;
}
DocIdSetIterator it = set.iterator();
if (it == null) {
return DocIdSet.EMPTY;
Expand All @@ -74,23 +87,21 @@ public static DocIdSet toCacheable(AtomicReader reader, @Nullable DocIdSet set)
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
return DocIdSet.EMPTY;
}
if (set instanceof FixedBitSet) {
return set;
}
// TODO: should we use WAH8DocIdSet like Lucene?
FixedBitSet fixedBitSet = new FixedBitSet(reader.maxDoc());
do {
fixedBitSet.set(doc);
doc = it.nextDoc();
} while (doc != DocIdSetIterator.NO_MORE_DOCS);
return fixedBitSet;
// We use WAH8DocIdSet like Lucene does by default
// Compared to FixedBitset, it doesn't have random access but is faster
// to iterate on, better compressed, and skips faster thanks to an index
WAH8DocIdSet.Builder builder = new WAH8DocIdSet.Builder();
builder.add(doc);
builder.add(it);
return builder.build();
}

/**
* Gets a set to bits.
* Gives random-access to a {@link DocIdSet}, potentially copying the
* {@link DocIdSet} to another data-structure that gives random access.
*/
public static Bits toSafeBits(AtomicReader reader, @Nullable DocIdSet set) throws IOException {
if (set == null) {
if (isEmpty(set)) {
return new Bits.MatchNoBits(reader.maxDoc());
}
Bits bits = set.bits();
Expand Down
Expand Up @@ -107,7 +107,7 @@ public Bits bits() throws IOException {

@Override
public DocIdSetIterator iterator() throws IOException {
if (!DocIdSets.isFastIterator(innerSet) && liveDocs instanceof FixedBitSet) {
if (!DocIdSets.hasFasterIteratorThanRandomAccess(innerSet) && liveDocs instanceof FixedBitSet) {
// might as well iterate over the live docs..., since the iterator is not fast enough
// but we can only do that if we have Bits..., in short, we reverse the order...
Bits bits = innerSet.bits();
Expand Down
Expand Up @@ -101,7 +101,7 @@ public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws
}
}
Bits bits = null;
if (!DocIdSets.isFastIterator(set)) {
if (!DocIdSets.hasFasterIteratorThanRandomAccess(set)) {
bits = set.bits();
}
results.add(new ResultClause(set, bits, clause));
Expand Down
Expand Up @@ -225,7 +225,7 @@ public Scorer filteredScorer(AtomicReaderContext context, Weight weight, DocIdSe
// CHANGE: handle "default" value
if (threshold == -1) {
// default value, don't iterate on only apply filter after query if its not a "fast" docIdSet
if (!DocIdSets.isFastIterator(ApplyAcceptedDocsFilter.unwrap(docIdSet))) {
if (!DocIdSets.hasFasterIteratorThanRandomAccess(ApplyAcceptedDocsFilter.unwrap(docIdSet))) {
return FilteredQuery.QUERY_FIRST_FILTER_STRATEGY.filteredScorer(context, weight, docIdSet);
}
}
Expand Down
Expand Up @@ -42,8 +42,6 @@ public EntriesStats(long sizeInBytes, long count) {
// we need to "inject" the index service to not create cyclic dep
void setIndexService(IndexService indexService);

String type();

Filter cache(Filter filterToCache);

void clear(Object reader);
Expand Down
Expand Up @@ -44,11 +44,6 @@ public void setIndexService(IndexService indexService) {
// nothing to do here...
}

@Override
public String type() {
return "none";
}

@Override
public void close() {
// nothing to do here
Expand Down
Expand Up @@ -22,14 +22,17 @@
import com.google.common.cache.Cache;
import com.google.common.cache.RemovalListener;
import com.google.common.cache.Weigher;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.WAH8DocIdSet;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchIllegalStateException;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.inject.Inject;
Expand All @@ -53,6 +56,11 @@
import java.io.IOException;
import java.util.concurrent.ConcurrentMap;

/**
* Default implementation for a filter cache. Evictions are weighted based on
* the amount of memory that cache entries require. Filters are cached using
* {@link WAH8DocIdSet}.
*/
public class WeightedFilterCache extends AbstractIndexComponent implements FilterCache, SegmentReader.CoreClosedListener, IndexReader.ReaderClosedListener {

final IndicesFilterCache indicesFilterCache;
Expand All @@ -71,11 +79,6 @@ public void setIndexService(IndexService indexService) {
this.indexService = indexService;
}

@Override
public String type() {
return "weighted";
}

@Override
public void close() throws ElasticsearchException {
clear("close");
Expand Down Expand Up @@ -141,6 +144,13 @@ public Filter cache(Filter filterToCache) {
return new FilterCacheFilterWrapper(filterToCache, this);
}

/**
* Given a {@link DocIdSet}, return a cacheable {@link DocIdSet}.
*/
protected DocIdSet doCache(AtomicReader reader, DocIdSet set) throws IOException {
return DocIdSets.toCacheable(reader, set);
}

static class FilterCacheFilterWrapper extends CachedFilter {

private final Filter filter;
Expand Down Expand Up @@ -174,7 +184,10 @@ public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws
// we can't pass down acceptedDocs provided, because we are caching the result, and acceptedDocs
// might be specific to a query. We don't pass the live docs either because a cache built for a specific
// generation of a segment might be reused by an older generation which has fewer deleted documents
cacheValue = DocIdSets.toCacheable(context.reader(), filter.getDocIdSet(context, null));
cacheValue = cache.doCache(context.reader(), filter.getDocIdSet(context, null));
if (!cacheValue.isCacheable()) {
throw new ElasticsearchIllegalStateException("doCache must return cacheable filters, got " + cacheValue);
}
// we might put the same one concurrently, that's fine, it will be replaced and the removal
// will be called
ShardId shardId = ShardUtils.extractShardId(context.reader());
Expand Down
Expand Up @@ -91,7 +91,7 @@ public void onRefreshSettings(Settings settings) {
public IndicesFilterCache(Settings settings, ThreadPool threadPool, NodeSettingsService nodeSettingsService) {
super(settings);
this.threadPool = threadPool;
this.size = componentSettings.get("size", "10%");
this.size = componentSettings.get("size", "5%");
this.expire = componentSettings.getAsTime("expire", null);
this.cleanInterval = componentSettings.getAsTime("clean_interval", TimeValue.timeValueSeconds(60));
computeSizeInBytes();
Expand Down

0 comments on commit 8d39d48

Please sign in to comment.