Skip to content

Commit

Permalink
lucene 4: Exposed Lucene's codec api
Browse files Browse the repository at this point in the history
This feature adds the option to configure a `PostingsFormat` and assign it to a field in the mapping. This feature is very expert and in almost all cases Elasticsearch's defaults will suite your needs.

## Configuring a postingsformat per field

There're several default postings formats configured by default which can be used in your mapping:
a* `direct` - A codec that wraps the default postings format during write time, but loads the terms and postinglists into memory directly in memory during read time as raw arrays. This postings format is exceptional memory intensive, but can give a substantial increase in search performance.
* `memory` - A codec that loads and stores terms and postinglists in memory using a FST. Acts like a cached postingslist.
* `bloom_default` - Maintains a bloom filter for the indexed terms, which is stored to disk and builds on top of the `default` postings format. This postings format is useful for low document frequency terms and offers a fail fast for seeks to terms that don't exist.
* `bloom_pulsing` - Similar to the `bloom_default` postings format, but builds on top of the `pulsing` postings format.
* `default` - The default postings format. The default if none is specified.

On all fields it possible to configure a `postings_format` attribute. Example mapping:
```
{
  "person" : {
     "properties" : {
         "second_person_id" : {"type" : "string", "postings_format" : "pulsing"}
     }
  }
}
```

## Configuring a custom postingsformat
It is possible the instantiate custom postingsformats. This can be specified via the index settings.
```
{
   "codec" : {
      "postings_format" : {
         "my_format" : {
            "type" : "pulsing40"
            "freq_cut_off" : "5"
         }
      }
   }
}
```
In the above example the `freq_cut_off` is set the 5 (defaults to 1). This tells the pulsing postings format to inline the postinglist of terms with a document frequency lower or equal to 5 in the term dictionary.

Closes #2411
  • Loading branch information
martijnvg committed Nov 14, 2012
1 parent 120560b commit fd5bd10
Show file tree
Hide file tree
Showing 56 changed files with 1,221 additions and 127 deletions.
99 changes: 99 additions & 0 deletions src/main/java/org/elasticsearch/index/codec/CodecModule.java
@@ -0,0 +1,99 @@
package org.elasticsearch.index.codec;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.codecs.memory.DirectPostingsFormat;
import org.apache.lucene.codecs.memory.MemoryPostingsFormat;
import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.inject.AbstractModule;
import org.elasticsearch.common.inject.Scopes;
import org.elasticsearch.common.inject.assistedinject.FactoryProvider;
import org.elasticsearch.common.inject.multibindings.MapBinder;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
import org.elasticsearch.index.codec.postingsformat.PostingsFormatService;
import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvider;

import java.util.List;
import java.util.Map;

/**
*/
public class CodecModule extends AbstractModule {

public static final ImmutableList<PreBuiltPostingsFormatProvider.Factory> preConfiguredPostingFormats;

static {
List<PreBuiltPostingsFormatProvider.Factory> preConfiguredPostingFormatsX = Lists.newArrayList();
// add defaults ones
for (String luceneName : PostingsFormat.availablePostingsFormats()) {
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory(PostingsFormat.forName(luceneName)));
}
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("direct", new DirectPostingsFormat()));
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("memory", new MemoryPostingsFormat()));
// LUCENE UPGRADE: Need to change this to the relevant ones on a lucene upgrade
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("pulsing", new Pulsing40PostingsFormat()));
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("bloom_pulsing", new BloomFilteringPostingsFormat(new Pulsing40PostingsFormat())));
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("default", new Lucene40PostingsFormat()));
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("bloom_default", new BloomFilteringPostingsFormat(new Lucene40PostingsFormat())));

preConfiguredPostingFormats = ImmutableList.copyOf(preConfiguredPostingFormatsX);
}

private final Settings indexSettings;

private Map<String, Class<? extends PostingsFormatProvider>> customProviders = Maps.newHashMap();

public CodecModule(Settings indexSettings) {
this.indexSettings = indexSettings;
}

public CodecModule addPostingFormat(String name, Class<? extends PostingsFormatProvider> provider) {
this.customProviders.put(name, provider);
return this;
}

@Override
protected void configure() {

Map<String, Class<? extends PostingsFormatProvider>> postingFormatProviders = Maps.newHashMap(customProviders);

Map<String, Settings> postingsFormatsSettings = indexSettings.getGroups("index.codec.postings_format");
for (Map.Entry<String, Settings> entry : postingsFormatsSettings.entrySet()) {
String name = entry.getKey();
Settings settings = entry.getValue();

Class<? extends PostingsFormatProvider> type =
settings.getAsClass("type", null, "org.elasticsearch.index.codec.postingsformat.", "PostingsFormatProvider");

if (type == null) {
// nothing found, see if its in bindings as a binding name
throw new ElasticSearchIllegalArgumentException("PostingsFormat Factory [" + name + "] must have a type associated with it");
}
postingFormatProviders.put(name, type);
}

// now bind
MapBinder<String, PostingsFormatProvider.Factory> postingFormatFactoryBinder
= MapBinder.newMapBinder(binder(), String.class, PostingsFormatProvider.Factory.class);

for (Map.Entry<String, Class<? extends PostingsFormatProvider>> entry : postingFormatProviders.entrySet()) {
postingFormatFactoryBinder.addBinding(entry.getKey()).toProvider(FactoryProvider.newFactory(PostingsFormatProvider.Factory.class, entry.getValue())).in(Scopes.SINGLETON);
}

for (PreBuiltPostingsFormatProvider.Factory factory : preConfiguredPostingFormats) {
if (postingFormatProviders.containsKey(factory.name())) {
continue;
}
postingFormatFactoryBinder.addBinding(factory.name()).toInstance(factory);
}

bind(PostingsFormatService.class).asEagerSingleton();
bind(CodecService.class).asEagerSingleton();
}
}
66 changes: 66 additions & 0 deletions src/main/java/org/elasticsearch/index/codec/CodecService.java
@@ -0,0 +1,66 @@
package org.elasticsearch.index.codec;

import com.google.common.collect.ImmutableMap;
import org.apache.lucene.codecs.Codec;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.AbstractIndexComponent;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.codec.postingsformat.PostingsFormatService;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.settings.IndexSettings;

/**
*/
public class CodecService extends AbstractIndexComponent {

private final PostingsFormatService postingsFormatService;
private final MapperService mapperService;
private final ImmutableMap<String, Codec> codecs;

public CodecService(Index index) {
this(index, ImmutableSettings.Builder.EMPTY_SETTINGS);
}

public CodecService(Index index, @IndexSettings Settings indexSettings) {
this(index, indexSettings, new PostingsFormatService(index, indexSettings), null);
}

@Inject
public CodecService(Index index, @IndexSettings Settings indexSettings, PostingsFormatService postingsFormatService,
MapperService mapperService) {
super(index, indexSettings);
this.postingsFormatService = postingsFormatService;
this.mapperService = mapperService;
MapBuilder<String, Codec> codecs = MapBuilder.<String, Codec>newMapBuilder();
if (mapperService == null) {
codecs.put("default", Codec.getDefault());
} else {
codecs.put("default", new PerFieldMappingPostingFormatCodec(mapperService, postingsFormatService.get("default").get()));
}
for (String codec : Codec.availableCodecs()) {
codecs.put(codec, Codec.forName(codec));
}
this.codecs = codecs.immutableMap();
}

public PostingsFormatService postingsFormatService() {
return this.postingsFormatService;
}

public MapperService mapperService() {
return mapperService;
}

public Codec codec(String name) throws ElasticSearchIllegalArgumentException {
Codec codec = codecs.get(name);
if (codec == null) {
throw new ElasticSearchIllegalArgumentException("failed to find codec [" + name + "]");
}
return codec;
}

}
@@ -0,0 +1,27 @@
package org.elasticsearch.index.codec;

import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
import org.elasticsearch.index.mapper.MapperService;

/**
* This one is the "default" codec we use.
*/
// LUCENE UPGRADE: make sure to move to a new codec depending on the lucene version
public class PerFieldMappingPostingFormatCodec extends Lucene40Codec {

private final MapperService mapperService;
private final PostingsFormat defaultPostingFormat;

public PerFieldMappingPostingFormatCodec(MapperService mapperService, PostingsFormat defaultPostingFormat) {
this.mapperService = mapperService;
this.defaultPostingFormat = defaultPostingFormat;
}

@Override
public PostingsFormat getPostingsFormatForField(String field) {
PostingsFormatProvider postingsFormat = mapperService.indexName(field).mapper().postingFormatProvider();
return postingsFormat != null ? postingsFormat.get() : defaultPostingFormat;
}
}
@@ -0,0 +1,17 @@
package org.elasticsearch.index.codec.postingsformat;

/**
*/
public abstract class AbstractPostingsFormatProvider implements PostingsFormatProvider {

private final String name;

protected AbstractPostingsFormatProvider(String name) {
this.name = name;
}

public String name() {
return name;
}

}
@@ -0,0 +1,78 @@
package org.elasticsearch.index.codec.postingsformat;

import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.bloom.BloomFilterFactory;
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
import org.apache.lucene.codecs.bloom.FuzzySet;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.settings.IndexSettings;

import java.util.Map;

/**
*/
public class BloomFilterPostingsFormatProvider extends AbstractPostingsFormatProvider {

private final float desiredMaxSaturation;
private final float saturationLimit;
private final PostingsFormatProvider delegate;
private final BloomFilteringPostingsFormat postingsFormat;

@Inject
public BloomFilterPostingsFormatProvider(@IndexSettings Settings indexSettings, @Nullable Map<String, Factory> postingFormatFactories, @Assisted String name, @Assisted Settings postingsFormatSettings) {
super(name);
this.desiredMaxSaturation = postingsFormatSettings.getAsFloat("desired_max_saturation", 0.1f);
this.saturationLimit = postingsFormatSettings.getAsFloat("saturation_limit", 0.9f);
this.delegate = Helper.lookup(indexSettings, postingsFormatSettings.get("delegate"), postingFormatFactories);
this.postingsFormat = new BloomFilteringPostingsFormat(
delegate.get(),
new CustomBloomFilterFactory(desiredMaxSaturation, saturationLimit)
);
}

public float desiredMaxSaturation() {
return desiredMaxSaturation;
}

public float saturationLimit() {
return saturationLimit;
}

public PostingsFormatProvider delegate() {
return delegate;
}

@Override
public PostingsFormat get() {
return postingsFormat;
}

static class CustomBloomFilterFactory extends BloomFilterFactory {

private final float desiredMaxSaturation;
private final float saturationLimit;

CustomBloomFilterFactory(float desiredMaxSaturation, float saturationLimit) {
this.desiredMaxSaturation = desiredMaxSaturation;
this.saturationLimit = saturationLimit;
}

@Override
public FuzzySet getSetForField(SegmentWriteState state, FieldInfo info) {
//Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set with desiredMaxSaturation% of bits set
return FuzzySet.createSetBasedOnQuality(state.segmentInfo.getDocCount(), desiredMaxSaturation);
}

@Override
public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
// Don't bother saving bitsets if > saturationLimit % of bits are set - we don't want to
// throw any more memory at this problem.
return bloomFilter.getSaturation() > saturationLimit;
}
}
}
@@ -0,0 +1,37 @@
package org.elasticsearch.index.codec.postingsformat;

import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.memory.DirectPostingsFormat;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;

/**
*/
public class DirectPostingsFormatProvider extends AbstractPostingsFormatProvider {

private final int minSkipCount;
private final int lowFreqCutoff;
private final DirectPostingsFormat postingsFormat;

@Inject
public DirectPostingsFormatProvider(@Assisted String name, @Assisted Settings postingsFormatSettings) {
super(name);
this.minSkipCount = postingsFormatSettings.getAsInt("min_skip_count", 8); // See DirectPostingsFormat#DEFAULT_MIN_SKIP_COUNT
this.lowFreqCutoff = postingsFormatSettings.getAsInt("low_freq_cutoff", 32); // See DirectPostingsFormat#DEFAULT_LOW_FREQ_CUTOFF
this.postingsFormat = new DirectPostingsFormat(minSkipCount, lowFreqCutoff);
}

public int minSkipCount() {
return minSkipCount;
}

public int lowFreqCutoff() {
return lowFreqCutoff;
}

@Override
public PostingsFormat get() {
return postingsFormat;
}
}
@@ -0,0 +1,38 @@
package org.elasticsearch.index.codec.postingsformat;

import org.apache.lucene.codecs.BlockTreeTermsWriter;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;

/**
*/
public class Lucene40PostingsFormatProvider extends AbstractPostingsFormatProvider {

private final int minBlockSize;
private final int maxBlockSize;
private final Lucene40PostingsFormat postingsFormat;

@Inject
public Lucene40PostingsFormatProvider(@Assisted String name, @Assisted Settings postingsFormatSettings) {
super(name);
this.minBlockSize = postingsFormatSettings.getAsInt("min_block_size", BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE);
this.maxBlockSize = postingsFormatSettings.getAsInt("max_block_size", BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
this.postingsFormat = new Lucene40PostingsFormat(minBlockSize, maxBlockSize);
}

public int minBlockSize() {
return minBlockSize;
}

public int maxBlockSize() {
return maxBlockSize;
}

@Override
public PostingsFormat get() {
return postingsFormat;
}
}

0 comments on commit fd5bd10

Please sign in to comment.