Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
lucene 4: Exposed Lucene's codec api
This feature adds the option to configure a `PostingsFormat` and assign it to a field in the mapping. This feature is very expert and in almost all cases Elasticsearch's defaults will suite your needs. ## Configuring a postingsformat per field There're several default postings formats configured by default which can be used in your mapping: a* `direct` - A codec that wraps the default postings format during write time, but loads the terms and postinglists into memory directly in memory during read time as raw arrays. This postings format is exceptional memory intensive, but can give a substantial increase in search performance. * `memory` - A codec that loads and stores terms and postinglists in memory using a FST. Acts like a cached postingslist. * `bloom_default` - Maintains a bloom filter for the indexed terms, which is stored to disk and builds on top of the `default` postings format. This postings format is useful for low document frequency terms and offers a fail fast for seeks to terms that don't exist. * `bloom_pulsing` - Similar to the `bloom_default` postings format, but builds on top of the `pulsing` postings format. * `default` - The default postings format. The default if none is specified. On all fields it possible to configure a `postings_format` attribute. Example mapping: ``` { "person" : { "properties" : { "second_person_id" : {"type" : "string", "postings_format" : "pulsing"} } } } ``` ## Configuring a custom postingsformat It is possible the instantiate custom postingsformats. This can be specified via the index settings. ``` { "codec" : { "postings_format" : { "my_format" : { "type" : "pulsing40" "freq_cut_off" : "5" } } } } ``` In the above example the `freq_cut_off` is set the 5 (defaults to 1). This tells the pulsing postings format to inline the postinglist of terms with a document frequency lower or equal to 5 in the term dictionary. Closes #2411
- Loading branch information
Showing
56 changed files
with
1,221 additions
and
127 deletions.
There are no files selected for viewing
99 changes: 99 additions & 0 deletions
99
src/main/java/org/elasticsearch/index/codec/CodecModule.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
package org.elasticsearch.index.codec; | ||
|
||
import com.google.common.collect.ImmutableList; | ||
import com.google.common.collect.Lists; | ||
import com.google.common.collect.Maps; | ||
import org.apache.lucene.codecs.PostingsFormat; | ||
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat; | ||
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; | ||
import org.apache.lucene.codecs.memory.DirectPostingsFormat; | ||
import org.apache.lucene.codecs.memory.MemoryPostingsFormat; | ||
import org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat; | ||
import org.elasticsearch.ElasticSearchIllegalArgumentException; | ||
import org.elasticsearch.common.inject.AbstractModule; | ||
import org.elasticsearch.common.inject.Scopes; | ||
import org.elasticsearch.common.inject.assistedinject.FactoryProvider; | ||
import org.elasticsearch.common.inject.multibindings.MapBinder; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider; | ||
import org.elasticsearch.index.codec.postingsformat.PostingsFormatService; | ||
import org.elasticsearch.index.codec.postingsformat.PreBuiltPostingsFormatProvider; | ||
|
||
import java.util.List; | ||
import java.util.Map; | ||
|
||
/** | ||
*/ | ||
public class CodecModule extends AbstractModule { | ||
|
||
public static final ImmutableList<PreBuiltPostingsFormatProvider.Factory> preConfiguredPostingFormats; | ||
|
||
static { | ||
List<PreBuiltPostingsFormatProvider.Factory> preConfiguredPostingFormatsX = Lists.newArrayList(); | ||
// add defaults ones | ||
for (String luceneName : PostingsFormat.availablePostingsFormats()) { | ||
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory(PostingsFormat.forName(luceneName))); | ||
} | ||
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("direct", new DirectPostingsFormat())); | ||
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("memory", new MemoryPostingsFormat())); | ||
// LUCENE UPGRADE: Need to change this to the relevant ones on a lucene upgrade | ||
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("pulsing", new Pulsing40PostingsFormat())); | ||
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("bloom_pulsing", new BloomFilteringPostingsFormat(new Pulsing40PostingsFormat()))); | ||
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("default", new Lucene40PostingsFormat())); | ||
preConfiguredPostingFormatsX.add(new PreBuiltPostingsFormatProvider.Factory("bloom_default", new BloomFilteringPostingsFormat(new Lucene40PostingsFormat()))); | ||
|
||
preConfiguredPostingFormats = ImmutableList.copyOf(preConfiguredPostingFormatsX); | ||
} | ||
|
||
private final Settings indexSettings; | ||
|
||
private Map<String, Class<? extends PostingsFormatProvider>> customProviders = Maps.newHashMap(); | ||
|
||
public CodecModule(Settings indexSettings) { | ||
this.indexSettings = indexSettings; | ||
} | ||
|
||
public CodecModule addPostingFormat(String name, Class<? extends PostingsFormatProvider> provider) { | ||
this.customProviders.put(name, provider); | ||
return this; | ||
} | ||
|
||
@Override | ||
protected void configure() { | ||
|
||
Map<String, Class<? extends PostingsFormatProvider>> postingFormatProviders = Maps.newHashMap(customProviders); | ||
|
||
Map<String, Settings> postingsFormatsSettings = indexSettings.getGroups("index.codec.postings_format"); | ||
for (Map.Entry<String, Settings> entry : postingsFormatsSettings.entrySet()) { | ||
String name = entry.getKey(); | ||
Settings settings = entry.getValue(); | ||
|
||
Class<? extends PostingsFormatProvider> type = | ||
settings.getAsClass("type", null, "org.elasticsearch.index.codec.postingsformat.", "PostingsFormatProvider"); | ||
|
||
if (type == null) { | ||
// nothing found, see if its in bindings as a binding name | ||
throw new ElasticSearchIllegalArgumentException("PostingsFormat Factory [" + name + "] must have a type associated with it"); | ||
} | ||
postingFormatProviders.put(name, type); | ||
} | ||
|
||
// now bind | ||
MapBinder<String, PostingsFormatProvider.Factory> postingFormatFactoryBinder | ||
= MapBinder.newMapBinder(binder(), String.class, PostingsFormatProvider.Factory.class); | ||
|
||
for (Map.Entry<String, Class<? extends PostingsFormatProvider>> entry : postingFormatProviders.entrySet()) { | ||
postingFormatFactoryBinder.addBinding(entry.getKey()).toProvider(FactoryProvider.newFactory(PostingsFormatProvider.Factory.class, entry.getValue())).in(Scopes.SINGLETON); | ||
} | ||
|
||
for (PreBuiltPostingsFormatProvider.Factory factory : preConfiguredPostingFormats) { | ||
if (postingFormatProviders.containsKey(factory.name())) { | ||
continue; | ||
} | ||
postingFormatFactoryBinder.addBinding(factory.name()).toInstance(factory); | ||
} | ||
|
||
bind(PostingsFormatService.class).asEagerSingleton(); | ||
bind(CodecService.class).asEagerSingleton(); | ||
} | ||
} |
66 changes: 66 additions & 0 deletions
66
src/main/java/org/elasticsearch/index/codec/CodecService.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package org.elasticsearch.index.codec; | ||
|
||
import com.google.common.collect.ImmutableMap; | ||
import org.apache.lucene.codecs.Codec; | ||
import org.elasticsearch.ElasticSearchIllegalArgumentException; | ||
import org.elasticsearch.common.collect.MapBuilder; | ||
import org.elasticsearch.common.inject.Inject; | ||
import org.elasticsearch.common.settings.ImmutableSettings; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.index.AbstractIndexComponent; | ||
import org.elasticsearch.index.Index; | ||
import org.elasticsearch.index.codec.postingsformat.PostingsFormatService; | ||
import org.elasticsearch.index.mapper.MapperService; | ||
import org.elasticsearch.index.settings.IndexSettings; | ||
|
||
/** | ||
*/ | ||
public class CodecService extends AbstractIndexComponent { | ||
|
||
private final PostingsFormatService postingsFormatService; | ||
private final MapperService mapperService; | ||
private final ImmutableMap<String, Codec> codecs; | ||
|
||
public CodecService(Index index) { | ||
this(index, ImmutableSettings.Builder.EMPTY_SETTINGS); | ||
} | ||
|
||
public CodecService(Index index, @IndexSettings Settings indexSettings) { | ||
this(index, indexSettings, new PostingsFormatService(index, indexSettings), null); | ||
} | ||
|
||
@Inject | ||
public CodecService(Index index, @IndexSettings Settings indexSettings, PostingsFormatService postingsFormatService, | ||
MapperService mapperService) { | ||
super(index, indexSettings); | ||
this.postingsFormatService = postingsFormatService; | ||
this.mapperService = mapperService; | ||
MapBuilder<String, Codec> codecs = MapBuilder.<String, Codec>newMapBuilder(); | ||
if (mapperService == null) { | ||
codecs.put("default", Codec.getDefault()); | ||
} else { | ||
codecs.put("default", new PerFieldMappingPostingFormatCodec(mapperService, postingsFormatService.get("default").get())); | ||
} | ||
for (String codec : Codec.availableCodecs()) { | ||
codecs.put(codec, Codec.forName(codec)); | ||
} | ||
this.codecs = codecs.immutableMap(); | ||
} | ||
|
||
public PostingsFormatService postingsFormatService() { | ||
return this.postingsFormatService; | ||
} | ||
|
||
public MapperService mapperService() { | ||
return mapperService; | ||
} | ||
|
||
public Codec codec(String name) throws ElasticSearchIllegalArgumentException { | ||
Codec codec = codecs.get(name); | ||
if (codec == null) { | ||
throw new ElasticSearchIllegalArgumentException("failed to find codec [" + name + "]"); | ||
} | ||
return codec; | ||
} | ||
|
||
} |
27 changes: 27 additions & 0 deletions
27
src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
package org.elasticsearch.index.codec; | ||
|
||
import org.apache.lucene.codecs.PostingsFormat; | ||
import org.apache.lucene.codecs.lucene40.Lucene40Codec; | ||
import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider; | ||
import org.elasticsearch.index.mapper.MapperService; | ||
|
||
/** | ||
* This one is the "default" codec we use. | ||
*/ | ||
// LUCENE UPGRADE: make sure to move to a new codec depending on the lucene version | ||
public class PerFieldMappingPostingFormatCodec extends Lucene40Codec { | ||
|
||
private final MapperService mapperService; | ||
private final PostingsFormat defaultPostingFormat; | ||
|
||
public PerFieldMappingPostingFormatCodec(MapperService mapperService, PostingsFormat defaultPostingFormat) { | ||
this.mapperService = mapperService; | ||
this.defaultPostingFormat = defaultPostingFormat; | ||
} | ||
|
||
@Override | ||
public PostingsFormat getPostingsFormatForField(String field) { | ||
PostingsFormatProvider postingsFormat = mapperService.indexName(field).mapper().postingFormatProvider(); | ||
return postingsFormat != null ? postingsFormat.get() : defaultPostingFormat; | ||
} | ||
} |
17 changes: 17 additions & 0 deletions
17
...ain/java/org/elasticsearch/index/codec/postingsformat/AbstractPostingsFormatProvider.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
package org.elasticsearch.index.codec.postingsformat; | ||
|
||
/** | ||
*/ | ||
public abstract class AbstractPostingsFormatProvider implements PostingsFormatProvider { | ||
|
||
private final String name; | ||
|
||
protected AbstractPostingsFormatProvider(String name) { | ||
this.name = name; | ||
} | ||
|
||
public String name() { | ||
return name; | ||
} | ||
|
||
} |
78 changes: 78 additions & 0 deletions
78
.../java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormatProvider.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
package org.elasticsearch.index.codec.postingsformat; | ||
|
||
import org.apache.lucene.codecs.PostingsFormat; | ||
import org.apache.lucene.codecs.bloom.BloomFilterFactory; | ||
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat; | ||
import org.apache.lucene.codecs.bloom.FuzzySet; | ||
import org.apache.lucene.index.FieldInfo; | ||
import org.apache.lucene.index.SegmentWriteState; | ||
import org.elasticsearch.common.Nullable; | ||
import org.elasticsearch.common.inject.Inject; | ||
import org.elasticsearch.common.inject.assistedinject.Assisted; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.index.settings.IndexSettings; | ||
|
||
import java.util.Map; | ||
|
||
/** | ||
*/ | ||
public class BloomFilterPostingsFormatProvider extends AbstractPostingsFormatProvider { | ||
|
||
private final float desiredMaxSaturation; | ||
private final float saturationLimit; | ||
private final PostingsFormatProvider delegate; | ||
private final BloomFilteringPostingsFormat postingsFormat; | ||
|
||
@Inject | ||
public BloomFilterPostingsFormatProvider(@IndexSettings Settings indexSettings, @Nullable Map<String, Factory> postingFormatFactories, @Assisted String name, @Assisted Settings postingsFormatSettings) { | ||
super(name); | ||
this.desiredMaxSaturation = postingsFormatSettings.getAsFloat("desired_max_saturation", 0.1f); | ||
this.saturationLimit = postingsFormatSettings.getAsFloat("saturation_limit", 0.9f); | ||
this.delegate = Helper.lookup(indexSettings, postingsFormatSettings.get("delegate"), postingFormatFactories); | ||
this.postingsFormat = new BloomFilteringPostingsFormat( | ||
delegate.get(), | ||
new CustomBloomFilterFactory(desiredMaxSaturation, saturationLimit) | ||
); | ||
} | ||
|
||
public float desiredMaxSaturation() { | ||
return desiredMaxSaturation; | ||
} | ||
|
||
public float saturationLimit() { | ||
return saturationLimit; | ||
} | ||
|
||
public PostingsFormatProvider delegate() { | ||
return delegate; | ||
} | ||
|
||
@Override | ||
public PostingsFormat get() { | ||
return postingsFormat; | ||
} | ||
|
||
static class CustomBloomFilterFactory extends BloomFilterFactory { | ||
|
||
private final float desiredMaxSaturation; | ||
private final float saturationLimit; | ||
|
||
CustomBloomFilterFactory(float desiredMaxSaturation, float saturationLimit) { | ||
this.desiredMaxSaturation = desiredMaxSaturation; | ||
this.saturationLimit = saturationLimit; | ||
} | ||
|
||
@Override | ||
public FuzzySet getSetForField(SegmentWriteState state, FieldInfo info) { | ||
//Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set with desiredMaxSaturation% of bits set | ||
return FuzzySet.createSetBasedOnQuality(state.segmentInfo.getDocCount(), desiredMaxSaturation); | ||
} | ||
|
||
@Override | ||
public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) { | ||
// Don't bother saving bitsets if > saturationLimit % of bits are set - we don't want to | ||
// throw any more memory at this problem. | ||
return bloomFilter.getSaturation() > saturationLimit; | ||
} | ||
} | ||
} |
37 changes: 37 additions & 0 deletions
37
src/main/java/org/elasticsearch/index/codec/postingsformat/DirectPostingsFormatProvider.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package org.elasticsearch.index.codec.postingsformat; | ||
|
||
import org.apache.lucene.codecs.PostingsFormat; | ||
import org.apache.lucene.codecs.memory.DirectPostingsFormat; | ||
import org.elasticsearch.common.inject.Inject; | ||
import org.elasticsearch.common.inject.assistedinject.Assisted; | ||
import org.elasticsearch.common.settings.Settings; | ||
|
||
/** | ||
*/ | ||
public class DirectPostingsFormatProvider extends AbstractPostingsFormatProvider { | ||
|
||
private final int minSkipCount; | ||
private final int lowFreqCutoff; | ||
private final DirectPostingsFormat postingsFormat; | ||
|
||
@Inject | ||
public DirectPostingsFormatProvider(@Assisted String name, @Assisted Settings postingsFormatSettings) { | ||
super(name); | ||
this.minSkipCount = postingsFormatSettings.getAsInt("min_skip_count", 8); // See DirectPostingsFormat#DEFAULT_MIN_SKIP_COUNT | ||
this.lowFreqCutoff = postingsFormatSettings.getAsInt("low_freq_cutoff", 32); // See DirectPostingsFormat#DEFAULT_LOW_FREQ_CUTOFF | ||
this.postingsFormat = new DirectPostingsFormat(minSkipCount, lowFreqCutoff); | ||
} | ||
|
||
public int minSkipCount() { | ||
return minSkipCount; | ||
} | ||
|
||
public int lowFreqCutoff() { | ||
return lowFreqCutoff; | ||
} | ||
|
||
@Override | ||
public PostingsFormat get() { | ||
return postingsFormat; | ||
} | ||
} |
38 changes: 38 additions & 0 deletions
38
...ain/java/org/elasticsearch/index/codec/postingsformat/Lucene40PostingsFormatProvider.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
package org.elasticsearch.index.codec.postingsformat; | ||
|
||
import org.apache.lucene.codecs.BlockTreeTermsWriter; | ||
import org.apache.lucene.codecs.PostingsFormat; | ||
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat; | ||
import org.elasticsearch.common.inject.Inject; | ||
import org.elasticsearch.common.inject.assistedinject.Assisted; | ||
import org.elasticsearch.common.settings.Settings; | ||
|
||
/** | ||
*/ | ||
public class Lucene40PostingsFormatProvider extends AbstractPostingsFormatProvider { | ||
|
||
private final int minBlockSize; | ||
private final int maxBlockSize; | ||
private final Lucene40PostingsFormat postingsFormat; | ||
|
||
@Inject | ||
public Lucene40PostingsFormatProvider(@Assisted String name, @Assisted Settings postingsFormatSettings) { | ||
super(name); | ||
this.minBlockSize = postingsFormatSettings.getAsInt("min_block_size", BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE); | ||
this.maxBlockSize = postingsFormatSettings.getAsInt("max_block_size", BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); | ||
this.postingsFormat = new Lucene40PostingsFormat(minBlockSize, maxBlockSize); | ||
} | ||
|
||
public int minBlockSize() { | ||
return minBlockSize; | ||
} | ||
|
||
public int maxBlockSize() { | ||
return maxBlockSize; | ||
} | ||
|
||
@Override | ||
public PostingsFormat get() { | ||
return postingsFormat; | ||
} | ||
} |
Oops, something went wrong.