Skip to content

Commit

Permalink
Add 'min_input_len' to completion suggester
Browse files Browse the repository at this point in the history
Restrict the size of the input length to a reasonable size otherwise very
long strings can cause StackOverflowExceptions deep down in lucene land.
Yet, this is simply a saftly limit set to `50` UTF-16 codepoints by default.
This limit is only present at index time and not at query time. If prefix
completions > 50 UTF-16 codepoints are expected / desired this limit should be raised.
Critical string sizes are beyone the 1k UTF-16 Codepoints limit.

Closes #3596
  • Loading branch information
s1monw committed Sep 3, 2013
1 parent eed7f0b commit eb2fed8
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 21 deletions.
10 changes: 9 additions & 1 deletion docs/reference/search/suggesters/completion-suggest.asciidoc
Expand Up @@ -65,7 +65,7 @@ Mapping supports the following parameters:
`payloads`::
Enables the storing of payloads, defaults to `false`

`preserve_separators`:
`preserve_separators`::
Preserves the separators, defaults to `true`.
If disabled, you could find a field starting with `Foo Fighters`, if you
suggest for `foof`.
Expand All @@ -78,6 +78,14 @@ Mapping supports the following parameters:
`The Beatles`, no need to change a simple analyzer, if you are able to
enrich your data.

`max_input_len`::
Limits the length of a single input, defaults to `50` UTF-16 code points.
This limit is only used at index time to reduce the total number of
characters per input string in order to prevent massive inputs from
bloating the underlying datastructure. The most usecases won't be influenced
by the default value since prefix completions hardly grow beyond prefixes longer
than a handful of characters.

==== Indexing

[source,js]
Expand Down
Expand Up @@ -31,16 +31,14 @@
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
import org.elasticsearch.index.fielddata.FieldDataType;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MapperException;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.*;
import org.elasticsearch.index.similarity.SimilarityProvider;
import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider;
import org.elasticsearch.search.suggest.completion.CompletionPostingsFormatProvider;
import org.elasticsearch.search.suggest.completion.CompletionTokenStream;

import java.io.IOException;
import java.io.Reader;
import java.util.List;
import java.util.Map;

Expand All @@ -62,6 +60,7 @@ public static class Defaults extends AbstractFieldMapper.Defaults {
public static final boolean DEFAULT_PRESERVE_SEPARATORS = true;
public static final boolean DEFAULT_POSITION_INCREMENTS = true;
public static final boolean DEFAULT_HAS_PAYLOADS = false;
public static final int DEFAULT_MAX_INPUT_LENGTH = 50;
}

public static class Fields {
Expand All @@ -72,6 +71,7 @@ public static class Fields {
public static final String PRESERVE_POSITION_INCREMENTS = "preserve_position_increments";
public static final String PAYLOADS = "payloads";
public static final String TYPE = "type";
public static final String MAX_INPUT_LENGTH = "max_input_len";
}

public static class Builder extends AbstractFieldMapper.OpenBuilder<Builder, CompletionFieldMapper> {
Expand All @@ -81,6 +81,7 @@ public static class Builder extends AbstractFieldMapper.OpenBuilder<Builder, Com
private boolean preserveSeparators = Defaults.DEFAULT_PRESERVE_SEPARATORS;
private boolean payloads = Defaults.DEFAULT_HAS_PAYLOADS;
private boolean preservePositionIncrements = Defaults.DEFAULT_POSITION_INCREMENTS;
private int maxInputLength = Defaults.DEFAULT_MAX_INPUT_LENGTH;

public Builder(String name) {
super(name, Defaults.FIELD_TYPE);
Expand Down Expand Up @@ -110,10 +111,19 @@ public Builder preservePositionIncrements(boolean preservePositionIncrements) {
this.preservePositionIncrements = preservePositionIncrements;
return this;
}

public Builder maxInputLength(int maxInputLength) {
if (maxInputLength <= 0) {
throw new ElasticSearchIllegalArgumentException(Fields.MAX_INPUT_LENGTH + " must be > 0 but was [" + maxInputLength + "]");
}
this.maxInputLength = maxInputLength;
return this;
}

@Override
public CompletionFieldMapper build(Mapper.BuilderContext context) {
return new CompletionFieldMapper(buildNames(context), indexAnalyzer, searchAnalyzer, provider, similarity, payloads, preserveSeparators, preservePositionIncrements);
return new CompletionFieldMapper(buildNames(context), indexAnalyzer, searchAnalyzer, provider, similarity, payloads,
preserveSeparators, preservePositionIncrements, maxInputLength);
}
}

Expand All @@ -129,18 +139,23 @@ public static class TypeParser implements Mapper.TypeParser {
continue;
}
if (fieldName.equals("analyzer")) {
builder.indexAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
builder.searchAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
NamedAnalyzer analyzer = getNamedAnalyzer(parserContext, fieldNode.toString());
builder.indexAnalyzer(analyzer);
builder.searchAnalyzer(analyzer);
} else if (fieldName.equals(Fields.INDEX_ANALYZER) || fieldName.equals("indexAnalyzer")) {
builder.indexAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
builder.indexAnalyzer(getNamedAnalyzer(parserContext, fieldNode.toString()));
} else if (fieldName.equals(Fields.SEARCH_ANALYZER) || fieldName.equals("searchAnalyzer")) {
builder.searchAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
builder.searchAnalyzer(getNamedAnalyzer(parserContext, fieldNode.toString()));
} else if (fieldName.equals(Fields.PAYLOADS)) {
builder.payloads(Boolean.parseBoolean(fieldNode.toString()));
} else if (fieldName.equals(Fields.PRESERVE_SEPARATORS) || fieldName.equals("preserveSeparators")) {
builder.preserveSeparators(Boolean.parseBoolean(fieldNode.toString()));
} else if (fieldName.equals(Fields.PRESERVE_POSITION_INCREMENTS) || fieldName.equals("preservePositionIncrements")) {
builder.preservePositionIncrements(Boolean.parseBoolean(fieldNode.toString()));
} else if (fieldName.equals(Fields.MAX_INPUT_LENGTH) || fieldName.equals("maxInputLen")) {
builder.maxInputLength(Integer.parseInt(fieldNode.toString()));
} else {
throw new MapperParsingException("Unknown field [" + fieldName + "]");
}
}

Expand All @@ -155,6 +170,14 @@ public static class TypeParser implements Mapper.TypeParser {
builder.postingsFormat(parserContext.postingFormatService().get("default"));
return builder;
}

private NamedAnalyzer getNamedAnalyzer(ParserContext parserContext, String name) {
NamedAnalyzer analyzer = parserContext.analysisService().analyzer(name);
if (analyzer == null) {
throw new ElasticSearchIllegalArgumentException("Can't find default or mapped analyzer with name [" + name +"]");
}
return analyzer;
}
}

private static final BytesRef EMPTY = new BytesRef();
Expand All @@ -164,15 +187,17 @@ public static class TypeParser implements Mapper.TypeParser {
private final boolean payloads;
private final boolean preservePositionIncrements;
private final boolean preserveSeparators;
private int maxInputLength;

public CompletionFieldMapper(Names names, NamedAnalyzer indexAnalyzer, NamedAnalyzer searchAnalyzer, PostingsFormatProvider provider, SimilarityProvider similarity, boolean payloads,
boolean preserveSeparators, boolean preservePositionIncrements) {
boolean preserveSeparators, boolean preservePositionIncrements, int maxInputLength) {
super(names, 1.0f, Defaults.FIELD_TYPE, indexAnalyzer, searchAnalyzer, provider, similarity, null);
analyzingSuggestLookupProvider = new AnalyzingCompletionLookupProvider(preserveSeparators, false, preservePositionIncrements, payloads);
this.completionPostingsFormatProvider = new CompletionPostingsFormatProvider("completion", provider, analyzingSuggestLookupProvider);
this.preserveSeparators = preserveSeparators;
this.payloads = payloads;
this.preservePositionIncrements = preservePositionIncrements;
this.maxInputLength = maxInputLength;
}


Expand Down Expand Up @@ -251,8 +276,20 @@ public void parse(ParseContext context) throws IOException {
}

public Field getCompletionField(String input, BytesRef payload) {
if (input.length() > maxInputLength) {
final int len = correctSubStringLen(input, Math.min(maxInputLength, input.length()));
input = input.substring(0, len);
}
return new SuggestField(names().fullName(), input, this.fieldType, payload, analyzingSuggestLookupProvider);
}

public static int correctSubStringLen(String input, int len) {
if (Character.isHighSurrogate(input.charAt(len-1))) {
assert input.length() >= len+1 && Character.isLowSurrogate(input.charAt(len));
return len + 1;
}
return len;
}

public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException {
return analyzingSuggestLookupProvider.buildPayload(
Expand All @@ -262,6 +299,12 @@ public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload
private static final class SuggestField extends Field {
private final BytesRef payload;
private final CompletionTokenStream.ToFiniteStrings toFiniteStrings;

public SuggestField(String name, Reader value, FieldType type, BytesRef payload, CompletionTokenStream.ToFiniteStrings toFiniteStrings) {
super(name, value, type);
this.payload = payload;
this.toFiniteStrings = toFiniteStrings;
}

public SuggestField(String name, String value, FieldType type, BytesRef payload, CompletionTokenStream.ToFiniteStrings toFiniteStrings) {
super(name, value, type);
Expand All @@ -287,12 +330,11 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
builder.field(Fields.INDEX_ANALYZER, indexAnalyzer.name())
.field(Fields.SEARCH_ANALYZER, searchAnalyzer.name());
}
builder.field(Fields.PAYLOADS, this.payloads)
.field(Fields.PRESERVE_SEPARATORS, this.preserveSeparators)
.field(Fields.PRESERVE_POSITION_INCREMENTS, this.preservePositionIncrements)
.endObject();

return builder;
builder.field(Fields.PAYLOADS, this.payloads);
builder.field(Fields.PRESERVE_SEPARATORS, this.preserveSeparators);
builder.field(Fields.PRESERVE_POSITION_INCREMENTS, this.preservePositionIncrements);
builder.field(Fields.MAX_INPUT_LENGTH, this.maxInputLength);
return builder.endObject();
}

@Override
Expand Down Expand Up @@ -328,4 +370,22 @@ public String value(Object value) {
public boolean isStoringPayloads() {
return payloads;
}

@Override
public void merge(Mapper mergeWith, MergeContext mergeContext) throws MergeMappingException {
super.merge(mergeWith, mergeContext);
CompletionFieldMapper fieldMergeWith = (CompletionFieldMapper) mergeWith;
if (payloads != fieldMergeWith.payloads) {
mergeContext.addConflict("mapper [" + names.fullName() + "] has different payload values");
}
if (preservePositionIncrements != fieldMergeWith.preservePositionIncrements) {
mergeContext.addConflict("mapper [" + names.fullName() + "] has different 'preserve_position_increments' values");
}
if (preserveSeparators != fieldMergeWith.preserveSeparators) {
mergeContext.addConflict("mapper [" + names.fullName() + "] has different 'preserve_separators' values");
}
if (!mergeContext.mergeFlags().simulate()) {
this.maxInputLength = fieldMergeWith.maxInputLength;
}
}
}
Expand Up @@ -28,6 +28,8 @@
import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequestBuilder;
import org.elasticsearch.action.admin.indices.delete.DeleteIndexResponse;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequestBuilder;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingResponse;
import org.elasticsearch.action.count.CountResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.ShardSearchFailure;
Expand All @@ -50,6 +52,15 @@
public class ElasticsearchAssertions {


public static void assertAcked(PutMappingRequestBuilder builder) {
assertAcked(builder.get());
}

private static void assertAcked(PutMappingResponse response) {
assertThat("Put Mapping failed - not acked", response.isAcknowledged(), equalTo(true));

}

public static void assertAcked(DeleteIndexRequestBuilder builder) {
assertAcked(builder.get());
}
Expand Down
Expand Up @@ -91,7 +91,7 @@ public void testCompletionPostingsFormat() throws IOException {
LookupFactory load = provider.load(input);
PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true), new CompletionSuggestionContext(null));
Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE), new CompletionSuggestionContext(null));
List<LookupResult> result = lookup.lookup("ge", false, 10);
assertThat(result.get(0).key.toString(), equalTo("Generator - Foo Fighters"));
assertThat(result.get(0).payload.utf8ToString(), equalTo("id:10"));
Expand Down Expand Up @@ -176,7 +176,7 @@ public BytesRef payload() {

NamedAnalyzer namedAnalzyer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
final CompletionFieldMapper mapper = new CompletionFieldMapper(new Names("foo"), namedAnalzyer, namedAnalzyer, provider, null, usePayloads,
preserveSeparators, preservePositionIncrements);
preserveSeparators, preservePositionIncrements, Integer.MAX_VALUE);
Lookup buildAnalyzingLookup = buildAnalyzingLookup(mapper, titles, titles, weights);
Field field = buildAnalyzingLookup.getClass().getDeclaredField("maxAnalyzedPathsForOneInput");
field.setAccessible(true);
Expand Down Expand Up @@ -265,7 +265,7 @@ public void testNoDocs() throws IOException {
LookupFactory load = provider.load(input);
PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
assertNull(load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true), new CompletionSuggestionContext(null)));
assertNull(load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE), new CompletionSuggestionContext(null)));
dir.close();
}

Expand Down
Expand Up @@ -32,11 +32,13 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.mapper.MapperException;
import org.elasticsearch.index.mapper.core.CompletionFieldMapper;
import org.elasticsearch.search.suggest.Suggest;
import org.elasticsearch.search.suggest.completion.CompletionStats;
import org.elasticsearch.search.suggest.completion.CompletionSuggestion;
import org.elasticsearch.search.suggest.completion.CompletionSuggestionBuilder;
import org.elasticsearch.search.suggest.completion.CompletionSuggestionFuzzyBuilder;
import org.elasticsearch.test.hamcrest.ElasticsearchAssertions;
import org.elasticsearch.test.integration.AbstractSharedClusterTest;
import org.junit.Test;

Expand Down Expand Up @@ -717,4 +719,60 @@ public void testPrunedSegments() throws IOException {
}
}
}

@Test
public void testMaxFieldLength() throws IOException {
client().admin().indices().prepareCreate(INDEX).get();
int iters = atLeast(10);
for (int i = 0; i < iters; i++) {
int len = between(3, 50);
String str = randomRealisticUnicodeOfCodepointLengthBetween(len+1, atLeast(len + 2));
ElasticsearchAssertions.assertAcked(client().admin().indices().preparePutMapping(INDEX).setType(TYPE).setSource(jsonBuilder().startObject()
.startObject(TYPE).startObject("properties")
.startObject(FIELD)
.field("type", "completion")
.field("max_input_len", len)
// upgrade mapping each time
.field("analyzer", "keyword")
.endObject()
.endObject().endObject()
.endObject()));
ensureYellow();
client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder()
.startObject().startObject(FIELD)
.startArray("input").value(str).endArray()
.field("output", "foobar")
.endObject().endObject()
).setRefresh(true).get();
int prefixLen = CompletionFieldMapper.correctSubStringLen(str, between(1, len - 1));
assertSuggestions(str.substring(0, prefixLen), "foobar");
if (len + 1 < str.length()) {
assertSuggestions(str.substring(0, CompletionFieldMapper.correctSubStringLen(str,
len + (Character.isHighSurrogate(str.charAt(len-1)) ? 2 : 1))));
}
}
}

@Test
// see #3596
public void testVeryLongInput() throws IOException {
client().admin().indices().prepareCreate(INDEX).get();
ElasticsearchAssertions.assertAcked(client().admin().indices().preparePutMapping(INDEX).setType(TYPE).setSource(jsonBuilder().startObject()
.startObject(TYPE).startObject("properties")
.startObject(FIELD)
.field("type", "completion")
.endObject()
.endObject().endObject()
.endObject()));
ensureYellow();
// can cause stack overflow without the default max_input_len
String longString = randomRealisticUnicodeOfLength(atLeast(5000));
client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder()
.startObject().startObject(FIELD)
.startArray("input").value(longString).endArray()
.field("output", "foobar")
.endObject().endObject()
).setRefresh(true).get();

}
}

0 comments on commit eb2fed8

Please sign in to comment.