Skip to content

Commit

Permalink
Limit the number of extracted token instance per query token.
Browse files Browse the repository at this point in the history
FVH deploys some recursive logic to extract terms from documents
that need to highlighted. For documents that have terms with super
large term frequency like a document that repeats a terms very
very often this can produce some very large stacks when extracting
the terms. Taken to an extreme this causes stack overflow errors
when this grow beyond a term frequency >= 6000.

The ultimate solution is a iterative implementation of the extract
logic but until then we should protect users from these massive
term extractions which might be not very useful in the first place.

Closes #3486
  • Loading branch information
s1monw authored and drewr committed Aug 14, 2013
1 parent 5c38d60 commit 0ebe5f3
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 1 deletion.
Expand Up @@ -114,7 +114,10 @@ public XFieldTermStack( IndexReader reader, int docId, String fieldName, final X
// For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
final float weight = ( float ) ( Math.log( numDocs / ( double ) ( reader.docFreq( new Term(fieldName, text) ) + 1 ) ) + 1.0 );

final int freq = dpEnum.freq();
// ES EDIT: added a safety check to limit this to 512 terms everything above might be meaningless anyways
// This limit protectes the FVH from running into StackOverflowErrors if super large TF docs are highlighted.
final int freq = Math.min(512, dpEnum.freq());


for(int i = 0;i < freq;i++) {
int pos = dpEnum.nextPosition();
Expand Down
Expand Up @@ -62,6 +62,43 @@ public class HighlighterSearchTests extends AbstractSharedClusterTest {
protected int numberOfNodes() {
return 4; // why 4?
}

@Test
// see #3486
public void testHighTermFrequencyDoc() throws ElasticSearchException, IOException {
wipeIndex("test");
client().admin().indices().prepareCreate("test")
.addMapping("test", jsonBuilder()
.startObject()
.startObject("test")
.startObject("properties")
.startObject("name")
.field("type", "string")
.field("term_vector", "with_positions_offsets")
.field("store", randomBoolean() ? "yes" : "no")
.endObject()
.endObject()
.endObject()
.endObject())
.setSettings(ImmutableSettings.settingsBuilder()
.put("index.number_of_shards", between(1, 5)))
.execute().actionGet();
ensureYellow();
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 6000; i++) {
builder.append("abc").append(" ");
}
client().prepareIndex("test", "test", "1")
.setSource(XContentFactory.jsonBuilder()
.startObject()
.field("name", builder.toString())
.endObject())
.execute().actionGet();
refresh();
SearchResponse search = client().prepareSearch().setQuery(constantScoreQuery(matchQuery("name", "abc"))).addHighlightedField("name").execute().actionGet();
assertHighlight(search, 0, "name", 0, startsWith("<em>abc</em> <em>abc</em> <em>abc</em> <em>abc</em>"));
}


@Test
public void testNgramHighlightingWithBrokenPositions() throws ElasticSearchException, IOException {
Expand Down

0 comments on commit 0ebe5f3

Please sign in to comment.