Skip to content

Commit

Permalink
Phrase Suggester: Add option to filter out phrase suggestions not mat…
Browse files Browse the repository at this point in the history
…ching any documents for a given query

The newly added filter option will let the user provide a template query which will be executed for every
phrase suggestions generated to ensure that the suggestion matches at least one document for the query.
The filter query is only executed on the local node for now. When the new filter option is used, the
size of the suggestion is restricted to 20.

Closes elastic#3482
  • Loading branch information
areek committed Jul 7, 2014
1 parent 34893c0 commit 535f68c
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 4 deletions.
7 changes: 7 additions & 0 deletions docs/reference/search/suggesters/phrase-suggest.asciidoc
Expand Up @@ -161,6 +161,13 @@ can contain misspellings (See parameter descriptions below).
in a row are changed the entire phrase of changed tokens
is wrapped rather than each token.

`filter`::
Sets a template query to filter out suggestions that do not match
any documents returned by the query. If provided must contain
the suggestion variable `suggestion` which is used to populate
the query with the phrase suggestions. If provided the `size`
parameter can be at most `20`.

==== Smoothing Models

The `phrase` suggester supports multiple smoothing models to balance
Expand Down
Expand Up @@ -23,10 +23,12 @@
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentParser.Token;
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.script.CompiledScript;
import org.elasticsearch.search.suggest.SuggestContextParser;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.SuggestionSearchContext;
Expand All @@ -37,6 +39,7 @@
public final class PhraseSuggestParser implements SuggestContextParser {

private PhraseSuggester suggester;
private static final int MAX_RESULT_SIZE_FILTER = 20;

public PhraseSuggestParser(PhraseSuggester suggester) {
this.suggester = suggester;
Expand Down Expand Up @@ -83,6 +86,23 @@ public SuggestionSearchContext.SuggestionContext parse(XContentParser parser, Ma
throw new ElasticsearchIllegalArgumentException("token_limit must be >= 1");
}
suggestion.setTokenLimit(tokenLimit);
} else if ("filter".equals(fieldName)) {
String templateNameOrTemplateContent;
if (token == XContentParser.Token.START_OBJECT && !parser.hasTextCharacters()) {
XContentBuilder builder = XContentBuilder.builder(parser.contentType().xContent());
builder.copyCurrentStructure(parser);
templateNameOrTemplateContent = builder.string();
} else {
templateNameOrTemplateContent = parser.text();
}
if (templateNameOrTemplateContent == null) {
throw new ElasticsearchIllegalArgumentException("no template found in filter field");
}
CompiledScript compiledScript = suggester.scriptService().compile("mustache", templateNameOrTemplateContent);
suggestion.setFilterQueryScript(compiledScript);
if (suggestion.getSize() >= MAX_RESULT_SIZE_FILTER) {
throw new ElasticsearchIllegalArgumentException("size must be <= " + MAX_RESULT_SIZE_FILTER + " for using filter");
}
} else {
throw new ElasticsearchIllegalArgumentException("suggester[phrase] doesn't support field [" + fieldName + "]");
}
Expand Down
Expand Up @@ -27,8 +27,15 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.elasticsearch.action.search.*;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.text.StringText;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.script.CompiledScript;
import org.elasticsearch.script.ExecutableScript;
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.search.suggest.Suggest.Suggestion;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option;
Expand All @@ -37,11 +44,22 @@

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
private final BytesRef SEPARATOR = new BytesRef(" ");

private static final String SUGGESTION_TEMPLATE_VAR_NAME = "suggestion";
private final Client client;
private final ScriptService scriptService;

@Inject
public PhraseSuggester(Client client, ScriptService scriptService) {
this.client = client;
this.scriptService = scriptService;
}

/*
* More Ideas:
* - add ability to find whitespace problems -> we can build a poor mans decompounder with our index based on a automaton?
Expand Down Expand Up @@ -84,7 +102,18 @@ public Suggestion<? extends Entry<? extends Option>> innerExecute(String name, P
response.addTerm(resultEntry);

BytesRef byteSpare = new BytesRef();
for (Correction correction : checkerResult.corrections) {
CompiledScript filterQueryScript = suggestion.getFilterQueryScript();
MultiSearchResponse multiSearchResponse = null;
if (filterQueryScript != null) {
multiSearchResponse = fetchMatchingDocCountResponses(checkerResult.corrections, filterQueryScript, byteSpare, spare);
}
for (int i = 0; i < checkerResult.corrections.length; i++) {
if (multiSearchResponse != null) {
if (!hasMatchingDocs(multiSearchResponse, i)) {
continue;
}
}
Correction correction = checkerResult.corrections[i];
UnicodeUtil.UTF8toUTF16(correction.join(SEPARATOR, byteSpare, null, null), spare);
Text phrase = new StringText(spare.toString());
Text highlighted = null;
Expand All @@ -104,6 +133,43 @@ private PhraseSuggestion.Entry buildResultEntry(PhraseSuggestionContext suggesti
UnicodeUtil.UTF8toUTF16(suggestion.getText(), spare);
return new PhraseSuggestion.Entry(new StringText(spare.toString()), 0, spare.length, cutoffScore);
}

private MultiSearchResponse fetchMatchingDocCountResponses(Correction[] corrections, CompiledScript filterQueryScript, BytesRef byteSpare, CharsRef spare) {
Map<String, String> vars = new HashMap<>(1);
MultiSearchResponse multiSearchResponse = null;
MultiSearchRequestBuilder multiSearchRequestBuilder = client.prepareMultiSearch();
boolean requestAdded = false;
for (Correction correction : corrections) {
UnicodeUtil.UTF8toUTF16(correction.join(SEPARATOR, byteSpare, null, null), spare);
vars.put(SUGGESTION_TEMPLATE_VAR_NAME, spare.toString());
ExecutableScript executable = scriptService.executable(filterQueryScript, vars);
BytesReference querySource = (BytesReference) executable.run();
requestAdded = true;
SearchRequestBuilder req = client.prepareSearch()
.setPreference("_only_local")
.setQuery(querySource)
.setSearchType(SearchType.COUNT);
multiSearchRequestBuilder.add(req);
}
if (requestAdded) {
multiSearchResponse = multiSearchRequestBuilder.get();
}

return multiSearchResponse;
}

private boolean hasMatchingDocs(MultiSearchResponse multiSearchResponse, int index) {
MultiSearchResponse.Item item = multiSearchResponse.getResponses()[index];
if (!item.isFailure()) {
SearchResponse resp = item.getResponse();
return resp.getHits().totalHits() > 0;
}
return false;
}

ScriptService scriptService() {
return scriptService;
}

@Override
public String[] names() {
Expand Down
Expand Up @@ -42,6 +42,7 @@ public final class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSugge
private Integer tokenLimit;
private String preTag;
private String postTag;
private String filter;

public PhraseSuggestionBuilder(String name) {
super(name, "phrase");
Expand Down Expand Up @@ -166,6 +167,14 @@ public PhraseSuggestionBuilder highlight(String preTag, String postTag) {
return this;
}

/**
* Sets a query used for filtering out suggested phrases.
*/
public PhraseSuggestionBuilder filter(String filter) {
this.filter = filter;
return this;
}

@Override
public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
if (realWordErrorLikelihood != null) {
Expand Down Expand Up @@ -210,6 +219,9 @@ public XContentBuilder innerToXContent(XContentBuilder builder, Params params) t
builder.field("post_tag", postTag);
builder.endObject();
}
if (filter != null) {
builder.field("filter", filter);
}
return builder;
}

Expand Down Expand Up @@ -610,4 +622,4 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws

}

}
}
Expand Up @@ -24,7 +24,9 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.script.CompiledScript;
import org.elasticsearch.search.suggest.DirectSpellcheckerSettings;
import org.elasticsearch.search.suggest.Suggest;
import org.elasticsearch.search.suggest.Suggester;
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;

Expand All @@ -40,6 +42,7 @@ class PhraseSuggestionContext extends SuggestionContext {
private int tokenLimit = NoisyChannelSpellChecker.DEFAULT_TOKEN_LIMIT;
private BytesRef preTag;
private BytesRef postTag;
private CompiledScript filterQueryScript;

private WordScorer.WordScorerFactory scorer;

Expand Down Expand Up @@ -180,4 +183,13 @@ public void setPostTag(BytesRef postTag) {
public BytesRef getPostTag() {
return postTag;
}

CompiledScript getFilterQueryScript() {
return filterQueryScript;
}

void setFilterQueryScript(CompiledScript filterQueryScript) {
this.filterQueryScript = filterQueryScript;
}

}
Expand Up @@ -1093,7 +1093,82 @@ public void suggestWithManyCandidates() throws InterruptedException, ExecutionEx
assertSuggestion(searchSuggest, 0, 0, "title", "united states house of representatives elections in washington 2006");
// assertThat(total, lessThan(1000L)); // Takes many seconds without fix - just for debugging
}


@Test
public void suggestPhrasesInIndex() throws InterruptedException, ExecutionException, IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(settingsBuilder()
.put(indexSettings())
.put(SETTING_NUMBER_OF_SHARDS, 1) // A single shard will help to keep the tests repeatable.
.put("index.analysis.analyzer.text.tokenizer", "standard")
.putArray("index.analysis.analyzer.text.filter", "lowercase", "my_shingle")
.put("index.analysis.filter.my_shingle.type", "shingle")
.put("index.analysis.filter.my_shingle.output_unigrams", true)
.put("index.analysis.filter.my_shingle.min_shingle_size", 2)
.put("index.analysis.filter.my_shingle.max_shingle_size", 3));

XContentBuilder mapping = XContentFactory.jsonBuilder()
.startObject()
.startObject("type1")
.startObject("properties")
.startObject("title")
.field("type", "string")
.field("analyzer", "text")
.endObject()
.endObject()
.endObject()
.endObject();
assertAcked(builder.addMapping("type1", mapping));
ensureGreen();

ImmutableList.Builder<String> titles = ImmutableList.<String>builder();

titles.add("United States House of Representatives Elections in Washington 2006");
titles.add("United States House of Representatives Elections in Washington 2005");
titles.add("State");
titles.add("Houses of Parliament");
titles.add("Representative Government");
titles.add("Election");

List<IndexRequestBuilder> builders = new ArrayList<>();
for (String title: titles.build()) {
builders.add(client().prepareIndex("test", "type1").setSource("title", title));
}
indexRandom(true, builders);

// suggest without filtering
PhraseSuggestionBuilder suggest = phraseSuggestion("title")
.field("title")
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("title")
.suggestMode("always")
.maxTermFreq(.99f)
.size(10)
.maxInspections(200)
)
.confidence(0f)
.maxErrors(2f)
.shardSize(30000)
.size(10);
Suggest searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", suggest);
assertSuggestionSize(searchSuggest, 0, 10, "title");

// suggest with filtering
String filterString = XContentFactory.jsonBuilder()
.startObject()
.startObject("match_phrase")
.field("title", "{{suggestion}}")
.endObject()
.endObject()
.string();
PhraseSuggestionBuilder filteredSuggest = suggest.filter(filterString);
searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", filteredSuggest);
assertSuggestionSize(searchSuggest, 0, 2, "title");

// filtered suggest with no result (boundary case)
searchSuggest = searchSuggest("Elections of Representatives Parliament", filteredSuggest);
assertSuggestionSize(searchSuggest, 0, 0, "title");

}

protected Suggest searchSuggest(SuggestionBuilder<?>... suggestion) {
return searchSuggest(null, suggestion);
}
Expand Down

0 comments on commit 535f68c

Please sign in to comment.