Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix percent_terms_to_match #7754

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/reference/query-dsl/queries/mlt-query.asciidoc
Expand Up @@ -87,8 +87,8 @@ unless specified otherwise in each `doc`.
|`include` |When using `ids` or `docs`, specifies whether the documents should be
included from the search. Defaults to `false`.

|`percent_terms_to_match` |The percentage of terms to match on (float
value). Defaults to `0.3` (30 percent).
|`percent_terms_to_match` |From the generated query, the percentage of terms
that must match (float value between 0 and 1). Defaults to `0.3` (30 percent).

|`min_term_freq` |The frequency below which terms will be ignored in the
source doc. The default frequency is `2`.
Expand Down
Expand Up @@ -152,20 +152,21 @@ public Query rewrite(IndexReader reader) throws IOException {

BooleanQuery bq = new BooleanQuery();
if (this.likeFields != null) {
bq.add((BooleanQuery) mlt.like(this.likeFields), BooleanClause.Occur.SHOULD);
Query mltQuery = mlt.like(this.likeFields);
setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch);
bq.add(mltQuery, BooleanClause.Occur.SHOULD);
}
if (this.likeText != null) {
Reader[] readers = new Reader[likeText.length];
for (int i = 0; i < readers.length; i++) {
readers[i] = new FastStringReader(likeText[i]);
}
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
bq.add((BooleanQuery) mlt.like(moreLikeFields[0], readers), BooleanClause.Occur.SHOULD);
Query mltQuery = mlt.like(moreLikeFields[0], readers);
setMinimumShouldMatch((BooleanQuery) mltQuery, percentTermsToMatch);
bq.add(mltQuery, BooleanClause.Occur.SHOULD);
}

BooleanClause[] clauses = bq.getClauses();
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));

bq.setBoost(getBoost());
return bq;
}
Expand Down Expand Up @@ -309,4 +310,9 @@ public float getBoostTermsFactor() {
public void setBoostTermsFactor(float boostTermsFactor) {
this.boostTermsFactor = boostTermsFactor;
}

private static void setMinimumShouldMatch(BooleanQuery bq, float percentTermsToMatch) {
BooleanClause[] clauses = bq.getClauses();
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
}
}
Expand Up @@ -639,19 +639,17 @@ public Query like(Fields... likeFields) throws IOException {
fieldNames.add(fieldName);
}
}
// to create one query per field name only
// term selection is per field, then appended to a single boolean query
BooleanQuery bq = new BooleanQuery();
for (String fieldName : fieldNames) {
Map<String, Int> termFreqMap = new HashMap<>();
this.setFieldNames(new String[]{fieldName});
for (Fields fields : likeFields) {
Terms vector = fields.terms(fieldName);
if (vector != null) {
addTermFrequencies(termFreqMap, vector);
}
}
Query query = createQuery(createQueue(termFreqMap));
bq.add(query, BooleanClause.Occur.SHOULD);
addToQuery(createQueue(termFreqMap, fieldName), bq);
}
return bq;
}
Expand All @@ -661,6 +659,14 @@ public Query like(Fields... likeFields) throws IOException {
*/
private Query createQuery(PriorityQueue<ScoreTerm> q) {
BooleanQuery query = new BooleanQuery();
addToQuery(q, query);
return query;
}

/**
* Add to an existing boolean query the More Like This query from this PriorityQueue
*/
private void addToQuery(PriorityQueue<ScoreTerm> q, BooleanQuery query) {
ScoreTerm scoreTerm;
float bestScore = -1;

Expand All @@ -682,7 +688,6 @@ private Query createQuery(PriorityQueue<ScoreTerm> q) {
break;
}
}
return query;
}

/**
Expand All @@ -691,6 +696,16 @@ private Query createQuery(PriorityQueue<ScoreTerm> q) {
* @param words a map of words keyed on the word(String) with Int objects as the values.
*/
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
return createQueue(words, this.fieldNames);
}

/**
* Create a PriorityQueue from a word->tf map.
*
* @param words a map of words keyed on the word(String) with Int objects as the values.
* @param fieldNames an array of field names to override defaults.
*/
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException {
// have collected all words in doc and their freqs
int numDocs = ir.numDocs();
final int limit = Math.min(maxQueryTerms, words.size());
Expand Down
Expand Up @@ -1623,6 +1623,43 @@ public void testMoreLikeThisIds() throws Exception {
}
}

@Test
public void testMLTPercentTermsToMatch() throws Exception {
// setup for mocking fetching items
MoreLikeThisQueryParser parser = (MoreLikeThisQueryParser) queryParser.queryParser("more_like_this");
parser.setFetchService(new MockMoreLikeThisFetchService());

// parsing the ES query
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-items.json");
BooleanQuery parsedQuery = (BooleanQuery) queryParser.parse(query).query();

// get MLT query, other clause is for include/exclude items
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) parsedQuery.getClauses()[0].getQuery();

// all terms must match
mltQuery.setPercentTermsToMatch(1.0f);
mltQuery.setMinWordLen(0);
mltQuery.setMinDocFreq(0);

// one document has all values
MemoryIndex index = new MemoryIndex();
index.addField("name.first", "apache lucene", new WhitespaceAnalyzer());
index.addField("name.last", "1 2 3 4", new WhitespaceAnalyzer());

// two clauses, one for items and one for like_text if set
BooleanQuery luceneQuery = (BooleanQuery) mltQuery.rewrite(index.createSearcher().getIndexReader());
BooleanClause[] clauses = luceneQuery.getClauses();

// check for items
int minNumberShouldMatch = ((BooleanQuery) (clauses[0].getQuery())).getMinimumNumberShouldMatch();
assertThat(minNumberShouldMatch, is(4));

// and for like_text
minNumberShouldMatch = ((BooleanQuery) (clauses[1].getQuery())).getMinimumNumberShouldMatch();
assertThat(minNumberShouldMatch, is(2));
}

private static class MockMoreLikeThisFetchService extends MoreLikeThisFetchService {

public MockMoreLikeThisFetchService() {
Expand Down