Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to using the multi-termvectors API #7014

Merged
merged 1 commit into from Aug 21, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/reference/query-dsl/queries/mlt-query.asciidoc
Expand Up @@ -119,7 +119,7 @@ boost factor.

|`boost` |Sets the boost value of the query. Defaults to `1.0`.

|`analyzer` |The analyzer that will be used to analyze the text.
Defaults to the analyzer associated with the field.
|`analyzer` |The analyzer that will be used to analyze the `like text`.
Defaults to the analyzer associated with the first field in `fields`.
|=======================================================================

Expand Up @@ -22,6 +22,7 @@
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.action.*;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.StreamInput;
Expand Down Expand Up @@ -52,6 +53,11 @@ public MultiTermVectorsRequest add(String index, @Nullable String type, String i
return this;
}

public MultiTermVectorsRequest add(MultiGetRequest.Item item) {
requests.add(new TermVectorRequest(item));
return this;
}

@Override
public ActionRequestValidationException validate() {
ActionRequestValidationException validationException = null;
Expand Down
376 changes: 197 additions & 179 deletions src/main/java/org/elasticsearch/action/termvector/TermVectorFields.java

Large diffs are not rendered by default.

Expand Up @@ -24,6 +24,7 @@
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionRequestValidationException;
import org.elasticsearch.action.ValidateActions;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
Expand Down Expand Up @@ -68,7 +69,7 @@ public TermVectorRequest(String index, String type, String id) {
this.id = id;
this.type = type;
}

/**
* Constructs a new term vector request for a document that will be fetch
* from the provided index. Use {@link #type(String)} and
Expand All @@ -86,6 +87,14 @@ public TermVectorRequest(TermVectorRequest other) {
}
}

public TermVectorRequest(MultiGetRequest.Item item) {
super(item.index());
this.id = item.id();
this.type = item.type();
this.selectedFields(item.fields());
this.routing(item.routing());
}

public EnumSet<Flag> getFlags() {
return flagsEnum;
}
Expand Down
Expand Up @@ -20,6 +20,7 @@
package org.elasticsearch.common.lucene.search;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
Expand All @@ -46,6 +47,7 @@ public class MoreLikeThisQuery extends Query {
private TFIDFSimilarity similarity;

private String[] likeText;
private Fields[] likeFields;
private String[] moreLikeFields;
private Analyzer analyzer;
private float percentTermsToMatch = DEFAULT_PERCENT_TERMS_TO_MATCH;
Expand Down Expand Up @@ -148,12 +150,18 @@ public Query rewrite(IndexReader reader) throws IOException {
mlt.setBoost(boostTerms);
mlt.setBoostFactor(boostTermsFactor);

Reader[] readers = new Reader[likeText.length];
for (int i = 0; i < readers.length; i++) {
readers[i] = new FastStringReader(likeText[i]);
BooleanQuery bq = new BooleanQuery();
if (this.likeFields != null) {
bq.add((BooleanQuery) mlt.like(this.likeFields), BooleanClause.Occur.SHOULD);
}
if (this.likeText != null) {
Reader[] readers = new Reader[likeText.length];
for (int i = 0; i < readers.length; i++) {
readers[i] = new FastStringReader(likeText[i]);
}
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
bq.add((BooleanQuery) mlt.like(moreLikeFields[0], readers), BooleanClause.Occur.SHOULD);
}
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
BooleanQuery bq = (BooleanQuery) mlt.like(moreLikeFields[0], readers);

BooleanClause[] clauses = bq.getClauses();
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
Expand Down Expand Up @@ -183,6 +191,14 @@ public void setLikeText(String... likeText) {
this.likeText = likeText;
}

public Fields[] getLikeFields() {
return likeFields;
}

public void setLikeText(Fields... likeFields) {
this.likeFields = likeFields;
}

public void setLikeText(List<String> likeText) {
setLikeText(likeText.toArray(Strings.EMPTY_ARRAY));
}
Expand Down
Expand Up @@ -53,11 +53,7 @@

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.*;


/**
Expand Down Expand Up @@ -618,6 +614,49 @@ public Query like(String fieldName, Reader... readers) throws IOException {
return createQuery(createQueue(words));
}

/**
* Return a query that will return docs like the passed Terms.
*
* @return a query that will return docs like the passed Terms.
*/
public Query like(Terms... likeTerms) throws IOException {
Map<String, Int> termFreqMap = new HashMap<>();
for (Terms vector : likeTerms) {
addTermFrequencies(termFreqMap, vector);
}
return createQuery(createQueue(termFreqMap));
}

/**
* Return a query that will return docs like the passed Fields.
*
* @return a query that will return docs like the passed Fields.
*/
public Query like(Fields... likeFields) throws IOException {
// get all field names
Set<String> fieldNames = new HashSet<>();
for (Fields fields : likeFields) {
for (String fieldName : fields) {
fieldNames.add(fieldName);
}
}
// to create one query per field name only
BooleanQuery bq = new BooleanQuery();
for (String fieldName : fieldNames) {
Map<String, Int> termFreqMap = new HashMap<>();
this.setFieldNames(new String[]{fieldName});
for (Fields fields : likeFields) {
Terms vector = fields.terms(fieldName);
if (vector != null) {
addTermFrequencies(termFreqMap, vector);
}
}
Query query = createQuery(createQueue(termFreqMap));
bq.add(query, BooleanClause.Occur.SHOULD);
}
return bq;
}

/**
* Create the More like query from a PriorityQueue
*/
Expand Down Expand Up @@ -773,7 +812,9 @@ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) thro
if (isNoiseWord(term)) {
continue;
}
final int freq = (int) termsEnum.totalTermFreq();

DocsEnum docs = termsEnum.docs(null, null);
final int freq = docs.freq();

// increment frequency
Int cnt = termFreqMap.get(term);
Expand Down
Expand Up @@ -20,7 +20,6 @@
package org.elasticsearch.index.query;

import com.google.common.collect.Lists;
import com.google.common.collect.ObjectArrays;
import com.google.common.collect.Sets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queries.TermsFilter;
Expand All @@ -40,10 +39,12 @@
import org.elasticsearch.index.mapper.Uid;
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService;
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService.LikeText;

import java.io.IOException;
import java.util.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

/**
*
Expand Down Expand Up @@ -201,54 +202,25 @@ public Query parse(QueryParseContext parseContext) throws IOException, QueryPars
}
if (item.fields() == null && item.fetchSourceContext() == null) {
item.fields(moreLikeFields.toArray(new String[moreLikeFields.size()]));
} else {
// TODO how about fields content fetched from _source?
removeUnsupportedFields(item, analyzer, failOnUnsupportedField);
}
}
// fetching the items with multi-get
List<LikeText> likeTexts = fetchService.fetch(items);
// collapse the text onto the same field name
Collection<LikeText> likeTextsCollapsed = collapseTextOnField(likeTexts);
// right now we are just building a boolean query
// fetching the items with multi-termvectors API
BooleanQuery boolQuery = new BooleanQuery();
for (LikeText likeText : likeTextsCollapsed) {
addMoreLikeThis(boolQuery, mltQuery, likeText);
}
org.apache.lucene.index.Fields[] likeFields = fetchService.fetch(items);
mltQuery.setLikeText(likeFields);
boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD);
// exclude the items from the search
if (!include) {
TermsFilter filter = new TermsFilter(UidFieldMapper.NAME, Uid.createUids(items));
ConstantScoreQuery query = new ConstantScoreQuery(filter);
boolQuery.add(query, BooleanClause.Occur.MUST_NOT);
}
// add the possible mlt query with like_text
if (mltQuery.getLikeText() != null) {
boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD);
}
return boolQuery;
}

return mltQuery;
}

private void addMoreLikeThis(BooleanQuery boolQuery, MoreLikeThisQuery mltQuery, LikeText likeText) {
MoreLikeThisQuery mlt = new MoreLikeThisQuery();
mlt.setMoreLikeFields(new String[] {likeText.field});
mlt.setLikeText(likeText.text);
mlt.setAnalyzer(mltQuery.getAnalyzer());
mlt.setPercentTermsToMatch(mltQuery.getPercentTermsToMatch());
mlt.setBoostTerms(mltQuery.isBoostTerms());
mlt.setBoostTermsFactor(mltQuery.getBoostTermsFactor());
mlt.setMinDocFreq(mltQuery.getMinDocFreq());
mlt.setMaxDocFreq(mltQuery.getMaxDocFreq());
mlt.setMinWordLen(mltQuery.getMinWordLen());
mlt.setMaxWordLen(mltQuery.getMaxWordLen());
mlt.setMinTermFrequency(mltQuery.getMinTermFrequency());
mlt.setMaxQueryTerms(mltQuery.getMaxQueryTerms());
mlt.setStopWords(mltQuery.getStopWords());
boolQuery.add(mlt, BooleanClause.Occur.SHOULD);
}

private List<String> removeUnsupportedFields(List<String> moreLikeFields, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException {
for (Iterator<String> it = moreLikeFields.iterator(); it.hasNext(); ) {
final String fieldName = it.next();
Expand All @@ -262,22 +234,4 @@ private List<String> removeUnsupportedFields(List<String> moreLikeFields, Analyz
}
return moreLikeFields;
}

public static Collection<LikeText> collapseTextOnField (Collection<LikeText> likeTexts) {
Map<String, LikeText> collapsedTexts = new HashMap<>();
for (LikeText likeText : likeTexts) {
String field = likeText.field;
String[] text = likeText.text;
if (collapsedTexts.containsKey(field)) {
text = ObjectArrays.concat(collapsedTexts.get(field).text, text, String.class);
}
collapsedTexts.put(field, new LikeText(field, text));
}
return collapsedTexts.values();
}

private void removeUnsupportedFields(MultiGetRequest.Item item, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException {
item.fields((String[]) removeUnsupportedFields(Arrays.asList(item.fields()), analyzer, failOnUnsupportedField).toArray());
}

}
Expand Up @@ -19,15 +19,16 @@

package org.elasticsearch.index.search.morelikethis;

import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.get.MultiGetItemResponse;
import org.apache.lucene.index.Fields;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.action.get.MultiGetResponse;
import org.elasticsearch.action.termvector.MultiTermVectorsItemResponse;
import org.elasticsearch.action.termvector.MultiTermVectorsRequest;
import org.elasticsearch.action.termvector.MultiTermVectorsResponse;
import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.get.GetField;

import java.io.IOException;
import java.util.ArrayList;
Expand All @@ -38,21 +39,6 @@
*/
public class MoreLikeThisFetchService extends AbstractComponent {

public static final class LikeText {
public final String field;
public final String[] text;

public LikeText(String field, String text) {
this.field = field;
this.text = new String[]{text};
}

public LikeText(String field, String... text) {
this.field = field;
this.text = text;
}
}

private final Client client;

@Inject
Expand All @@ -61,30 +47,23 @@ public MoreLikeThisFetchService(Client client, Settings settings) {
this.client = client;
}

public List<LikeText> fetch(List<MultiGetRequest.Item> items) throws IOException {
MultiGetRequest request = new MultiGetRequest();
public Fields[] fetch(List<MultiGetRequest.Item> items) throws IOException {
MultiTermVectorsRequest request = new MultiTermVectorsRequest();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry for the misleading comment, here is where we lose headers and request context, the items don't need them, only the main request. This one should be created passing in the original request, but that would be MoreLikeThisRequest not multi_get. Actually I'm confused on why we still have multi_get items but we don't use multi_get internally anymore

for (MultiGetRequest.Item item : items) {
request.add(item);
}
MultiGetResponse responses = client.multiGet(request).actionGet();
List<LikeText> likeTexts = new ArrayList<>();
for (MultiGetItemResponse response : responses) {
List<Fields> likeFields = new ArrayList<>();
MultiTermVectorsResponse responses = client.multiTermVectors(request).actionGet();
for (MultiTermVectorsItemResponse response : responses) {
if (response.isFailed()) {
continue;
}
GetResponse getResponse = response.getResponse();
TermVectorResponse getResponse = response.getResponse();
if (!getResponse.isExists()) {
continue;
}

for (GetField getField : getResponse.getFields().values()) {
String[] text = new String[getField.getValues().size()];
for (int i = 0; i < text.length; i++) {
text[i] = getField.getValues().get(i).toString();
}
likeTexts.add(new LikeText(getField.getName(), text));
}
likeFields.add(getResponse.getFields());
}
return likeTexts;
return likeFields.toArray(Fields.EMPTY_ARRAY);
}
}