diff --git a/docs/reference/query-dsl/queries/mlt-query.asciidoc b/docs/reference/query-dsl/queries/mlt-query.asciidoc index c0fc006aa31d3..3ebc7b24fcd16 100644 --- a/docs/reference/query-dsl/queries/mlt-query.asciidoc +++ b/docs/reference/query-dsl/queries/mlt-query.asciidoc @@ -119,7 +119,7 @@ boost factor. |`boost` |Sets the boost value of the query. Defaults to `1.0`. -|`analyzer` |The analyzer that will be used to analyze the text. -Defaults to the analyzer associated with the field. +|`analyzer` |The analyzer that will be used to analyze the `like text`. +Defaults to the analyzer associated with the first field in `fields`. |======================================================================= diff --git a/src/main/java/org/elasticsearch/action/termvector/MultiTermVectorsRequest.java b/src/main/java/org/elasticsearch/action/termvector/MultiTermVectorsRequest.java index d2a0467d47ee6..9c2aa515e4c32 100644 --- a/src/main/java/org/elasticsearch/action/termvector/MultiTermVectorsRequest.java +++ b/src/main/java/org/elasticsearch/action/termvector/MultiTermVectorsRequest.java @@ -22,6 +22,7 @@ import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.action.*; +import org.elasticsearch.action.get.MultiGetRequest; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.io.stream.StreamInput; @@ -52,6 +53,11 @@ public MultiTermVectorsRequest add(String index, @Nullable String type, String i return this; } + public MultiTermVectorsRequest add(MultiGetRequest.Item item) { + requests.add(new TermVectorRequest(item)); + return this; + } + @Override public ActionRequestValidationException validate() { ActionRequestValidationException validationException = null; diff --git a/src/main/java/org/elasticsearch/action/termvector/TermVectorFields.java b/src/main/java/org/elasticsearch/action/termvector/TermVectorFields.java index 01b1e2216bfd6..babf5a93d59fc 100644 --- a/src/main/java/org/elasticsearch/action/termvector/TermVectorFields.java +++ b/src/main/java/org/elasticsearch/action/termvector/TermVectorFields.java @@ -170,216 +170,234 @@ public Terms terms(String field) throws IOException { if (!fieldMap.containsKey(field)) { return null; // we don't have it. } - long offset = fieldMap.lget(); - final BytesStreamInput perFieldTermVectorInput = new BytesStreamInput(this.termVectors); - perFieldTermVectorInput.reset(); - perFieldTermVectorInput.skip(offset); - - // read how many terms.... - final long numTerms = perFieldTermVectorInput.readVLong(); - // ...if positions etc. were stored.... - final boolean hasPositions = perFieldTermVectorInput.readBoolean(); - final boolean hasOffsets = perFieldTermVectorInput.readBoolean(); - final boolean hasPayloads = perFieldTermVectorInput.readBoolean(); - // read the field statistics - final long sumTotalTermFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1; - final long sumDocFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1; - final int docCount = hasFieldStatistic ? readPotentiallyNegativeVInt(perFieldTermVectorInput) : -1; - - return new Terms() { + long readOffset = fieldMap.lget(); + return new TermVector(termVectors, readOffset); + } - @Override - public TermsEnum iterator(TermsEnum reuse) throws IOException { - // convert bytes ref for the terms to actual data - return new TermsEnum() { - int currentTerm = 0; - int freq = 0; - int docFreq = -1; - long totalTermFrequency = -1; - int[] positions = new int[1]; - int[] startOffsets = new int[1]; - int[] endOffsets = new int[1]; - BytesRef[] payloads = new BytesRef[1]; - final BytesRef spare = new BytesRef(); - - @Override - public BytesRef next() throws IOException { - if (currentTerm++ < numTerms) { - // term string. first the size... - int termVectorSize = perFieldTermVectorInput.readVInt(); - spare.grow(termVectorSize); - // ...then the value. - perFieldTermVectorInput.readBytes(spare.bytes, 0, termVectorSize); - spare.length = termVectorSize; - if (hasTermStatistic) { - docFreq = readPotentiallyNegativeVInt(perFieldTermVectorInput); - totalTermFrequency = readPotentiallyNegativeVLong(perFieldTermVectorInput); + @Override + public int size() { + return fieldMap.size(); + } - } + private final class TermVector extends Terms { - freq = readPotentiallyNegativeVInt(perFieldTermVectorInput); - // grow the arrays to read the values. this is just - // for performance reasons. Re-use memory instead of - // realloc. - growBuffers(); - // finally, read the values into the arrays - // curentPosition etc. so that we can just iterate - // later - writeInfos(perFieldTermVectorInput); - return spare; - - } else { - return null; - } + private final BytesStreamInput perFieldTermVectorInput; + private final long readOffset; - } + private long numTerms; + private boolean hasPositions; + private boolean hasOffsets; + private boolean hasPayloads; + private long sumTotalTermFreq; + private long sumDocFreq; + private int docCount; + + public TermVector(BytesReference termVectors, long readOffset) throws IOException { + this.perFieldTermVectorInput = new BytesStreamInput(termVectors); + this.readOffset = readOffset; + reset(); + } - private void writeInfos(final BytesStreamInput input) throws IOException { - for (int i = 0; i < freq; i++) { - if (hasPositions) { - positions[i] = input.readVInt(); - } - if (hasOffsets) { - startOffsets[i] = input.readVInt(); - endOffsets[i] = input.readVInt(); - } - if (hasPayloads) { - int payloadLength = input.readVInt(); - if (payloads[i] == null) { - payloads[i] = new BytesRef(payloadLength); - } else { - payloads[i].grow(payloadLength); - } - input.readBytes(payloads[i].bytes, 0, payloadLength); - payloads[i].length = payloadLength; - payloads[i].offset = 0; - } + private void reset() throws IOException { + this.perFieldTermVectorInput.reset(); + this.perFieldTermVectorInput.skip(readOffset); + + // read how many terms.... + this.numTerms = perFieldTermVectorInput.readVLong(); + // ...if positions etc. were stored.... + this.hasPositions = perFieldTermVectorInput.readBoolean(); + this.hasOffsets = perFieldTermVectorInput.readBoolean(); + this.hasPayloads = perFieldTermVectorInput.readBoolean(); + // read the field statistics + this.sumTotalTermFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1; + this.sumDocFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1; + this.docCount = hasFieldStatistic ? readPotentiallyNegativeVInt(perFieldTermVectorInput) : -1; + } + + @Override + public TermsEnum iterator(TermsEnum reuse) throws IOException { + // reset before asking for an iterator + reset(); + // convert bytes ref for the terms to actual data + return new TermsEnum() { + int currentTerm = 0; + int freq = 0; + int docFreq = -1; + long totalTermFrequency = -1; + int[] positions = new int[1]; + int[] startOffsets = new int[1]; + int[] endOffsets = new int[1]; + BytesRef[] payloads = new BytesRef[1]; + final BytesRef spare = new BytesRef(); + + @Override + public BytesRef next() throws IOException { + if (currentTerm++ < numTerms) { + // term string. first the size... + int termVectorSize = perFieldTermVectorInput.readVInt(); + spare.grow(termVectorSize); + // ...then the value. + perFieldTermVectorInput.readBytes(spare.bytes, 0, termVectorSize); + spare.length = termVectorSize; + if (hasTermStatistic) { + docFreq = readPotentiallyNegativeVInt(perFieldTermVectorInput); + totalTermFrequency = readPotentiallyNegativeVLong(perFieldTermVectorInput); } - } - private void growBuffers() { + freq = readPotentiallyNegativeVInt(perFieldTermVectorInput); + // grow the arrays to read the values. this is just + // for performance reasons. Re-use memory instead of + // realloc. + growBuffers(); + // finally, read the values into the arrays + // curentPosition etc. so that we can just iterate + // later + writeInfos(perFieldTermVectorInput); + return spare; + + } else { + return null; + } + } + private void writeInfos(final BytesStreamInput input) throws IOException { + for (int i = 0; i < freq; i++) { if (hasPositions) { - positions = grow(positions, freq); + positions[i] = input.readVInt(); } if (hasOffsets) { - startOffsets = grow(startOffsets, freq); - endOffsets = grow(endOffsets, freq); + startOffsets[i] = input.readVInt(); + endOffsets[i] = input.readVInt(); } if (hasPayloads) { - if (payloads.length < freq) { - final BytesRef[] newArray = new BytesRef[ArrayUtil.oversize(freq, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(payloads, 0, newArray, 0, payloads.length); - payloads = newArray; + int payloadLength = input.readVInt(); + if (payloads[i] == null) { + payloads[i] = new BytesRef(payloadLength); + } else { + payloads[i].grow(payloadLength); } + input.readBytes(payloads[i].bytes, 0, payloadLength); + payloads[i].length = payloadLength; + payloads[i].offset = 0; } } + } - @Override - public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUnicodeComparator(); - } - - @Override - public SeekStatus seekCeil(BytesRef text) throws IOException { - throw new UnsupportedOperationException(); + private void growBuffers() { + if (hasPositions) { + positions = grow(positions, freq); } - - @Override - public void seekExact(long ord) throws IOException { - throw new UnsupportedOperationException("Seek is not supported"); + if (hasOffsets) { + startOffsets = grow(startOffsets, freq); + endOffsets = grow(endOffsets, freq); } - - @Override - public BytesRef term() throws IOException { - return spare; - } - - @Override - public long ord() throws IOException { - throw new UnsupportedOperationException("ordinals are not supported"); - } - - @Override - public int docFreq() throws IOException { - return docFreq; - } - - @Override - public long totalTermFreq() throws IOException { - return totalTermFrequency; - } - - @Override - public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { - return docsAndPositions(liveDocs, reuse instanceof DocsAndPositionsEnum ? (DocsAndPositionsEnum) reuse : null, 0); - } - - @Override - public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { - final TermVectorsDocsAndPosEnum retVal = (reuse instanceof TermVectorsDocsAndPosEnum ? (TermVectorsDocsAndPosEnum) reuse - : new TermVectorsDocsAndPosEnum()); - return retVal.reset(hasPositions ? positions : null, hasOffsets ? startOffsets : null, hasOffsets ? endOffsets - : null, hasPayloads ? payloads : null, freq); + if (hasPayloads) { + if (payloads.length < freq) { + final BytesRef[] newArray = new BytesRef[ArrayUtil.oversize(freq, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(payloads, 0, newArray, 0, payloads.length); + payloads = newArray; + } } + } + + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException("Seek is not supported"); + } + + @Override + public BytesRef term() throws IOException { + return spare; + } + + @Override + public long ord() throws IOException { + throw new UnsupportedOperationException("ordinals are not supported"); + } + + @Override + public int docFreq() throws IOException { + return docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + return totalTermFrequency; + } + + @Override + public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException { + return docsAndPositions(liveDocs, reuse instanceof DocsAndPositionsEnum ? (DocsAndPositionsEnum) reuse : null, 0); + } + + @Override + public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException { + final TermVectorDocsAndPosEnum retVal = (reuse instanceof TermVectorDocsAndPosEnum ? (TermVectorDocsAndPosEnum) reuse + : new TermVectorDocsAndPosEnum()); + return retVal.reset(hasPositions ? positions : null, hasOffsets ? startOffsets : null, hasOffsets ? endOffsets + : null, hasPayloads ? payloads : null, freq); + } + + }; + } - }; - } - - @Override - public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUnicodeComparator(); - } - - @Override - public long size() throws IOException { - return numTerms; - } + @Override + public Comparator getComparator() { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } - @Override - public long getSumTotalTermFreq() throws IOException { - return sumTotalTermFreq; - } + @Override + public long size() throws IOException { + return numTerms; + } - @Override - public long getSumDocFreq() throws IOException { - return sumDocFreq; - } + @Override + public long getSumTotalTermFreq() throws IOException { + return sumTotalTermFreq; + } - @Override - public int getDocCount() throws IOException { - return docCount; - } - - @Override - public boolean hasFreqs() { - return true; - } + @Override + public long getSumDocFreq() throws IOException { + return sumDocFreq; + } - @Override - public boolean hasOffsets() { - return hasOffsets; - } + @Override + public int getDocCount() throws IOException { + return docCount; + } - @Override - public boolean hasPositions() { - return hasPositions; - } + @Override + public boolean hasFreqs() { + return true; + } - @Override - public boolean hasPayloads() { - return hasPayloads; - } + @Override + public boolean hasOffsets() { + return hasOffsets; + } - }; - } + @Override + public boolean hasPositions() { + return hasPositions; + } - @Override - public int size() { - return fieldMap.size(); + @Override + public boolean hasPayloads() { + return hasPayloads; + } } - private final class TermVectorsDocsAndPosEnum extends DocsAndPositionsEnum { + private final class TermVectorDocsAndPosEnum extends DocsAndPositionsEnum { private boolean hasPositions; private boolean hasOffsets; private boolean hasPayloads; diff --git a/src/main/java/org/elasticsearch/action/termvector/TermVectorRequest.java b/src/main/java/org/elasticsearch/action/termvector/TermVectorRequest.java index dbd026d2144e9..9b1ccdc287a84 100644 --- a/src/main/java/org/elasticsearch/action/termvector/TermVectorRequest.java +++ b/src/main/java/org/elasticsearch/action/termvector/TermVectorRequest.java @@ -24,6 +24,7 @@ import org.elasticsearch.Version; import org.elasticsearch.action.ActionRequestValidationException; import org.elasticsearch.action.ValidateActions; +import org.elasticsearch.action.get.MultiGetRequest; import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; @@ -68,7 +69,7 @@ public TermVectorRequest(String index, String type, String id) { this.id = id; this.type = type; } - + /** * Constructs a new term vector request for a document that will be fetch * from the provided index. Use {@link #type(String)} and @@ -86,6 +87,14 @@ public TermVectorRequest(TermVectorRequest other) { } } + public TermVectorRequest(MultiGetRequest.Item item) { + super(item.index()); + this.id = item.id(); + this.type = item.type(); + this.selectedFields(item.fields()); + this.routing(item.routing()); + } + public EnumSet getFlags() { return flagsEnum; } diff --git a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java index bd0909538fccb..c712f60aa447b 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java @@ -20,6 +20,7 @@ package org.elasticsearch.common.lucene.search; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; @@ -46,6 +47,7 @@ public class MoreLikeThisQuery extends Query { private TFIDFSimilarity similarity; private String[] likeText; + private Fields[] likeFields; private String[] moreLikeFields; private Analyzer analyzer; private float percentTermsToMatch = DEFAULT_PERCENT_TERMS_TO_MATCH; @@ -148,12 +150,18 @@ public Query rewrite(IndexReader reader) throws IOException { mlt.setBoost(boostTerms); mlt.setBoostFactor(boostTermsFactor); - Reader[] readers = new Reader[likeText.length]; - for (int i = 0; i < readers.length; i++) { - readers[i] = new FastStringReader(likeText[i]); + BooleanQuery bq = new BooleanQuery(); + if (this.likeFields != null) { + bq.add((BooleanQuery) mlt.like(this.likeFields), BooleanClause.Occur.SHOULD); + } + if (this.likeText != null) { + Reader[] readers = new Reader[likeText.length]; + for (int i = 0; i < readers.length; i++) { + readers[i] = new FastStringReader(likeText[i]); + } + //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field) + bq.add((BooleanQuery) mlt.like(moreLikeFields[0], readers), BooleanClause.Occur.SHOULD); } - //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field) - BooleanQuery bq = (BooleanQuery) mlt.like(moreLikeFields[0], readers); BooleanClause[] clauses = bq.getClauses(); bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch)); @@ -183,6 +191,14 @@ public void setLikeText(String... likeText) { this.likeText = likeText; } + public Fields[] getLikeFields() { + return likeFields; + } + + public void setLikeText(Fields... likeFields) { + this.likeFields = likeFields; + } + public void setLikeText(List likeText) { setLikeText(likeText.toArray(Strings.EMPTY_ARRAY)); } diff --git a/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java b/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java index 50862e9500fa0..56faae74170d2 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java @@ -53,11 +53,7 @@ import java.io.IOException; import java.io.Reader; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; +import java.util.*; /** @@ -618,6 +614,49 @@ public Query like(String fieldName, Reader... readers) throws IOException { return createQuery(createQueue(words)); } + /** + * Return a query that will return docs like the passed Terms. + * + * @return a query that will return docs like the passed Terms. + */ + public Query like(Terms... likeTerms) throws IOException { + Map termFreqMap = new HashMap<>(); + for (Terms vector : likeTerms) { + addTermFrequencies(termFreqMap, vector); + } + return createQuery(createQueue(termFreqMap)); + } + + /** + * Return a query that will return docs like the passed Fields. + * + * @return a query that will return docs like the passed Fields. + */ + public Query like(Fields... likeFields) throws IOException { + // get all field names + Set fieldNames = new HashSet<>(); + for (Fields fields : likeFields) { + for (String fieldName : fields) { + fieldNames.add(fieldName); + } + } + // to create one query per field name only + BooleanQuery bq = new BooleanQuery(); + for (String fieldName : fieldNames) { + Map termFreqMap = new HashMap<>(); + this.setFieldNames(new String[]{fieldName}); + for (Fields fields : likeFields) { + Terms vector = fields.terms(fieldName); + if (vector != null) { + addTermFrequencies(termFreqMap, vector); + } + } + Query query = createQuery(createQueue(termFreqMap)); + bq.add(query, BooleanClause.Occur.SHOULD); + } + return bq; + } + /** * Create the More like query from a PriorityQueue */ @@ -773,7 +812,9 @@ private void addTermFrequencies(Map termFreqMap, Terms vector) thro if (isNoiseWord(term)) { continue; } - final int freq = (int) termsEnum.totalTermFreq(); + + DocsEnum docs = termsEnum.docs(null, null); + final int freq = docs.freq(); // increment frequency Int cnt = termFreqMap.get(term); diff --git a/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java b/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java index b67a1a4479c2d..0cc50a44bb189 100644 --- a/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java +++ b/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java @@ -20,7 +20,6 @@ package org.elasticsearch.index.query; import com.google.common.collect.Lists; -import com.google.common.collect.ObjectArrays; import com.google.common.collect.Sets; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.queries.TermsFilter; @@ -40,10 +39,12 @@ import org.elasticsearch.index.mapper.Uid; import org.elasticsearch.index.mapper.internal.UidFieldMapper; import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService; -import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService.LikeText; import java.io.IOException; -import java.util.*; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Set; /** * @@ -201,54 +202,25 @@ public Query parse(QueryParseContext parseContext) throws IOException, QueryPars } if (item.fields() == null && item.fetchSourceContext() == null) { item.fields(moreLikeFields.toArray(new String[moreLikeFields.size()])); - } else { - // TODO how about fields content fetched from _source? - removeUnsupportedFields(item, analyzer, failOnUnsupportedField); } } - // fetching the items with multi-get - List likeTexts = fetchService.fetch(items); - // collapse the text onto the same field name - Collection likeTextsCollapsed = collapseTextOnField(likeTexts); - // right now we are just building a boolean query + // fetching the items with multi-termvectors API BooleanQuery boolQuery = new BooleanQuery(); - for (LikeText likeText : likeTextsCollapsed) { - addMoreLikeThis(boolQuery, mltQuery, likeText); - } + org.apache.lucene.index.Fields[] likeFields = fetchService.fetch(items); + mltQuery.setLikeText(likeFields); + boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD); // exclude the items from the search if (!include) { TermsFilter filter = new TermsFilter(UidFieldMapper.NAME, Uid.createUids(items)); ConstantScoreQuery query = new ConstantScoreQuery(filter); boolQuery.add(query, BooleanClause.Occur.MUST_NOT); } - // add the possible mlt query with like_text - if (mltQuery.getLikeText() != null) { - boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD); - } return boolQuery; } return mltQuery; } - private void addMoreLikeThis(BooleanQuery boolQuery, MoreLikeThisQuery mltQuery, LikeText likeText) { - MoreLikeThisQuery mlt = new MoreLikeThisQuery(); - mlt.setMoreLikeFields(new String[] {likeText.field}); - mlt.setLikeText(likeText.text); - mlt.setAnalyzer(mltQuery.getAnalyzer()); - mlt.setPercentTermsToMatch(mltQuery.getPercentTermsToMatch()); - mlt.setBoostTerms(mltQuery.isBoostTerms()); - mlt.setBoostTermsFactor(mltQuery.getBoostTermsFactor()); - mlt.setMinDocFreq(mltQuery.getMinDocFreq()); - mlt.setMaxDocFreq(mltQuery.getMaxDocFreq()); - mlt.setMinWordLen(mltQuery.getMinWordLen()); - mlt.setMaxWordLen(mltQuery.getMaxWordLen()); - mlt.setMinTermFrequency(mltQuery.getMinTermFrequency()); - mlt.setMaxQueryTerms(mltQuery.getMaxQueryTerms()); - mlt.setStopWords(mltQuery.getStopWords()); - boolQuery.add(mlt, BooleanClause.Occur.SHOULD); - } - private List removeUnsupportedFields(List moreLikeFields, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException { for (Iterator it = moreLikeFields.iterator(); it.hasNext(); ) { final String fieldName = it.next(); @@ -262,22 +234,4 @@ private List removeUnsupportedFields(List moreLikeFields, Analyz } return moreLikeFields; } - - public static Collection collapseTextOnField (Collection likeTexts) { - Map collapsedTexts = new HashMap<>(); - for (LikeText likeText : likeTexts) { - String field = likeText.field; - String[] text = likeText.text; - if (collapsedTexts.containsKey(field)) { - text = ObjectArrays.concat(collapsedTexts.get(field).text, text, String.class); - } - collapsedTexts.put(field, new LikeText(field, text)); - } - return collapsedTexts.values(); - } - - private void removeUnsupportedFields(MultiGetRequest.Item item, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException { - item.fields((String[]) removeUnsupportedFields(Arrays.asList(item.fields()), analyzer, failOnUnsupportedField).toArray()); - } - } \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/search/morelikethis/MoreLikeThisFetchService.java b/src/main/java/org/elasticsearch/index/search/morelikethis/MoreLikeThisFetchService.java index 92a42412244fb..20e88a491a8ae 100644 --- a/src/main/java/org/elasticsearch/index/search/morelikethis/MoreLikeThisFetchService.java +++ b/src/main/java/org/elasticsearch/index/search/morelikethis/MoreLikeThisFetchService.java @@ -19,15 +19,16 @@ package org.elasticsearch.index.search.morelikethis; -import org.elasticsearch.action.get.GetResponse; -import org.elasticsearch.action.get.MultiGetItemResponse; +import org.apache.lucene.index.Fields; import org.elasticsearch.action.get.MultiGetRequest; -import org.elasticsearch.action.get.MultiGetResponse; +import org.elasticsearch.action.termvector.MultiTermVectorsItemResponse; +import org.elasticsearch.action.termvector.MultiTermVectorsRequest; +import org.elasticsearch.action.termvector.MultiTermVectorsResponse; +import org.elasticsearch.action.termvector.TermVectorResponse; import org.elasticsearch.client.Client; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.get.GetField; import java.io.IOException; import java.util.ArrayList; @@ -38,21 +39,6 @@ */ public class MoreLikeThisFetchService extends AbstractComponent { - public static final class LikeText { - public final String field; - public final String[] text; - - public LikeText(String field, String text) { - this.field = field; - this.text = new String[]{text}; - } - - public LikeText(String field, String... text) { - this.field = field; - this.text = text; - } - } - private final Client client; @Inject @@ -61,30 +47,23 @@ public MoreLikeThisFetchService(Client client, Settings settings) { this.client = client; } - public List fetch(List items) throws IOException { - MultiGetRequest request = new MultiGetRequest(); + public Fields[] fetch(List items) throws IOException { + MultiTermVectorsRequest request = new MultiTermVectorsRequest(); for (MultiGetRequest.Item item : items) { request.add(item); } - MultiGetResponse responses = client.multiGet(request).actionGet(); - List likeTexts = new ArrayList<>(); - for (MultiGetItemResponse response : responses) { + List likeFields = new ArrayList<>(); + MultiTermVectorsResponse responses = client.multiTermVectors(request).actionGet(); + for (MultiTermVectorsItemResponse response : responses) { if (response.isFailed()) { continue; } - GetResponse getResponse = response.getResponse(); + TermVectorResponse getResponse = response.getResponse(); if (!getResponse.isExists()) { continue; } - - for (GetField getField : getResponse.getFields().values()) { - String[] text = new String[getField.getValues().size()]; - for (int i = 0; i < text.length; i++) { - text[i] = getField.getValues().get(i).toString(); - } - likeTexts.add(new LikeText(getField.getName(), text)); - } + likeFields.add(getResponse.getFields()); } - return likeTexts; + return likeFields.toArray(Fields.EMPTY_ARRAY); } } diff --git a/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java b/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java index fa6da71a6db2c..596d29b482531 100644 --- a/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java +++ b/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java @@ -21,18 +21,23 @@ import com.google.common.collect.Lists; -import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.index.*; +import org.apache.lucene.index.memory.MemoryIndex; import org.apache.lucene.queries.*; import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery; import org.apache.lucene.search.*; import org.apache.lucene.search.spans.*; import org.apache.lucene.spatial.prefix.IntersectsPrefixTreeFilter; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.UnicodeUtil; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.get.MultiGetRequest; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.compress.CompressedString; +import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.search.*; import org.elasticsearch.common.lucene.search.function.BoostScoreFunction; import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery; @@ -48,7 +53,6 @@ import org.elasticsearch.index.search.geo.GeoPolygonFilter; import org.elasticsearch.index.search.geo.InMemoryGeoBoundingBoxFilter; import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService; -import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService.LikeText; import org.elasticsearch.index.service.IndexService; import org.elasticsearch.test.ElasticsearchSingleNodeTest; import org.hamcrest.Matchers; @@ -1591,37 +1595,24 @@ public void testMoreLikeThisIds() throws Exception { MoreLikeThisQueryParser parser = (MoreLikeThisQueryParser) queryParser.queryParser("more_like_this"); parser.setFetchService(new MockMoreLikeThisFetchService()); - List likeTexts = new ArrayList<>(); - likeTexts.add(new LikeText("name.first", new String[]{ - "test person 1 name.first", "test person 2 name.first", "test person 3 name.first", "test person 4 name.first"})); - likeTexts.add(new LikeText("name.last", new String[]{ - "test person 1 name.last", "test person 2 name.last", "test person 3 name.last", "test person 4 name.last"})); - IndexQueryParserService queryParser = queryParser(); String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-items.json"); Query parsedQuery = queryParser.parse(query).query(); assertThat(parsedQuery, instanceOf(BooleanQuery.class)); BooleanQuery booleanQuery = (BooleanQuery) parsedQuery; - assertThat(booleanQuery.getClauses().length, is(likeTexts.size() + 1)); - - // check each clause is for each item - BooleanClause[] boolClauses = booleanQuery.getClauses(); - for (int i = 0; i < likeTexts.size(); i++) { - BooleanClause booleanClause = booleanQuery.getClauses()[i]; - assertThat(booleanClause.getOccur(), is(BooleanClause.Occur.SHOULD)); - assertThat(booleanClause.getQuery(), instanceOf(MoreLikeThisQuery.class)); - MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) booleanClause.getQuery(); - assertThat(mltQuery.getLikeTexts(), is(likeTexts.get(i).text)); - assertThat(mltQuery.getMoreLikeFields()[0], equalTo(likeTexts.get(i).field)); + assertThat(booleanQuery.getClauses().length, is(1)); + + BooleanClause itemClause = booleanQuery.getClauses()[0]; + assertThat(itemClause.getOccur(), is(BooleanClause.Occur.SHOULD)); + assertThat(itemClause.getQuery(), instanceOf(MoreLikeThisQuery.class)); + MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) itemClause.getQuery(); + + // check each Fields is for each item + for (int id = 1; id <= 4; id++) { + Fields fields = mltQuery.getLikeFields()[id - 1]; + assertThat(termsToString(fields.terms("name.first")), is(String.valueOf(id))); + assertThat(termsToString(fields.terms("name.last")), is(String.valueOf(id))); } - - // check last clause is for 'like_text' - BooleanClause boolClause = boolClauses[boolClauses.length - 1]; - assertThat(boolClause.getOccur(), is(BooleanClause.Occur.SHOULD)); - assertThat(boolClause.getQuery(), instanceOf(MoreLikeThisQuery.class)); - MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) boolClause.getQuery(); - assertArrayEquals("Not the same more like this 'fields'", new String[] {"name.first", "name.last"}, mltQuery.getMoreLikeFields()); - assertThat(mltQuery.getLikeText(), equalTo("Apache Lucene")); } private static class MockMoreLikeThisFetchService extends MoreLikeThisFetchService { @@ -1630,17 +1621,34 @@ public MockMoreLikeThisFetchService() { super(null, ImmutableSettings.Builder.EMPTY_SETTINGS); } - public List fetch(List items) throws IOException { - List likeTexts = new ArrayList<>(); - for (MultiGetRequest.Item item: items) { - for (String field : item.fields()) { - LikeText likeText = new LikeText( - field, item.index() + " " + item.type() + " " + item.id() + " " + field); - likeTexts.add(likeText); - } + public Fields[] fetch(List items) throws IOException { + List likeTexts = new ArrayList<>(); + for (MultiGetRequest.Item item : items) { + likeTexts.add(generateFields(item.fields(), item.id())); } - return likeTexts; + return likeTexts.toArray(Fields.EMPTY_ARRAY); + } + } + + private static Fields generateFields(String[] fieldNames, String text) throws IOException { + MemoryIndex index = new MemoryIndex(); + for (String fieldName : fieldNames) { + index.addField(fieldName, text, new WhitespaceAnalyzer(Lucene.VERSION)); + } + return MultiFields.getFields(index.createSearcher().getIndexReader()); + } + + private static String termsToString(Terms terms) throws IOException { + String strings = ""; + TermsEnum termsEnum = terms.iterator(null); + CharsRef spare = new CharsRef(); + BytesRef text; + while((text = termsEnum.next()) != null) { + UnicodeUtil.UTF8toUTF16(text, spare); + String term = spare.toString(); + strings += term; } + return strings; } @Test