diff --git a/docs/reference/modules.asciidoc b/docs/reference/modules.asciidoc index 4474ef5d19e04..a9490ec9101ce 100644 --- a/docs/reference/modules.asciidoc +++ b/docs/reference/modules.asciidoc @@ -21,6 +21,8 @@ include::modules/plugins.asciidoc[] include::modules/scripting.asciidoc[] +include::modules/advanced-scripting.asciidoc[] + include::modules/threadpool.asciidoc[] include::modules/thrift.asciidoc[] diff --git a/docs/reference/modules/advanced-scripting.asciidoc b/docs/reference/modules/advanced-scripting.asciidoc new file mode 100644 index 0000000000000..93d5696c7c8c3 --- /dev/null +++ b/docs/reference/modules/advanced-scripting.asciidoc @@ -0,0 +1,184 @@ +[[modules-advanced-scripting]] +== Text scoring in scripts + + +Text features, such as term or document frequency for a specific term can be accessed in scripts (see <> ) with the `_shard` variable. This can be useful if, for example, you want to implement your own scoring model using for example a script inside a <>. +Statistics over the document collection are computed *per shard*, not per +index. + +[float] +=== Nomenclature: + + +[horizontal] +`df`:: + + document frequency. The number of documents a term appears in. Computed + per field. + + +`tf`:: + + term frequency. The number times a term appears in a field in one specific + document. + +`ttf`:: + + total term frequency. The number of times this term appears in all + documents, that is, the sum of `tf` over all documents. Computed per + field. + +`df` and `ttf` are computed per shard and therefore these numbers can vary +depending on the shard the current document resides in. + + +[float] +=== Shard statistics: + +`_shard.numDocs()`:: + + Number of documents in shard. + +`_shard.maxDoc()`:: + + Maximal document number in shard. + +`_shard.numDeletedDocs()`:: + + Number of deleted documents in shard. + + +[float] +=== Field statistics: + +Field statistics can be accessed with a subscript operator like this: +`_shard['FIELD']`. + + +`_shard['FIELD'].docCount()`:: + + Number of documents containing the field `FIELD`. Does not take deleted documents into account. + +`_shard['FIELD'].sumttf()`:: + + Sum of `ttf` over all terms that appear in field `FIELD` in all documents. + +`_shard['FIELD'].sumdf()`:: + + The sum of `df` s over all terms that appear in field `FIELD` in all + documents. + + +Field statistics are computed per shard and therfore these numbers can vary +depending on the shard the current document resides in. +The number of terms in a field cannot be accessed using the `_shard` variable. See <> on how to do that. + +[float] +=== Term statistics: + +Term statistics for a field can be accessed with a subscript operator like +this: `_shard['FIELD']['TERM']`. This will never return null, even if term or field does not exist. +If you do not need the term frequency, call `_shard['FIELD'].get('TERM', 0)` +to avoid uneccesary initialization of the frequencies. The flag will have only +affect is your set the `index_options` to `docs` (see <>). + + +`_shard['FIELD']['TERM'].df()`:: + + `df` of term `TERM` in field `FIELD`. Will be returned, even if the term + is not present in the current document. + +`_shard['FIELD']['TERM'].ttf()`:: + + The sum of term frequencys of term `TERM` in field `FIELD` over all + documents. Will be returned, even if the term is not present in the + current document. + +`_shard['FIELD']['TERM'].tf()`:: + + `tf` of term `TERM` in field `FIELD`. Will be 0 if the term is not present + in the current document. + + +[float] +=== Term positions, offsets and payloads: + +If you need information on the positions of terms in a field, call +`_shard['FIELD'].get('TERM', flag)` where flag can be + +[horizontal] +`_POSITIONS`:: if you need the positions of the term +`_OFFSETS`:: if you need the offests of the term +`_PAYLOADS`:: if you need the payloads of the term +`_CACHE`:: if you need to iterate over all positions several times + +The iterator uses the underlying lucene classes to iterate over positions. For efficiency reasons, you can only iterate over positions once. If you need to iterate over the positions several times, set the `_CACHE` flag. + +You can combine the operators with a `|` if you need more than one info. For +example, the following will return an object holding the positions and payloads, +as well as all statistics: + + + `_shard['FIELD'].get('TERM', _POSITIONS | _PAYLOADS)` + + +Positions can be accessed with an iterator that returns an object +(`POS_OBJECT`) holding position, offsets and payload for each term position. + +`POS_OBJECT.position`:: + + The position of the term. + +`POS_OBJECT.startOffset`:: + + The start offset of the term. + +`POS_OBJECT.endOffset`:: + + The end offset of the term. + +`POS_OBJECT.payload`:: + + The payload of the term. + +`POS_OBJECT.payloadAsInt(missingValue)`:: + + The payload of the term converted to integer. If the current position has + no payload, the `missingValue` will be returned. Call this only if you + know that your payloads are integers. + +`POS_OBJECT.payloadAsFloat(missingValue)`:: + + The payload of the term converted to float. If the current position has no + payload, the `missingValue` will be returned. Call this only if you know + that your payloads are floats. + +`POS_OBJECT.payloadAsString()`:: + + The payload of the term converted to string. If the current position has + no payload, `null` will be returned. Call this only if you know that your + payloads are strings. + + +Example: sums up all payloads for the term `foo`. + +[source,mvel] +--------------------------------------------------------- +termInfo = _shard['my_field'].get('foo',_PAYLOADS); +score = 0; +for (pos : termInfo) { + score = score + pos.payloadAsInt(0); +} +return score; +--------------------------------------------------------- + + +[float] +=== Term vectors: + +The `_shard` variable can only be used to gather statistics for single terms. If you want to use information on all terms in a field, you must store the term vectors (set `term_vector` in the mapping as described in the <>). To access them, call +`_shard.getTermVectors()` to get a +https://lucene.apache.org/core/4_0_0/core/org/apache/lucene/index/Fields.html[Fields] +instance. This object can then be used as described in https://lucene.apache.org/core/4_0_0/core/org/apache/lucene/index/Fields.html[lucene doc] to iterate over fields and then for each field iterate over each term in the field. +The method will return null if the term vectors were not stored. + diff --git a/docs/reference/modules/scripting.asciidoc b/docs/reference/modules/scripting.asciidoc index f0fda7e7ad299..5c1f7dc965464 100644 --- a/docs/reference/modules/scripting.asciidoc +++ b/docs/reference/modules/scripting.asciidoc @@ -112,6 +112,11 @@ automatically loaded. In all scripts that can be used in facets, allow to access the current doc score using `doc.score`. +[float] +=== Computing scores based on terms in scripts + +see <> + [float] === Document Fields @@ -190,6 +195,7 @@ faster than using `_source` due to the extra overhead of potentially parsing lar However, `_source` may be faster if you access multiple fields or if the source has already been loaded for other purposes. + [float] === mvel Built In Functions diff --git a/src/main/java/org/elasticsearch/action/termvector/TermVectorFields.java b/src/main/java/org/elasticsearch/action/termvector/TermVectorFields.java index a7d245933e88e..03a909c3c46a6 100644 --- a/src/main/java/org/elasticsearch/action/termvector/TermVectorFields.java +++ b/src/main/java/org/elasticsearch/action/termvector/TermVectorFields.java @@ -112,8 +112,8 @@ public final class TermVectorFields extends Fields { - final private ObjectLongOpenHashMap fieldMap; - final private BytesReference termVectors; + private final ObjectLongOpenHashMap fieldMap; + private final BytesReference termVectors; final boolean hasTermStatistic; final boolean hasFieldStatistic; diff --git a/src/main/java/org/elasticsearch/common/lucene/search/function/FiltersFunctionScoreQuery.java b/src/main/java/org/elasticsearch/common/lucene/search/function/FiltersFunctionScoreQuery.java index 8526072722a48..59a7ba849e9d3 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/function/FiltersFunctionScoreQuery.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/function/FiltersFunctionScoreQuery.java @@ -89,10 +89,11 @@ public FiltersFunctionScoreQuery(Query subQuery, ScoreMode scoreMode, FilterFunc combineFunction = CombineFunction.MULT; } - public FiltersFunctionScoreQuery setCombineFunction(CombineFunction combineFunction){ + public FiltersFunctionScoreQuery setCombineFunction(CombineFunction combineFunction) { this.combineFunction = combineFunction; return this; } + public Query getSubQuery() { return subQuery; } @@ -150,7 +151,10 @@ public void normalize(float norm, float topLevelBoost) { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, Bits acceptDocs) throws IOException { - Scorer subQueryScorer = subQueryWeight.scorer(context, scoreDocsInOrder, false, acceptDocs); + // we ignore scoreDocsInOrder parameter, because we need to score in + // order if documents are scored with a script. The + // ShardLookup depends on in order scoring. + Scorer subQueryScorer = subQueryWeight.scorer(context, true, false, acceptDocs); if (subQueryScorer == null) { return null; } diff --git a/src/main/java/org/elasticsearch/common/lucene/search/function/FunctionScoreQuery.java b/src/main/java/org/elasticsearch/common/lucene/search/function/FunctionScoreQuery.java index 0d1e08c42b64f..16b2d00de45ba 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/function/FunctionScoreQuery.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/function/FunctionScoreQuery.java @@ -113,7 +113,10 @@ public void normalize(float norm, float topLevelBoost) { @Override public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder, boolean topScorer, Bits acceptDocs) throws IOException { - Scorer subQueryScorer = subQueryWeight.scorer(context, scoreDocsInOrder, false, acceptDocs); + // we ignore scoreDocsInOrder parameter, because we need to score in + // order if documents are scored with a script. The + // ShardLookup depends on in order scoring. + Scorer subQueryScorer = subQueryWeight.scorer(context, true, false, acceptDocs); if (subQueryScorer == null) { return null; } diff --git a/src/main/java/org/elasticsearch/common/util/MinimalMap.java b/src/main/java/org/elasticsearch/common/util/MinimalMap.java new file mode 100644 index 0000000000000..b758ed8f88061 --- /dev/null +++ b/src/main/java/org/elasticsearch/common/util/MinimalMap.java @@ -0,0 +1,71 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.util; + +import java.util.Collection; +import java.util.Map; +import java.util.Set; + +public abstract class MinimalMap implements Map { + + public boolean isEmpty() { + throw new UnsupportedOperationException("entrySet() not supported!"); + } + + public V put(K key, V value) { + throw new UnsupportedOperationException("put(Object, Object) not supported!"); + } + + public void putAll(Map m) { + throw new UnsupportedOperationException("putAll(Map) not supported!"); + } + + public V remove(Object key) { + throw new UnsupportedOperationException("remove(Object) not supported!"); + } + + public void clear() { + throw new UnsupportedOperationException("clear() not supported!"); + } + + public Set keySet() { + throw new UnsupportedOperationException("keySet() not supported!"); + } + + public Collection values() { + throw new UnsupportedOperationException("values() not supported!"); + } + + public Set> entrySet() { + throw new UnsupportedOperationException("entrySet() not supported!"); + } + + public boolean containsValue(Object value) { + throw new UnsupportedOperationException("containsValue(Object) not supported!"); + } + + public int size() { + throw new UnsupportedOperationException("size() not supported!"); + } + + public boolean containsKey(Object k) { + throw new UnsupportedOperationException("containsKey(Object) not supported!"); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/script/AbstractSearchScript.java b/src/main/java/org/elasticsearch/script/AbstractSearchScript.java index ad152ab9531d7..d7f3ddf995d36 100644 --- a/src/main/java/org/elasticsearch/script/AbstractSearchScript.java +++ b/src/main/java/org/elasticsearch/script/AbstractSearchScript.java @@ -19,6 +19,8 @@ package org.elasticsearch.script; +import org.elasticsearch.search.lookup.ShardTermsLookup; + import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.search.Scorer; import org.elasticsearch.index.fielddata.ScriptDocValues; @@ -87,6 +89,13 @@ protected ScriptDocValues.Longs docFieldLongs(String field) { protected final SourceLookup source() { return lookup.source(); } + + /** + * Allows to access statistics on terms and fields. + */ + protected final ShardTermsLookup shardTerms() { + return lookup.shardTerms(); + } /** * Allows to access the *stored* fields. diff --git a/src/main/java/org/elasticsearch/search/lookup/CachedPositionIterator.java b/src/main/java/org/elasticsearch/search/lookup/CachedPositionIterator.java new file mode 100644 index 0000000000000..7e4510ace26ad --- /dev/null +++ b/src/main/java/org/elasticsearch/search/lookup/CachedPositionIterator.java @@ -0,0 +1,138 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.lookup; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +import java.io.IOException; +import java.util.Iterator; + +/* + * Can iterate over the positions of a term an arbotrary number of times. + * */ +public class CachedPositionIterator extends PositionIterator { + + public CachedPositionIterator(ScriptTerm termInfo) { + super(termInfo); + } + + // all payloads of the term in the current document in one bytes array. + // payloadStarts and payloadLength mark the start and end of one payload. + final BytesRef payloads = new BytesRef(); + + final IntsRef payloadsLengths = new IntsRef(0); + + final IntsRef payloadsStarts = new IntsRef(0); + + final IntsRef positions = new IntsRef(0); + + final IntsRef startOffsets = new IntsRef(0); + + final IntsRef endOffsets = new IntsRef(0); + + @Override + public Iterator reset() { + return new Iterator() { + private int pos = 0; + private final TermPosition termPosition = new TermPosition(); + + @Override + public boolean hasNext() { + return pos < freq; + } + + @Override + public TermPosition next() { + termPosition.position = positions.ints[pos]; + termPosition.startOffset = startOffsets.ints[pos]; + termPosition.endOffset = endOffsets.ints[pos]; + termPosition.payload = payloads; + payloads.offset = payloadsStarts.ints[pos]; + payloads.length = payloadsLengths.ints[pos]; + pos++; + return termPosition; + } + + @Override + public void remove() { + } + }; + } + + + private void record() throws IOException { + TermPosition termPosition; + for (int i = 0; i < freq; i++) { + termPosition = super.next(); + positions.ints[i] = termPosition.position; + addPayload(i, termPosition.payload); + startOffsets.ints[i] = termPosition.startOffset; + endOffsets.ints[i] = termPosition.endOffset; + } + } + private void ensureSize(int freq) { + if (freq == 0) { + return; + } + if (startOffsets.ints.length < freq) { + startOffsets.grow(freq); + endOffsets.grow(freq); + positions.grow(freq); + payloadsLengths.grow(freq); + payloadsStarts.grow(freq); + } + payloads.offset = 0; + payloadsLengths.offset = 0; + payloadsStarts.offset = 0; + payloads.grow(freq * 8);// this is just a guess.... + + } + + private void addPayload(int i, BytesRef currPayload) { + if (currPayload != null) { + payloadsLengths.ints[i] = currPayload.length; + payloadsStarts.ints[i] = i == 0 ? 0 : payloadsStarts.ints[i - 1] + payloadsLengths.ints[i - 1]; + if (payloads.bytes.length < payloadsStarts.ints[i] + payloadsLengths.ints[i]) { + payloads.offset = 0; // the offset serves no purpose here. but + // we must assure that it is 0 before + // grow() is called + payloads.grow(payloads.bytes.length * 2); // just a guess + } + System.arraycopy(currPayload.bytes, currPayload.offset, payloads.bytes, payloadsStarts.ints[i], currPayload.length); + } else { + payloadsLengths.ints[i] = 0; + payloadsStarts.ints[i] = i == 0 ? 0 : payloadsStarts.ints[i - 1] + payloadsLengths.ints[i - 1]; + } + } + + + @Override + public void nextDoc() throws IOException { + super.nextDoc(); + ensureSize(freq); + record(); + } + + @Override + public TermPosition next() { + throw new UnsupportedOperationException(); + } +} diff --git a/src/main/java/org/elasticsearch/search/lookup/PositionIterator.java b/src/main/java/org/elasticsearch/search/lookup/PositionIterator.java new file mode 100644 index 0000000000000..926d8fff566e9 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/lookup/PositionIterator.java @@ -0,0 +1,143 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.lookup; + +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.ElasticSearchException; + +import java.io.IOException; +import java.util.Iterator; + +public class PositionIterator implements Iterator { + + private static final DocsAndPositionsEnum EMPTY = new EmptyDocsAndPosEnum(); + + private boolean resetted = false; + + protected ScriptTerm scriptTerm; + + protected int freq = -1; + + // current position of iterator + private int currentPos; + + protected final TermPosition termPosition = new TermPosition(); + + private DocsAndPositionsEnum docsAndPos; + + public PositionIterator(ScriptTerm termInfo) { + this.scriptTerm = termInfo; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Cannot remove anything from TermPositions iterator."); + } + + @Override + public boolean hasNext() { + return currentPos < freq; + } + + + @Override + public TermPosition next() { + try { + termPosition.position = docsAndPos.nextPosition(); + termPosition.startOffset = docsAndPos.startOffset(); + termPosition.endOffset = docsAndPos.endOffset(); + termPosition.payload = docsAndPos.getPayload(); + } catch (IOException ex) { + throw new ElasticSearchException("can not advance iterator", ex); + } + currentPos++; + return termPosition; + } + + public void nextDoc() throws IOException { + resetted = false; + currentPos = 0; + freq = scriptTerm.tf(); + if (scriptTerm.docsEnum instanceof DocsAndPositionsEnum) { + docsAndPos = (DocsAndPositionsEnum) scriptTerm.docsEnum; + } else { + docsAndPos = EMPTY; + } + } + + public Iterator reset() { + if (resetted) { + throw new ElasticSearchException( + "Cannot iterate twice! If you want to iterate more that once, add _CACHE explicitely."); + } + resetted = true; + return this; + } + + // we use this to make sure we can also iterate if there are no positions + private static final class EmptyDocsAndPosEnum extends DocsAndPositionsEnum { + + @Override + public int nextPosition() throws IOException { + return -1; + } + + @Override + public int startOffset() throws IOException { + return -1; + } + + @Override + public int endOffset() throws IOException { + return -1; + } + + @Override + public BytesRef getPayload() throws IOException { + return null; + } + + @Override + public int freq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int docID() { + throw new UnsupportedOperationException(); + } + + @Override + public int nextDoc() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + throw new UnsupportedOperationException(); + } + } +} diff --git a/src/main/java/org/elasticsearch/search/lookup/ScriptTerm.java b/src/main/java/org/elasticsearch/search/lookup/ScriptTerm.java new file mode 100644 index 0000000000000..1df1156d262db --- /dev/null +++ b/src/main/java/org/elasticsearch/search/lookup/ScriptTerm.java @@ -0,0 +1,284 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.lookup; + +import org.apache.lucene.index.*; +import org.apache.lucene.search.TermStatistics; +import org.elasticsearch.ElasticSearchException; +import org.elasticsearch.common.lucene.search.EmptyScorer; + +import java.io.IOException; +import java.util.Iterator; + +/** + * Holds all information on a particular term in a field. + * */ +public class ScriptTerm implements Iterable { + + // The posting list for this term. Is null if the term or field does not + // exist. Can be DocsEnum or DocsAndPositionsEnum. + DocsEnum docsEnum; + + // Stores if positions, offsets and payloads are requested. + private final int flags; + + private final String fieldName; + + private final String term; + + private final PositionIterator iterator; + + // for lucene calls + private final Term identifier; + + private final TermStatistics termStats; + + static private EmptyScorer EMPTY_DOCS_ENUM = new EmptyScorer(null); + + // get the document frequency of the term + public long df() throws IOException { + return termStats.docFreq(); + } + + // get the total term frequency of the term, that is, how often does the + // term appear in any document? + public long ttf() throws IOException { + return termStats.totalTermFreq(); + } + + // when the reader changes, we have to get the posting list for this term + // and reader + void setNextReader(AtomicReader reader) { + try { + // Get the posting list for a specific term. Depending on the flags, + // this + // will either get a DocsEnum or a DocsAndPositionsEnum if + // available. + + // get lucene frequency flag + int luceneFrequencyFlag = getLuceneFrequencyFlag(flags); + if (shouldRetrieveFrequenciesOnly()) { + docsEnum = getOnlyDocsEnum(luceneFrequencyFlag, reader); + } else { + int lucenePositionsFlags = getLucenePositionsFlags(flags); + docsEnum = getDocsAndPosEnum(lucenePositionsFlags, reader); + if (docsEnum == null) {// no pos available + docsEnum = getOnlyDocsEnum(luceneFrequencyFlag, reader); + } + } + } catch (IOException e) { + throw new ElasticSearchException("Unable to get posting list for field " + fieldName + " and term " + term, e); + } + + } + + private boolean shouldRetrieveFrequenciesOnly() { + return (flags & ~ShardTermsLookup.FLAG_FREQUENCIES) == 0; + } + + private int getLuceneFrequencyFlag(int flags) { + return (flags & ShardTermsLookup.FLAG_FREQUENCIES) > 0 ? DocsEnum.FLAG_FREQS : DocsEnum.FLAG_NONE; + } + + private int getLucenePositionsFlags(int flags) { + int lucenePositionsFlags = (flags & ShardTermsLookup.FLAG_PAYLOADS) > 0 ? DocsAndPositionsEnum.FLAG_PAYLOADS : 0x0; + lucenePositionsFlags |= (flags & ShardTermsLookup.FLAG_OFFSETS) > 0 ? DocsAndPositionsEnum.FLAG_OFFSETS : 0x0; + return lucenePositionsFlags; + } + + // get the DocsAndPositionsEnum from the reader. + private DocsEnum getDocsAndPosEnum(int luceneFlags, AtomicReader reader) throws IOException { + assert identifier.field() != null; + assert identifier.bytes() != null; + final Fields fields = reader.fields(); + DocsEnum newDocsEnum = null; + if (fields != null) { + final Terms terms = fields.terms(identifier.field()); + if (terms != null) { + if (terms.hasPositions()) { + final TermsEnum termsEnum = terms.iterator(null); + if (termsEnum.seekExact(identifier.bytes())) { + newDocsEnum = termsEnum.docsAndPositions(reader.getLiveDocs(), + docsEnum instanceof DocsAndPositionsEnum ? (DocsAndPositionsEnum) docsEnum : null, luceneFlags); + } + } + } + } + return newDocsEnum; + } + + // get the DocsEnum from the reader. + private DocsEnum getOnlyDocsEnum(int luceneFlags, AtomicReader reader) throws IOException { + assert identifier.field() != null; + assert identifier.bytes() != null; + final Fields fields = reader.fields(); + DocsEnum newDocsEnum = null; + if (fields != null) { + final Terms terms = fields.terms(identifier.field()); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(null); + if (termsEnum.seekExact(identifier.bytes())) { + newDocsEnum = termsEnum.docs(reader.getLiveDocs(), docsEnum, luceneFlags); + } + } + } + if (newDocsEnum == null) { + newDocsEnum = EMPTY_DOCS_ENUM; + } + return newDocsEnum; + } + + private int freq = 0; + + public void setNextDoc(int docId) { + assert (docsEnum != null); + try { + // we try to advance to the current document. + int currentDocPos = docsEnum.docID(); + if (currentDocPos < docId) { + currentDocPos = docsEnum.advance(docId); + } + if (currentDocPos == docId) { + freq = docsEnum.freq(); + } else { + freq = 0; + } + iterator.nextDoc(); + } catch (IOException e) { + throw new ElasticSearchException("While trying to initialize term positions in ScriptTerm.setNextDoc() ", e); + } + } + + public ScriptTerm(String term, String fieldName, ShardTermsLookup shardTermsLookup, int flags) { + assert fieldName != null; + this.fieldName = fieldName; + assert term != null; + this.term = term; + assert shardTermsLookup != null; + identifier = new Term(fieldName, (String) term); + this.flags = flags; + boolean doRecord = ((flags & ShardTermsLookup.FLAG_CACHE) > 0); + if (withPositions()) { + if (!doRecord) { + iterator = new PositionIterator(this); + } else { + iterator = new CachedPositionIterator(this); + } + } else { + iterator = new PositionIterator(this); + } + setNextReader(shardTermsLookup.getReader()); + setNextDoc(shardTermsLookup.getDocId()); + try { + termStats = shardTermsLookup.getIndexSearcher().termStatistics(identifier, + TermContext.build(shardTermsLookup.getReaderContext(), identifier)); + } catch (IOException e) { + throw new ElasticSearchException("Cannot get term statistics: ", e); + } + } + + private boolean withPositions() { + return shouldRetrievePositions() || shouldRetrieveOffsets() || shouldRetrievePayloads(); + } + + protected boolean shouldRetrievePositions() { + return (flags & ShardTermsLookup.FLAG_POSITIONS) > 0; + } + + protected boolean shouldRetrieveOffsets() { + return (flags & ShardTermsLookup.FLAG_OFFSETS) > 0; + } + + protected boolean shouldRetrievePayloads() { + return (flags & ShardTermsLookup.FLAG_PAYLOADS) > 0; + } + + public int tf() throws IOException { + return freq; + } + + @Override + public Iterator iterator() { + return iterator.reset(); + } + + /* + * A user might decide inside a script to call get with _POSITIONS and then + * a second time with _PAYLOADS. If the positions were recorded but the + * payloads were not, the user will not have access to them. Therfore, throw + * exception here explaining how to call get(). + */ + public void validateFlags(int flags2) { + if ((this.flags & flags2) < flags2) { + throw new ElasticSearchException("You must call get with all required flags! Instead of " + getCalledStatement(flags2) + + "call " + getCallStatement(flags2 | this.flags) + " once"); + } + } + + private String getCalledStatement(int flags2) { + String calledFlagsCall1 = getFlagsString(flags); + String calledFlagsCall2 = getFlagsString(flags2); + String callStatement1 = getCallStatement(calledFlagsCall1); + String callStatement2 = getCallStatement(calledFlagsCall2); + return " " + callStatement1 + " and " + callStatement2 + " "; + } + + private String getCallStatement(String calledFlags) { + return "_shard['" + this.fieldName + "'].get('" + this.term + "', " + calledFlags + ")"; + } + + private String getFlagsString(int flags2) { + String flagsString = null; + if ((flags2 & ShardTermsLookup.FLAG_FREQUENCIES) != 0) { + flagsString = anddToFlagsString(flagsString, "_FREQUENCIES"); + } + if ((flags2 & ShardTermsLookup.FLAG_POSITIONS) != 0) { + flagsString = anddToFlagsString(flagsString, "_POSITIONS"); + } + if ((flags2 & ShardTermsLookup.FLAG_OFFSETS) != 0) { + flagsString = anddToFlagsString(flagsString, "_OFFSETS"); + } + if ((flags2 & ShardTermsLookup.FLAG_PAYLOADS) != 0) { + flagsString = anddToFlagsString(flagsString, "_PAYLOADS"); + } + if ((flags2 & ShardTermsLookup.FLAG_CACHE) != 0) { + flagsString = anddToFlagsString(flagsString, "_CACHE"); + } + return flagsString; + } + + private String anddToFlagsString(String flagsString, String flag) { + if (flagsString != null) { + flagsString += " | "; + } else { + flagsString = ""; + } + flagsString += flag; + return flagsString; + } + + private String getCallStatement(int flags2) { + String calledFlags = getFlagsString(flags2); + String callStatement = getCallStatement(calledFlags); + return " " + callStatement + " "; + + } + +} diff --git a/src/main/java/org/elasticsearch/search/lookup/ScriptTerms.java b/src/main/java/org/elasticsearch/search/lookup/ScriptTerms.java new file mode 100644 index 0000000000000..3214aefff98fa --- /dev/null +++ b/src/main/java/org/elasticsearch/search/lookup/ScriptTerms.java @@ -0,0 +1,137 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.lookup; + +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.search.CollectionStatistics; +import org.elasticsearch.common.util.MinimalMap; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +/** + * Script interface to all information regarding a field. + * */ +public class ScriptTerms extends MinimalMap { + + /* + * TermsInfo Objects that represent the Terms are stored in this map when + * requested. Information such as frequency, doc frequency and positions + * information can be retrieved from the TermInfo objects in this map. + */ + private final Map terms = new HashMap(); + + // the name of this field + private final String fieldName; + + /* + * The holds the current reader. We need it to populate the field + * statistics. We just delegate all requests there + */ + private ShardTermsLookup shardTermsLookup; + + /* + * General field statistics such as number of documents containing the + * field. + */ + private final CollectionStatistics fieldStats; + + /* + * Uodate posting lists in all TermInfo objects + */ + void setReader(AtomicReader reader) { + for (ScriptTerm ti : terms.values()) { + ti.setNextReader(reader); + } + } + + /* + * Represents a field in a document. Can be used to return information on + * statistics of this field. Information on specific terms in this field can + * be accessed by calling get(String term). + */ + public ScriptTerms(String fieldName, ShardTermsLookup shardTermsLookup) throws IOException { + + assert fieldName != null; + this.fieldName = fieldName; + + assert shardTermsLookup != null; + this.shardTermsLookup = shardTermsLookup; + + fieldStats = shardTermsLookup.getIndexSearcher().collectionStatistics(fieldName); + } + + /* get number of documents containing the field */ + public long docCount() throws IOException { + return fieldStats.docCount(); + } + + /* get sum of the number of words over all documents that were indexed */ + public long sumttf() throws IOException { + return fieldStats.sumTotalTermFreq(); + } + + /* + * get the sum of doc frequencies over all words that appear in any document + * that has the field. + */ + public long sumdf() throws IOException { + return fieldStats.sumDocFreq(); + } + + // TODO: might be good to get the field lengths here somewhere? + + /* + * Returns a TermInfo object that can be used to access information on + * specific terms. flags can be set as described in TermInfo. + * + * TODO: here might be potential for running time improvement? If we knew in + * advance which terms are requested, we could provide an array which the + * user could then iterate over. + */ + public ScriptTerm get(Object key, int flags) { + String termString = (String) key; + ScriptTerm termInfo = terms.get(termString); + // see if we initialized already... + if (termInfo == null) { + termInfo = new ScriptTerm(termString, fieldName, shardTermsLookup, flags); + terms.put(termString, termInfo); + } + termInfo.validateFlags(flags); + return termInfo; + } + + /* + * Returns a TermInfo object that can be used to access information on + * specific terms. flags can be set as described in TermInfo. + */ + public ScriptTerm get(Object key) { + // per default, do not initialize any positions info + return get(key, ShardTermsLookup.FLAG_FREQUENCIES); + } + + public void setDocIdInTerms(int docId) { + for (ScriptTerm ti : terms.values()) { + ti.setNextDoc(docId); + } + } + +} diff --git a/src/main/java/org/elasticsearch/search/lookup/SearchLookup.java b/src/main/java/org/elasticsearch/search/lookup/SearchLookup.java index 9a8036038fba1..7a7cb38d7d2b6 100644 --- a/src/main/java/org/elasticsearch/search/lookup/SearchLookup.java +++ b/src/main/java/org/elasticsearch/search/lookup/SearchLookup.java @@ -36,14 +36,24 @@ public class SearchLookup { final SourceLookup sourceLookup; final FieldsLookup fieldsLookup; - + + final ShardTermsLookup shardTermsLookup; + final ImmutableMap asMap; public SearchLookup(MapperService mapperService, IndexFieldDataService fieldDataService, @Nullable String[] types) { + ImmutableMap.Builder builder = ImmutableMap.builder(); docMap = new DocLookup(mapperService, fieldDataService, types); sourceLookup = new SourceLookup(); fieldsLookup = new FieldsLookup(mapperService, types); - asMap = ImmutableMap.of("doc", docMap, "_doc", docMap, "_source", sourceLookup, "_fields", fieldsLookup); + shardTermsLookup = new ShardTermsLookup(builder); + + builder.put("doc", docMap); + builder.put("_doc", docMap); + builder.put("_source", sourceLookup); + builder.put("_fields", fieldsLookup); + builder.put("_shard", shardTermsLookup); + asMap = builder.build(); } public ImmutableMap asMap() { @@ -53,6 +63,10 @@ public ImmutableMap asMap() { public SourceLookup source() { return this.sourceLookup; } + + public ShardTermsLookup shardTerms() { + return this.shardTermsLookup; + } public FieldsLookup fields() { return this.fieldsLookup; @@ -70,11 +84,13 @@ public void setNextReader(AtomicReaderContext context) { docMap.setNextReader(context); sourceLookup.setNextReader(context); fieldsLookup.setNextReader(context); + shardTermsLookup.setNextReader(context); } public void setNextDocId(int docId) { docMap.setNextDocId(docId); sourceLookup.setNextDocId(docId); fieldsLookup.setNextDocId(docId); + shardTermsLookup.setNextDocId(docId); } } diff --git a/src/main/java/org/elasticsearch/search/lookup/ShardTermsLookup.java b/src/main/java/org/elasticsearch/search/lookup/ShardTermsLookup.java new file mode 100644 index 0000000000000..7913dad4e9174 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/lookup/ShardTermsLookup.java @@ -0,0 +1,242 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.lookup; + +import com.google.common.collect.ImmutableMap.Builder; +import org.apache.lucene.index.*; +import org.apache.lucene.search.IndexSearcher; +import org.elasticsearch.ElasticSearchException; +import org.elasticsearch.common.util.MinimalMap; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class ShardTermsLookup extends MinimalMap { + + /** + * Flag to pass to {@link ScriptTerms#get(String, flags)} if you require + * offsets in the returned {@link ScriptTerm}. + */ + public static final int FLAG_OFFSETS = 2; + + /** + * Flag to pass to {@link ScriptTerms#get(String, flags)} if you require + * payloads in the returned {@link ScriptTerm}. + */ + public static final int FLAG_PAYLOADS = 4; + + /** + * Flag to pass to {@link ScriptTerms#get(String, flags)} if you require + * frequencies in the returned {@link ScriptTerm}. Frequencies might be + * returned anyway for some lucene codecs even if this flag is no set. + */ + public static final int FLAG_FREQUENCIES = 8; + + /** + * Flag to pass to {@link ScriptTerms#get(String, flags)} if you require + * positions in the returned {@link ScriptTerm}. + */ + public static final int FLAG_POSITIONS = 16; + + /** + * Flag to pass to {@link ScriptTerms#get(String, flags)} if you require + * positions in the returned {@link ScriptTerm}. + */ + public static final int FLAG_CACHE = 32; + + // Current reader from which we can get the term vectors. No info on term + // and field statistics. + private AtomicReader reader; + + // The parent reader from which we can get proper field and term + // statistics + private CompositeReader parentReader; + + // we need this later to get the field and term statistics of the shard + private IndexSearcher indexSearcher; + + // we need this later to get the term statistics of the shard + private IndexReaderContext indexReaderContext; + + // current docId + private int docId = -1; + + // stores the objects that are used in the script. we maintain this map + // because we do not want to re-initialize the objects each time a field is + // accessed + private final Map scriptTermsPerField = new HashMap(); + + // number of documents per shard. cached here because the computation is + // expensive + private int numDocs = -1; + + // the maximum doc number of the shard. + private int maxDoc = -1; + + // number of deleted documents per shard. cached here because the + // computation is expensive + private int numDeletedDocs = -1; + + public int numDocs() { + if (numDocs == -1) { + numDocs = parentReader.numDocs(); + } + return numDocs; + } + + public int maxDoc() { + if (maxDoc == -1) { + maxDoc = parentReader.maxDoc(); + } + return maxDoc; + } + + public int numDeletedDocs() { + if (numDeletedDocs == -1) { + numDeletedDocs = parentReader.numDeletedDocs(); + } + return numDeletedDocs; + } + + public ShardTermsLookup(Builder builder) { + builder.put("_FREQUENCIES", ShardTermsLookup.FLAG_FREQUENCIES); + builder.put("_POSITIONS", ShardTermsLookup.FLAG_POSITIONS); + builder.put("_OFFSETS", ShardTermsLookup.FLAG_OFFSETS); + builder.put("_PAYLOADS", ShardTermsLookup.FLAG_PAYLOADS); + builder.put("_CACHE", ShardTermsLookup.FLAG_CACHE); + } + + public void setNextReader(AtomicReaderContext context) { + if (reader == context.reader()) { // if we are called with the same + // reader, nothing to do + return; + } + // check if we have to invalidate all field and shard stats - only if + // parent reader changed + if (context.parent != null) { + if (parentReader == null) { + parentReader = context.parent.reader(); + indexSearcher = new IndexSearcher(parentReader); + indexReaderContext = context.parent; + } else { + // parent reader may only be set once. TODO we could also call + // scriptFields.clear() here instead of assertion just to be on + // the save side + assert (parentReader == context.parent.reader()); + } + } else { + assert parentReader == null; + } + reader = context.reader(); + docId = -1; + setReaderInFields(); + } + + protected void setReaderInFields() { + for (ScriptTerms stat : scriptTermsPerField.values()) { + stat.setReader(reader); + } + } + + public void setNextDocId(int docId) { + if (this.docId == docId) { // if we are called with the same docId, + // nothing to do + return; + } + // We assume that docs are processed in ascending order of id. If this + // is not the case, we would have to re initialize all posting lists in + // ScriptTerm. TODO: Instead of assert we could also call + // setReaderInFields(); here? + if (this.docId > docId) { + // This might happen if the same SearchLookup is used in different + // phases, such as score and fetch phase. + // In this case we do not want to re initialize posting list etc. + // because we do not even know if term and field statistics will be + // needed in this new phase. + // Therefore we just remove all ScriptFields. + scriptTermsPerField.clear(); + } + this.docId = docId; + setNextDocIdInFields(); + } + + protected void setNextDocIdInFields() { + for (ScriptTerms stat : scriptTermsPerField.values()) { + stat.setDocIdInTerms(this.docId); + } + } + + /* + * TODO: here might be potential for running time improvement? If we knew in + * advance which terms are requested, we could provide an array which the + * user could then iterate over. + */ + @Override + public ScriptTerms get(Object key) { + String stringField = (String) key; + ScriptTerms scriptField = scriptTermsPerField.get(key); + if (scriptField == null) { + try { + scriptField = new ScriptTerms(stringField, this); + scriptTermsPerField.put(stringField, scriptField); + } catch (IOException e) { + throw new ElasticSearchException(e.getMessage()); + } + } + return scriptField; + } + + /* + * Get the lucene term vectors. See + * https://lucene.apache.org/core/4_0_0/core/org/apache/lucene/index/Fields.html + * * + */ + public Fields termVectors() throws IOException { + assert reader != null; + return reader.getTermVectors(docId); + } + + AtomicReader getReader() { + return reader; + } + + public int getDocId() { + return docId; + } + + public IndexReader getParentReader() { + if (parentReader == null) { + return reader; + } + return parentReader; + } + + public IndexSearcher getIndexSearcher() { + if (indexSearcher == null) { + return new IndexSearcher(reader); + } + return indexSearcher; + } + + public IndexReaderContext getReaderContext() { + return indexReaderContext; + } +} diff --git a/src/main/java/org/elasticsearch/search/lookup/TermPosition.java b/src/main/java/org/elasticsearch/search/lookup/TermPosition.java new file mode 100644 index 0000000000000..9a84acd8d5b6f --- /dev/null +++ b/src/main/java/org/elasticsearch/search/lookup/TermPosition.java @@ -0,0 +1,59 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.lookup; + +import org.apache.lucene.analysis.payloads.PayloadHelper; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; + +public class TermPosition { + + public int position = -1; + public int startOffset = -1; + public int endOffset = -1; + public BytesRef payload; + private CharsRef spare = new CharsRef(0); + + public String payloadAsString() { + if (payload != null && payload.length != 0) { + UnicodeUtil.UTF8toUTF16(payload.bytes, payload.offset, payload.length, spare); + return spare.toString(); + } else { + return null; + } + } + + public float payloadAsFloat(float defaultMissing) { + if (payload != null && payload.length != 0) { + return PayloadHelper.decodeFloat(payload.bytes, payload.offset); + } else { + return defaultMissing; + } + } + + public int payloadAsInt(int defaultMissing) { + if (payload != null && payload.length != 0) { + return PayloadHelper.decodeInt(payload.bytes, payload.offset); + } else { + return defaultMissing; + } + } +} diff --git a/src/test/java/org/elasticsearch/benchmark/scripts/score/BasicScriptBenchmark.java b/src/test/java/org/elasticsearch/benchmark/scripts/score/BasicScriptBenchmark.java new file mode 100644 index 0000000000000..09c8886c5cf81 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/scripts/score/BasicScriptBenchmark.java @@ -0,0 +1,338 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.benchmark.scripts.score; + +import com.google.common.base.Charsets; +import com.google.common.io.Files; +import org.elasticsearch.action.bulk.BulkRequestBuilder; +import org.elasticsearch.action.search.SearchRequest; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.action.search.SearchType; +import org.elasticsearch.client.Client; +import org.elasticsearch.common.StopWatch; +import org.elasticsearch.common.lucene.search.function.CombineFunction; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.index.query.FilterBuilders; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.index.query.functionscore.script.ScriptScoreFunctionBuilder; +import org.joda.time.DateTime; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.IOException; +import java.math.BigInteger; +import java.security.SecureRandom; +import java.util.*; +import java.util.Map.Entry; + +import static org.elasticsearch.client.Requests.searchRequest; +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.index.query.QueryBuilders.functionScoreQuery; +import static org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders.scriptFunction; +import static org.elasticsearch.search.builder.SearchSourceBuilder.searchSource; + +public class BasicScriptBenchmark { + + public static class RequestInfo { + public RequestInfo(SearchRequest source, int i) { + request = source; + numTerms = i; + } + + SearchRequest request; + int numTerms; + } + + public static class Results { + public static final String TIME_PER_DOCIN_MILLIS = "timePerDocinMillis"; + public static final String NUM_TERMS = "numTerms"; + public static final String NUM_DOCS = "numDocs"; + public static final String TIME_PER_QUERY_IN_SEC = "timePerQueryInSec"; + public static final String TOTAL_TIME_IN_SEC = "totalTimeInSec"; + Double[] resultSeconds; + Double[] resultMSPerQuery; + Long[] numDocs; + Integer[] numTerms; + Double[] timePerDoc; + String label; + String description; + public String lineStyle; + public String color; + + void init(int numVariations, String label, String description, String color, String lineStyle) { + resultSeconds = new Double[numVariations]; + resultMSPerQuery = new Double[numVariations]; + numDocs = new Long[numVariations]; + numTerms = new Integer[numVariations]; + timePerDoc = new Double[numVariations]; + this.label = label; + this.description = description; + this.color = color; + this.lineStyle = lineStyle; + } + + void set(SearchResponse searchResponse, StopWatch stopWatch, String message, int maxIter, int which, int numTerms) { + resultSeconds[which] = (double) ((double) stopWatch.lastTaskTime().getMillis() / (double) 1000); + resultMSPerQuery[which] = (double) ((double) stopWatch.lastTaskTime().secondsFrac() / (double) maxIter); + numDocs[which] = searchResponse.getHits().totalHits(); + this.numTerms[which] = numTerms; + timePerDoc[which] = resultMSPerQuery[which] / numDocs[which]; + } + + public void printResults(BufferedWriter writer) throws IOException { + String comma = (writer == null) ? "" : ";"; + String results = description + "\n" + Results.TOTAL_TIME_IN_SEC + " = " + getResultArray(resultSeconds) + comma + "\n" + + Results.TIME_PER_QUERY_IN_SEC + " = " + getResultArray(resultMSPerQuery) + comma + "\n" + Results.NUM_DOCS + " = " + + getResultArray(numDocs) + comma + "\n" + Results.NUM_TERMS + " = " + getResultArray(numTerms) + comma + "\n" + + Results.TIME_PER_DOCIN_MILLIS + " = " + getResultArray(timePerDoc) + comma + "\n"; + if (writer != null) { + writer.write(results); + } else { + System.out.println(results); + } + + } + + private String getResultArray(Object[] resultArray) { + String result = "["; + for (int i = 0; i < resultArray.length; i++) { + result += resultArray[i].toString(); + if (i != resultArray.length - 1) { + result += ","; + } + } + result += "]"; + return result; + } + } + + public BasicScriptBenchmark() { + } + + static List termsList = new ArrayList(); + + static void init(int numTerms) { + SecureRandom random = new SecureRandom(); + random.setSeed(1); + termsList.clear(); + for (int i = 0; i < numTerms; i++) { + String term = new BigInteger(512, random).toString(32); + termsList.add(term); + } + + } + + static String[] getTerms(int numTerms) { + assert numTerms <= termsList.size(); + String[] terms = new String[numTerms]; + for (int i = 0; i < numTerms; i++) { + terms[i] = termsList.get(i); + } + return terms; + } + + public static void writeHelperFunction() throws IOException { + File file = new File("addToPlot.m"); + BufferedWriter out = Files.newWriter(file, Charsets.UTF_8); + + out.write("function handle = addToPlot(numTerms, perDoc, color, linestyle, linewidth)\n" + "handle = line(numTerms, perDoc);\n" + + "set(handle, 'color', color);\n" + "set(handle, 'linestyle',linestyle);\n" + "set(handle, 'LineWidth',linewidth);\n" + + "end\n"); + out.close(); + } + + public static void printOctaveScript(List allResults, String[] args) throws IOException { + if (args.length == 0) { + return; + } + BufferedWriter out = null; + try { + File file = new File(args[0]); + out = Files.newWriter(file, Charsets.UTF_8); + + out.write("#! /usr/local/bin/octave -qf"); + out.write("\n\n\n\n"); + out.write("######################################\n"); + out.write("# Octave script for plotting results\n"); + String filename = "scriptScoreBenchmark" + new DateTime().toString(); + out.write("#Call '" + args[0] + "' from the command line. The plot is then in " + filename + "\n\n"); + + out.write("handleArray = [];\n tagArray = [];\n plot([]);\n hold on;\n"); + for (Results result : allResults) { + out.write("\n"); + out.write("# " + result.description); + result.printResults(out); + out.write("handleArray = [handleArray, addToPlot(" + Results.NUM_TERMS + ", " + Results.TIME_PER_DOCIN_MILLIS + ", '" + + result.color + "','" + result.lineStyle + "',5)];\n"); + out.write("tagArray = [tagArray; '" + result.label + "'];\n"); + out.write("\n"); + } + + out.write("xlabel(\'number of query terms');"); + out.write("ylabel(\'query time per document');"); + + out.write("legend(handleArray,tagArray);\n"); + + out.write("saveas(gcf,'" + filename + ".png','png')\n"); + out.write("hold off;\n\n"); + } catch (IOException e) { + System.err.println("Error: " + e.getMessage()); + } finally { + if (out != null) { + out.close(); + } + } + writeHelperFunction(); + } + + static void printResult(SearchResponse searchResponse, StopWatch stopWatch, String queryInfo) { + System.out.println("--> Searching with " + queryInfo + " took " + stopWatch.lastTaskTime() + ", per query " + + (stopWatch.lastTaskTime().secondsFrac() / 100) + " for " + searchResponse.getHits().totalHits() + " docs"); + } + + static void indexData(long numDocs, Client client, boolean randomizeTerms) throws IOException { + try { + client.admin().indices().prepareDelete("test").execute().actionGet(); + } catch (Throwable t) { + // index might exist already, in this case we do nothing TODO: make + // saver in general + } + + XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties") + .startObject("text").field("type", "string").field("index_options", "offsets").field("analyzer", "payload_float") + .endObject().endObject().endObject().endObject(); + client.admin() + .indices() + .prepareCreate("test") + .addMapping("type1", mapping) + .setSettings( + ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.payload_float.tokenizer", "whitespace") + .putArray("index.analysis.analyzer.payload_float.filter", "delimited_float") + .put("index.analysis.filter.delimited_float.delimiter", "|") + .put("index.analysis.filter.delimited_float.encoding", "float") + .put("index.analysis.filter.delimited_float.type", "delimited_payload_filter") + .put("index.number_of_replicas", 0).put("index.number_of_shards", 1)).execute().actionGet(); + client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().setTimeout("10s").execute().actionGet(); + BulkRequestBuilder bulkRequest = client.prepareBulk(); + Random random = new Random(1); + for (int i = 0; i < numDocs; i++) { + + bulkRequest.add(client.prepareIndex().setType("type1").setIndex("test") + .setSource(jsonBuilder().startObject().field("text", randomText(random, randomizeTerms)).endObject())); + if (i % 1000 == 0) { + bulkRequest.execute().actionGet(); + bulkRequest = client.prepareBulk(); + } + } + bulkRequest.execute().actionGet(); + client.admin().indices().prepareRefresh("test").execute().actionGet(); + client.admin().indices().prepareFlush("test").setFull(true).execute().actionGet(); + System.out.println("Done indexing " + numDocs + " documents"); + + } + + private static String randomText(Random random, boolean randomizeTerms) { + String text = ""; + for (int i = 0; i < termsList.size(); i++) { + if (random.nextInt(5) == 3 || !randomizeTerms) { + text = text + " " + termsList.get(i) + "|1"; + } + } + return text; + } + + static void printTimings(SearchResponse searchResponse, StopWatch stopWatch, String message, int maxIter) { + System.out.println(message); + System.out.println(stopWatch.lastTaskTime() + ", " + (stopWatch.lastTaskTime().secondsFrac() / maxIter) + ", " + + searchResponse.getHits().totalHits() + ", " + + (stopWatch.lastTaskTime().secondsFrac() / (maxIter + searchResponse.getHits().totalHits()))); + } + + static List> initTermQueries(int minTerms, int maxTerms) { + List> termSearchRequests = new ArrayList>(); + for (int nTerms = minTerms; nTerms < maxTerms; nTerms++) { + Map params = new HashMap(); + String[] terms = getTerms(nTerms + 1); + params.put("text", terms); + SearchRequest request = searchRequest().searchType(SearchType.QUERY_THEN_FETCH).source( + searchSource().explain(false).size(0).query(QueryBuilders.termsQuery("text", terms))); + String infoString = "Results for term query with " + (nTerms + 1) + " terms:"; + termSearchRequests.add(new AbstractMap.SimpleEntry(infoString, new RequestInfo(request, nTerms + 1))); + } + return termSearchRequests; + } + + static List> initNativeSearchRequests(int minTerms, int maxTerms, String script, boolean langNative) { + List> nativeSearchRequests = new ArrayList>(); + for (int nTerms = minTerms; nTerms < maxTerms; nTerms++) { + Map params = new HashMap(); + String[] terms = getTerms(nTerms + 1); + params.put("text", terms); + String infoString = "Results for native script with " + (nTerms + 1) + " terms:"; + ScriptScoreFunctionBuilder scriptFunction = (langNative == true) ? scriptFunction(script, "native", params) : scriptFunction( + script, params); + SearchRequest request = searchRequest().searchType(SearchType.QUERY_THEN_FETCH).source( + searchSource() + .explain(false) + .size(0) + .query(functionScoreQuery(FilterBuilders.termsFilter("text", terms), scriptFunction).boostMode( + CombineFunction.REPLACE))); + nativeSearchRequests.add(new AbstractMap.SimpleEntry(infoString, new RequestInfo(request, nTerms + 1))); + } + return nativeSearchRequests; + } + + static List> initScriptMatchAllSearchRequests(String script, boolean langNative) { + List> nativeSearchRequests = new ArrayList>(); + String infoString = "Results for constant score script:"; + ScriptScoreFunctionBuilder scriptFunction = (langNative == true) ? scriptFunction(script, "native") : scriptFunction(script); + SearchRequest request = searchRequest().searchType(SearchType.QUERY_THEN_FETCH).source( + searchSource().explain(false).size(0) + .query(functionScoreQuery(FilterBuilders.matchAllFilter(), scriptFunction).boostMode(CombineFunction.REPLACE))); + nativeSearchRequests.add(new AbstractMap.SimpleEntry(infoString, new RequestInfo(request, 0))); + + return nativeSearchRequests; + } + + static void runBenchmark(Client client, int maxIter, Results results, List> nativeSearchRequests, + int minTerms, int warmerIter) throws IOException { + int counter = 0; + for (Entry entry : nativeSearchRequests) { + SearchResponse searchResponse = null; + // warm up + for (int i = 0; i < warmerIter; i++) { + searchResponse = client.search(entry.getValue().request).actionGet(); + } + System.gc(); + // run benchmark + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + for (int i = 0; i < maxIter; i++) { + searchResponse = client.search(entry.getValue().request).actionGet(); + } + stopWatch.stop(); + results.set(searchResponse, stopWatch, entry.getKey(), maxIter, counter, entry.getValue().numTerms); + counter++; + } + results.printResults(null); + } +} \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/benchmark/scripts/score/ScriptsConstantScoreBenchmark.java b/src/test/java/org/elasticsearch/benchmark/scripts/score/ScriptsConstantScoreBenchmark.java new file mode 100644 index 0000000000000..9d19efa45b8f9 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/scripts/score/ScriptsConstantScoreBenchmark.java @@ -0,0 +1,102 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.benchmark.scripts.score; + +import org.elasticsearch.benchmark.scripts.score.plugin.NativeScriptExamplesPlugin; +import org.elasticsearch.benchmark.scripts.score.script.NativeConstantForLoopScoreScript; +import org.elasticsearch.benchmark.scripts.score.script.NativeConstantScoreScript; +import org.elasticsearch.client.Client; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.node.Node; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map.Entry; + +import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.node.NodeBuilder.nodeBuilder; + +/** + * + */ +public class ScriptsConstantScoreBenchmark extends BasicScriptBenchmark { + + public static void main(String[] args) throws Exception { + + int minTerms = 49; + int maxTerms = 50; + int maxIter = 1000; + int warmerIter = 1000; + + init(maxTerms); + List allResults = new ArrayList(); + Settings settings = settingsBuilder().put("plugin.types", NativeScriptExamplesPlugin.class.getName()).build(); + + String clusterName = ScriptsConstantScoreBenchmark.class.getSimpleName(); + Node node1 = nodeBuilder().clusterName(clusterName).settings(settingsBuilder().put(settings).put("name", "node1")).node(); + Client client = node1.client(); + client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().setTimeout("10s").execute().actionGet(); + + indexData(10000, client, true); + client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().setTimeout("10s").execute().actionGet(); + + Results results = new Results(); + + results.init(maxTerms - minTerms, "native const script score (log(2) 10X)", + "Results for native const script score with score = log(2) 10X:", "black", "-."); + // init script searches + List> searchRequests = initScriptMatchAllSearchRequests( + NativeConstantForLoopScoreScript.NATIVE_CONSTANT_FOR_LOOP_SCRIPT_SCORE, true); + // run actual benchmark + runBenchmark(client, maxIter, results, searchRequests, minTerms, warmerIter); + allResults.add(results); + + // init native script searches + results = new Results(); + results.init(maxTerms - minTerms, "mvel const (log(2) 10X)", "Results for mvel const score = log(2) 10X:", "red", "-."); + searchRequests = initScriptMatchAllSearchRequests("score = 0; for (int i=0; i<10;i++) {score = score + log(2);} return score", + false); + // run actual benchmark + runBenchmark(client, maxIter, results, searchRequests, minTerms, warmerIter); + allResults.add(results); + + results = new Results(); + results.init(maxTerms - minTerms, "native const script score (2)", "Results for native const script score with score = 2:", + "black", ":"); + // init native script searches + searchRequests = initScriptMatchAllSearchRequests(NativeConstantScoreScript.NATIVE_CONSTANT_SCRIPT_SCORE, true); + // run actual benchmark + runBenchmark(client, maxIter, results, searchRequests, minTerms, warmerIter); + allResults.add(results); + + results = new Results(); + results.init(maxTerms - minTerms, "mvel const (2)", "Results for mvel const score = 2:", "red", "--"); + // init native script searches + searchRequests = initScriptMatchAllSearchRequests("2", false); + // run actual benchmark + runBenchmark(client, maxIter, results, searchRequests, minTerms, warmerIter); + allResults.add(results); + + printOctaveScript(allResults, args); + + client.close(); + node1.close(); + } +} diff --git a/src/test/java/org/elasticsearch/benchmark/scripts/score/ScriptsScoreBenchmark.java b/src/test/java/org/elasticsearch/benchmark/scripts/score/ScriptsScoreBenchmark.java new file mode 100644 index 0000000000000..5083541dfd6cc --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/scripts/score/ScriptsScoreBenchmark.java @@ -0,0 +1,137 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.benchmark.scripts.score; + +import org.elasticsearch.benchmark.scripts.score.plugin.NativeScriptExamplesPlugin; +import org.elasticsearch.benchmark.scripts.score.script.NativeNaiveTFIDFScoreScript; +import org.elasticsearch.client.Client; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.node.Node; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map.Entry; + +import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.node.NodeBuilder.nodeBuilder; + +/** + * + */ +public class ScriptsScoreBenchmark extends BasicScriptBenchmark { + + public static void main(String[] args) throws Exception { + + int minTerms = 1; + int maxTerms = 50; + int maxIter = 100; + int warmerIter = 10; + + boolean runMVEL = false; + init(maxTerms); + List allResults = new ArrayList(); + Settings settings = settingsBuilder().put("plugin.types", NativeScriptExamplesPlugin.class.getName()).build(); + + String clusterName = ScriptsScoreBenchmark.class.getSimpleName(); + Node node1 = nodeBuilder().clusterName(clusterName).settings(settingsBuilder().put(settings).put("name", "node1")).node(); + Client client = node1.client(); + client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().setTimeout("10s").execute().actionGet(); + + indexData(10000, client, false); + client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().setTimeout("10s").execute().actionGet(); + + Results results = new Results(); + results.init(maxTerms - minTerms, "native tfidf script score dense posting list", + "Results for native script score with dense posting list:", "black", "--"); + // init native script searches + List> searchRequests = initNativeSearchRequests(minTerms, maxTerms, + NativeNaiveTFIDFScoreScript.NATIVE_NAIVE_TFIDF_SCRIPT_SCORE, true); + // run actual benchmark + runBenchmark(client, maxIter, results, searchRequests, minTerms, warmerIter); + allResults.add(results); + + results = new Results(); + + results.init(maxTerms - minTerms, "term query dense posting list", "Results for term query with dense posting lists:", "green", + "--"); + // init term queries + searchRequests = initTermQueries(minTerms, maxTerms); + // run actual benchmark + runBenchmark(client, maxIter, results, searchRequests, minTerms, warmerIter); + allResults.add(results); + + if (runMVEL) { + + results = new Results(); + results.init(maxTerms - minTerms, "mvel tfidf dense posting list", "Results for mvel score with dense posting list:", "red", + "--"); + // init native script searches + searchRequests = initNativeSearchRequests( + minTerms, + maxTerms, + "score = 0.0; fi= _terminfo[\"text\"]; for(i=0; i allResults = new ArrayList(); + Settings settings = settingsBuilder().put("plugin.types", NativeScriptExamplesPlugin.class.getName()).build(); + + String clusterName = ScriptsScoreBenchmark.class.getSimpleName(); + Node node1 = nodeBuilder().clusterName(clusterName).settings(settingsBuilder().put(settings).put("name", "node1")).node(); + Client client = node1.client(); + client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().setTimeout("10s").execute().actionGet(); + + indexData(10000, client, false); + client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().setTimeout("10s").execute().actionGet(); + + Results results = new Results(); + // init script searches + results.init(maxTerms - minTerms, "native payload sum script score", "Results for native script score:", "green", ":"); + List> searchRequests = initNativeSearchRequests(minTerms, maxTerms, + NativePayloadSumScoreScript.NATIVE_PAYLOAD_SUM_SCRIPT_SCORE, true); + // run actual benchmark + runBenchmark(client, maxIter, results, searchRequests, minTerms, warmerIter); + allResults.add(results); + + results = new Results(); + // init script searches + results.init(maxTerms - minTerms, "native payload sum script score no record", "Results for native script score:", "black", ":"); + searchRequests = initNativeSearchRequests(minTerms, maxTerms, + NativePayloadSumNoRecordScoreScript.NATIVE_PAYLOAD_SUM_NO_RECORD_SCRIPT_SCORE, true); + // run actual benchmark + runBenchmark(client, maxIter, results, searchRequests, minTerms, warmerIter); + allResults.add(results); + + printOctaveScript(allResults, args); + + client.close(); + node1.close(); + } + +} diff --git a/src/test/java/org/elasticsearch/benchmark/scripts/score/plugin/NativeScriptExamplesPlugin.java b/src/test/java/org/elasticsearch/benchmark/scripts/score/plugin/NativeScriptExamplesPlugin.java new file mode 100644 index 0000000000000..2575a318dfb57 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/scripts/score/plugin/NativeScriptExamplesPlugin.java @@ -0,0 +1,27 @@ +package org.elasticsearch.benchmark.scripts.score.plugin; + +import org.elasticsearch.benchmark.scripts.score.script.*; +import org.elasticsearch.plugins.AbstractPlugin; +import org.elasticsearch.script.ScriptModule; + +public class NativeScriptExamplesPlugin extends AbstractPlugin { + + + @Override + public String name() { + return "native-script-example"; + } + + @Override + public String description() { + return "Native script examples"; + } + + public void onModule(ScriptModule module) { + module.registerScript(NativeNaiveTFIDFScoreScript.NATIVE_NAIVE_TFIDF_SCRIPT_SCORE, NativeNaiveTFIDFScoreScript.Factory.class); + module.registerScript(NativeConstantForLoopScoreScript.NATIVE_CONSTANT_FOR_LOOP_SCRIPT_SCORE, NativeConstantForLoopScoreScript.Factory.class); + module.registerScript(NativeConstantScoreScript.NATIVE_CONSTANT_SCRIPT_SCORE, NativeConstantScoreScript.Factory.class); + module.registerScript(NativePayloadSumScoreScript.NATIVE_PAYLOAD_SUM_SCRIPT_SCORE, NativePayloadSumScoreScript.Factory.class); + module.registerScript(NativePayloadSumNoRecordScoreScript.NATIVE_PAYLOAD_SUM_NO_RECORD_SCRIPT_SCORE, NativePayloadSumNoRecordScoreScript.Factory.class); + } +} diff --git a/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativeConstantForLoopScoreScript.java b/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativeConstantForLoopScoreScript.java new file mode 100644 index 0000000000000..3dc9f0d2d6ed7 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativeConstantForLoopScoreScript.java @@ -0,0 +1,35 @@ +package org.elasticsearch.benchmark.scripts.score.script; + +import org.elasticsearch.common.Nullable; +import org.elasticsearch.script.AbstractSearchScript; +import org.elasticsearch.script.ExecutableScript; +import org.elasticsearch.script.NativeScriptFactory; + +import java.util.Map; + +public class NativeConstantForLoopScoreScript extends AbstractSearchScript { + + public static final String NATIVE_CONSTANT_FOR_LOOP_SCRIPT_SCORE = "native_constant_for_loop_script_score"; + + public static class Factory implements NativeScriptFactory { + + @Override + public ExecutableScript newScript(@Nullable Map params) { + return new NativeConstantForLoopScoreScript(params); + } + } + + private NativeConstantForLoopScoreScript(Map params) { + + } + + @Override + public Object run() { + float score = 0; + for (int i = 0; i < 10; i++) { + score += Math.log(2); + } + return score; + } + +} diff --git a/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativeConstantScoreScript.java b/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativeConstantScoreScript.java new file mode 100644 index 0000000000000..e37e849844ed0 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativeConstantScoreScript.java @@ -0,0 +1,30 @@ +package org.elasticsearch.benchmark.scripts.score.script; + +import org.elasticsearch.common.Nullable; +import org.elasticsearch.script.AbstractSearchScript; +import org.elasticsearch.script.ExecutableScript; +import org.elasticsearch.script.NativeScriptFactory; + +import java.util.Map; + +public class NativeConstantScoreScript extends AbstractSearchScript { + + public static final String NATIVE_CONSTANT_SCRIPT_SCORE = "native_constant_script_score"; + + public static class Factory implements NativeScriptFactory { + + @Override + public ExecutableScript newScript(@Nullable Map params) { + return new NativeConstantScoreScript(); + } + } + + private NativeConstantScoreScript() { + } + + @Override + public Object run() { + return 2; + } + +} diff --git a/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativeNaiveTFIDFScoreScript.java b/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativeNaiveTFIDFScoreScript.java new file mode 100644 index 0000000000000..0bc6ab50876e5 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativeNaiveTFIDFScoreScript.java @@ -0,0 +1,55 @@ +package org.elasticsearch.benchmark.scripts.score.script; + +import org.elasticsearch.common.Nullable; +import org.elasticsearch.script.AbstractSearchScript; +import org.elasticsearch.script.ExecutableScript; +import org.elasticsearch.script.NativeScriptFactory; +import org.elasticsearch.search.lookup.ScriptTerm; +import org.elasticsearch.search.lookup.ScriptTerms; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Map; + +public class NativeNaiveTFIDFScoreScript extends AbstractSearchScript { + + public static final String NATIVE_NAIVE_TFIDF_SCRIPT_SCORE = "native_naive_tfidf_script_score"; + String field = null; + String[] terms = null; + + public static class Factory implements NativeScriptFactory { + + @Override + public ExecutableScript newScript(@Nullable Map params) { + return new NativeNaiveTFIDFScoreScript(params); + } + } + + private NativeNaiveTFIDFScoreScript(Map params) { + params.entrySet(); + terms = new String[params.size()]; + field = params.keySet().iterator().next(); + Object o = params.get(field); + ArrayList arrayList = (ArrayList) o; + terms = arrayList.toArray(new String[arrayList.size()]); + + } + + @Override + public Object run() { + float score = 0; + ScriptTerms scriptTerms = shardTerms().get(field); + for (int i = 0; i < terms.length; i++) { + ScriptTerm scriptTerm = scriptTerms.get(terms[i]); + try { + if (scriptTerm.tf() != 0) { + score += scriptTerm.tf() * scriptTerms.docCount() / scriptTerm.df(); + } + } catch (IOException e) { + throw new RuntimeException(); + } + } + return score; + } + +} diff --git a/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativePayloadSumNoRecordScoreScript.java b/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativePayloadSumNoRecordScoreScript.java new file mode 100644 index 0000000000000..dde6e8b565f23 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativePayloadSumNoRecordScoreScript.java @@ -0,0 +1,53 @@ +package org.elasticsearch.benchmark.scripts.score.script; + +import org.elasticsearch.search.lookup.ScriptTerm; +import org.elasticsearch.search.lookup.ScriptTerms; +import org.elasticsearch.search.lookup.ShardTermsLookup; +import org.elasticsearch.search.lookup.TermPosition; + +import org.elasticsearch.common.Nullable; +import org.elasticsearch.script.AbstractSearchScript; +import org.elasticsearch.script.ExecutableScript; +import org.elasticsearch.script.NativeScriptFactory; + +import java.util.ArrayList; +import java.util.Map; + +public class NativePayloadSumNoRecordScoreScript extends AbstractSearchScript { + + public static final String NATIVE_PAYLOAD_SUM_NO_RECORD_SCRIPT_SCORE = "native_payload_sum_no_record_script_score"; + String field = null; + String[] terms = null; + + public static class Factory implements NativeScriptFactory { + + @Override + public ExecutableScript newScript(@Nullable Map params) { + return new NativePayloadSumNoRecordScoreScript(params); + } + } + + private NativePayloadSumNoRecordScoreScript(Map params) { + params.entrySet(); + terms = new String[params.size()]; + field = params.keySet().iterator().next(); + Object o = params.get(field); + ArrayList arrayList = (ArrayList) o; + terms = arrayList.toArray(new String[arrayList.size()]); + + } + + @Override + public Object run() { + float score = 0; + ScriptTerms scriptTerms = shardTerms().get(field); + for (int i = 0; i < terms.length; i++) { + ScriptTerm scriptTerm = scriptTerms.get(terms[i], ShardTermsLookup.FLAG_PAYLOADS); + for (TermPosition pos : scriptTerm) { + score += pos.payloadAsFloat(0); + } + } + return score; + } + +} diff --git a/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativePayloadSumScoreScript.java b/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativePayloadSumScoreScript.java new file mode 100644 index 0000000000000..2d0443d791343 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/scripts/score/script/NativePayloadSumScoreScript.java @@ -0,0 +1,53 @@ +package org.elasticsearch.benchmark.scripts.score.script; + +import org.elasticsearch.search.lookup.ScriptTerm; +import org.elasticsearch.search.lookup.ScriptTerms; +import org.elasticsearch.search.lookup.ShardTermsLookup; +import org.elasticsearch.search.lookup.TermPosition; + +import org.elasticsearch.common.Nullable; +import org.elasticsearch.script.AbstractSearchScript; +import org.elasticsearch.script.ExecutableScript; +import org.elasticsearch.script.NativeScriptFactory; + +import java.util.ArrayList; +import java.util.Map; + +public class NativePayloadSumScoreScript extends AbstractSearchScript { + + public static final String NATIVE_PAYLOAD_SUM_SCRIPT_SCORE = "native_payload_sum_script_score"; + String field = null; + String[] terms = null; + + public static class Factory implements NativeScriptFactory { + + @Override + public ExecutableScript newScript(@Nullable Map params) { + return new NativePayloadSumScoreScript(params); + } + } + + private NativePayloadSumScoreScript(Map params) { + params.entrySet(); + terms = new String[params.size()]; + field = params.keySet().iterator().next(); + Object o = params.get(field); + ArrayList arrayList = (ArrayList) o; + terms = arrayList.toArray(new String[arrayList.size()]); + + } + + @Override + public Object run() { + float score = 0; + ScriptTerms scriptTerms = shardTerms().get(field); + for (int i = 0; i < terms.length; i++) { + ScriptTerm scriptTerm = scriptTerms.get(terms[i], ShardTermsLookup.FLAG_PAYLOADS | ShardTermsLookup.FLAG_CACHE); + for (TermPosition pos : scriptTerm) { + score += pos.payloadAsFloat(0); + } + } + return score; + } + +} diff --git a/src/test/java/org/elasticsearch/script/ShardLookupInScriptTests.java b/src/test/java/org/elasticsearch/script/ShardLookupInScriptTests.java new file mode 100644 index 0000000000000..e7e7d9f3c4766 --- /dev/null +++ b/src/test/java/org/elasticsearch/script/ShardLookupInScriptTests.java @@ -0,0 +1,625 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.script; + +import org.elasticsearch.action.search.SearchPhaseExecutionException; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.action.search.ShardSearchFailure; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.index.query.functionscore.ScoreFunctionBuilders; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.test.ElasticsearchIntegrationTest; +import org.elasticsearch.test.hamcrest.ElasticsearchAssertions; +import org.hamcrest.Matchers; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; + +import static org.hamcrest.Matchers.equalTo; + +public class ShardLookupInScriptTests extends ElasticsearchIntegrationTest { + + String includeAllFlag = "_FREQUENCIES | _OFFSETS | _PAYLOADS | _POSITIONS | _CACHE"; + String includeAllWithoutRecordFlag = "_FREQUENCIES | _OFFSETS | _PAYLOADS | _POSITIONS "; + private HashMap> expectedEndOffsetsArray; + private HashMap> expectedPayloadsArray; + private HashMap> expectedPositionsArray; + private HashMap> emptyArray; + private HashMap> expectedStartOffsetsArray; + + void initTestData() throws InterruptedException, ExecutionException, IOException { + emptyArray = new HashMap>(); + List empty1 = new ArrayList(); + empty1.add(-1); + empty1.add(-1); + emptyArray.put("1", empty1); + List empty2 = new ArrayList(); + empty2.add(-1); + empty2.add(-1); + emptyArray.put("2", empty2); + List empty3 = new ArrayList(); + empty3.add(-1); + empty3.add(-1); + emptyArray.put("3", empty3); + + expectedPositionsArray = new HashMap>(); + + List pos1 = new ArrayList(); + pos1.add(1); + pos1.add(2); + expectedPositionsArray.put("1", pos1); + List pos2 = new ArrayList(); + pos2.add(0); + pos2.add(1); + expectedPositionsArray.put("2", pos2); + List pos3 = new ArrayList(); + pos3.add(0); + pos3.add(4); + expectedPositionsArray.put("3", pos3); + + expectedPayloadsArray = new HashMap>(); + List pay1 = new ArrayList(); + pay1.add(2); + pay1.add(3); + expectedPayloadsArray.put("1", pay1); + List pay2 = new ArrayList(); + pay2.add(1); + pay2.add(2); + expectedPayloadsArray.put("2", pay2); + List pay3 = new ArrayList(); + pay3.add(1); + pay3.add(-1); + expectedPayloadsArray.put("3", pay3); + /* + * "a|1 b|2 b|3 c|4 d " "b|1 b|2 c|3 d|4 a " "b|1 c|2 d|3 a|4 b " + */ + expectedStartOffsetsArray = new HashMap>(); + List starts1 = new ArrayList(); + starts1.add(4); + starts1.add(8); + expectedStartOffsetsArray.put("1", starts1); + List starts2 = new ArrayList(); + starts2.add(0); + starts2.add(4); + expectedStartOffsetsArray.put("2", starts2); + List starts3 = new ArrayList(); + starts3.add(0); + starts3.add(16); + expectedStartOffsetsArray.put("3", starts3); + + expectedEndOffsetsArray = new HashMap>(); + List ends1 = new ArrayList(); + ends1.add(7); + ends1.add(11); + expectedEndOffsetsArray.put("1", ends1); + List ends2 = new ArrayList(); + ends2.add(3); + ends2.add(7); + expectedEndOffsetsArray.put("2", ends2); + List ends3 = new ArrayList(); + ends3.add(3); + ends3.add(17); + expectedEndOffsetsArray.put("3", ends3); + + XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties") + .startObject("int_payload_field").field("type", "string").field("index_options", "offsets") + .field("analyzer", "payload_int").endObject().endObject().endObject().endObject(); + ElasticsearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings( + ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.payload_int.tokenizer", "whitespace") + .putArray("index.analysis.analyzer.payload_int.filter", "delimited_int") + .put("index.analysis.filter.delimited_int.delimiter", "|") + .put("index.analysis.filter.delimited_int.encoding", "int") + .put("index.analysis.filter.delimited_int.type", "delimited_payload_filter") + .put("index.number_of_replicas", 0).put("index.number_of_shards", randomIntBetween(1, 6)))); + indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("int_payload_field", "a|1 b|2 b|3 c|4 d "), client() + .prepareIndex("test", "type1", "2").setSource("int_payload_field", "b|1 b|2 c|3 d|4 a "), + client().prepareIndex("test", "type1", "3").setSource("int_payload_field", "b|1 c|2 d|3 a|4 b ")); + ensureGreen(); + + } + + @Test + public void testTwoScripts() throws Exception { + + initTestData(); + + // check term frequencies for 'a' + String scriptFieldScript = "termInfo = _shard['int_payload_field']['c']; termInfo.tf()"; + scriptFieldScript = "1"; + String scoreScript = "termInfo = _shard['int_payload_field']['b']; termInfo.tf()"; + Map expectedResultsField = new HashMap(); + expectedResultsField.put("1", 1); + expectedResultsField.put("2", 1); + expectedResultsField.put("3", 1); + Map expectedResultsScore = new HashMap(); + expectedResultsScore.put("1", 2f); + expectedResultsScore.put("2", 2f); + expectedResultsScore.put("3", 2f); + checkOnlyFunctionScore(scoreScript, expectedResultsScore, 3); + checkValueInEachDocWithFunctionScore(scriptFieldScript, expectedResultsField, scoreScript, expectedResultsScore, 3); + + } + + @Test + public void testCallWithDifferentFlagsFails() throws Exception { + + initTestData(); + + // should throw an exception, we cannot call with different flags twice + // if the flags of the second call were not included in the first call. + String script = "termInfo = _shard['int_payload_field']['b']; return _shard['int_payload_field'].get('b', _POSITIONS).tf();"; + try { + client().prepareSearch("test").setQuery(QueryBuilders.matchAllQuery()).addScriptField("tvtest", script).execute().actionGet(); + } catch (SearchPhaseExecutionException e) { + assertThat( + e.getDetailedMessage() + .indexOf( + "You must call get with all required flags! Instead of _shard['int_payload_field'].get('b', _FREQUENCIES) and _shard['int_payload_field'].get('b', _POSITIONS) call _shard['int_payload_field'].get('b', _FREQUENCIES | _POSITIONS) once]; "), + Matchers.greaterThan(-1)); + } + + // Should not throw an exception this way round + script = "termInfo = _shard['int_payload_field'].get('b', _POSITIONS | _FREQUENCIES);return _shard['int_payload_field']['b'].tf();"; + client().prepareSearch("test").setQuery(QueryBuilders.matchAllQuery()).addScriptField("tvtest", script).execute().actionGet(); + } + + private void checkOnlyFunctionScore(String scoreScript, Map expectedScore, int numExpectedDocs) { + SearchResponse sr = client().prepareSearch("test") + .setQuery(QueryBuilders.functionScoreQuery(ScoreFunctionBuilders.scriptFunction(scoreScript))).execute().actionGet(); + ElasticsearchAssertions.assertHitCount(sr, numExpectedDocs); + for (SearchHit hit : sr.getHits().getHits()) { + assertThat("for doc " + hit.getId(), ((Float) expectedScore.get(hit.getId())).doubleValue(), + Matchers.closeTo(hit.score(), 1.e-4)); + } + } + + @Test + public void testDocumentationExample() throws Exception { + + initTestData(); + + String script = "termInfo = _shard['float_payload_field'].get('b'," + includeAllFlag + + "); payloadSum=0; for (pos : termInfo) {payloadSum = pos.payloadAsInt(0);} return payloadSum;"; + + // non existing field: sum should be 0 + HashMap zeroArray = new HashMap(); + zeroArray.put("1", 0); + zeroArray.put("2", 0); + zeroArray.put("3", 0); + checkValueInEachDoc(script, zeroArray, 3); + + script = "termInfo = _shard['int_payload_field'].get('b'," + includeAllFlag + + "); payloadSum=0; for (pos : termInfo) {payloadSum = payloadSum + pos.payloadAsInt(0);} return payloadSum;"; + + // existing field: sums should be as here: + zeroArray.put("1", 5); + zeroArray.put("2", 3); + zeroArray.put("3", 1); + checkValueInEachDoc(script, zeroArray, 3); + } + + @Test + public void testIteratorAndRecording() throws Exception { + + initTestData(); + + // call twice with record: should work as expected + String script = createPositionsArrayScriptIterateTwice("b", includeAllFlag, "position"); + checkArrayValsInEachDoc(script, expectedPositionsArray, 3); + script = createPositionsArrayScriptIterateTwice("b", includeAllFlag, "startOffset"); + checkArrayValsInEachDoc(script, expectedStartOffsetsArray, 3); + script = createPositionsArrayScriptIterateTwice("b", includeAllFlag, "endOffset"); + checkArrayValsInEachDoc(script, expectedEndOffsetsArray, 3); + script = createPositionsArrayScriptIterateTwice("b", includeAllFlag, "payloadAsInt(-1)"); + checkArrayValsInEachDoc(script, expectedPayloadsArray, 3); + + // no record and get iterator twice: should fail + script = createPositionsArrayScriptIterateTwice("b", includeAllWithoutRecordFlag, "position"); + checkExceptions(script); + script = createPositionsArrayScriptIterateTwice("b", includeAllWithoutRecordFlag, "startOffset"); + checkExceptions(script); + script = createPositionsArrayScriptIterateTwice("b", includeAllWithoutRecordFlag, "endOffset"); + checkExceptions(script); + script = createPositionsArrayScriptIterateTwice("b", includeAllWithoutRecordFlag, "payloadAsInt(-1)"); + checkExceptions(script); + + // no record and get TermInfoObject twice and iterate: should fail + script = createPositionsArrayScriptGetInfoObjectTwice("b", includeAllWithoutRecordFlag, "position"); + checkExceptions(script); + script = createPositionsArrayScriptGetInfoObjectTwice("b", includeAllWithoutRecordFlag, "startOffset"); + checkExceptions(script); + script = createPositionsArrayScriptGetInfoObjectTwice("b", includeAllWithoutRecordFlag, "endOffset"); + checkExceptions(script); + script = createPositionsArrayScriptGetInfoObjectTwice("b", includeAllWithoutRecordFlag, "payloadAsInt(-1)"); + checkExceptions(script); + + } + + private String createPositionsArrayScriptGetInfoObjectTwice(String term, String flags, String what) { + String script = "termInfo = _shard['int_payload_field'].get('" + term + "'," + flags + + "); array=[]; for (pos : termInfo) {array.add(pos." + what + ")} ;_shard['int_payload_field'].get('" + term + "'," + + flags + "); array=[]; for (pos : termInfo) {array.add(pos." + what + ")}"; + return script; + } + + private String createPositionsArrayScriptIterateTwice(String term, String flags, String what) { + String script = "termInfo = _shard['int_payload_field'].get('" + term + "'," + flags + + "); array=[]; for (pos : termInfo) {array.add(pos." + what + ")} array=[]; for (pos : termInfo) {array.add(pos." + what + + ")} return array;"; + return script; + } + + private String createPositionsArrayScript(String field, String term, String flags, String what) { + String script = "termInfo = _shard['" + field + "'].get('" + term + "'," + flags + + "); array=[]; for (pos : termInfo) {array.add(pos." + what + ")} return array;"; + return script; + } + + private String createPositionsArrayScriptDefaultGet(String field, String term, String what) { + String script = "termInfo = _shard['" + field + "']['" + term + "']; array=[]; for (pos : termInfo) {array.add(pos." + what + + ")} return array;"; + return script; + } + + @Test + public void testFlags() throws Exception { + + initTestData(); + + // check default flag + String script = createPositionsArrayScriptDefaultGet("int_payload_field", "b", "position"); + // there should be no positions + checkArrayValsInEachDoc(script, emptyArray, 3); + script = createPositionsArrayScriptDefaultGet("int_payload_field", "b", "startOffset"); + // there should be no offsets + checkArrayValsInEachDoc(script, emptyArray, 3); + script = createPositionsArrayScriptDefaultGet("int_payload_field", "b", "endOffset"); + // there should be no offsets + checkArrayValsInEachDoc(script, emptyArray, 3); + script = createPositionsArrayScriptDefaultGet("int_payload_field", "b", "payloadAsInt(-1)"); + // there should be no payload + checkArrayValsInEachDoc(script, emptyArray, 3); + + // check FLAG_FREQUENCIES flag + script = createPositionsArrayScript("int_payload_field", "b", "_FREQUENCIES", "position"); + // there should be no positions + checkArrayValsInEachDoc(script, emptyArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_FREQUENCIES", "startOffset"); + // there should be no offsets + checkArrayValsInEachDoc(script, emptyArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_FREQUENCIES", "endOffset"); + // there should be no offsets + checkArrayValsInEachDoc(script, emptyArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_FREQUENCIES", "payloadAsInt(-1)"); + // there should be no payloads + checkArrayValsInEachDoc(script, emptyArray, 3); + + // check FLAG_POSITIONS flag + script = createPositionsArrayScript("int_payload_field", "b", "_POSITIONS", "position"); + // there should be positions + checkArrayValsInEachDoc(script, expectedPositionsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_POSITIONS", "startOffset"); + // there should be no offsets + checkArrayValsInEachDoc(script, emptyArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_POSITIONS", "endOffset"); + // there should be no offsets + checkArrayValsInEachDoc(script, emptyArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_POSITIONS", "payloadAsInt(-1)"); + // there should be no payloads + checkArrayValsInEachDoc(script, emptyArray, 3); + + // check FLAG_OFFSETS flag + script = createPositionsArrayScript("int_payload_field", "b", "_OFFSETS", "position"); + // there should be positions and s forth ... + checkArrayValsInEachDoc(script, expectedPositionsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_OFFSETS", "startOffset"); + checkArrayValsInEachDoc(script, expectedStartOffsetsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_OFFSETS", "endOffset"); + checkArrayValsInEachDoc(script, expectedEndOffsetsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_OFFSETS", "payloadAsInt(-1)"); + checkArrayValsInEachDoc(script, expectedPayloadsArray, 3); + + // check FLAG_PAYLOADS flag + script = createPositionsArrayScript("int_payload_field", "b", "_PAYLOADS", "position"); + checkArrayValsInEachDoc(script, expectedPositionsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_PAYLOADS", "startOffset"); + checkArrayValsInEachDoc(script, expectedStartOffsetsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_PAYLOADS", "endOffset"); + checkArrayValsInEachDoc(script, expectedEndOffsetsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", "_PAYLOADS", "payloadAsInt(-1)"); + checkArrayValsInEachDoc(script, expectedPayloadsArray, 3); + + // check all flags + String allFlags = "_POSITIONS | _OFFSETS | _PAYLOADS"; + script = createPositionsArrayScript("int_payload_field", "b", allFlags, "position"); + checkArrayValsInEachDoc(script, expectedPositionsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", allFlags, "startOffset"); + checkArrayValsInEachDoc(script, expectedStartOffsetsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", allFlags, "endOffset"); + checkArrayValsInEachDoc(script, expectedEndOffsetsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", allFlags, "payloadAsInt(-1)"); + checkArrayValsInEachDoc(script, expectedPayloadsArray, 3); + + // check all flags without record + script = createPositionsArrayScript("int_payload_field", "b", includeAllWithoutRecordFlag, "position"); + checkArrayValsInEachDoc(script, expectedPositionsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", includeAllWithoutRecordFlag, "startOffset"); + checkArrayValsInEachDoc(script, expectedStartOffsetsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", includeAllWithoutRecordFlag, "endOffset"); + checkArrayValsInEachDoc(script, expectedEndOffsetsArray, 3); + script = createPositionsArrayScript("int_payload_field", "b", includeAllWithoutRecordFlag, "payloadAsInt(-1)"); + checkArrayValsInEachDoc(script, expectedPayloadsArray, 3); + + } + + private void checkArrayValsInEachDoc(String script, HashMap> expectedArray, int expectedHitSize) { + SearchResponse sr = client().prepareSearch("test").setQuery(QueryBuilders.matchAllQuery()).addScriptField("tvtest", script) + .execute().actionGet(); + ElasticsearchAssertions.assertHitCount(sr, expectedHitSize); + int nullCounter = 0; + for (SearchHit hit : sr.getHits().getHits()) { + Object result = hit.getFields().get("tvtest").getValues().get(0); + Object expectedResult = expectedArray.get(hit.getId()); + assertThat("for doc " + hit.getId(), result, equalTo(expectedResult)); + if (expectedResult != null) { + nullCounter++; + } + } + assertThat(nullCounter, equalTo(expectedArray.size())); + } + + @Test + public void testAllExceptPosAndOffset() throws Exception { + XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties") + .startObject("float_payload_field").field("type", "string").field("index_options", "offsets").field("term_vector", "no") + .field("analyzer", "payload_float").endObject().startObject("string_payload_field").field("type", "string") + .field("index_options", "offsets").field("term_vector", "no").field("analyzer", "payload_string").endObject() + .startObject("int_payload_field").field("type", "string").field("index_options", "offsets") + .field("analyzer", "payload_int").endObject().endObject().endObject().endObject(); + ElasticsearchAssertions.assertAcked(prepareCreate("test").addMapping("type1", mapping).setSettings( + ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.payload_float.tokenizer", "whitespace") + .putArray("index.analysis.analyzer.payload_float.filter", "delimited_float") + .put("index.analysis.filter.delimited_float.delimiter", "|") + .put("index.analysis.filter.delimited_float.encoding", "float") + .put("index.analysis.filter.delimited_float.type", "delimited_payload_filter") + .put("index.analysis.analyzer.payload_string.tokenizer", "whitespace") + .putArray("index.analysis.analyzer.payload_string.filter", "delimited_string") + .put("index.analysis.filter.delimited_string.delimiter", "|") + .put("index.analysis.filter.delimited_string.encoding", "identity") + .put("index.analysis.filter.delimited_string.type", "delimited_payload_filter") + .put("index.analysis.analyzer.payload_int.tokenizer", "whitespace") + .putArray("index.analysis.analyzer.payload_int.filter", "delimited_int") + .put("index.analysis.filter.delimited_int.delimiter", "|") + .put("index.analysis.filter.delimited_int.encoding", "int") + .put("index.analysis.filter.delimited_int.type", "delimited_payload_filter").put("index.number_of_replicas", 0) + .put("index.number_of_shards", 1))); + ensureYellow(); + indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("float_payload_field", "a|1 b|2 a|3 b "), client() + .prepareIndex("test", "type1", "2").setSource("string_payload_field", "a|a b|b a|a b "), + client().prepareIndex("test", "type1", "3").setSource("float_payload_field", "a|4 b|5 a|6 b "), + client().prepareIndex("test", "type1", "4").setSource("string_payload_field", "a|b b|a a|b b "), + client().prepareIndex("test", "type1", "5").setSource("float_payload_field", "c "), + client().prepareIndex("test", "type1", "6").setSource("int_payload_field", "c|1")); + + // get the number of all docs + String script = "_shard.numDocs()"; + checkValueInEachDoc(6, script, 6); + + // get the number of docs with field float_payload_field + script = "_shard['float_payload_field'].docCount()"; + checkValueInEachDoc(3, script, 6); + + // corner case: what if the field does not exist? + script = "_shard['non_existent_field'].docCount()"; + checkValueInEachDoc(0, script, 6); + + // get the number of all tokens in all docs + script = "_shard['float_payload_field'].sumttf()"; + checkValueInEachDoc(9, script, 6); + + // corner case get the number of all tokens in all docs for non existent + // field + script = "_shard['non_existent_field'].sumttf()"; + checkValueInEachDoc(0, script, 6); + + // get the sum of doc freqs in all docs + script = "_shard['float_payload_field'].sumdf()"; + checkValueInEachDoc(5, script, 6); + + // get the sum of doc freqs in all docs for non existent field + script = "_shard['non_existent_field'].sumdf()"; + checkValueInEachDoc(0, script, 6); + + // check term frequencies for 'a' + script = "termInfo = _shard['float_payload_field']['a']; if (termInfo != null) {termInfo.tf()}"; + Map expectedResults = new HashMap(); + expectedResults.put("1", 2); + expectedResults.put("2", 0); + expectedResults.put("3", 2); + expectedResults.put("4", 0); + expectedResults.put("5", 0); + expectedResults.put("6", 0); + checkValueInEachDoc(script, expectedResults, 6); + expectedResults.clear(); + + // check doc frequencies for 'c' + script = "termInfo = _shard['float_payload_field']['c']; if (termInfo != null) {termInfo.df()}"; + expectedResults.put("1", 1l); + expectedResults.put("2", 1l); + expectedResults.put("3", 1l); + expectedResults.put("4", 1l); + expectedResults.put("5", 1l); + expectedResults.put("6", 1l); + checkValueInEachDoc(script, expectedResults, 6); + expectedResults.clear(); + + // check doc frequencies for term that does not exist + script = "termInfo = _shard['float_payload_field']['non_existent_term']; if (termInfo != null) {termInfo.df()}"; + expectedResults.put("1", 0l); + expectedResults.put("2", 0l); + expectedResults.put("3", 0l); + expectedResults.put("4", 0l); + expectedResults.put("5", 0l); + expectedResults.put("6", 0l); + checkValueInEachDoc(script, expectedResults, 6); + expectedResults.clear(); + + // check doc frequencies for term that does not exist + script = "termInfo = _shard['non_existent_field']['non_existent_term']; if (termInfo != null) {termInfo.tf()}"; + expectedResults.put("1", 0); + expectedResults.put("2", 0); + expectedResults.put("3", 0); + expectedResults.put("4", 0); + expectedResults.put("5", 0); + expectedResults.put("6", 0); + checkValueInEachDoc(script, expectedResults, 6); + expectedResults.clear(); + + // check total term frequencies for 'a' + script = "termInfo = _shard['float_payload_field']['a']; if (termInfo != null) {termInfo.ttf()}"; + expectedResults.put("1", 4l); + expectedResults.put("2", 4l); + expectedResults.put("3", 4l); + expectedResults.put("4", 4l); + expectedResults.put("5", 4l); + expectedResults.put("6", 4l); + checkValueInEachDoc(script, expectedResults, 6); + expectedResults.clear(); + + // check float payload for 'b' + HashMap> expectedPayloadsArray = new HashMap>(); + script = createPositionsArrayScript("float_payload_field", "b", includeAllFlag, "payloadAsFloat(-1)"); + float missingValue = -1; + List payloadsFor1 = new ArrayList(); + payloadsFor1.add(2f); + payloadsFor1.add(missingValue); + expectedPayloadsArray.put("1", payloadsFor1); + List payloadsFor2 = new ArrayList(); + payloadsFor2.add(5f); + payloadsFor2.add(missingValue); + expectedPayloadsArray.put("3", payloadsFor2); + expectedPayloadsArray.put("6", new ArrayList()); + expectedPayloadsArray.put("5", new ArrayList()); + expectedPayloadsArray.put("4", new ArrayList()); + expectedPayloadsArray.put("2", new ArrayList()); + checkArrayValsInEachDoc(script, expectedPayloadsArray, 6); + + // check string payload for 'b' + expectedPayloadsArray.clear(); + payloadsFor1.clear(); + payloadsFor2.clear(); + script = createPositionsArrayScript("string_payload_field", "b", includeAllFlag, "payloadAsString()"); + payloadsFor1.add("b"); + payloadsFor1.add(null); + expectedPayloadsArray.put("2", payloadsFor1); + payloadsFor2.add("a"); + payloadsFor2.add(null); + expectedPayloadsArray.put("4", payloadsFor2); + expectedPayloadsArray.put("6", new ArrayList()); + expectedPayloadsArray.put("5", new ArrayList()); + expectedPayloadsArray.put("3", new ArrayList()); + expectedPayloadsArray.put("1", new ArrayList()); + checkArrayValsInEachDoc(script, expectedPayloadsArray, 6); + + // check int payload for 'c' + expectedPayloadsArray.clear(); + payloadsFor1.clear(); + payloadsFor2.clear(); + script = createPositionsArrayScript("int_payload_field", "c", includeAllFlag, "payloadAsInt(-1)"); + payloadsFor1 = new ArrayList(); + payloadsFor1.add(1); + expectedPayloadsArray.put("6", payloadsFor1); + expectedPayloadsArray.put("5", new ArrayList()); + expectedPayloadsArray.put("4", new ArrayList()); + expectedPayloadsArray.put("3", new ArrayList()); + expectedPayloadsArray.put("2", new ArrayList()); + expectedPayloadsArray.put("1", new ArrayList()); + checkArrayValsInEachDoc(script, expectedPayloadsArray, 6); + + } + + private void checkExceptions(String script) { + try { + SearchResponse sr = client().prepareSearch("test").setQuery(QueryBuilders.matchAllQuery()).addScriptField("tvtest", script) + .execute().actionGet(); + assertThat(sr.getHits().hits().length, equalTo(0)); + ShardSearchFailure[] shardFails = sr.getShardFailures(); + for (ShardSearchFailure fail : shardFails) { + assertThat(fail.reason().indexOf("Cannot iterate twice! If you want to iterate more that once, add _CACHE explicitely."), + Matchers.greaterThan(-1)); + } + } catch (SearchPhaseExecutionException ex) { + + assertThat( + ex.getDetailedMessage().indexOf("Cannot iterate twice! If you want to iterate more that once, add _CACHE explicitely."), + Matchers.greaterThan(-1)); + } + } + + private void checkValueInEachDocWithFunctionScore(String fieldScript, Map expectedFieldVals, String scoreScript, + Map expectedScore, int numExpectedDocs) { + SearchResponse sr = client().prepareSearch("test") + .setQuery(QueryBuilders.functionScoreQuery(ScoreFunctionBuilders.scriptFunction(scoreScript))) + .addScriptField("tvtest", fieldScript).execute().actionGet(); + ElasticsearchAssertions.assertHitCount(sr, numExpectedDocs); + for (SearchHit hit : sr.getHits().getHits()) { + Object result = hit.getFields().get("tvtest").getValues().get(0); + Object expectedResult = expectedFieldVals.get(hit.getId()); + assertThat("for doc " + hit.getId(), result, equalTo(expectedResult)); + assertThat("for doc " + hit.getId(), ((Float) expectedScore.get(hit.getId())).doubleValue(), + Matchers.closeTo(hit.score(), 1.e-4)); + } + } + + private void checkValueInEachDoc(String script, Map expectedResults, int numExpectedDocs) { + SearchResponse sr = client().prepareSearch("test").setQuery(QueryBuilders.matchAllQuery()).addScriptField("tvtest", script) + .execute().actionGet(); + ElasticsearchAssertions.assertHitCount(sr, numExpectedDocs); + for (SearchHit hit : sr.getHits().getHits()) { + Object result = hit.getFields().get("tvtest").getValues().get(0); + Object expectedResult = expectedResults.get(hit.getId()); + assertThat("for doc " + hit.getId(), result, equalTo(expectedResult)); + } + } + + private void checkValueInEachDoc(int value, String script, int numExpectedDocs) { + SearchResponse sr = client().prepareSearch("test").setQuery(QueryBuilders.matchAllQuery()).addScriptField("tvtest", script) + .execute().actionGet(); + ElasticsearchAssertions.assertHitCount(sr, numExpectedDocs); + for (SearchHit hit : sr.getHits().getHits()) { + Object result = hit.getFields().get("tvtest").getValues().get(0); + if (result instanceof Integer) { + assertThat(((Integer) result).intValue(), equalTo(value)); + } else if (result instanceof Long) { + assertThat(((Long) result).intValue(), equalTo(value)); + } else { + assert false; + } + } + } +}