Term Vectors: terms filtering

This adds a new feature to the Term Vectors API which allows for filtering of terms based on their tf-idf scores. With `dfs` option on, this could be useful for finding out a good characteric vector of a document or a set of documents. The parameters are similar to the ones used in the MLT Query. Closes #9561
elastic · Apr 14, 2015 · d339ee4 · d339ee4
1 parent 82df50a
commit d339ee4
Show file tree

Hide file tree

Showing 12 changed files with 781 additions and 36 deletions.
diff --git a/docs/reference/docs/termvectors.asciidoc b/docs/reference/docs/termvectors.asciidoc
@@ -85,6 +85,34 @@ Setting `dfs` to `true` (default is `false`) will return the term statistics
 or the field statistics of the entire index, and not just at the shard. Use it
 with caution as distributed frequencies can have a serious performance impact.
 
+[float]
+==== Terms Filtering coming[2.0]
+
+With the parameter `filter`, the terms returned could also be filtered based
+on their tf-idf scores. This could be useful in order find out a good
+characteristic vector of a document. This feature works in a similar manner to
+the <<mlt-query-term-selection,second phase>> of the
+<<query-dsl-mlt-query,More Like This Query>>. See <<docs-termvectors-terms-filtering,example 5>>
+for usage.
+
+The following sub-parameters are supported:
+
+[horizontal]
+`max_num_terms`::
+  Maximum number of terms that must be returned per field. Defaults to `25`.
+`min_term_freq`::
+  Ignore words with less than this frequency in the source doc. Defaults to `1`.
+`max_term_freq`::
+  Ignore words with more than this frequency in the source doc. Defaults to unbounded.
+`min_doc_freq`::
+  Ignore terms which do not occur in at least this many docs. Defaults to `1`.
+`max_doc_freq`::
+  Ignore words which occur in more than this many docs. Defaults to unbounded.
+`min_word_length`::
+  The minimum word length below which words will be ignored. Defaults to `0`.
+`max_word_length`::
+  The maximum word length above which words will be ignored. Defaults to unbounded (`0`).
+
 [float]
 === Behaviour
 
@@ -337,3 +365,75 @@ Response:
   }
 }
 --------------------------------------------------
+
+[float]
+[[docs-termvectors-terms-filtering]]
+=== Example 5
+
+Finally, the terms returned could be filtered based on their tf-idf scores. In
+the example below we obtain the three most "interesting" keywords from the
+artificial document having the given "plot" field value. Additionally, we are
+asking for distributed frequencies to obtain more accurate results. Notice
+that the keyword "Tony" or any stop words are not part of the response, as
+their tf-idf must be too low.
+
+[source,js]
+--------------------------------------------------
+GET /imdb/movies/_termvectors
+{
+    "doc": {
+      "plot": "When wealthy industrialist Tony Stark is forced to build an armored suit after a life-threatening incident, he ultimately decides to use its technology to fight against evil."
+    },
+    "term_statistics" : true,
+    "field_statistics" : true,
+    "dfs": true,
+    "positions": false,
+    "offsets": false,
+    "filter" : {
+      "max_num_terms" : 3,
+      "min_term_freq" : 1,
+      "min_doc_freq" : 1
+    }
+}
+--------------------------------------------------
+
+Response:
+
+[source,js]
+--------------------------------------------------
+{
+   "_index": "imdb",
+   "_type": "movies",
+   "_version": 0,
+   "found": true,
+   "term_vectors": {
+      "plot": {
+         "field_statistics": {
+            "sum_doc_freq": 3384269,
+            "doc_count": 176214,
+            "sum_ttf": 3753460
+         },
+         "terms": {
+            "armored": {
+               "doc_freq": 27,
+               "ttf": 27,
+               "term_freq": 1,
+               "score": 9.74725
+            },
+            "industrialist": {
+               "doc_freq": 88,
+               "ttf": 88,
+               "term_freq": 1,
+               "score": 8.590818
+            },
+            "stark": {
+               "doc_freq": 44,
+               "ttf": 47,
+               "term_freq": 1,
+               "score": 9.272792
+            }
+         }
+      }
+   }
+}
+--------------------------------------------------
diff --git a/docs/reference/query-dsl/queries/mlt-query.asciidoc b/docs/reference/query-dsl/queries/mlt-query.asciidoc
@@ -178,6 +178,7 @@ The text to find documents like it.
 A list of documents following the same syntax as the <<docs-multi-get,Multi GET API>>.
 
 [float]
+[[mlt-query-term-selection]]
 ==== Term Selection Parameters
 
 [horizontal]

diff --git a/src/main/java/org/elasticsearch/action/termvectors/TermVectorsFields.java b/src/main/java/org/elasticsearch/action/termvectors/TermVectorsFields.java
@@ -21,7 +21,11 @@
 
 import com.carrotsearch.hppc.ObjectLongOpenHashMap;
 import com.carrotsearch.hppc.cursors.ObjectLongCursor;
-import org.apache.lucene.index.*;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BoostAttribute;
 import org.apache.lucene.util.*;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.io.stream.BytesStreamInput;
@@ -40,7 +44,7 @@
  * <tt>-1,</tt>, if no positions were returned by the {@link TermVectorsRequest}.
  * <p/>
  * The data is stored in two byte arrays ({@code headerRef} and
- * {@code termVectors}, both {@link ByteRef}) that have the following format:
+ * {@code termVectors}, both {@link BytesRef}) that have the following format:
  * <p/>
  * {@code headerRef}: Stores offsets per field in the {@code termVectors} array
  * and some header information as {@link BytesRef}. Format is
@@ -113,6 +117,7 @@ public final class TermVectorsFields extends Fields {
     private final BytesReference termVectors;
     final boolean hasTermStatistic;
     final boolean hasFieldStatistic;
+    public final boolean hasScores;
 
     /**
      * @param headerRef   Stores offsets per field in the {@code termVectors} and some
@@ -130,6 +135,7 @@ public TermVectorsFields(BytesReference headerRef, BytesReference termVectors) t
         assert version == -1;
         hasTermStatistic = header.readBoolean();
         hasFieldStatistic = header.readBoolean();
+        hasScores = header.readBoolean();
         final int numFields = header.readVInt();
         for (int i = 0; i < numFields; i++) {
             fieldMap.put((header.readString()), header.readVLong());
@@ -226,6 +232,7 @@ public TermsEnum iterator() throws IOException {
                 int[] endOffsets = new int[1];
                 BytesRefBuilder[] payloads = new BytesRefBuilder[1];
                 final BytesRefBuilder spare = new BytesRefBuilder();
+                BoostAttribute boostAtt = this.attributes().addAttribute(BoostAttribute.class);
 
                 @Override
                 public BytesRef next() throws IOException {
@@ -250,6 +257,11 @@ public BytesRef next() throws IOException {
                         // currentPosition etc. so that we can just iterate
                         // later
                         writeInfos(perFieldTermVectorInput);
+
+                        // read the score if available
+                        if (hasScores) {
+                            boostAtt.setBoost(perFieldTermVectorInput.readFloat());
+                        }
                         return spare.get();
 
                     } else {
@@ -482,5 +494,4 @@ int readPotentiallyNegativeVInt(BytesStreamInput stream) throws IOException {
     long readPotentiallyNegativeVLong(BytesStreamInput stream) throws IOException {
         return stream.readVLong() - 1;
     }
-
-}
+}