Use non analyzed token stream optimization everywhere

In the string type, we have an optimization to reuse the StringTokenStream on a thread local when a non analyzed field is used (instead of creating it each time). We should use this across the board on all places where we create a field with a String. Also, move to a specific XStringField, that we can reuse StringTokenStream instead of copying it. closes #6001
elastic · Apr 30, 2014 · 23f200b · 23f200b
1 parent 12f43fb
commit 23f200b
Show file tree

Hide file tree

Showing 15 changed files with 94 additions and 103 deletions.
diff --git a/core-signatures.txt b/core-signatures.txt
@@ -51,3 +51,6 @@ java.lang.Math#abs(long)
 
 @defaultMessage Use Long.compare instead we are on Java7
 com.google.common.primitives.Longs#compare(long,long)
+
+@defaultMessage we have an optimized XStringField to reduce analysis creation overhead
+org.apache.lucene.document.Field#<init>(java.lang.String,java.lang.String,org.apache.lucene.document.FieldType)
diff --git a/src/main/java/org/apache/lucene/document/XStringField.java b/src/main/java/org/apache/lucene/document/XStringField.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.lucene.document;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.CloseableThreadLocal;
+
+import java.io.IOException;
+
+/**
+ * A string/text field that optimizes the case for non analyzed fields to reuse a thread local token
+ * stream (instead of creating it each time). This reduces analysis chain overhead and object creation
+ * (which is significant, yay Attributes).
+ * <p/>
+ * Not to be confused with Lucene StringField, this handles analyzed text as well, and relies on providing
+ * the FieldType. Couldn't come up with a good name for this that is different from Text/String...
+ */
+public class XStringField extends Field {
+
+    private static final CloseableThreadLocal<StringTokenStream> NOT_ANALYZED_TOKENSTREAM = new CloseableThreadLocal<StringTokenStream>() {
+        @Override
+        protected StringTokenStream initialValue() {
+            return new StringTokenStream();
+        }
+    };
+
+    public XStringField(String name, String value, FieldType fieldType) {
+        super(name, fieldType);
+        fieldsData = value;
+    }
+
+    @Override
+    public TokenStream tokenStream(Analyzer analyzer) throws IOException {
+        if (!fieldType().indexed()) {
+            return null;
+        }
+        // Only use the cached TokenStream if the value is indexed and not-tokenized
+        if (fieldType().tokenized()) {
+            return super.tokenStream(analyzer);
+        }
+        StringTokenStream nonAnalyzedTokenStream = NOT_ANALYZED_TOKENSTREAM.get();
+        nonAnalyzedTokenStream.setValue((String) fieldsData);
+        return nonAnalyzedTokenStream;
+    }
+}
diff --git a/src/main/java/org/elasticsearch/index/mapper/core/BooleanFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/BooleanFieldMapper.java
@@ -21,6 +21,7 @@
 
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.queries.TermFilter;
 import org.apache.lucene.search.Filter;
@@ -221,7 +222,7 @@ protected void parseCreateField(ParseContext context, List<Field> fields) throws
         if (value == null) {
             return;
         }
-        fields.add(new Field(names.indexName(), value ? "T" : "F", fieldType));
+        fields.add(new XStringField(names.indexName(), value ? "T" : "F", fieldType));
     }
 
     @Override

diff --git a/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java
@@ -25,6 +25,7 @@
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.ElasticsearchIllegalArgumentException;
@@ -387,18 +388,11 @@ public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload
                 surfaceForm, weight, payload);
     }
 
-    private static final class SuggestField extends Field {
+    private static final class SuggestField extends XStringField {
         private final BytesRef payload;
         private final CompletionTokenStream.ToFiniteStrings toFiniteStrings;
         private final ContextMapping.Context ctx;
 
-        public SuggestField(String name, ContextMapping.Context ctx, Reader value, FieldType type, BytesRef payload, CompletionTokenStream.ToFiniteStrings toFiniteStrings) {
-            super(name, value, type);
-            this.payload = payload;
-            this.toFiniteStrings = toFiniteStrings;
-            this.ctx = ctx;
-        }
-
         public SuggestField(String name, ContextMapping.Context ctx, String value, FieldType type, BytesRef payload, CompletionTokenStream.ToFiniteStrings toFiniteStrings) {
             super(name, value, type);
             this.payload = payload;

diff --git a/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java
@@ -26,6 +26,7 @@
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.util.BytesRef;
@@ -286,7 +287,7 @@ protected void parseCreateField(ParseContext context, List<Field> fields) throws
         }
 
         if (fieldType.indexed() || fieldType.stored()) {
-            Field field = new StringField(names.indexName(), valueAndBoost.value(), fieldType);
+            Field field = new XStringField(names.indexName(), valueAndBoost.value(), fieldType);
             field.setBoost(valueAndBoost.boost());
             fields.add(field);
         }
@@ -385,86 +386,6 @@ protected void doXContentBody(XContentBuilder builder, boolean includeDefaults,
         }
     }
 
-    /** Extension of {@link Field} supporting reuse of a cached TokenStream for not-tokenized values. */
-    static class StringField extends Field {
-
-        public StringField(String name, String value, FieldType fieldType) {
-            super(name, fieldType);
-            fieldsData = value;
-        }
-
-        @Override
-        public TokenStream tokenStream(Analyzer analyzer) throws IOException {
-            if (!fieldType().indexed()) {
-                return null;
-            }
-            // Only use the cached TokenStream if the value is indexed and not-tokenized
-            if (fieldType().tokenized()) {
-                return super.tokenStream(analyzer);
-            }
-            return NOT_ANALYZED_TOKENSTREAM.get().setValue((String) fieldsData);
-        }
-    }
-
-    private static final ThreadLocal<StringTokenStream> NOT_ANALYZED_TOKENSTREAM = new ThreadLocal<StringTokenStream>() {
-        @Override
-        protected StringTokenStream initialValue() {
-            return new StringTokenStream();
-        }
-    };
-
-
-    // Copied from Field.java
-    static final class StringTokenStream extends TokenStream {
-        private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
-        private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
-        private boolean used = false;
-        private String value = null;
-
-        /**
-         * Creates a new TokenStream that returns a String as single token.
-         * <p>Warning: Does not initialize the value, you must call
-         * {@link #setValue(String)} afterwards!
-         */
-        StringTokenStream() {
-        }
-
-        /** Sets the string value. */
-        StringTokenStream setValue(String value) {
-            this.value = value;
-            return this;
-        }
-
-        @Override
-        public boolean incrementToken() {
-            if (used) {
-                return false;
-            }
-            clearAttributes();
-            termAttribute.append(value);
-            offsetAttribute.setOffset(0, value.length());
-            used = true;
-            return true;
-        }
-
-        @Override
-        public void end() {
-            final int finalOffset = value.length();
-            offsetAttribute.setOffset(finalOffset, finalOffset);
-            value = null;
-        }
-
-        @Override
-        public void reset() {
-            used = false;
-        }
-
-        @Override
-        public void close() {
-            value = null;
-        }
-    }
-
     /**
      * Parsed value and boost to be returned from {@link #parseCreateFieldForString}.
      */

diff --git a/src/main/java/org/elasticsearch/index/mapper/geo/GeoPointFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/geo/GeoPointFieldMapper.java
@@ -24,6 +24,7 @@
 import com.google.common.base.Objects;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.util.BytesRef;
@@ -570,7 +571,7 @@ private void parse(ParseContext context, GeoPoint point, String geohash) throws
         }
 
         if (fieldType.indexed() || fieldType.stored()) {
-            Field field = new Field(names.indexName(), Double.toString(point.lat()) + ',' + Double.toString(point.lon()), fieldType);
+            Field field = new XStringField(names.indexName(), Double.toString(point.lat()) + ',' + Double.toString(point.lon()), fieldType);
             context.doc().add(field);
         }
         if (enableGeoHash) {

diff --git a/src/main/java/org/elasticsearch/index/mapper/internal/IdFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/internal/IdFieldMapper.java
@@ -23,6 +23,7 @@
 import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.TermsFilter;
@@ -313,7 +314,7 @@ protected void parseCreateField(ParseContext context, List<Field> fields) throws
         } // else we are in the pre/post parse phase
 
         if (fieldType.indexed() || fieldType.stored()) {
-            fields.add(new Field(names.indexName(), context.id(), fieldType));
+            fields.add(new XStringField(names.indexName(), context.id(), fieldType));
         }
         if (hasDocValues()) {
             fields.add(new BinaryDocValuesField(names.indexName(), new BytesRef(context.id())));

diff --git a/src/main/java/org/elasticsearch/index/mapper/internal/IndexFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/internal/IndexFieldMapper.java
@@ -22,6 +22,7 @@
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.Strings;
@@ -185,7 +186,7 @@ protected void parseCreateField(ParseContext context, List<Field> fields) throws
         if (!enabledState.enabled) {
             return;
         }
-        fields.add(new Field(names.indexName(), context.index(), fieldType));
+        fields.add(new XStringField(names.indexName(), context.index(), fieldType));
     }
 
     @Override

diff --git a/src/main/java/org/elasticsearch/index/mapper/internal/ParentFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/internal/ParentFieldMapper.java
@@ -20,6 +20,7 @@
 
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.TermFilter;
@@ -188,7 +189,7 @@ protected void parseCreateField(ParseContext context, List<Field> fields) throws
             // we are in the parsing of _parent phase
             String parentId = context.parser().text();
             context.sourceToParse().parent(parentId);
-            fields.add(new Field(names.indexName(), Uid.createUid(context.stringBuilder(), type, parentId), fieldType));
+            fields.add(new XStringField(names.indexName(), Uid.createUid(context.stringBuilder(), type, parentId), fieldType));
         } else {
             // otherwise, we are running it post processing of the xcontent
             String parsedParentId = context.doc().get(Defaults.NAME);
@@ -199,7 +200,7 @@ protected void parseCreateField(ParseContext context, List<Field> fields) throws
                         throw new MapperParsingException("No parent id provided, not within the document, and not externally");
                     }
                     // we did not add it in the parsing phase, add it now
-                    fields.add(new Field(names.indexName(), Uid.createUid(context.stringBuilder(), type, parentId), fieldType));
+                    fields.add(new XStringField(names.indexName(), Uid.createUid(context.stringBuilder(), type, parentId), fieldType));
                 } else if (parentId != null && !parsedParentId.equals(Uid.createUid(context.stringBuilder(), type, parentId))) {
                     throw new MapperParsingException("Parent id mismatch, document value is [" + Uid.createUid(parsedParentId).id() + "], while external value is [" + parentId + "]");
                 }

diff --git a/src/main/java/org/elasticsearch/index/mapper/internal/RoutingFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/internal/RoutingFieldMapper.java
@@ -22,6 +22,7 @@
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.Strings;
@@ -226,7 +227,7 @@ protected void parseCreateField(ParseContext context, List<Field> fields) throws
                     context.ignoredValue(names.indexName(), routing);
                     return;
                 }
-                fields.add(new Field(names.indexName(), routing, fieldType));
+                fields.add(new XStringField(names.indexName(), routing, fieldType));
             }
         }
     }

diff --git a/src/main/java/org/elasticsearch/index/mapper/internal/TypeFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/internal/TypeFieldMapper.java
@@ -22,6 +22,7 @@
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.TermFilter;
@@ -181,7 +182,7 @@ protected void parseCreateField(ParseContext context, List<Field> fields) throws
         if (!fieldType.indexed() && !fieldType.stored()) {
             return;
         }
-        fields.add(new Field(names.indexName(), context.type(), fieldType));
+        fields.add(new XStringField(names.indexName(), context.type(), fieldType));
         if (hasDocValues()) {
             fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(context.type())));
         }

diff --git a/src/main/java/org/elasticsearch/index/mapper/internal/UidFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/internal/UidFieldMapper.java
@@ -22,6 +22,7 @@
 import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.Term;
@@ -153,7 +154,7 @@ public void postParse(ParseContext context) throws IOException {
                 // we need to go over the docs and add it...
                 for (int i = 1; i < context.docs().size(); i++) {
                     final Document doc = context.docs().get(i);
-                    doc.add(new Field(UidFieldMapper.NAME, uidField.stringValue(), Defaults.NESTED_FIELD_TYPE));
+                    doc.add(new XStringField(UidFieldMapper.NAME, uidField.stringValue(), Defaults.NESTED_FIELD_TYPE));
                 }
             }
         }
@@ -175,7 +176,7 @@ public boolean includeInObject() {
 
     @Override
     protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
-        Field uid = new Field(NAME, Uid.createUid(context.stringBuilder(), context.type(), context.id()), Defaults.FIELD_TYPE);
+        Field uid = new XStringField(NAME, Uid.createUid(context.stringBuilder(), context.type(), context.id()), Defaults.FIELD_TYPE);
         context.uid(uid);
         fields.add(uid);
         if (hasDocValues()) {

diff --git a/src/main/java/org/elasticsearch/index/mapper/object/ObjectMapper.java b/src/main/java/org/elasticsearch/index/mapper/object/ObjectMapper.java
@@ -21,6 +21,7 @@
 
 import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.TermFilter;
@@ -432,12 +433,12 @@ public void parse(ParseContext context) throws IOException {
                 // we also rely on this for UidField#loadVersion
 
                 // this is a deeply nested field
-                nestedDoc.add(new Field(UidFieldMapper.NAME, uidField.stringValue(), UidFieldMapper.Defaults.NESTED_FIELD_TYPE));
+                nestedDoc.add(new XStringField(UidFieldMapper.NAME, uidField.stringValue(), UidFieldMapper.Defaults.NESTED_FIELD_TYPE));
             }
             // the type of the nested doc starts with __, so we can identify that its a nested one in filters
             // note, we don't prefix it with the type of the doc since it allows us to execute a nested query
             // across types (for example, with similar nested objects)
-            nestedDoc.add(new Field(TypeFieldMapper.NAME, nestedTypePathAsString, TypeFieldMapper.Defaults.FIELD_TYPE));
+            nestedDoc.add(new XStringField(TypeFieldMapper.NAME, nestedTypePathAsString, TypeFieldMapper.Defaults.FIELD_TYPE));
             restoreDoc = context.switchDoc(nestedDoc);
             context.addDoc(nestedDoc);
         }

diff --git a/.../org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java b/.../org/elasticsearch/search/highlight/vectorhighlight/SourceScoreOrderFragmentsBuilder.java
@@ -27,6 +27,7 @@
 import org.apache.lucene.analysis.ngram.NGramTokenizerFactory;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.TextField;
+import org.apache.lucene.document.XStringField;
 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.highlight.Encoder;
@@ -66,7 +67,7 @@ protected Field[] getFields(IndexReader reader, int docId, String fieldName) thr
         List<Object> values = lookup.source().extractRawValues(mapper.names().sourcePath());
         Field[] fields = new Field[values.size()];
         for (int i = 0; i < values.size(); i++) {
-            fields[i] = new Field(mapper.names().indexName(), values.get(i).toString(), TextField.TYPE_NOT_STORED);
+            fields[i] = new XStringField(mapper.names().indexName(), values.get(i).toString(), TextField.TYPE_NOT_STORED);
         }
         return fields;
     }