Remove lowercase_expanded_terms and locale from query-parser opti…

…ons. (#20208) Lucene 6.2 introduces the new `Analyzer.normalize` API, which allows to apply only character-level normalization such as lowercasing or accent folding, which is exactly what is needed to process queries that operate on partial terms such as `prefix`, `wildcard` or `fuzzy` queries. As a consequence, the `lowercase_expanded_terms` option is not necessary anymore. Furthermore, the `locale` option was only needed in order to know how to perform the lowercasing, so this one can be removed as well. Closes #9978
elastic · Nov 2, 2016 · 52de064 · 52de064
1 parent 638353c
commit 52de064
Show file tree

Hide file tree

Showing 29 changed files with 193 additions and 470 deletions.
diff --git a/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java b/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java
@@ -24,6 +24,7 @@
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.queryparser.analyzing.AnalyzingQueryParser;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BoostQuery;
@@ -34,6 +35,7 @@
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SynonymQuery;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.automaton.RegExp;
 import org.elasticsearch.common.lucene.search.Queries;
@@ -42,6 +44,7 @@
 import org.elasticsearch.index.mapper.LegacyDateFieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.MapperService;
+import org.elasticsearch.index.mapper.StringFieldType;
 import org.elasticsearch.index.query.QueryShardContext;
 import org.elasticsearch.index.query.support.QueryParsers;
 
@@ -63,7 +66,7 @@
  * Also breaks fields with [type].[name] into a boolean query that must include the type
  * as well as the query on the name.
  */
-public class MapperQueryParser extends QueryParser {
+public class MapperQueryParser extends AnalyzingQueryParser {
 
     public static final Map<String, FieldQueryExtension> FIELD_QUERY_EXTENSIONS;
 
@@ -99,11 +102,10 @@ public void reset(QueryParserSettings settings) {
         setAutoGeneratePhraseQueries(settings.autoGeneratePhraseQueries());
         setMaxDeterminizedStates(settings.maxDeterminizedStates());
         setAllowLeadingWildcard(settings.allowLeadingWildcard());
-        setLowercaseExpandedTerms(settings.lowercaseExpandedTerms());
+        setLowercaseExpandedTerms(false);
         setPhraseSlop(settings.phraseSlop());
         setDefaultOperator(settings.defaultOperator());
         setFuzzyPrefixLength(settings.fuzzyPrefixLength());
-        setLocale(settings.locale());
         setSplitOnWhitespace(settings.splitOnWhitespace());
     }
 
@@ -330,21 +332,20 @@ private Query getRangeQuerySingle(String field, String part1, String part2,
             boolean startInclusive, boolean endInclusive, QueryShardContext context) {
         currentFieldType = context.fieldMapper(field);
         if (currentFieldType != null) {
-            if (lowercaseExpandedTerms && currentFieldType.tokenized()) {
-                part1 = part1 == null ? null : part1.toLowerCase(locale);
-                part2 = part2 == null ? null : part2.toLowerCase(locale);
-            }
-
             try {
+                BytesRef part1Binary = part1 == null ? null : getAnalyzer().normalize(field, part1);
+                BytesRef part2Binary = part2 == null ? null : getAnalyzer().normalize(field, part2);
                 Query rangeQuery;
                 if (currentFieldType instanceof LegacyDateFieldMapper.DateFieldType && settings.timeZone() != null) {
                     LegacyDateFieldMapper.DateFieldType dateFieldType = (LegacyDateFieldMapper.DateFieldType) this.currentFieldType;
-                    rangeQuery = dateFieldType.rangeQuery(part1, part2, startInclusive, endInclusive, settings.timeZone(), null, context);
+                    rangeQuery = dateFieldType.rangeQuery(part1Binary, part2Binary,
+                            startInclusive, endInclusive, settings.timeZone(), null, context);
                 } else if (currentFieldType instanceof DateFieldMapper.DateFieldType && settings.timeZone() != null) {
                     DateFieldMapper.DateFieldType dateFieldType = (DateFieldMapper.DateFieldType) this.currentFieldType;
-                    rangeQuery = dateFieldType.rangeQuery(part1, part2, startInclusive, endInclusive, settings.timeZone(), null, context);
+                    rangeQuery = dateFieldType.rangeQuery(part1Binary, part2Binary,
+                            startInclusive, endInclusive, settings.timeZone(), null, context);
                 } else {
-                    rangeQuery = currentFieldType.rangeQuery(part1, part2, startInclusive, endInclusive, context);
+                    rangeQuery = currentFieldType.rangeQuery(part1Binary, part2Binary, startInclusive, endInclusive, context);
                 }
                 return rangeQuery;
             } catch (RuntimeException e) {
@@ -358,9 +359,6 @@ private Query getRangeQuerySingle(String field, String part1, String part2,
     }
 
     protected Query getFuzzyQuery(String field, String termStr, String minSimilarity) throws ParseException {
-        if (lowercaseExpandedTerms) {
-            termStr = termStr.toLowerCase(locale);
-        }
         Collection<String> fields = extractMultiFields(field);
         if (fields != null) {
             if (fields.size() == 1) {
@@ -399,8 +397,9 @@ private Query getFuzzyQuerySingle(String field, String termStr, String minSimila
         currentFieldType = context.fieldMapper(field);
         if (currentFieldType != null) {
             try {
-                return currentFieldType.fuzzyQuery(termStr, Fuzziness.build(minSimilarity),
-                    fuzzyPrefixLength, settings.fuzzyMaxExpansions(), FuzzyQuery.defaultTranspositions);
+                BytesRef term = termStr == null ? null : getAnalyzer().normalize(field, termStr);
+                return currentFieldType.fuzzyQuery(term, Fuzziness.build(minSimilarity),
+                    getFuzzyPrefixLength(), settings.fuzzyMaxExpansions(), FuzzyQuery.defaultTranspositions);
             } catch (RuntimeException e) {
                 if (settings.lenient()) {
                     return null;
@@ -423,9 +422,6 @@ protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLeng
 
     @Override
     protected Query getPrefixQuery(String field, String termStr) throws ParseException {
-        if (lowercaseExpandedTerms) {
-            termStr = termStr.toLowerCase(locale);
-        }
         Collection<String> fields = extractMultiFields(field);
         if (fields != null) {
             if (fields.size() == 1) {
@@ -471,8 +467,8 @@ private Query getPrefixQuerySingle(String field, String termStr) throws ParseExc
                     setAnalyzer(context.getSearchAnalyzer(currentFieldType));
                 }
                 Query query = null;
-                if (currentFieldType.tokenized() == false) {
-                    query = currentFieldType.prefixQuery(termStr, multiTermRewriteMethod, context);
+                if (currentFieldType instanceof StringFieldType == false) {
+                    query = currentFieldType.prefixQuery(termStr, getMultiTermRewriteMethod(), context);
                 }
                 if (query == null) {
                     query = getPossiblyAnalyzedPrefixQuery(currentFieldType.name(), termStr);
@@ -590,9 +586,6 @@ protected Query getWildcardQuery(String field, String termStr) throws ParseExcep
                 return FIELD_QUERY_EXTENSIONS.get(ExistsFieldQueryExtension.NAME).query(context, actualField);
             }
         }
-        if (lowercaseExpandedTerms) {
-            termStr = termStr.toLowerCase(locale);
-        }
         Collection<String> fields = extractMultiFields(field);
         if (fields != null) {
             if (fields.size() == 1) {
@@ -639,9 +632,8 @@ private Query getWildcardQuerySingle(String field, String termStr) throws ParseE
                     setAnalyzer(context.getSearchAnalyzer(currentFieldType));
                 }
                 indexedNameField = currentFieldType.name();
-                return getPossiblyAnalyzedWildcardQuery(indexedNameField, termStr);
             }
-            return getPossiblyAnalyzedWildcardQuery(indexedNameField, termStr);
+            return super.getWildcardQuery(indexedNameField, termStr);
         } catch (RuntimeException e) {
             if (settings.lenient()) {
                 return null;
@@ -652,75 +644,8 @@ private Query getWildcardQuerySingle(String field, String termStr) throws ParseE
         }
     }
 
-    private Query getPossiblyAnalyzedWildcardQuery(String field, String termStr) throws ParseException {
-        if (!settings.analyzeWildcard()) {
-            return super.getWildcardQuery(field, termStr);
-        }
-        boolean isWithinToken = (!termStr.startsWith("?") && !termStr.startsWith("*"));
-        StringBuilder aggStr = new StringBuilder();
-        StringBuilder tmp = new StringBuilder();
-        for (int i = 0; i < termStr.length(); i++) {
-            char c = termStr.charAt(i);
-            if (c == '?' || c == '*') {
-                if (isWithinToken) {
-                    try (TokenStream source = getAnalyzer().tokenStream(field, tmp.toString())) {
-                        source.reset();
-                        CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
-                        if (source.incrementToken()) {
-                            String term = termAtt.toString();
-                            if (term.length() == 0) {
-                                // no tokens, just use what we have now
-                                aggStr.append(tmp);
-                            } else {
-                                aggStr.append(term);
-                            }
-                        } else {
-                            // no tokens, just use what we have now
-                            aggStr.append(tmp);
-                        }
-                    } catch (IOException e) {
-                        aggStr.append(tmp);
-                    }
-                    tmp.setLength(0);
-                }
-                isWithinToken = false;
-                aggStr.append(c);
-            } else {
-                tmp.append(c);
-                isWithinToken = true;
-            }
-        }
-        if (isWithinToken) {
-            try {
-                try (TokenStream source = getAnalyzer().tokenStream(field, tmp.toString())) {
-                    source.reset();
-                    CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
-                    if (source.incrementToken()) {
-                        String term = termAtt.toString();
-                        if (term.length() == 0) {
-                            // no tokens, just use what we have now
-                            aggStr.append(tmp);
-                        } else {
-                            aggStr.append(term);
-                        }
-                    } else {
-                        // no tokens, just use what we have now
-                        aggStr.append(tmp);
-                    }
-                }
-            } catch (IOException e) {
-                aggStr.append(tmp);
-            }
-        }
-
-        return super.getWildcardQuery(field, aggStr.toString());
-    }
-
     @Override
     protected Query getRegexpQuery(String field, String termStr) throws ParseException {
-        if (lowercaseExpandedTerms) {
-            termStr = termStr.toLowerCase(locale);
-        }
         Collection<String> fields = extractMultiFields(field);
         if (fields != null) {
             if (fields.size() == 1) {
@@ -768,7 +693,7 @@ private Query getRegexpQuerySingle(String field, String termStr) throws ParseExc
                 Query query = null;
                 if (currentFieldType.tokenized() == false) {
                     query = currentFieldType.regexpQuery(termStr, RegExp.ALL,
-                        maxDeterminizedStates, multiTermRewriteMethod, context);
+                        getMaxDeterminizedStates(), getMultiTermRewriteMethod(), context);
                 }
                 if (query == null) {
                     query = super.getRegexpQuery(field, termStr);

diff --git a/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java b/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java
@@ -24,7 +24,6 @@
 import org.elasticsearch.common.unit.Fuzziness;
 import org.joda.time.DateTimeZone;
 
-import java.util.Locale;
 import java.util.Map;
 
 /**
@@ -53,12 +52,8 @@ public class QueryParserSettings {
 
     private boolean analyzeWildcard;
 
-    private boolean lowercaseExpandedTerms;
-
     private boolean enablePositionIncrements;
 
-    private Locale locale;
-
     private Fuzziness fuzziness;
     private int fuzzyPrefixLength;
     private int fuzzyMaxExpansions;
@@ -137,14 +132,6 @@ public void allowLeadingWildcard(boolean allowLeadingWildcard) {
         this.allowLeadingWildcard = allowLeadingWildcard;
     }
 
-    public boolean lowercaseExpandedTerms() {
-        return lowercaseExpandedTerms;
-    }
-
-    public void lowercaseExpandedTerms(boolean lowercaseExpandedTerms) {
-        this.lowercaseExpandedTerms = lowercaseExpandedTerms;
-    }
-
     public boolean enablePositionIncrements() {
         return enablePositionIncrements;
     }
@@ -269,14 +256,6 @@ public void useDisMax(boolean useDisMax) {
         this.useDisMax = useDisMax;
     }
 
-    public void locale(Locale locale) {
-        this.locale = locale;
-    }
-
-    public Locale locale() {
-        return this.locale;
-    }
-
     public void timeZone(DateTimeZone timeZone) {
         this.timeZone = timeZone;
     }

diff --git a/core/src/main/java/org/elasticsearch/index/mapper/MapperService.java b/core/src/main/java/org/elasticsearch/index/mapper/MapperService.java
@@ -54,7 +54,6 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.function.Function;
-import java.util.function.LongSupplier;
 import java.util.function.Supplier;
 import java.util.stream.Collectors;