Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Lucene's getFiniteStrings to not consume Java stack #5927

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -912,7 +912,7 @@ public final Set<IntsRef> toFiniteStrings(final TokenStreamToAutomaton ts2a, Tok
// TODO: we could walk & add simultaneously, so we
// don't have to alloc [possibly biggish]
// intermediate HashSet in RAM:
return SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
return XSpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
}

final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
Expand Down
Expand Up @@ -219,7 +219,7 @@ public TokenStreamToAutomaton getTokenStreamToAutomaton() {
}

Automaton toLevenshteinAutomata(Automaton automaton) {
final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
final Set<IntsRef> ref = XSpecialOperations.getFiniteStrings(automaton, -1);
Automaton subs[] = new Automaton[ref.size()];
int upto = 0;
for (IntsRef path : ref) {
Expand Down
@@ -0,0 +1,200 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.lucene.search.suggest.analyzing;

import java.util.Collections;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Set;

import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.State;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Util;
import org.elasticsearch.Version;

class XSpecialOperations {

// TODO Lucene 4.9: remove this once we upgrade; see
// LUCENE-5628

static {
assert Version.CURRENT.luceneVersion == org.apache.lucene.util.Version.LUCENE_48: "Remove this code once we upgrade to Lucene 4.9 where LUCENE-5628 is fixed";
}

private static class PathNode {

/** Which state the path node ends on, whose
* transitions we are enumerating. */
public State state;

/** Which state the current transition leads to. */
public State to;

/** Which transition we are on. */
public int transition;

/** Which label we are on, in the min-max range of the
* current Transition */
public int label;

public void resetState(State state) {
assert state.numTransitions() != 0;
this.state = state;
transition = 0;
Transition t = state.transitionsArray[transition];
label = t.getMin();
to = t.getDest();
}

/** Returns next label of current transition, or
* advances to next transition and returns its first
* label, if current one is exhausted. If there are
* no more transitions, returns -1. */
public int nextLabel() {
if (label > state.transitionsArray[transition].getMax()) {
// We've exhaused the current transition's labels;
// move to next transitions:
transition++;
if (transition >= state.numTransitions()) {
// We're done iterating transitions leaving this state
return -1;
}
Transition t = state.transitionsArray[transition];
label = t.getMin();
to = t.getDest();
}
return label++;
}
}

private static PathNode getNode(PathNode[] nodes, int index) {
assert index < nodes.length;
if (nodes[index] == null) {
nodes[index] = new PathNode();
}
return nodes[index];
}

// TODO: this is a dangerous method ... Automaton could be
// huge ... and it's better in general for caller to
// enumerate & process in a single walk:

/** Returns the set of accepted strings, up to at most
* <code>limit</code> strings. If more than <code>limit</code>
* strings are accepted, the first limit strings found are returned. If <code>limit</code> == -1, then
* the limit is infinite. If the {@link Automaton} has
* cycles then this method might throw {@code
* IllegalArgumentException} but that is not guaranteed
* when the limit is set. */
public static Set<IntsRef> getFiniteStrings(Automaton a, int limit) {
Set<IntsRef> results = new HashSet<>();

if (limit == -1 || limit > 0) {
// OK
} else {
throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit);
}

if (a.getSingleton() != null) {
// Easy case: automaton accepts only 1 string
results.add(Util.toUTF32(a.getSingleton(), new IntsRef()));
} else {

if (a.getInitialState().isAccept()) {
// Special case the empty string, as usual:
results.add(new IntsRef());
}

if (a.getInitialState().numTransitions() > 0 && (limit == -1 || results.size() < limit)) {

// TODO: we could use state numbers here and just
// alloc array, but asking for states array can be
// costly (it's lazily computed):

// Tracks which states are in the current path, for
// cycle detection:
Set<State> pathStates = Collections.newSetFromMap(new IdentityHashMap<State,Boolean>());

// Stack to hold our current state in the
// recursion/iteration:
PathNode[] nodes = new PathNode[4];

pathStates.add(a.getInitialState());
PathNode root = getNode(nodes, 0);
root.resetState(a.getInitialState());

IntsRef string = new IntsRef(1);
string.length = 1;

while (string.length > 0) {

PathNode node = nodes[string.length-1];

// Get next label leaving the current node:
int label = node.nextLabel();

if (label != -1) {
string.ints[string.length-1] = label;

if (node.to.isAccept()) {
// This transition leads to an accept state,
// so we save the current string:
results.add(IntsRef.deepCopyOf(string));
if (results.size() == limit) {
break;
}
}

if (node.to.numTransitions() != 0) {
// Now recurse: the destination of this transition has
// outgoing transitions:
if (pathStates.contains(node.to)) {
throw new IllegalArgumentException("automaton has cycles");
}
pathStates.add(node.to);

// Push node onto stack:
if (nodes.length == string.length) {
PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(nodes, 0, newNodes, 0, nodes.length);
nodes = newNodes;
}
getNode(nodes, string.length).resetState(node.to);
string.length++;
string.grow(string.length);
}
} else {
// No more transitions leaving this state,
// pop/return back to previous state:
assert pathStates.contains(node.state);
pathStates.remove(node.state);
string.length--;
}
}
}
}

return results;
}
}
Expand Up @@ -126,13 +126,6 @@ public void testDuellCompletions() throws IOException, NoSuchFieldException, Sec
final boolean usePayloads = getRandom().nextBoolean();
final int options = preserveSeparators ? AnalyzingSuggester.PRESERVE_SEP : 0;

// NOTE: remove once we fix getFiniteStrings to not
// recurse; this is just a stopgap to mute the test:
// This test fails on Java8, I think because that
// version allocates less stack in the Jenkins envs
// where we run tests
assumeFalse(Constants.JRE_IS_MINIMUM_JAVA8);

XAnalyzingSuggester reference = new XAnalyzingSuggester(new StandardAnalyzer(TEST_VERSION_CURRENT), null, new StandardAnalyzer(
TEST_VERSION_CURRENT), options, 256, -1, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
LineFileDocs docs = new LineFileDocs(getRandom());
Expand Down