Tie-break suggestions from phrase suggester by term

If the score for two suggestions is the same, we now tie break by term; earlier terms (aaa) sort before later terms (zzz). Closes #5978
elastic · May 18, 2014 · 4f7792e · 4f7792e
1 parent f79b283
commit 4f7792e
Show file tree

Hide file tree

Showing 6 changed files with 60 additions and 11 deletions.
diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java b/src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java
@@ -18,6 +18,7 @@
  */
 package org.elasticsearch.search.suggest.phrase;
 import java.io.IOException;
+import java.util.Arrays;
 
 import org.apache.lucene.util.PriorityQueue;
 import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
@@ -42,7 +43,7 @@ public Correction[] findBestCandiates(CandidateSet[] sets, float errorFraction,
         PriorityQueue<Correction> corrections = new PriorityQueue<Correction>(maxNumCorrections) {
             @Override
             protected boolean lessThan(Correction a, Correction b) {
-                return a.score < b.score;
+                return a.compareTo(b) < 0;
             }
         };
         int numMissspellings = 1;
@@ -98,7 +99,7 @@ private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueu
                 Candidate[] c = new Candidate[candidates.length];
                 System.arraycopy(path, 0, c, 0, path.length);
                 corrections.add(new Correction(score, c));
-            } else if (corrections.top().score < score) {
+            } else if (corrections.top().compareTo(score, path) < 0) {
                 Correction top = corrections.top();
                 System.arraycopy(path, 0, top.candidates, 0, path.length);
                 top.score = score;

diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/Correction.java b/src/main/java/org/elasticsearch/search/suggest/phrase/Correction.java
@@ -23,8 +23,9 @@
 import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
 
 import java.util.Arrays;
+
 //TODO public for tests
-public final class Correction {
+public final class Correction implements Comparable<Correction> {
 
     public static final Correction[] EMPTY = new Correction[0];
     public double score;
@@ -73,4 +74,28 @@ public BytesRef join(BytesRef separator, BytesRef result, BytesRef preTag, Bytes
         result.grow(len);
         return SuggestUtils.joinPreAllocated(separator, result, toJoin);
     }
-}
+
+    /** Lower scores sorts first; if scores are equal,
+     *  than later terms (zzz) sort first .*/
+    @Override
+    public int compareTo(Correction other) {
+        return compareTo(other.score, other.candidates);
+    }
+
+    int compareTo(double otherScore, Candidate[] otherCandidates) {
+        if (score == otherScore) {
+            int limit = Math.min(candidates.length, otherCandidates.length);
+            for (int i=0;i<limit;i++) {
+                int cmp = candidates[i].term.compareTo(otherCandidates[i].term);
+                if (cmp != 0) {
+                    // Later (zzz) terms sort before (are weaker than) earlier (aaa) terms:
+                    return -cmp;
+                }
+            }
+
+            return candidates.length - otherCandidates.length;
+        } else {
+            return Double.compare(score, otherScore);
+        }
+    }
+}
diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java b/src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java
@@ -30,6 +30,8 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
@@ -186,11 +188,15 @@ public CandidateSet(Candidate[] candidates, Candidate originalTerm) {
         }
 
         public void addCandidates(List<Candidate> candidates) {
+            // Merge new candidates into existing ones,
+            // deduping:
             final Set<Candidate> set = new HashSet<>(candidates);
             for (int i = 0; i < this.candidates.length; i++) {
                 set.add(this.candidates[i]);
             }
             this.candidates = set.toArray(new Candidate[set.size()]);
+            // Sort strongest to weakest:
+            Arrays.sort(this.candidates, Collections.reverseOrder());
         }
 
         public void addOneCandidate(Candidate candidate) {
@@ -202,7 +208,7 @@ public void addOneCandidate(Candidate candidate) {
 
     }
 
-    public static class Candidate {
+    public static class Candidate implements Comparable<Candidate> {
         public static final Candidate[] EMPTY = new Candidate[0];
         public final BytesRef term;
         public final double stringDistance;
@@ -220,7 +226,7 @@ public Candidate(BytesRef term, long frequency, double stringDistance, double sc
 
         @Override
         public String toString() {
-            return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", frequency=" + frequency + 
+            return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", score=" + score + ", frequency=" + frequency + 
                     (userInput ? ", userInput" : "" ) + "]";
         }
 
@@ -248,6 +254,17 @@ public boolean equals(Object obj) {
                 return false;
             return true;
         }
+
+        /** Lower scores sort first; if scores are equal, then later (zzz) terms sort first */
+        @Override
+        public int compareTo(Candidate other) {
+            if (score == other.score) {
+                // Later (zzz) terms sort before earlier (aaa) terms:
+                return other.term.compareTo(term);
+            } else {
+                return Double.compare(score, other.score);
+            }
+        }
     }
 
     @Override

diff --git a/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java b/src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java
@@ -126,9 +126,9 @@ public void end() {
             double inputPhraseScore = scorer.score(candidates, candidateSets);
             cutoffScore = inputPhraseScore * confidence;
         }
-        Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
+        Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
 
-        return new Result(findBestCandiates, cutoffScore);
+        return new Result(bestCandidates, cutoffScore);
     }
 
     public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,

diff --git a/src/test/java/org/elasticsearch/search/suggest/SuggestSearchTests.java b/src/test/java/org/elasticsearch/search/suggest/SuggestSearchTests.java
@@ -672,7 +672,6 @@ public void testSizePararm() throws IOException {
 
     @Test
     @Nightly
-    @LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elasticsearch/elasticsearch/pull/5962")
     public void testPhraseBoundaryCases() throws ElasticsearchException, IOException {
         CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(settingsBuilder()
                 .put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) // to get reliable statistics we should put this all into one shard
@@ -751,10 +750,17 @@ public void testPhraseBoundaryCases() throws ElasticsearchException, IOException
         phraseSuggestion.field("ngram").analyzer("myDefAnalyzer")
                 .addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"));
         Suggest suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
-        assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
+
+        // "xorr the god jewel" and and "xorn the god jewel" have identical scores (we are only using unigrams to score), so we tie break by
+        // earlier term (xorn):
+        assertSuggestion(suggest, 0, "simple_phrase", "xorn the god jewel");
 
         phraseSuggestion.analyzer(null);
         suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
+
+        // In this case xorr has a better score than xorn because we set the field back to the default (my_shingle2) analyzer, so the
+        // probability that the term is not in the dictionary but is NOT a misspelling is relatively high in this case compared to the
+        // others that have no n-gram with the other terms in the phrase :) you can set this realWorldErrorLikelyhood
         assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
     }
 

diff --git a/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java b/src/test/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellCheckerTests.java
@@ -268,7 +268,7 @@ protected TokenStreamComponents createComponents(String fieldName, Reader reader
         assertThat(corrections.length, equalTo(4));
         assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
         assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
-        assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel"));
+        assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
 
 
         corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;