Skip to content

Commit

Permalink
Tie-break suggestions from phrase suggester by term
Browse files Browse the repository at this point in the history
If the score for two suggestions is the same, we now tie break by term; earlier terms (aaa) sort before later terms (zzz).

Closes #5978
  • Loading branch information
mikemccand committed May 18, 2014
1 parent f79b283 commit 4f7792e
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
*/
package org.elasticsearch.search.suggest.phrase;
import java.io.IOException;
import java.util.Arrays;

import org.apache.lucene.util.PriorityQueue;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
Expand All @@ -42,7 +43,7 @@ public Correction[] findBestCandiates(CandidateSet[] sets, float errorFraction,
PriorityQueue<Correction> corrections = new PriorityQueue<Correction>(maxNumCorrections) {
@Override
protected boolean lessThan(Correction a, Correction b) {
return a.score < b.score;
return a.compareTo(b) < 0;
}
};
int numMissspellings = 1;
Expand Down Expand Up @@ -98,7 +99,7 @@ private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueu
Candidate[] c = new Candidate[candidates.length];
System.arraycopy(path, 0, c, 0, path.length);
corrections.add(new Correction(score, c));
} else if (corrections.top().score < score) {
} else if (corrections.top().compareTo(score, path) < 0) {
Correction top = corrections.top();
System.arraycopy(path, 0, top.candidates, 0, path.length);
top.score = score;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;

import java.util.Arrays;

//TODO public for tests
public final class Correction {
public final class Correction implements Comparable<Correction> {

public static final Correction[] EMPTY = new Correction[0];
public double score;
Expand Down Expand Up @@ -73,4 +74,28 @@ public BytesRef join(BytesRef separator, BytesRef result, BytesRef preTag, Bytes
result.grow(len);
return SuggestUtils.joinPreAllocated(separator, result, toJoin);
}
}

/** Lower scores sorts first; if scores are equal,
* than later terms (zzz) sort first .*/
@Override
public int compareTo(Correction other) {
return compareTo(other.score, other.candidates);
}

int compareTo(double otherScore, Candidate[] otherCandidates) {
if (score == otherScore) {
int limit = Math.min(candidates.length, otherCandidates.length);
for (int i=0;i<limit;i++) {
int cmp = candidates[i].term.compareTo(otherCandidates[i].term);
if (cmp != 0) {
// Later (zzz) terms sort before (are weaker than) earlier (aaa) terms:
return -cmp;
}
}

return candidates.length - otherCandidates.length;
} else {
return Double.compare(score, otherScore);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
Expand Down Expand Up @@ -186,11 +188,15 @@ public CandidateSet(Candidate[] candidates, Candidate originalTerm) {
}

public void addCandidates(List<Candidate> candidates) {
// Merge new candidates into existing ones,
// deduping:
final Set<Candidate> set = new HashSet<>(candidates);
for (int i = 0; i < this.candidates.length; i++) {
set.add(this.candidates[i]);
}
this.candidates = set.toArray(new Candidate[set.size()]);
// Sort strongest to weakest:
Arrays.sort(this.candidates, Collections.reverseOrder());
}

public void addOneCandidate(Candidate candidate) {
Expand All @@ -202,7 +208,7 @@ public void addOneCandidate(Candidate candidate) {

}

public static class Candidate {
public static class Candidate implements Comparable<Candidate> {
public static final Candidate[] EMPTY = new Candidate[0];
public final BytesRef term;
public final double stringDistance;
Expand All @@ -220,7 +226,7 @@ public Candidate(BytesRef term, long frequency, double stringDistance, double sc

@Override
public String toString() {
return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", frequency=" + frequency +
return "Candidate [term=" + term.utf8ToString() + ", stringDistance=" + stringDistance + ", score=" + score + ", frequency=" + frequency +
(userInput ? ", userInput" : "" ) + "]";
}

Expand Down Expand Up @@ -248,6 +254,17 @@ public boolean equals(Object obj) {
return false;
return true;
}

/** Lower scores sort first; if scores are equal, then later (zzz) terms sort first */
@Override
public int compareTo(Candidate other) {
if (score == other.score) {
// Later (zzz) terms sort before earlier (aaa) terms:
return other.term.compareTo(term);
} else {
return Double.compare(score, other.score);
}
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,9 @@ public void end() {
double inputPhraseScore = scorer.score(candidates, candidateSets);
cutoffScore = inputPhraseScore * confidence;
}
Correction[] findBestCandiates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);

return new Result(findBestCandiates, cutoffScore);
return new Result(bestCandidates, cutoffScore);
}

public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,6 @@ public void testSizePararm() throws IOException {

@Test
@Nightly
@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elasticsearch/elasticsearch/pull/5962")
public void testPhraseBoundaryCases() throws ElasticsearchException, IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(settingsBuilder()
.put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1) // to get reliable statistics we should put this all into one shard
Expand Down Expand Up @@ -751,10 +750,17 @@ public void testPhraseBoundaryCases() throws ElasticsearchException, IOException
phraseSuggestion.field("ngram").analyzer("myDefAnalyzer")
.addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"));
Suggest suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");

// "xorr the god jewel" and and "xorn the god jewel" have identical scores (we are only using unigrams to score), so we tie break by
// earlier term (xorn):
assertSuggestion(suggest, 0, "simple_phrase", "xorn the god jewel");

phraseSuggestion.analyzer(null);
suggest = searchSuggest( "Xor the Got-Jewel", phraseSuggestion);

// In this case xorr has a better score than xorn because we set the field back to the default (my_shingle2) analyzer, so the
// probability that the term is not in the dictionary but is NOT a misspelling is relatively high in this case compared to the
// others that have no n-gram with the other terms in the phrase :) you can set this realWorldErrorLikelyhood
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ protected TokenStreamComponents createComponents(String fieldName, Reader reader
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("gorr the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));


corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
Expand Down

0 comments on commit 4f7792e

Please sign in to comment.