New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Switch to Lucene QueryRescorer #7707
Changes from 9 commits
bfc938a
61cdac4
fd53805
9beb323
119eab0
0a08194
1bc1572
d6a39c2
8846d66
9aae1d6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,11 +19,8 @@ | |
|
||
package org.elasticsearch.search.rescore; | ||
|
||
import org.apache.lucene.index.AtomicReaderContext; | ||
import org.apache.lucene.index.Term; | ||
import org.apache.lucene.search.*; | ||
import org.apache.lucene.util.Bits; | ||
import org.apache.lucene.util.IntroSorter; | ||
import org.elasticsearch.ElasticsearchIllegalArgumentException; | ||
import org.elasticsearch.common.xcontent.XContentParser; | ||
import org.elasticsearch.common.xcontent.XContentParser.Token; | ||
|
@@ -33,6 +30,7 @@ | |
|
||
import java.io.IOException; | ||
import java.util.Arrays; | ||
import java.util.Comparator; | ||
import java.util.Set; | ||
|
||
public final class QueryRescorer implements Rescorer { | ||
|
@@ -106,35 +104,57 @@ public String name() { | |
} | ||
|
||
@Override | ||
public void rescore(TopDocs topDocs, SearchContext context, RescoreSearchContext rescoreContext) throws IOException { | ||
public TopDocs rescore(TopDocs topDocs, SearchContext context, RescoreSearchContext rescoreContext) throws IOException { | ||
|
||
assert rescoreContext != null; | ||
if (topDocs == null || topDocs.totalHits == 0 || topDocs.scoreDocs.length == 0) { | ||
return; | ||
return topDocs; | ||
} | ||
|
||
QueryRescoreContext rescore = (QueryRescoreContext) rescoreContext; | ||
ContextIndexSearcher searcher = context.searcher(); | ||
TopDocsFilter filter = new TopDocsFilter(topDocs, rescoreContext.window()); | ||
TopDocs rescored = searcher.search(rescore.query(), filter, rescoreContext.window()); | ||
context.queryResult().topDocs(merge(topDocs, rescored, rescore)); | ||
final QueryRescoreContext rescore = (QueryRescoreContext) rescoreContext; | ||
|
||
org.apache.lucene.search.Rescorer rescorer = new org.apache.lucene.search.QueryRescorer(rescore.query()) { | ||
|
||
@Override | ||
protected float combine(float firstPassScore, boolean secondPassMatches, float secondPassScore) { | ||
if (secondPassMatches) { | ||
return rescore.scoreMode.combine(firstPassScore * rescore.queryWeight(), secondPassScore * rescore.rescoreQueryWeight()); | ||
} | ||
return firstPassScore * rescore.queryWeight(); | ||
} | ||
}; | ||
|
||
// First take top slice of incoming docs, to be rescored: | ||
TopDocs topNFirstPass = topN(topDocs, rescoreContext.window()); | ||
|
||
// Rescore them: | ||
TopDocs rescored = rescorer.rescore(context.searcher(), topNFirstPass, rescoreContext.window()); | ||
|
||
// Splice back to non-topN hits and resort all of them: | ||
return combine(topDocs, rescored, (QueryRescoreContext) rescoreContext); | ||
} | ||
|
||
@Override | ||
public Explanation explain(int topLevelDocId, SearchContext context, RescoreSearchContext rescoreContext, | ||
Explanation sourceExplanation) throws IOException { | ||
QueryRescoreContext rescore = ((QueryRescoreContext) rescoreContext); | ||
Explanation sourceExplanation) throws IOException { | ||
QueryRescoreContext rescore = (QueryRescoreContext) rescoreContext; | ||
ContextIndexSearcher searcher = context.searcher(); | ||
if (sourceExplanation == null) { | ||
// this should not happen but just in case | ||
return new ComplexExplanation(false, 0.0f, "nothing matched"); | ||
} | ||
// TODO: this isn't right? I.e., we are incorrectly pretending all first pass hits were rescored? If the requested docID was | ||
// beyond the top rescoreContext.window() in the first pass hits, we don't rescore it now? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 We should try to fix it (probably in another change, it doesn't look new?) |
||
Explanation rescoreExplain = searcher.explain(rescore.query(), topLevelDocId); | ||
float primaryWeight = rescore.queryWeight(); | ||
ComplexExplanation prim = new ComplexExplanation(sourceExplanation.isMatch(), | ||
sourceExplanation.getValue() * primaryWeight, | ||
"product of:"); | ||
prim.addDetail(sourceExplanation); | ||
prim.addDetail(new Explanation(primaryWeight, "primaryWeight")); | ||
|
||
// NOTE: we don't use Lucene's Rescorer.explain because we want to insert our own description with which ScoreMode was used. Maybe | ||
// we should add QueryRescorer.explainCombine to Lucene? | ||
if (rescoreExplain != null && rescoreExplain.isMatch()) { | ||
float secondaryWeight = rescore.rescoreQueryWeight(); | ||
ComplexExplanation sec = new ComplexExplanation(rescoreExplain.isMatch(), | ||
|
@@ -195,6 +215,46 @@ public RescoreSearchContext parse(XContentParser parser, SearchContext context) | |
return rescoreContext; | ||
} | ||
|
||
private final static Comparator<ScoreDoc> SCORE_DOC_COMPARATOR = new Comparator<ScoreDoc>() { | ||
@Override | ||
public int compare(ScoreDoc o1, ScoreDoc o2) { | ||
int cmp = Float.compare(o2.score, o1.score); | ||
return cmp == 0 ? Integer.compare(o1.doc, o2.doc) : cmp; | ||
} | ||
}; | ||
|
||
/** Returns a new {@link TopDocs} with the topN from the incoming one, or the same TopDocs if the number of hits is already <= | ||
* topN. */ | ||
private TopDocs topN(TopDocs in, int topN) { | ||
if (in.totalHits < topN) { | ||
assert in.scoreDocs.length == in.totalHits; | ||
return in; | ||
} | ||
|
||
ScoreDoc[] subset = new ScoreDoc[topN]; | ||
System.arraycopy(in.scoreDocs, 0, subset, 0, topN); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When doing simple array copies like this, I tend to find |
||
|
||
return new TopDocs(in.totalHits, subset, in.getMaxScore()); | ||
} | ||
|
||
/** Modifies incoming TopDocs (in) by replacing the top hits with resorted's hits, and then resorting all hits. */ | ||
private TopDocs combine(TopDocs in, TopDocs resorted, QueryRescoreContext ctx) { | ||
|
||
System.arraycopy(resorted.scoreDocs, 0, in.scoreDocs, 0, resorted.scoreDocs.length); | ||
if (in.scoreDocs.length > resorted.scoreDocs.length) { | ||
// These hits were not rescored (beyond the rescore window), so we treat them the same as a hit that did get rescored but did | ||
// not match the 2nd pass query: | ||
for(int i=resorted.scoreDocs.length;i<in.scoreDocs.length;i++) { | ||
in.scoreDocs[i].score *= ctx.queryWeight(); | ||
} | ||
|
||
// TODO: this is wrong, i.e. we are comparing apples and oranges at this point. It would be better if we always rescored all | ||
// incoming first pass hits, instead of allowing recoring of just the top subset: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I agree here, isn't the bug rather that we should only sort up to resorted.scoreDocs.length (and maybe force the weight of the first-pass query to be gte 1)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To give more context, the reason why I'm thinking about this is to make sure that you always get the same scores for the same documents, no matter the page size. For the same reason I think another bug is that rescoring should take into account the offset (it doesn't seem to)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It's tricky ... because if you only sort resorted.scoreDocs.length then the hits are not necessarily properly sorted by score? I mean, a hit in the non-rescored part could in theory have a higher score than the rescored part? But you're right, pagination won't work right, if you increase size across the two requests. Maybe we should document that you cannot change size? E.g., you can only show up to N pages and you query up front for page=1 for all N pages...
Hmmm... that's a great question. At what point is from/offset applied when rescoring? Hopefully it is after rescoring ran. I'll try to add a test making sure if you change "from" (keeping size constant) it's behaving properly... Also, do we check/require that you can only run rescorer if you are sorting by score in the first place... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Maybe we should ensure that the original query weight is greater than 1 and that the rescore weight is greater than 0, so that rescoring can only increase scores? The fact that non-rescored hits could have higher scores than rescored hits feels wrong to me.
I wish this is something elasticsearch could take care of by itself, so that
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I agree it's odd if rescoring can make the score worse ... but it's not just the default (ScoreMode.Total) that can do this. With ScoreMode.Min and .Multiply you could also see the score decrease. I suppose we could set the new score to be max(old score, new score) to force it to only increase, but this seems hacky/dangerous ... what if there are real use cases for allowing rescorer to penalize ... So I don't think we should enforce rescore weight > 0, nor similar checks for the .Min/.Multiply cases, at least not on this issue ;)
We have this now for "from" anyway (as long as caller does not change window_size), but if caller changes size, because of the "new score might be lower than old score" issue, results could shift ... That said, for the "common" usage here, where rescore weight > 0, and you use Total (not Min or Mutiply), such that new score is always >= old score, then you can also vary size without changing top hits. I'll soften the admonition in the docs to just warn about keeping window_size fixed ...
You can do this today too, just by pre-specifying a large size up front. |
||
Arrays.sort(in.scoreDocs, SCORE_DOC_COMPARATOR); | ||
} | ||
return in; | ||
} | ||
|
||
public static class QueryRescoreContext extends RescoreSearchContext { | ||
|
||
public QueryRescoreContext(QueryRescorer rescorer) { | ||
|
@@ -241,174 +301,6 @@ public void setScoreMode(ScoreMode scoreMode) { | |
|
||
} | ||
|
||
|
||
private TopDocs merge(TopDocs primary, TopDocs secondary, QueryRescoreContext context) { | ||
DocIdSorter sorter = new DocIdSorter(); | ||
sorter.array = primary.scoreDocs; | ||
sorter.sort(0, sorter.array.length); | ||
ScoreDoc[] primaryDocs = sorter.array; | ||
sorter.array = secondary.scoreDocs; | ||
sorter.sort(0, sorter.array.length); | ||
ScoreDoc[] secondaryDocs = sorter.array; | ||
int j = 0; | ||
float primaryWeight = context.queryWeight(); | ||
float secondaryWeight = context.rescoreQueryWeight(); | ||
ScoreMode scoreMode = context.scoreMode(); | ||
for (int i = 0; i < primaryDocs.length; i++) { | ||
if (j < secondaryDocs.length && primaryDocs[i].doc == secondaryDocs[j].doc) { | ||
primaryDocs[i].score = scoreMode.combine(primaryDocs[i].score * primaryWeight, secondaryDocs[j++].score * secondaryWeight); | ||
} else { | ||
primaryDocs[i].score *= primaryWeight; | ||
} | ||
} | ||
ScoreSorter scoreSorter = new ScoreSorter(); | ||
scoreSorter.array = primaryDocs; | ||
scoreSorter.sort(0, primaryDocs.length); | ||
primary.setMaxScore(primaryDocs[0].score); | ||
return primary; | ||
} | ||
|
||
private static final class DocIdSorter extends IntroSorter { | ||
private ScoreDoc[] array; | ||
private ScoreDoc pivot; | ||
|
||
@Override | ||
protected void swap(int i, int j) { | ||
ScoreDoc scoreDoc = array[i]; | ||
array[i] = array[j]; | ||
array[j] = scoreDoc; | ||
} | ||
|
||
@Override | ||
protected int compare(int i, int j) { | ||
return compareDocId(array[i], array[j]); | ||
} | ||
|
||
@Override | ||
protected void setPivot(int i) { | ||
pivot = array[i]; | ||
|
||
} | ||
|
||
@Override | ||
protected int comparePivot(int j) { | ||
return compareDocId(pivot, array[j]); | ||
} | ||
|
||
} | ||
|
||
private static final int compareDocId(ScoreDoc left, ScoreDoc right) { | ||
if (left.doc < right.doc) { | ||
return 1; | ||
} else if (left.doc == right.doc) { | ||
return 0; | ||
} | ||
return -1; | ||
} | ||
|
||
private static final class ScoreSorter extends IntroSorter { | ||
private ScoreDoc[] array; | ||
private ScoreDoc pivot; | ||
|
||
@Override | ||
protected void swap(int i, int j) { | ||
ScoreDoc scoreDoc = array[i]; | ||
array[i] = array[j]; | ||
array[j] = scoreDoc; | ||
} | ||
|
||
@Override | ||
protected int compare(int i, int j) { | ||
int cmp = Float.compare(array[j].score, array[i].score); | ||
return cmp == 0 ? compareDocId(array[i], array[j]) : cmp; | ||
} | ||
|
||
@Override | ||
protected void setPivot(int i) { | ||
pivot = array[i]; | ||
|
||
} | ||
|
||
@Override | ||
protected int comparePivot(int j) { | ||
int cmp = Float.compare(array[j].score, pivot.score); | ||
return cmp == 0 ? compareDocId(pivot, array[j]) : cmp; | ||
} | ||
|
||
} | ||
|
||
private static final class TopDocsFilter extends Filter { | ||
|
||
private final int[] docIds; | ||
|
||
public TopDocsFilter(TopDocs topDocs, int max) { | ||
ScoreDoc[] scoreDocs = topDocs.scoreDocs; | ||
max = Math.min(max, scoreDocs.length); | ||
this.docIds = new int[max]; | ||
for (int i = 0; i < max; i++) { | ||
docIds[i] = scoreDocs[i].doc; | ||
} | ||
Arrays.sort(docIds); | ||
} | ||
|
||
@Override | ||
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { | ||
final int docBase = context.docBase; | ||
int limit = docBase + context.reader().maxDoc(); | ||
int offset = Arrays.binarySearch(docIds, docBase); | ||
if (offset < 0) { | ||
offset = (-offset) - 1; | ||
} | ||
int end = Arrays.binarySearch(docIds, limit); | ||
if (end < 0) { | ||
end = (-end) - 1; | ||
} | ||
final int start = offset; | ||
final int stop = end; | ||
|
||
return new DocIdSet() { | ||
|
||
@Override | ||
public DocIdSetIterator iterator() throws IOException { | ||
return new DocIdSetIterator() { | ||
private int current = start; | ||
private int docId = NO_MORE_DOCS; | ||
|
||
@Override | ||
public int nextDoc() throws IOException { | ||
if (current < stop) { | ||
return docId = docIds[current++] - docBase; | ||
} | ||
return docId = NO_MORE_DOCS; | ||
} | ||
|
||
@Override | ||
public int docID() { | ||
return docId; | ||
} | ||
|
||
@Override | ||
public int advance(int target) throws IOException { | ||
if (target == NO_MORE_DOCS) { | ||
current = stop; | ||
return docId = NO_MORE_DOCS; | ||
} | ||
while (nextDoc() < target) { | ||
} | ||
return docId; | ||
} | ||
|
||
@Override | ||
public long cost() { | ||
return docIds.length; | ||
} | ||
}; | ||
} | ||
}; | ||
} | ||
|
||
} | ||
|
||
@Override | ||
public void extractTerms(SearchContext context, RescoreSearchContext rescoreContext, Set<Term> termsSet) { | ||
((QueryRescoreContext) rescoreContext).query().extractTerms(termsSet); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FYI using
NOTE:
instead of*Note:*
would make it an admonition paragraph, which is probably what you want? https://github.com/elasticsearch/docs#admonition-paragraphs