Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to Lucene QueryRescorer #7707

Closed
wants to merge 10 commits into from
5 changes: 5 additions & 0 deletions docs/reference/search/request/rescore.asciidoc
Expand Up @@ -19,6 +19,11 @@ alternative rescorers may be made available, for example, a pair-wise rescorer.
<<search-request-search-type,`search_type`>> is set
to `scan` or `count`.

*Note:* when exposing pagination to your users, you should not change
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI using NOTE: instead of *Note:* would make it an admonition paragraph, which is probably what you want? https://github.com/elasticsearch/docs#admonition-paragraphs

`size` nor `window_size` as you step through each page (by passing
different `from` values) since such changes can alter the top hits
causing results to confusingly shift as the user steps through pages.

==== Query rescorer

The query rescorer executes a second query only on the Top-K results
Expand Down
252 changes: 72 additions & 180 deletions src/main/java/org/elasticsearch/search/rescore/QueryRescorer.java
Expand Up @@ -19,11 +19,8 @@

package org.elasticsearch.search.rescore;

import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.IntroSorter;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentParser.Token;
Expand All @@ -33,6 +30,7 @@

import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Set;

public final class QueryRescorer implements Rescorer {
Expand Down Expand Up @@ -106,35 +104,57 @@ public String name() {
}

@Override
public void rescore(TopDocs topDocs, SearchContext context, RescoreSearchContext rescoreContext) throws IOException {
public TopDocs rescore(TopDocs topDocs, SearchContext context, RescoreSearchContext rescoreContext) throws IOException {

assert rescoreContext != null;
if (topDocs == null || topDocs.totalHits == 0 || topDocs.scoreDocs.length == 0) {
return;
return topDocs;
}

QueryRescoreContext rescore = (QueryRescoreContext) rescoreContext;
ContextIndexSearcher searcher = context.searcher();
TopDocsFilter filter = new TopDocsFilter(topDocs, rescoreContext.window());
TopDocs rescored = searcher.search(rescore.query(), filter, rescoreContext.window());
context.queryResult().topDocs(merge(topDocs, rescored, rescore));
final QueryRescoreContext rescore = (QueryRescoreContext) rescoreContext;

org.apache.lucene.search.Rescorer rescorer = new org.apache.lucene.search.QueryRescorer(rescore.query()) {

@Override
protected float combine(float firstPassScore, boolean secondPassMatches, float secondPassScore) {
if (secondPassMatches) {
return rescore.scoreMode.combine(firstPassScore * rescore.queryWeight(), secondPassScore * rescore.rescoreQueryWeight());
}
return firstPassScore * rescore.queryWeight();
}
};

// First take top slice of incoming docs, to be rescored:
TopDocs topNFirstPass = topN(topDocs, rescoreContext.window());

// Rescore them:
TopDocs rescored = rescorer.rescore(context.searcher(), topNFirstPass, rescoreContext.window());

// Splice back to non-topN hits and resort all of them:
return combine(topDocs, rescored, (QueryRescoreContext) rescoreContext);
}

@Override
public Explanation explain(int topLevelDocId, SearchContext context, RescoreSearchContext rescoreContext,
Explanation sourceExplanation) throws IOException {
QueryRescoreContext rescore = ((QueryRescoreContext) rescoreContext);
Explanation sourceExplanation) throws IOException {
QueryRescoreContext rescore = (QueryRescoreContext) rescoreContext;
ContextIndexSearcher searcher = context.searcher();
if (sourceExplanation == null) {
// this should not happen but just in case
return new ComplexExplanation(false, 0.0f, "nothing matched");
}
// TODO: this isn't right? I.e., we are incorrectly pretending all first pass hits were rescored? If the requested docID was
// beyond the top rescoreContext.window() in the first pass hits, we don't rescore it now?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 We should try to fix it (probably in another change, it doesn't look new?)

Explanation rescoreExplain = searcher.explain(rescore.query(), topLevelDocId);
float primaryWeight = rescore.queryWeight();
ComplexExplanation prim = new ComplexExplanation(sourceExplanation.isMatch(),
sourceExplanation.getValue() * primaryWeight,
"product of:");
prim.addDetail(sourceExplanation);
prim.addDetail(new Explanation(primaryWeight, "primaryWeight"));

// NOTE: we don't use Lucene's Rescorer.explain because we want to insert our own description with which ScoreMode was used. Maybe
// we should add QueryRescorer.explainCombine to Lucene?
if (rescoreExplain != null && rescoreExplain.isMatch()) {
float secondaryWeight = rescore.rescoreQueryWeight();
ComplexExplanation sec = new ComplexExplanation(rescoreExplain.isMatch(),
Expand Down Expand Up @@ -195,6 +215,46 @@ public RescoreSearchContext parse(XContentParser parser, SearchContext context)
return rescoreContext;
}

private final static Comparator<ScoreDoc> SCORE_DOC_COMPARATOR = new Comparator<ScoreDoc>() {
@Override
public int compare(ScoreDoc o1, ScoreDoc o2) {
int cmp = Float.compare(o2.score, o1.score);
return cmp == 0 ? Integer.compare(o1.doc, o2.doc) : cmp;
}
};

/** Returns a new {@link TopDocs} with the topN from the incoming one, or the same TopDocs if the number of hits is already <=
* topN. */
private TopDocs topN(TopDocs in, int topN) {
if (in.totalHits < topN) {
assert in.scoreDocs.length == in.totalHits;
return in;
}

ScoreDoc[] subset = new ScoreDoc[topN];
System.arraycopy(in.scoreDocs, 0, subset, 0, topN);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When doing simple array copies like this, I tend to find Arrays.copyOf easier to read?


return new TopDocs(in.totalHits, subset, in.getMaxScore());
}

/** Modifies incoming TopDocs (in) by replacing the top hits with resorted's hits, and then resorting all hits. */
private TopDocs combine(TopDocs in, TopDocs resorted, QueryRescoreContext ctx) {

System.arraycopy(resorted.scoreDocs, 0, in.scoreDocs, 0, resorted.scoreDocs.length);
if (in.scoreDocs.length > resorted.scoreDocs.length) {
// These hits were not rescored (beyond the rescore window), so we treat them the same as a hit that did get rescored but did
// not match the 2nd pass query:
for(int i=resorted.scoreDocs.length;i<in.scoreDocs.length;i++) {
in.scoreDocs[i].score *= ctx.queryWeight();
}

// TODO: this is wrong, i.e. we are comparing apples and oranges at this point. It would be better if we always rescored all
// incoming first pass hits, instead of allowing recoring of just the top subset:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I agree here, isn't the bug rather that we should only sort up to resorted.scoreDocs.length (and maybe force the weight of the first-pass query to be gte 1)?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To give more context, the reason why I'm thinking about this is to make sure that you always get the same scores for the same documents, no matter the page size.

For the same reason I think another bug is that rescoring should take into account the offset (it doesn't seem to)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I agree here, isn't the bug rather that we should only sort up to resorted.scoreDocs.length (and maybe force the weight of the first-pass query to be gte 1)?

It's tricky ... because if you only sort resorted.scoreDocs.length then the hits are not necessarily properly sorted by score? I mean, a hit in the non-rescored part could in theory have a higher score than the rescored part?

But you're right, pagination won't work right, if you increase size across the two requests. Maybe we should document that you cannot change size? E.g., you can only show up to N pages and you query up front for page=1 for all N pages...

For the same reason I think another bug is that rescoring should take into account the offset (it doesn't seem to)?

Hmmm... that's a great question. At what point is from/offset applied when rescoring? Hopefully it is after rescoring ran. I'll try to add a test making sure if you change "from" (keeping size constant) it's behaving properly...

Also, do we check/require that you can only run rescorer if you are sorting by score in the first place...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's tricky ... because if you only sort resorted.scoreDocs.length then the hits are not necessarily properly sorted by score? I mean, a hit in the non-rescored part could in theory have a higher score than the rescored part?

Maybe we should ensure that the original query weight is greater than 1 and that the rescore weight is greater than 0, so that rescoring can only increase scores? The fact that non-rescored hits could have higher scores than rescored hits feels wrong to me.

But you're right, pagination won't work right, if you increase size across the two requests. Maybe we should document that you cannot change size? E.g., you can only show up to N pages and you query up front for page=1 for all N pages...

I wish this is something elasticsearch could take care of by itself, so that

  • given a fixed window_size you would have exactly the same hits with the same scores and at the same position, no matter what your from and size are (this way over-requesting up-front would just be an optimization, not the normal procedure)
  • you could request pages that are beyong window_size

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the original query weight is greater than 1 and that the rescore weight is greater than 0

query_weight >= 1 is not even necessary anymore now that you changed rescoring to multiply the score of all hits with the query weight

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should ensure that the original query weight is greater than 1 and that the rescore weight is greater than 0, so that rescoring can only increase scores? The fact that non-rescored hits could have higher scores than rescored hits feels wrong to me.

I agree it's odd if rescoring can make the score worse ... but it's not just the default (ScoreMode.Total) that can do this. With ScoreMode.Min and .Multiply you could also see the score decrease. I suppose we could set the new score to be max(old score, new score) to force it to only increase, but this seems hacky/dangerous ... what if there are real use cases for allowing rescorer to penalize ...

So I don't think we should enforce rescore weight > 0, nor similar checks for the .Min/.Multiply cases, at least not on this issue ;)

I wish this is something elasticsearch could take care of by itself, so that

  • given a fixed window_size you would have exactly the same hits with the same scores and at the same position, no matter what your from and size are (this way over-requesting up-front would just be an optimization, not the normal procedure)

We have this now for "from" anyway (as long as caller does not change window_size), but if caller changes size, because of the "new score might be lower than old score" issue, results could shift ...

That said, for the "common" usage here, where rescore weight > 0, and you use Total (not Min or Mutiply), such that new score is always >= old score, then you can also vary size without changing top hits.

I'll soften the admonition in the docs to just warn about keeping window_size fixed ...

  • you could request pages that are beyong window_size

You can do this today too, just by pre-specifying a large size up front.

Arrays.sort(in.scoreDocs, SCORE_DOC_COMPARATOR);
}
return in;
}

public static class QueryRescoreContext extends RescoreSearchContext {

public QueryRescoreContext(QueryRescorer rescorer) {
Expand Down Expand Up @@ -241,174 +301,6 @@ public void setScoreMode(ScoreMode scoreMode) {

}


private TopDocs merge(TopDocs primary, TopDocs secondary, QueryRescoreContext context) {
DocIdSorter sorter = new DocIdSorter();
sorter.array = primary.scoreDocs;
sorter.sort(0, sorter.array.length);
ScoreDoc[] primaryDocs = sorter.array;
sorter.array = secondary.scoreDocs;
sorter.sort(0, sorter.array.length);
ScoreDoc[] secondaryDocs = sorter.array;
int j = 0;
float primaryWeight = context.queryWeight();
float secondaryWeight = context.rescoreQueryWeight();
ScoreMode scoreMode = context.scoreMode();
for (int i = 0; i < primaryDocs.length; i++) {
if (j < secondaryDocs.length && primaryDocs[i].doc == secondaryDocs[j].doc) {
primaryDocs[i].score = scoreMode.combine(primaryDocs[i].score * primaryWeight, secondaryDocs[j++].score * secondaryWeight);
} else {
primaryDocs[i].score *= primaryWeight;
}
}
ScoreSorter scoreSorter = new ScoreSorter();
scoreSorter.array = primaryDocs;
scoreSorter.sort(0, primaryDocs.length);
primary.setMaxScore(primaryDocs[0].score);
return primary;
}

private static final class DocIdSorter extends IntroSorter {
private ScoreDoc[] array;
private ScoreDoc pivot;

@Override
protected void swap(int i, int j) {
ScoreDoc scoreDoc = array[i];
array[i] = array[j];
array[j] = scoreDoc;
}

@Override
protected int compare(int i, int j) {
return compareDocId(array[i], array[j]);
}

@Override
protected void setPivot(int i) {
pivot = array[i];

}

@Override
protected int comparePivot(int j) {
return compareDocId(pivot, array[j]);
}

}

private static final int compareDocId(ScoreDoc left, ScoreDoc right) {
if (left.doc < right.doc) {
return 1;
} else if (left.doc == right.doc) {
return 0;
}
return -1;
}

private static final class ScoreSorter extends IntroSorter {
private ScoreDoc[] array;
private ScoreDoc pivot;

@Override
protected void swap(int i, int j) {
ScoreDoc scoreDoc = array[i];
array[i] = array[j];
array[j] = scoreDoc;
}

@Override
protected int compare(int i, int j) {
int cmp = Float.compare(array[j].score, array[i].score);
return cmp == 0 ? compareDocId(array[i], array[j]) : cmp;
}

@Override
protected void setPivot(int i) {
pivot = array[i];

}

@Override
protected int comparePivot(int j) {
int cmp = Float.compare(array[j].score, pivot.score);
return cmp == 0 ? compareDocId(pivot, array[j]) : cmp;
}

}

private static final class TopDocsFilter extends Filter {

private final int[] docIds;

public TopDocsFilter(TopDocs topDocs, int max) {
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
max = Math.min(max, scoreDocs.length);
this.docIds = new int[max];
for (int i = 0; i < max; i++) {
docIds[i] = scoreDocs[i].doc;
}
Arrays.sort(docIds);
}

@Override
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
final int docBase = context.docBase;
int limit = docBase + context.reader().maxDoc();
int offset = Arrays.binarySearch(docIds, docBase);
if (offset < 0) {
offset = (-offset) - 1;
}
int end = Arrays.binarySearch(docIds, limit);
if (end < 0) {
end = (-end) - 1;
}
final int start = offset;
final int stop = end;

return new DocIdSet() {

@Override
public DocIdSetIterator iterator() throws IOException {
return new DocIdSetIterator() {
private int current = start;
private int docId = NO_MORE_DOCS;

@Override
public int nextDoc() throws IOException {
if (current < stop) {
return docId = docIds[current++] - docBase;
}
return docId = NO_MORE_DOCS;
}

@Override
public int docID() {
return docId;
}

@Override
public int advance(int target) throws IOException {
if (target == NO_MORE_DOCS) {
current = stop;
return docId = NO_MORE_DOCS;
}
while (nextDoc() < target) {
}
return docId;
}

@Override
public long cost() {
return docIds.length;
}
};
}
};
}

}

@Override
public void extractTerms(SearchContext context, RescoreSearchContext rescoreContext, Set<Term> termsSet) {
((QueryRescoreContext) rescoreContext).query().extractTerms(termsSet);
Expand Down
16 changes: 10 additions & 6 deletions src/main/java/org/elasticsearch/search/rescore/RescorePhase.java
Expand Up @@ -20,6 +20,8 @@
package org.elasticsearch.search.rescore;

import com.google.common.collect.ImmutableMap;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
Expand Down Expand Up @@ -54,16 +56,18 @@ public void preProcess(SearchContext context) {
@Override
public void execute(SearchContext context) throws ElasticsearchException {
try {
TopDocs topDocs = context.queryResult().topDocs();
for (RescoreSearchContext ctx : context.rescore()) {
ctx.rescorer().rescore(context.queryResult().topDocs(), context, ctx);
topDocs = ctx.rescorer().rescore(topDocs, context, ctx);
}
if (context.size() < topDocs.scoreDocs.length) {
ScoreDoc[] hits = new ScoreDoc[context.size()];
System.arraycopy(topDocs.scoreDocs, 0, hits, 0, hits.length);
topDocs = new TopDocs(topDocs.totalHits, hits, topDocs.getMaxScore());
}
context.queryResult().topDocs(topDocs);
} catch (IOException e) {
throw new ElasticsearchException("Rescore Phase Failed", e);
}
}





}
Expand Up @@ -49,7 +49,7 @@ public interface Rescorer {
* @param rescoreContext the {@link RescoreSearchContext}. This will never be <code>null</code>
* @throws IOException if an {@link IOException} occurs during rescoring
*/
public void rescore(TopDocs topDocs, SearchContext context, RescoreSearchContext rescoreContext) throws IOException;
public TopDocs rescore(TopDocs topDocs, SearchContext context, RescoreSearchContext rescoreContext) throws IOException;

/**
* Executes an {@link Explanation} phase on the rescorer.
Expand Down