-
Notifications
You must be signed in to change notification settings - Fork 21
Enabler for proper de-duplication of query results in tiered index with compression #689
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -665,6 +665,23 @@ class TieredSVSIndex : public VecSimTieredIndex<DataType, float> { | |
return ret; | ||
} | ||
|
||
VecSimQueryReply *topKQuery(const void *queryBlob, size_t k, | ||
VecSimQueryParams *queryParams) const override { | ||
// To avoid duplicates in the final result, we must use withSet=true because backend vectors | ||
// are quantized, | ||
// and may produce different scores than the flat index for the same label. | ||
return this->template topKQueryImp<true>(queryBlob, k, queryParams); | ||
} | ||
|
||
VecSimQueryReply *rangeQuery(const void *queryBlob, double radius, | ||
VecSimQueryParams *queryParams, | ||
VecSimQueryReply_Order order) const override { | ||
// To avoid duplicates in the final result, we must use withSet=true because backend vectors | ||
// are quantized, | ||
// and may produce different scores than the flat index for the same label. | ||
return this->template rangeQueryImp<true>(queryBlob, radius, queryParams, order); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
||
} | ||
|
||
size_t indexSize() const override { | ||
std::shared_lock<std::shared_mutex> flat_lock(this->flatIndexGuard); | ||
std::shared_lock<std::shared_mutex> main_lock(this->mainIndexGuard); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -386,6 +386,102 @@ TYPED_TEST(SVSTieredIndexTest, insertJobAsync) { | |
} | ||
} | ||
|
||
/** | ||
* This test verifies that a tiered index correctly returns the closest vectors when querying data | ||
* distributed across both the flat and SVS indices, specifically when duplicate labels exist in | ||
* both indices with different distances. It adds vectors with known scores, including such | ||
* duplicates, and ensures that only the closer instance is returned. The test covers both top-K and | ||
* range queries, validating result ordering by score and by ID. | ||
*/ | ||
TYPED_TEST(SVSTieredIndexTest, SearchDifferentScores) { | ||
size_t dim = 4; | ||
size_t constexpr k = 3; | ||
|
||
// Create TieredSVS index instance with a mock queue. | ||
SVSParams params = { | ||
.type = TypeParam::get_index_type(), | ||
.dim = dim, | ||
.metric = VecSimMetric_L2, | ||
}; | ||
VecSimParams svs_params = CreateParams(params); | ||
auto mock_thread_pool = tieredIndexMock(); | ||
auto *tiered_index = this->CreateTieredSVSIndex(svs_params, mock_thread_pool, k); | ||
ASSERT_INDEX(tiered_index); | ||
|
||
auto svs_index = tiered_index->GetBackendIndex(); | ||
auto flat_index = tiered_index->GetFlatIndex(); | ||
|
||
// Define IDs and distance values for test vectors | ||
// ids are intentionally in random order to verify sorting works correctly | ||
size_t constexpr ids[k] = {54, 4, 15}; | ||
double constexpr res_values[k] = {2, 3, 100}; | ||
// Define a type for our result pair | ||
using ResultPair = std::pair<size_t, double>; // (id, score) | ||
|
||
// Create a vector of expected results - these are the scores we expect | ||
// when querying with a zero vector (L2 distance = value²*dim) | ||
std::vector<ResultPair> expected_results_by_score(k); | ||
|
||
for (size_t i = 0; i < k; i++) { | ||
expected_results_by_score[i] = {ids[i], res_values[i] * res_values[i] * dim}; | ||
} | ||
|
||
// Insert duplicate vectors with same ID but different distances across the two indices. | ||
// The index should return only the closer of the two. | ||
|
||
// ID 54: closer in SVS, farther in flat — expect to return SVS version | ||
GenerateAndAddVector<TEST_DATA_T>(svs_index, dim, ids[0], res_values[0]); | ||
GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, ids[0], 4); | ||
Comment on lines
+432
to
+434
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I understand, flat index should be prioritized in this case:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's assume the following:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. then the implementation is wrong. |
||
|
||
// ID 4: closer in flat, farther in SVS — expect to return flat version | ||
GenerateAndAddVector<TEST_DATA_T>(svs_index, dim, ids[1], 5); | ||
GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, ids[1], res_values[1]); | ||
|
||
// ID 15: identical in both indices — distance is large, should still return one instance | ||
GenerateAndAddVector<TEST_DATA_T>(svs_index, dim, ids[2], res_values[2]); | ||
GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, ids[2], res_values[2]); | ||
|
||
// Create a zero vector for querying - this makes scores directly proportional to vector values | ||
TEST_DATA_T query_0[dim]; | ||
GenerateVector<TEST_DATA_T>(query_0, dim, 0); | ||
|
||
// Verify results ordered by increasing score (distance). | ||
double prev_score = 0; // all scores are positive | ||
auto verify_by_score = [&](size_t id, double score, size_t res_index) { | ||
ASSERT_LT(prev_score, score); // prev_score < score | ||
prev_score = score; | ||
ASSERT_EQ(id, expected_results_by_score[res_index].first); | ||
ASSERT_EQ(score, expected_results_by_score[res_index].second); | ||
}; | ||
|
||
runTopKSearchTest(tiered_index, query_0, k, verify_by_score, nullptr, BY_SCORE); | ||
// Reset score tracking for range query | ||
prev_score = 0; | ||
// Use the largest score as the range to include all vectors | ||
double range = expected_results_by_score.back().second; | ||
runRangeQueryTest(tiered_index, query_0, range, verify_by_score, k, BY_SCORE); | ||
|
||
// Now verify result ordering by ascending ID instead of score | ||
auto expected_results_by_id = expected_results_by_score; | ||
// Sort by id (ascending) | ||
std::sort(expected_results_by_id.begin(), expected_results_by_id.end(), | ||
[](const ResultPair &a, const ResultPair &b) { return a.first < b.first; }); | ||
|
||
size_t prev_id = 0; // all ids are positive | ||
auto verify_by_id = [&](size_t id, double score, size_t res_index) { | ||
ASSERT_LT(prev_id, id); // prev_score < score | ||
prev_id = id; | ||
ASSERT_EQ(id, expected_results_by_id[res_index].first); | ||
ASSERT_EQ(score, expected_results_by_id[res_index].second); | ||
}; | ||
// Test top-K search with results ordered by ID | ||
runTopKSearchTest(tiered_index, query_0, k, verify_by_id, nullptr, BY_ID); | ||
// Reset ID tracking for range query | ||
prev_id = 0; | ||
// Test range query with results ordered by ID | ||
runRangeQueryTest(tiered_index, query_0, range, verify_by_id, k, BY_ID); | ||
} | ||
|
||
TYPED_TEST(SVSTieredIndexTest, KNNSearch) { | ||
size_t dim = 4; | ||
size_t k = 10; | ||
|
@@ -2413,4 +2509,4 @@ TEST(SVSTieredIndexTest, svs_not_supported) { | |
ASSERT_EQ(size2, -1); | ||
} | ||
|
||
#endif | ||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TODO: set to true only if quantization is enabled.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@rfsaliev Could you please guide me on how to check whether the SVS index performs quantization (e.g., LVQ) in a way that minimizes runtime overhead? Ideally, I’d like to avoid adding dynamic casts or branching logic if possible.