Skip to content

Enabler for proper de-duplication of query results in tiered index with compression #689

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/VecSim/algorithms/svs/svs_tiered.h
Original file line number Diff line number Diff line change
Expand Up @@ -665,6 +665,23 @@ class TieredSVSIndex : public VecSimTieredIndex<DataType, float> {
return ret;
}

VecSimQueryReply *topKQuery(const void *queryBlob, size_t k,
VecSimQueryParams *queryParams) const override {
// To avoid duplicates in the final result, we must use withSet=true because backend vectors
// are quantized,
// and may produce different scores than the flat index for the same label.
return this->template topKQueryImp<true>(queryBlob, k, queryParams);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: set to true only if quantization is enabled.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rfsaliev Could you please guide me on how to check whether the SVS index performs quantization (e.g., LVQ) in a way that minimizes runtime overhead? Ideally, I’d like to avoid adding dynamic casts or branching logic if possible.

}

VecSimQueryReply *rangeQuery(const void *queryBlob, double radius,
VecSimQueryParams *queryParams,
VecSimQueryReply_Order order) const override {
// To avoid duplicates in the final result, we must use withSet=true because backend vectors
// are quantized,
// and may produce different scores than the flat index for the same label.
return this->template rangeQueryImp<true>(queryBlob, radius, queryParams, order);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

}

size_t indexSize() const override {
std::shared_lock<std::shared_mutex> flat_lock(this->flatIndexGuard);
std::shared_lock<std::shared_mutex> main_lock(this->mainIndexGuard);
Expand Down
4 changes: 4 additions & 0 deletions src/VecSim/utils/query_result_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ std::pair<size_t, size_t> merge_results(VecSimQueryResultContainer &results,
}

// Assumes that the arrays are sorted by score firstly and by id secondarily.
// Use withSet=false if you can guarantee that shared ids between the two lists
// will also have identical scores. In this case, any duplicates will naturally align
// at the front of both lists during the merge, so they can be removed without explicitly
// tracking seen ids — enabling a more efficient merge.
template <bool withSet>
VecSimQueryReply *merge_result_lists(VecSimQueryReply *first, VecSimQueryReply *second,
size_t limit) {
Expand Down
69 changes: 50 additions & 19 deletions src/VecSim/vec_sim_tiered_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@ class VecSimTieredIndex : public VecSimIndexInterface {
jobs.size());
}

// For both topK and range, Use withSet=false if you can guarantee that shared ids between the
// two lists will also have identical scores. In this case, any duplicates will naturally align
// at the front of both lists during the merge, so they can be removed without explicitly
// tracking seen ids — enabling a more efficient merge.
template <bool WithSet>
VecSimQueryReply *topKQueryImp(const void *queryBlob, size_t k,
VecSimQueryParams *queryParams) const;

template <bool WithSet>
VecSimQueryReply *rangeQueryImp(const void *queryBlob, double radius,
VecSimQueryParams *queryParams,
VecSimQueryReply_Order order) const;

public:
VecSimTieredIndex(VecSimIndexAbstract<DataType, DistType> *backendIndex_,
BruteForceIndex<DataType, DistType> *frontendIndex_,
Expand Down Expand Up @@ -116,10 +129,12 @@ class VecSimTieredIndex : public VecSimIndexInterface {
}
#endif
};

template <typename DataType, typename DistType>
template <bool withSet>
VecSimQueryReply *
VecSimTieredIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
VecSimQueryParams *queryParams) const {
VecSimTieredIndex<DataType, DistType>::topKQueryImp(const void *queryBlob, size_t k,
VecSimQueryParams *queryParams) const {
this->flatIndexGuard.lock_shared();

// If the flat buffer is empty, we can simply query the main index.
Expand Down Expand Up @@ -163,12 +178,20 @@ VecSimTieredIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k
return main_results;
}

// Merge the results and return, avoiding duplicates.
if (this->backendIndex->isMultiValue()) {
return merge_result_lists<true>(main_results, flat_results, k);
} else {
return merge_result_lists<false>(main_results, flat_results, k);
}
return merge_result_lists<withSet>(main_results, flat_results, k);
}
}
template <typename DataType, typename DistType>
VecSimQueryReply *
VecSimTieredIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
VecSimQueryParams *queryParams) const {
if (this->backendIndex->isMultiValue()) {
return this->topKQueryImp<true>(queryBlob, k, queryParams); // Multi-value index
} else {
// Calling with withSet=false for optimized performance, assuming that shared IDs across
// lists also have identical scores — in which case duplicates are implicitly avoided by the
// merge logic.
return this->topKQueryImp<false>(queryBlob, k, queryParams);
}
}

Expand All @@ -177,6 +200,23 @@ VecSimQueryReply *
VecSimTieredIndex<DataType, DistType>::rangeQuery(const void *queryBlob, double radius,
VecSimQueryParams *queryParams,
VecSimQueryReply_Order order) const {
if (this->backendIndex->isMultiValue()) {
return this->rangeQueryImp<true>(queryBlob, radius, queryParams,
order); // Multi-value index
} else {
// Calling with withSet=false for optimized performance, assuming that shared IDs across
// lists also have identical scores — in which case duplicates are implicitly avoided by the
// merge logic.
return this->rangeQueryImp<false>(queryBlob, radius, queryParams, order);
}
}

template <typename DataType, typename DistType>
template <bool withSet>
VecSimQueryReply *
VecSimTieredIndex<DataType, DistType>::rangeQueryImp(const void *queryBlob, double radius,
VecSimQueryParams *queryParams,
VecSimQueryReply_Order order) const {
this->flatIndexGuard.lock_shared();

// If the flat buffer is empty, we can simply query the main index.
Expand Down Expand Up @@ -225,24 +265,15 @@ VecSimTieredIndex<DataType, DistType>::rangeQuery(const void *queryBlob, double
auto code = main_results->code;

// Merge the sorted results with no limit (all the results are valid).
VecSimQueryReply *ret;
if (this->backendIndex->isMultiValue()) {
ret = merge_result_lists<true>(main_results, flat_results, -1);
} else {
ret = merge_result_lists<false>(main_results, flat_results, -1);
}
VecSimQueryReply *ret = merge_result_lists<withSet>(main_results, flat_results, -1);
// Restore the return code and return.
ret->code = code;
return ret;

} else { // BY_ID
// Notice that we don't modify the return code of the main index in any step.
concat_results(main_results, flat_results);
if (this->backendIndex->isMultiValue()) {
filter_results_by_id<true>(main_results);
} else {
filter_results_by_id<false>(main_results);
}
filter_results_by_id<withSet>(main_results);
return main_results;
}
}
Expand Down
98 changes: 97 additions & 1 deletion tests/unit/test_svs_tiered.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,102 @@ TYPED_TEST(SVSTieredIndexTest, insertJobAsync) {
}
}

/**
* This test verifies that a tiered index correctly returns the closest vectors when querying data
* distributed across both the flat and SVS indices, specifically when duplicate labels exist in
* both indices with different distances. It adds vectors with known scores, including such
* duplicates, and ensures that only the closer instance is returned. The test covers both top-K and
* range queries, validating result ordering by score and by ID.
*/
TYPED_TEST(SVSTieredIndexTest, SearchDifferentScores) {
size_t dim = 4;
size_t constexpr k = 3;

// Create TieredSVS index instance with a mock queue.
SVSParams params = {
.type = TypeParam::get_index_type(),
.dim = dim,
.metric = VecSimMetric_L2,
};
VecSimParams svs_params = CreateParams(params);
auto mock_thread_pool = tieredIndexMock();
auto *tiered_index = this->CreateTieredSVSIndex(svs_params, mock_thread_pool, k);
ASSERT_INDEX(tiered_index);

auto svs_index = tiered_index->GetBackendIndex();
auto flat_index = tiered_index->GetFlatIndex();

// Define IDs and distance values for test vectors
// ids are intentionally in random order to verify sorting works correctly
size_t constexpr ids[k] = {54, 4, 15};
double constexpr res_values[k] = {2, 3, 100};
// Define a type for our result pair
using ResultPair = std::pair<size_t, double>; // (id, score)

// Create a vector of expected results - these are the scores we expect
// when querying with a zero vector (L2 distance = value²*dim)
std::vector<ResultPair> expected_results_by_score(k);

for (size_t i = 0; i < k; i++) {
expected_results_by_score[i] = {ids[i], res_values[i] * res_values[i] * dim};
}

// Insert duplicate vectors with same ID but different distances across the two indices.
// The index should return only the closer of the two.

// ID 54: closer in SVS, farther in flat — expect to return SVS version
GenerateAndAddVector<TEST_DATA_T>(svs_index, dim, ids[0], res_values[0]);
GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, ids[0], 4);
Comment on lines +432 to +434
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I understand, flat index should be prioritized in this case:

  • user adds a vector with a value version_1 (moved to backend)
  • user overrides the vector with a value version_2 (not yet moved to backend but kept in flat)
  • user expects that version_1 is overridden and forgotten and will never appear in query results

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. This test simulates discrepancies in scores due to quantization differences, not synchronization issues.
  2. The scenario you described is valid. However, in general, the flat index result can also be selected if it yields a better score for the same label compared to the SVS index—again, assuming updates are properly handled and the score is calculated for the same vector version.

Copy link
Collaborator

@rfsaliev rfsaliev Jun 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's assume the following:

  • user adds a vector with label 1 which is close to further query
  • user adds 1000 other vectors
  • user overrides the vector with label 1 with a value which is farther than all other 1000 vectors
  • user calls topK query where k=10
  • user does not expect the vector with label 1 in query results, but if the first version of the vector is kept in backend, user will receive it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then the implementation is wrong.
The vector should have been marked as deleted in the backend index during the update, and should not be part of the results returned from the backend index.


// ID 4: closer in flat, farther in SVS — expect to return flat version
GenerateAndAddVector<TEST_DATA_T>(svs_index, dim, ids[1], 5);
GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, ids[1], res_values[1]);

// ID 15: identical in both indices — distance is large, should still return one instance
GenerateAndAddVector<TEST_DATA_T>(svs_index, dim, ids[2], res_values[2]);
GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, ids[2], res_values[2]);

// Create a zero vector for querying - this makes scores directly proportional to vector values
TEST_DATA_T query_0[dim];
GenerateVector<TEST_DATA_T>(query_0, dim, 0);

// Verify results ordered by increasing score (distance).
double prev_score = 0; // all scores are positive
auto verify_by_score = [&](size_t id, double score, size_t res_index) {
ASSERT_LT(prev_score, score); // prev_score < score
prev_score = score;
ASSERT_EQ(id, expected_results_by_score[res_index].first);
ASSERT_EQ(score, expected_results_by_score[res_index].second);
};

runTopKSearchTest(tiered_index, query_0, k, verify_by_score, nullptr, BY_SCORE);
// Reset score tracking for range query
prev_score = 0;
// Use the largest score as the range to include all vectors
double range = expected_results_by_score.back().second;
runRangeQueryTest(tiered_index, query_0, range, verify_by_score, k, BY_SCORE);

// Now verify result ordering by ascending ID instead of score
auto expected_results_by_id = expected_results_by_score;
// Sort by id (ascending)
std::sort(expected_results_by_id.begin(), expected_results_by_id.end(),
[](const ResultPair &a, const ResultPair &b) { return a.first < b.first; });

size_t prev_id = 0; // all ids are positive
auto verify_by_id = [&](size_t id, double score, size_t res_index) {
ASSERT_LT(prev_id, id); // prev_score < score
prev_id = id;
ASSERT_EQ(id, expected_results_by_id[res_index].first);
ASSERT_EQ(score, expected_results_by_id[res_index].second);
};
// Test top-K search with results ordered by ID
runTopKSearchTest(tiered_index, query_0, k, verify_by_id, nullptr, BY_ID);
// Reset ID tracking for range query
prev_id = 0;
// Test range query with results ordered by ID
runRangeQueryTest(tiered_index, query_0, range, verify_by_id, k, BY_ID);
}

TYPED_TEST(SVSTieredIndexTest, KNNSearch) {
size_t dim = 4;
size_t k = 10;
Expand Down Expand Up @@ -2413,4 +2509,4 @@ TEST(SVSTieredIndexTest, svs_not_supported) {
ASSERT_EQ(size2, -1);
}

#endif
#endif
Loading