RedisAI · meiravgri · Jun 3, 2025 · Jun 3, 2025 · meiravgri · Jun 3, 2025
diff --git a/src/VecSim/algorithms/svs/svs_tiered.h b/src/VecSim/algorithms/svs/svs_tiered.h
@@ -665,6 +665,23 @@ class TieredSVSIndex : public VecSimTieredIndex<DataType, float> {
         return ret;
     }
 
+    VecSimQueryReply *topKQuery(const void *queryBlob, size_t k,
+                                VecSimQueryParams *queryParams) const override {
+        // To avoid duplicates in the final result, we must use withSet=true because backend vectors
+        // are quantized,
+        // and may produce different scores than the flat index for the same label.
+        return this->template topKQueryImp<true>(queryBlob, k, queryParams);
+    }
+
+    VecSimQueryReply *rangeQuery(const void *queryBlob, double radius,
+                                 VecSimQueryParams *queryParams,
+                                 VecSimQueryReply_Order order) const override {
+        // To avoid duplicates in the final result, we must use withSet=true because backend vectors
+        // are quantized,
+        // and may produce different scores than the flat index for the same label.
+        return this->template rangeQueryImp<true>(queryBlob, radius, queryParams, order);
+    }
+
     size_t indexSize() const override {
         std::shared_lock<std::shared_mutex> flat_lock(this->flatIndexGuard);
         std::shared_lock<std::shared_mutex> main_lock(this->mainIndexGuard);

diff --git a/src/VecSim/utils/query_result_utils.h b/src/VecSim/utils/query_result_utils.h
@@ -86,6 +86,10 @@ std::pair<size_t, size_t> merge_results(VecSimQueryResultContainer &results,
 }
 
 // Assumes that the arrays are sorted by score firstly and by id secondarily.
+// Use withSet=false if you can guarantee that shared ids between the two lists
+// will also have identical scores. In this case, any duplicates will naturally align
+// at the front of both lists during the merge, so they can be removed without explicitly
+// tracking seen ids — enabling a more efficient merge.
 template <bool withSet>
 VecSimQueryReply *merge_result_lists(VecSimQueryReply *first, VecSimQueryReply *second,
                                      size_t limit) {

diff --git a/src/VecSim/vec_sim_tiered_index.h b/src/VecSim/vec_sim_tiered_index.h
@@ -66,6 +66,19 @@ class VecSimTieredIndex : public VecSimIndexInterface {
                                 jobs.size());
     }
 
+    // For both topK and range, Use withSet=false if you can guarantee that shared ids between the
+    // two lists will also have identical scores. In this case, any duplicates will naturally align
+    // at the front of both lists during the merge, so they can be removed without explicitly
+    // tracking seen ids — enabling a more efficient merge.
+    template <bool WithSet>
+    VecSimQueryReply *topKQueryImp(const void *queryBlob, size_t k,
+                                   VecSimQueryParams *queryParams) const;
+
+    template <bool WithSet>
+    VecSimQueryReply *rangeQueryImp(const void *queryBlob, double radius,
+                                    VecSimQueryParams *queryParams,
+                                    VecSimQueryReply_Order order) const;
+
 public:
     VecSimTieredIndex(VecSimIndexAbstract<DataType, DistType> *backendIndex_,
                       BruteForceIndex<DataType, DistType> *frontendIndex_,
@@ -116,10 +129,12 @@ class VecSimTieredIndex : public VecSimIndexInterface {
     }
 #endif
 };
+
 template <typename DataType, typename DistType>
+template <bool withSet>
 VecSimQueryReply *
-VecSimTieredIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
-                                                 VecSimQueryParams *queryParams) const {
+VecSimTieredIndex<DataType, DistType>::topKQueryImp(const void *queryBlob, size_t k,
+                                                    VecSimQueryParams *queryParams) const {
     this->flatIndexGuard.lock_shared();
 
     // If the flat buffer is empty, we can simply query the main index.
@@ -163,12 +178,20 @@ VecSimTieredIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k
             return main_results;
         }
 
-        // Merge the results and return, avoiding duplicates.
-        if (this->backendIndex->isMultiValue()) {
-            return merge_result_lists<true>(main_results, flat_results, k);
-        } else {
-            return merge_result_lists<false>(main_results, flat_results, k);
-        }
+        return merge_result_lists<withSet>(main_results, flat_results, k);
+    }
+}
+template <typename DataType, typename DistType>
+VecSimQueryReply *
+VecSimTieredIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
+                                                 VecSimQueryParams *queryParams) const {
+    if (this->backendIndex->isMultiValue()) {
+        return this->topKQueryImp<true>(queryBlob, k, queryParams); // Multi-value index
+    } else {
+        // Calling with withSet=false for optimized performance, assuming that shared IDs across
+        // lists also have identical scores — in which case duplicates are implicitly avoided by the
+        // merge logic.
+        return this->topKQueryImp<false>(queryBlob, k, queryParams);
     }
 }
 
@@ -177,6 +200,23 @@ VecSimQueryReply *
 VecSimTieredIndex<DataType, DistType>::rangeQuery(const void *queryBlob, double radius,
                                                   VecSimQueryParams *queryParams,
                                                   VecSimQueryReply_Order order) const {
+    if (this->backendIndex->isMultiValue()) {
+        return this->rangeQueryImp<true>(queryBlob, radius, queryParams,
+                                         order); // Multi-value index
+    } else {
+        // Calling with withSet=false for optimized performance, assuming that shared IDs across
+        // lists also have identical scores — in which case duplicates are implicitly avoided by the
+        // merge logic.
+        return this->rangeQueryImp<false>(queryBlob, radius, queryParams, order);
+    }
+}
+
+template <typename DataType, typename DistType>
+template <bool withSet>
+VecSimQueryReply *
+VecSimTieredIndex<DataType, DistType>::rangeQueryImp(const void *queryBlob, double radius,
+                                                     VecSimQueryParams *queryParams,
+                                                     VecSimQueryReply_Order order) const {
     this->flatIndexGuard.lock_shared();
 
     // If the flat buffer is empty, we can simply query the main index.
@@ -225,24 +265,15 @@ VecSimTieredIndex<DataType, DistType>::rangeQuery(const void *queryBlob, double
             auto code = main_results->code;
 
             // Merge the sorted results with no limit (all the results are valid).
-            VecSimQueryReply *ret;
-            if (this->backendIndex->isMultiValue()) {
-                ret = merge_result_lists<true>(main_results, flat_results, -1);
-            } else {
-                ret = merge_result_lists<false>(main_results, flat_results, -1);
-            }
+            VecSimQueryReply *ret = merge_result_lists<withSet>(main_results, flat_results, -1);
             // Restore the return code and return.
             ret->code = code;
             return ret;
 
         } else { // BY_ID
             // Notice that we don't modify the return code of the main index in any step.
             concat_results(main_results, flat_results);
-            if (this->backendIndex->isMultiValue()) {
-                filter_results_by_id<true>(main_results);
-            } else {
-                filter_results_by_id<false>(main_results);
-            }
+            filter_results_by_id<withSet>(main_results);
             return main_results;
         }
     }

diff --git a/tests/unit/test_svs_tiered.cpp b/tests/unit/test_svs_tiered.cpp
@@ -386,6 +386,102 @@ TYPED_TEST(SVSTieredIndexTest, insertJobAsync) {
     }
 }
 
+/**
+ * This test verifies that a tiered index correctly returns the closest vectors when querying data
+ * distributed across both the flat and SVS indices, specifically when duplicate labels exist in
+ * both indices with different distances. It adds vectors with known scores, including such
+ * duplicates, and ensures that only the closer instance is returned. The test covers both top-K and
+ * range queries, validating result ordering by score and by ID.
+ */
+TYPED_TEST(SVSTieredIndexTest, SearchDifferentScores) {
+    size_t dim = 4;
+    size_t constexpr k = 3;
+
+    // Create TieredSVS index instance with a mock queue.
+    SVSParams params = {
+        .type = TypeParam::get_index_type(),
+        .dim = dim,
+        .metric = VecSimMetric_L2,
+    };
+    VecSimParams svs_params = CreateParams(params);
+    auto mock_thread_pool = tieredIndexMock();
+    auto *tiered_index = this->CreateTieredSVSIndex(svs_params, mock_thread_pool, k);
+    ASSERT_INDEX(tiered_index);
+
+    auto svs_index = tiered_index->GetBackendIndex();
+    auto flat_index = tiered_index->GetFlatIndex();
+
+    // Define IDs and distance values for test vectors
+    // ids are intentionally in random order to verify sorting works correctly
+    size_t constexpr ids[k] = {54, 4, 15};
+    double constexpr res_values[k] = {2, 3, 100};
+    // Define a type for our result pair
+    using ResultPair = std::pair<size_t, double>; // (id, score)
+
+    // Create a vector of expected results - these are the scores we expect
+    // when querying with a zero vector (L2 distance = value²*dim)
+    std::vector<ResultPair> expected_results_by_score(k);
+
+    for (size_t i = 0; i < k; i++) {
+        expected_results_by_score[i] = {ids[i], res_values[i] * res_values[i] * dim};
+    }
+
+    // Insert duplicate vectors with same ID but different distances across the two indices.
+    // The index should return only the closer of the two.
+
+    // ID 54: closer in SVS, farther in flat — expect to return SVS version
+    GenerateAndAddVector<TEST_DATA_T>(svs_index, dim, ids[0], res_values[0]);
+    GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, ids[0], 4);
+
+    // ID 4: closer in flat, farther in SVS — expect to return flat version
+    GenerateAndAddVector<TEST_DATA_T>(svs_index, dim, ids[1], 5);
+    GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, ids[1], res_values[1]);
+
+    // ID 15: identical in both indices — distance is large, should still return one instance
+    GenerateAndAddVector<TEST_DATA_T>(svs_index, dim, ids[2], res_values[2]);
+    GenerateAndAddVector<TEST_DATA_T>(flat_index, dim, ids[2], res_values[2]);
+
+    // Create a zero vector for querying - this makes scores directly proportional to vector values
+    TEST_DATA_T query_0[dim];
+    GenerateVector<TEST_DATA_T>(query_0, dim, 0);
+
+    // Verify results ordered by increasing score (distance).
+    double prev_score = 0; // all scores are positive
+    auto verify_by_score = [&](size_t id, double score, size_t res_index) {
+        ASSERT_LT(prev_score, score); // prev_score < score
+        prev_score = score;
+        ASSERT_EQ(id, expected_results_by_score[res_index].first);
+        ASSERT_EQ(score, expected_results_by_score[res_index].second);
+    };
+
+    runTopKSearchTest(tiered_index, query_0, k, verify_by_score, nullptr, BY_SCORE);
+    // Reset score tracking for range query
+    prev_score = 0;
+    // Use the largest score as the range to include all vectors
+    double range = expected_results_by_score.back().second;
+    runRangeQueryTest(tiered_index, query_0, range, verify_by_score, k, BY_SCORE);
+
+    // Now verify result ordering by ascending ID instead of score
+    auto expected_results_by_id = expected_results_by_score;
+    // Sort by id (ascending)
+    std::sort(expected_results_by_id.begin(), expected_results_by_id.end(),
+              [](const ResultPair &a, const ResultPair &b) { return a.first < b.first; });
+
+    size_t prev_id = 0; // all ids are positive
+    auto verify_by_id = [&](size_t id, double score, size_t res_index) {
+        ASSERT_LT(prev_id, id); // prev_score < score
+        prev_id = id;
+        ASSERT_EQ(id, expected_results_by_id[res_index].first);
+        ASSERT_EQ(score, expected_results_by_id[res_index].second);
+    };
+    // Test top-K search with results ordered by ID
+    runTopKSearchTest(tiered_index, query_0, k, verify_by_id, nullptr, BY_ID);
+    // Reset ID tracking for range query
+    prev_id = 0;
+    // Test range query with results ordered by ID
+    runRangeQueryTest(tiered_index, query_0, range, verify_by_id, k, BY_ID);
+}
+
 TYPED_TEST(SVSTieredIndexTest, KNNSearch) {
     size_t dim = 4;
     size_t k = 10;
@@ -2413,4 +2509,4 @@ TEST(SVSTieredIndexTest, svs_not_supported) {
     ASSERT_EQ(size2, -1);
 }
 
-#endif
+#endif