From ae111368c3b5631db4aaea1cdbf8d8cb418da2f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Tue, 30 Sep 2025 16:46:22 +0200 Subject: [PATCH 1/7] More clean logical selection --- welearn_datastack/modules/qdrant_handler.py | 44 ++++++++++++--------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py index d16cbf3..c238a17 100644 --- a/welearn_datastack/modules/qdrant_handler.py +++ b/welearn_datastack/modules/qdrant_handler.py @@ -31,30 +31,36 @@ def classify_documents_per_collection( """ tmp_collections_names_in_qdrant = qdrant_connector.get_collections().collections collections_names_in_qdrant = [c.name for c in tmp_collections_names_in_qdrant] - model_name_collection_name = {} - for x in collections_names_in_qdrant: - parts = x.split("_") - if len(parts) >= 4: - model_name_collection_name[parts[3]] = x - else: - logger.warning( - "Collection name '%s' does not follow the expected format", x - ) - ret: Dict[str, Set[UUID]] = defaultdict(set) + ret: Dict[str, Set[UUID]] = {} for dslice in slices: - model_name = dslice.embedding_model.title - try: - collection_name = model_name_collection_name[model_name] - ret[collection_name].add(dslice.document_id) # type: ignore - except KeyError: - logger.warning( - "No collection found for model %s, document %s", - model_name, - dslice.document_id, + lang = dslice.document.lang + model = dslice.embedding_model.title + collection_name = None + # Check multilingual + for cn in collections_names_in_qdrant: + multilingual_collection = f"collection_welearn_mul_{model}" + if cn == multilingual_collection: + collection_name = multilingual_collection + + # Check monolingual + for cn in collections_names_in_qdrant: + mono_collection = f"collection_welearn_{lang}_{model}" + if cn == mono_collection: + collection_name = mono_collection + + if not collection_name: + logger.error( + "Collection %s not found in Qdrant, slice %s ignored", + collection_name, + dslice.id, ) continue + if collection_name not in ret: + ret[collection_name] = set() + ret[collection_name].add(dslice.document_id) # type: ignore + return ret From 64042e1a8dbd38ce9ca8d15223c8420afda32df8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Tue, 30 Sep 2025 16:46:27 +0200 Subject: [PATCH 2/7] add log --- .../nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py index 7066a1b..1c6a709 100644 --- a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py +++ b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py @@ -114,6 +114,7 @@ def main() -> None: # Iterate on each collection for collection_name in documents_per_collection: + logger.info(f"We working on collection : {collection_name}") # We need to delete all points related to the documents in the collection for avoiding duplicates del_res = delete_points_related_to_document( collection_name=collection_name, From 71a66c28fe7af5e7124fa01f87e70ead885aa0d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Tue, 30 Sep 2025 17:09:31 +0200 Subject: [PATCH 3/7] fix and test --- .../qdrant_syncronizer/test_qdrant_handler.py | 25 +++++++++++++++++++ welearn_datastack/modules/qdrant_handler.py | 5 +--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/tests/qdrant_syncronizer/test_qdrant_handler.py b/tests/qdrant_syncronizer/test_qdrant_handler.py index 2cb347f..824fa74 100644 --- a/tests/qdrant_syncronizer/test_qdrant_handler.py +++ b/tests/qdrant_syncronizer/test_qdrant_handler.py @@ -134,3 +134,28 @@ def test_should_handle_multiple_slices_for_same_collection_with_multi_lingual_co "collection_welearn_mul_mulembmodel": {doc_id1}, } self.assertDictEqual(dict(collections_names), expected) + + def test_should_handle_multiple_slices_for_same_collection_with_multi_lingual_collection_and_gibberish( + self, + ): + self.client.create_collection( + collection_name="collection_welearn_mul_mulembmodel_og", + vectors_config=models.VectorParams( + size=50, distance=models.Distance.COSINE + ), + ) + + doc_id0 = uuid.uuid4() + doc_id1 = uuid.uuid4() + qdrant_connector = self.client + fake_slice0 = FakeSlice(doc_id0, embedding_model_name="english-embmodel") + fake_slice1 = FakeSlice(doc_id0, embedding_model_name="english-embmodel") + + fake_slice1.order_sequence = 1 + + fake_slice2 = FakeSlice(doc_id1, embedding_model_name="mulembmodel") + fake_slice2.document.lang = "pt" + + slices = [fake_slice0, fake_slice1, fake_slice2] + collections_names = classify_documents_per_collection(qdrant_connector, slices) + self.assertNotIn("collection_welearn_mul_mulembmodel_og", collections_names) diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py index c238a17..a6a89ec 100644 --- a/welearn_datastack/modules/qdrant_handler.py +++ b/welearn_datastack/modules/qdrant_handler.py @@ -9,9 +9,7 @@ from qdrant_client.http.models import models from welearn_datastack.data.db_models import DocumentSlice -from welearn_datastack.exceptions import ( - ErrorWhileDeletingChunks, -) +from welearn_datastack.exceptions import ErrorWhileDeletingChunks logger = logging.getLogger(__name__) @@ -79,7 +77,6 @@ def delete_points_related_to_document( """ logger.info("Deletion started") logger.debug(f"Deleting points related to {documents_ids} in {collection_name}") - op_res = None try: op_res = qdrant_connector.delete( From de68cff5de1d4c6096b56aa2def7393d81b9fa8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com> Date: Tue, 30 Sep 2025 17:13:56 +0200 Subject: [PATCH 4/7] Update welearn_datastack/modules/qdrant_handler.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- welearn_datastack/modules/qdrant_handler.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py index a6a89ec..05944db 100644 --- a/welearn_datastack/modules/qdrant_handler.py +++ b/welearn_datastack/modules/qdrant_handler.py @@ -35,22 +35,23 @@ def classify_documents_per_collection( lang = dslice.document.lang model = dslice.embedding_model.title collection_name = None + multilingual_collection = f"collection_welearn_mul_{model}" + mono_collection = f"collection_welearn_{lang}_{model}" # Check multilingual for cn in collections_names_in_qdrant: - multilingual_collection = f"collection_welearn_mul_{model}" if cn == multilingual_collection: collection_name = multilingual_collection # Check monolingual for cn in collections_names_in_qdrant: - mono_collection = f"collection_welearn_{lang}_{model}" if cn == mono_collection: collection_name = mono_collection if not collection_name: logger.error( - "Collection %s not found in Qdrant, slice %s ignored", - collection_name, + "Collections %s or %s not found in Qdrant, slice %s ignored", + multilingual_collection, + mono_collection, dslice.id, ) continue From 0adb22224b54efa4a7dfd38d78b4a4405b03d846 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Wed, 1 Oct 2025 10:39:12 +0200 Subject: [PATCH 5/7] I regroup the two loops in one https://github.com/CyberCRI/welearn-datastack/pull/66#discussion_r2392006653 --- welearn_datastack/modules/qdrant_handler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py index a6a89ec..e9de764 100644 --- a/welearn_datastack/modules/qdrant_handler.py +++ b/welearn_datastack/modules/qdrant_handler.py @@ -1,5 +1,4 @@ import logging -from collections import defaultdict from typing import Collection, Dict, List, Set, Type from uuid import UUID @@ -35,17 +34,18 @@ def classify_documents_per_collection( lang = dslice.document.lang model = dslice.embedding_model.title collection_name = None - # Check multilingual + + # Check multilingual or mono lingual for cn in collections_names_in_qdrant: multilingual_collection = f"collection_welearn_mul_{model}" + mono_collection = f"collection_welearn_{lang}_{model}" + if cn == multilingual_collection: collection_name = multilingual_collection - - # Check monolingual - for cn in collections_names_in_qdrant: - mono_collection = f"collection_welearn_{lang}_{model}" + break if cn == mono_collection: collection_name = mono_collection + break if not collection_name: logger.error( From 99e06122b68ba1aef2b4c6c7f71e9719aa610899 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com> Date: Wed, 1 Oct 2025 11:07:22 +0200 Subject: [PATCH 6/7] Update welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py Co-authored-by: Sandra Guerreiro --- .../nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py index 1c6a709..8a8e6ea 100644 --- a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py +++ b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py @@ -114,7 +114,7 @@ def main() -> None: # Iterate on each collection for collection_name in documents_per_collection: - logger.info(f"We working on collection : {collection_name}") + logger.info(f"We are working on collection : {collection_name}") # We need to delete all points related to the documents in the collection for avoiding duplicates del_res = delete_points_related_to_document( collection_name=collection_name, From 82792efe4146bb0b16a80876b56742abd33a5415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Wed, 1 Oct 2025 11:07:32 +0200 Subject: [PATCH 7/7] removed from the loop --- welearn_datastack/modules/qdrant_handler.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py index 66af763..272e349 100644 --- a/welearn_datastack/modules/qdrant_handler.py +++ b/welearn_datastack/modules/qdrant_handler.py @@ -34,20 +34,15 @@ def classify_documents_per_collection( lang = dslice.document.lang model = dslice.embedding_model.title collection_name = None + multilingual_collection = f"collection_welearn_mul_{model}" + mono_collection = f"collection_welearn_{lang}_{model}" # Check multilingual or mono lingual - for cn in collections_names_in_qdrant: - multilingual_collection = f"collection_welearn_mul_{model}" - mono_collection = f"collection_welearn_{lang}_{model}" - - if cn == multilingual_collection: - collection_name = multilingual_collection - break - if cn == mono_collection: - collection_name = mono_collection - break - - if not collection_name: + if multilingual_collection in collections_names_in_qdrant: + collection_name = multilingual_collection + elif mono_collection in collections_names_in_qdrant: + collection_name = mono_collection + else: logger.error( f"Collection {collection_name} not found in Qdrant, slice {dslice.id} ignored", )