From ae111368c3b5631db4aaea1cdbf8d8cb418da2f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Tue, 30 Sep 2025 16:46:22 +0200
Subject: [PATCH 1/7] More clean logical selection

---
 welearn_datastack/modules/qdrant_handler.py | 44 ++++++++++++---------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py
index d16cbf3..c238a17 100644
--- a/welearn_datastack/modules/qdrant_handler.py
+++ b/welearn_datastack/modules/qdrant_handler.py
@@ -31,30 +31,36 @@ def classify_documents_per_collection(
     """
     tmp_collections_names_in_qdrant = qdrant_connector.get_collections().collections
     collections_names_in_qdrant = [c.name for c in tmp_collections_names_in_qdrant]
-    model_name_collection_name = {}
-    for x in collections_names_in_qdrant:
-        parts = x.split("_")
-        if len(parts) >= 4:
-            model_name_collection_name[parts[3]] = x
-        else:
-            logger.warning(
-                "Collection name '%s' does not follow the expected format", x
-            )
 
-    ret: Dict[str, Set[UUID]] = defaultdict(set)
+    ret: Dict[str, Set[UUID]] = {}
     for dslice in slices:
-        model_name = dslice.embedding_model.title
-        try:
-            collection_name = model_name_collection_name[model_name]
-            ret[collection_name].add(dslice.document_id)  # type: ignore
-        except KeyError:
-            logger.warning(
-                "No collection found for model %s, document %s",
-                model_name,
-                dslice.document_id,
+        lang = dslice.document.lang
+        model = dslice.embedding_model.title
+        collection_name = None
+        # Check multilingual
+        for cn in collections_names_in_qdrant:
+            multilingual_collection = f"collection_welearn_mul_{model}"
+            if cn == multilingual_collection:
+                collection_name = multilingual_collection
+
+        # Check monolingual
+        for cn in collections_names_in_qdrant:
+            mono_collection = f"collection_welearn_{lang}_{model}"
+            if cn == mono_collection:
+                collection_name = mono_collection
+
+        if not collection_name:
+            logger.error(
+                "Collection %s not found in Qdrant, slice %s ignored",
+                collection_name,
+                dslice.id,
             )
             continue
 
+        if collection_name not in ret:
+            ret[collection_name] = set()
+        ret[collection_name].add(dslice.document_id)  # type: ignore
+
     return ret
 
 

From 64042e1a8dbd38ce9ca8d15223c8420afda32df8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Tue, 30 Sep 2025 16:46:27 +0200
Subject: [PATCH 2/7] add log

---
 .../nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py
index 7066a1b..1c6a709 100644
--- a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py
+++ b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py
@@ -114,6 +114,7 @@ def main() -> None:
 
         # Iterate on each collection
         for collection_name in documents_per_collection:
+            logger.info(f"We working on collection : {collection_name}")
             # We need to delete all points related to the documents in the collection for avoiding duplicates
             del_res = delete_points_related_to_document(
                 collection_name=collection_name,

From 71a66c28fe7af5e7124fa01f87e70ead885aa0d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Tue, 30 Sep 2025 17:09:31 +0200
Subject: [PATCH 3/7] fix and test

---
 .../qdrant_syncronizer/test_qdrant_handler.py | 25 +++++++++++++++++++
 welearn_datastack/modules/qdrant_handler.py   |  5 +---
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/tests/qdrant_syncronizer/test_qdrant_handler.py b/tests/qdrant_syncronizer/test_qdrant_handler.py
index 2cb347f..824fa74 100644
--- a/tests/qdrant_syncronizer/test_qdrant_handler.py
+++ b/tests/qdrant_syncronizer/test_qdrant_handler.py
@@ -134,3 +134,28 @@ def test_should_handle_multiple_slices_for_same_collection_with_multi_lingual_co
             "collection_welearn_mul_mulembmodel": {doc_id1},
         }
         self.assertDictEqual(dict(collections_names), expected)
+
+    def test_should_handle_multiple_slices_for_same_collection_with_multi_lingual_collection_and_gibberish(
+        self,
+    ):
+        self.client.create_collection(
+            collection_name="collection_welearn_mul_mulembmodel_og",
+            vectors_config=models.VectorParams(
+                size=50, distance=models.Distance.COSINE
+            ),
+        )
+
+        doc_id0 = uuid.uuid4()
+        doc_id1 = uuid.uuid4()
+        qdrant_connector = self.client
+        fake_slice0 = FakeSlice(doc_id0, embedding_model_name="english-embmodel")
+        fake_slice1 = FakeSlice(doc_id0, embedding_model_name="english-embmodel")
+
+        fake_slice1.order_sequence = 1
+
+        fake_slice2 = FakeSlice(doc_id1, embedding_model_name="mulembmodel")
+        fake_slice2.document.lang = "pt"
+
+        slices = [fake_slice0, fake_slice1, fake_slice2]
+        collections_names = classify_documents_per_collection(qdrant_connector, slices)
+        self.assertNotIn("collection_welearn_mul_mulembmodel_og", collections_names)
diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py
index c238a17..a6a89ec 100644
--- a/welearn_datastack/modules/qdrant_handler.py
+++ b/welearn_datastack/modules/qdrant_handler.py
@@ -9,9 +9,7 @@
 from qdrant_client.http.models import models
 
 from welearn_datastack.data.db_models import DocumentSlice
-from welearn_datastack.exceptions import (
-    ErrorWhileDeletingChunks,
-)
+from welearn_datastack.exceptions import ErrorWhileDeletingChunks
 
 logger = logging.getLogger(__name__)
 
@@ -79,7 +77,6 @@ def delete_points_related_to_document(
     """
     logger.info("Deletion started")
     logger.debug(f"Deleting points related to {documents_ids} in {collection_name}")
-    op_res = None
 
     try:
         op_res = qdrant_connector.delete(

From de68cff5de1d4c6096b56aa2def7393d81b9fa8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com>
Date: Tue, 30 Sep 2025 17:13:56 +0200
Subject: [PATCH 4/7] Update welearn_datastack/modules/qdrant_handler.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 welearn_datastack/modules/qdrant_handler.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py
index a6a89ec..05944db 100644
--- a/welearn_datastack/modules/qdrant_handler.py
+++ b/welearn_datastack/modules/qdrant_handler.py
@@ -35,22 +35,23 @@ def classify_documents_per_collection(
         lang = dslice.document.lang
         model = dslice.embedding_model.title
         collection_name = None
+        multilingual_collection = f"collection_welearn_mul_{model}"
+        mono_collection = f"collection_welearn_{lang}_{model}"
         # Check multilingual
         for cn in collections_names_in_qdrant:
-            multilingual_collection = f"collection_welearn_mul_{model}"
             if cn == multilingual_collection:
                 collection_name = multilingual_collection
 
         # Check monolingual
         for cn in collections_names_in_qdrant:
-            mono_collection = f"collection_welearn_{lang}_{model}"
             if cn == mono_collection:
                 collection_name = mono_collection
 
         if not collection_name:
             logger.error(
-                "Collection %s not found in Qdrant, slice %s ignored",
-                collection_name,
+                "Collections %s or %s not found in Qdrant, slice %s ignored",
+                multilingual_collection,
+                mono_collection,
                 dslice.id,
             )
             continue

From 0adb22224b54efa4a7dfd38d78b4a4405b03d846 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Wed, 1 Oct 2025 10:39:12 +0200
Subject: [PATCH 5/7] I regroup the two loops in one
 https://github.com/CyberCRI/welearn-datastack/pull/66#discussion_r2392006653

---
 welearn_datastack/modules/qdrant_handler.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py
index a6a89ec..e9de764 100644
--- a/welearn_datastack/modules/qdrant_handler.py
+++ b/welearn_datastack/modules/qdrant_handler.py
@@ -1,5 +1,4 @@
 import logging
-from collections import defaultdict
 from typing import Collection, Dict, List, Set, Type
 from uuid import UUID
 
@@ -35,17 +34,18 @@ def classify_documents_per_collection(
         lang = dslice.document.lang
         model = dslice.embedding_model.title
         collection_name = None
-        # Check multilingual
+
+        # Check multilingual or mono lingual
         for cn in collections_names_in_qdrant:
             multilingual_collection = f"collection_welearn_mul_{model}"
+            mono_collection = f"collection_welearn_{lang}_{model}"
+
             if cn == multilingual_collection:
                 collection_name = multilingual_collection
-
-        # Check monolingual
-        for cn in collections_names_in_qdrant:
-            mono_collection = f"collection_welearn_{lang}_{model}"
+                break
             if cn == mono_collection:
                 collection_name = mono_collection
+                break
 
         if not collection_name:
             logger.error(

From 99e06122b68ba1aef2b4c6c7f71e9719aa610899 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com>
Date: Wed, 1 Oct 2025 11:07:22 +0200
Subject: [PATCH 6/7] Update
 welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py

Co-authored-by: Sandra Guerreiro  <sandragjacinto@gmail.com>
---
 .../nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py
index 1c6a709..8a8e6ea 100644
--- a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py
+++ b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py
@@ -114,7 +114,7 @@ def main() -> None:
 
         # Iterate on each collection
         for collection_name in documents_per_collection:
-            logger.info(f"We working on collection : {collection_name}")
+            logger.info(f"We are working on collection : {collection_name}")
             # We need to delete all points related to the documents in the collection for avoiding duplicates
             del_res = delete_points_related_to_document(
                 collection_name=collection_name,

From 82792efe4146bb0b16a80876b56742abd33a5415 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Wed, 1 Oct 2025 11:07:32 +0200
Subject: [PATCH 7/7] removed from the loop

---
 welearn_datastack/modules/qdrant_handler.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py
index 66af763..272e349 100644
--- a/welearn_datastack/modules/qdrant_handler.py
+++ b/welearn_datastack/modules/qdrant_handler.py
@@ -34,20 +34,15 @@ def classify_documents_per_collection(
         lang = dslice.document.lang
         model = dslice.embedding_model.title
         collection_name = None
+        multilingual_collection = f"collection_welearn_mul_{model}"
+        mono_collection = f"collection_welearn_{lang}_{model}"
 
         # Check multilingual or mono lingual
-        for cn in collections_names_in_qdrant:
-            multilingual_collection = f"collection_welearn_mul_{model}"
-            mono_collection = f"collection_welearn_{lang}_{model}"
-
-            if cn == multilingual_collection:
-                collection_name = multilingual_collection
-                break
-            if cn == mono_collection:
-                collection_name = mono_collection
-                break
-
-        if not collection_name:
+        if multilingual_collection in collections_names_in_qdrant:
+            collection_name = multilingual_collection
+        elif mono_collection in collections_names_in_qdrant:
+            collection_name = mono_collection
+        else:
             logger.error(
                 f"Collection {collection_name} not found in Qdrant, slice {dslice.id} ignored",
             )