From 5c98cac6687532bf60724ede944730857388dcc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Tue, 18 Nov 2025 15:52:23 +0100
Subject: [PATCH 1/2] fix: handle documents without embedding model in
 classification

---
 .../qdrant_syncronizer/test_qdrant_handler.py | 20 ++++++++++++++++++-
 welearn_datastack/modules/qdrant_handler.py   | 14 ++++++++++---
 .../QdrantSyncronizer/qdrant_syncronizer.py   | 15 ++++++++++++++
 3 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/tests/qdrant_syncronizer/test_qdrant_handler.py b/tests/qdrant_syncronizer/test_qdrant_handler.py
index 824fa74..c86822e 100644
--- a/tests/qdrant_syncronizer/test_qdrant_handler.py
+++ b/tests/qdrant_syncronizer/test_qdrant_handler.py
@@ -75,6 +75,19 @@ def setUp(self):
     def tearDown(self):
         self.client.close()
 
+    def test_slice_without_embedding_model_should_go_to_none_collection(self):
+        doc_id = uuid.uuid4()
+        qdrant_connector = self.client
+        fake_slice = FakeSlice(doc_id)
+        fake_slice.embedding_model = None
+        slices = [fake_slice]
+        collections_names = classify_documents_per_collection(qdrant_connector, slices)
+
+        expected = {
+            None: {fake_slice.document_id},
+        }
+        self.assertEqual(dict(collections_names), expected)
+
     def test_should_get_collections_names_for_given_slices(self):
         doc_id = uuid.uuid4()
         qdrant_connector = self.client
@@ -83,7 +96,10 @@ def test_should_get_collections_names_for_given_slices(self):
         slices = [fake_slice]
         collections_names = classify_documents_per_collection(qdrant_connector, slices)
 
-        expected = {"collection_welearn_en_english-embmodel": {fake_slice.document_id}}
+        expected = {
+            None: set(),
+            "collection_welearn_en_english-embmodel": {fake_slice.document_id},
+        }
         self.assertEqual(dict(collections_names), expected)
 
     def test_should_handle_multiple_slices_for_same_collection(self):
@@ -101,6 +117,7 @@ def test_should_handle_multiple_slices_for_same_collection(self):
         slices = [fake_slice0, fake_slice1, fake_slice2]
         collections_names = classify_documents_per_collection(qdrant_connector, slices)
         expected = {
+            None: set(),
             "collection_welearn_en_english-embmodel": {doc_id0},
             "collection_welearn_fr_french-embmodel": {doc_id1},
         }
@@ -130,6 +147,7 @@ def test_should_handle_multiple_slices_for_same_collection_with_multi_lingual_co
         slices = [fake_slice0, fake_slice1, fake_slice2]
         collections_names = classify_documents_per_collection(qdrant_connector, slices)
         expected = {
+            None: set(),
             "collection_welearn_en_english-embmodel": {doc_id0},
             "collection_welearn_mul_mulembmodel": {doc_id1},
         }
diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py
index 51c29a1..1a895b0 100644
--- a/welearn_datastack/modules/qdrant_handler.py
+++ b/welearn_datastack/modules/qdrant_handler.py
@@ -15,7 +15,7 @@
 
 def classify_documents_per_collection(
     qdrant_connector: QdrantClient, slices: Collection[Type[DocumentSlice]]
-) -> Dict[str, Set[UUID]]:
+) -> Dict[str | None, Set[UUID]]:
     """
     Classify documents per collection in Qdrant.
 
@@ -29,10 +29,18 @@ def classify_documents_per_collection(
     tmp_collections_names_in_qdrant = qdrant_connector.get_collections().collections
     collections_names_in_qdrant = [c.name for c in tmp_collections_names_in_qdrant]
 
-    ret: Dict[str, Set[UUID]] = {}
+    ret: Dict[str | None, Set[UUID]] = {None: set()}
     for dslice in slices:
         lang = dslice.document.lang
-        model = dslice.embedding_model.title
+        try:
+            model = dslice.embedding_model.title
+        except AttributeError:
+            logger.error(
+                f"Slice {dslice.id} has no updated embedding model, document ({dslice.document_id} put in error",
+            )
+            ret[None].add(dslice.document_id)  # type: ignore
+            continue
+
         collection_name = None
         multilingual_collection = f"collection_welearn_mul_{model}"
         mono_collection = f"collection_welearn_{lang}_{model}"
diff --git a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py
index 017b30f..4057b82 100644
--- a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py
+++ b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py
@@ -112,6 +112,21 @@ def main() -> None:
             qdrant_connector=qdrant_client, slices=slices
         )
 
+        # Flag documents with no collection
+        logger.info(
+            "Flag documents with no collection: %s", len(documents_per_collection[None])
+        )
+        for docid in documents_per_collection[None]:
+            db_session.add(
+                ProcessState(
+                    id=uuid.uuid4(),
+                    document_id=docid,
+                    title=Step.KEPT_FOR_TRACE.value,
+                )
+            )
+        del documents_per_collection[None]
+        db_session.commit()
+
         # Iterate on each collection
         for collection_name in documents_per_collection:
             logger.info(f"We are working on collection : {collection_name}")

From a7c71911fe58b617f8a78b8873354490c7419388 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com>
Date: Tue, 18 Nov 2025 16:09:30 +0100
Subject: [PATCH 2/2] Update welearn_datastack/modules/qdrant_handler.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 welearn_datastack/modules/qdrant_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py
index 1a895b0..b1fbc3a 100644
--- a/welearn_datastack/modules/qdrant_handler.py
+++ b/welearn_datastack/modules/qdrant_handler.py
@@ -36,7 +36,7 @@ def classify_documents_per_collection(
             model = dslice.embedding_model.title
         except AttributeError:
             logger.error(
-                f"Slice {dslice.id} has no updated embedding model, document ({dslice.document_id} put in error",
+                f"Slice {dslice.id} has no updated embedding model, document ({dslice.document_id}) put in error",
             )
             ret[None].add(dslice.document_id)  # type: ignore
             continue