From 5c98cac6687532bf60724ede944730857388dcc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Tue, 18 Nov 2025 15:52:23 +0100 Subject: [PATCH 1/2] fix: handle documents without embedding model in classification --- .../qdrant_syncronizer/test_qdrant_handler.py | 20 ++++++++++++++++++- welearn_datastack/modules/qdrant_handler.py | 14 ++++++++++--- .../QdrantSyncronizer/qdrant_syncronizer.py | 15 ++++++++++++++ 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/tests/qdrant_syncronizer/test_qdrant_handler.py b/tests/qdrant_syncronizer/test_qdrant_handler.py index 824fa74..c86822e 100644 --- a/tests/qdrant_syncronizer/test_qdrant_handler.py +++ b/tests/qdrant_syncronizer/test_qdrant_handler.py @@ -75,6 +75,19 @@ def setUp(self): def tearDown(self): self.client.close() + def test_slice_without_embedding_model_should_go_to_none_collection(self): + doc_id = uuid.uuid4() + qdrant_connector = self.client + fake_slice = FakeSlice(doc_id) + fake_slice.embedding_model = None + slices = [fake_slice] + collections_names = classify_documents_per_collection(qdrant_connector, slices) + + expected = { + None: {fake_slice.document_id}, + } + self.assertEqual(dict(collections_names), expected) + def test_should_get_collections_names_for_given_slices(self): doc_id = uuid.uuid4() qdrant_connector = self.client @@ -83,7 +96,10 @@ def test_should_get_collections_names_for_given_slices(self): slices = [fake_slice] collections_names = classify_documents_per_collection(qdrant_connector, slices) - expected = {"collection_welearn_en_english-embmodel": {fake_slice.document_id}} + expected = { + None: set(), + "collection_welearn_en_english-embmodel": {fake_slice.document_id}, + } self.assertEqual(dict(collections_names), expected) def test_should_handle_multiple_slices_for_same_collection(self): @@ -101,6 +117,7 @@ def test_should_handle_multiple_slices_for_same_collection(self): slices = [fake_slice0, fake_slice1, fake_slice2] collections_names = classify_documents_per_collection(qdrant_connector, slices) expected = { + None: set(), "collection_welearn_en_english-embmodel": {doc_id0}, "collection_welearn_fr_french-embmodel": {doc_id1}, } @@ -130,6 +147,7 @@ def test_should_handle_multiple_slices_for_same_collection_with_multi_lingual_co slices = [fake_slice0, fake_slice1, fake_slice2] collections_names = classify_documents_per_collection(qdrant_connector, slices) expected = { + None: set(), "collection_welearn_en_english-embmodel": {doc_id0}, "collection_welearn_mul_mulembmodel": {doc_id1}, } diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py index 51c29a1..1a895b0 100644 --- a/welearn_datastack/modules/qdrant_handler.py +++ b/welearn_datastack/modules/qdrant_handler.py @@ -15,7 +15,7 @@ def classify_documents_per_collection( qdrant_connector: QdrantClient, slices: Collection[Type[DocumentSlice]] -) -> Dict[str, Set[UUID]]: +) -> Dict[str | None, Set[UUID]]: """ Classify documents per collection in Qdrant. @@ -29,10 +29,18 @@ def classify_documents_per_collection( tmp_collections_names_in_qdrant = qdrant_connector.get_collections().collections collections_names_in_qdrant = [c.name for c in tmp_collections_names_in_qdrant] - ret: Dict[str, Set[UUID]] = {} + ret: Dict[str | None, Set[UUID]] = {None: set()} for dslice in slices: lang = dslice.document.lang - model = dslice.embedding_model.title + try: + model = dslice.embedding_model.title + except AttributeError: + logger.error( + f"Slice {dslice.id} has no updated embedding model, document ({dslice.document_id} put in error", + ) + ret[None].add(dslice.document_id) # type: ignore + continue + collection_name = None multilingual_collection = f"collection_welearn_mul_{model}" mono_collection = f"collection_welearn_{lang}_{model}" diff --git a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py index 017b30f..4057b82 100644 --- a/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py +++ b/welearn_datastack/nodes_workflow/QdrantSyncronizer/qdrant_syncronizer.py @@ -112,6 +112,21 @@ def main() -> None: qdrant_connector=qdrant_client, slices=slices ) + # Flag documents with no collection + logger.info( + "Flag documents with no collection: %s", len(documents_per_collection[None]) + ) + for docid in documents_per_collection[None]: + db_session.add( + ProcessState( + id=uuid.uuid4(), + document_id=docid, + title=Step.KEPT_FOR_TRACE.value, + ) + ) + del documents_per_collection[None] + db_session.commit() + # Iterate on each collection for collection_name in documents_per_collection: logger.info(f"We are working on collection : {collection_name}") From a7c71911fe58b617f8a78b8873354490c7419388 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com> Date: Tue, 18 Nov 2025 16:09:30 +0100 Subject: [PATCH 2/2] Update welearn_datastack/modules/qdrant_handler.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- welearn_datastack/modules/qdrant_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/welearn_datastack/modules/qdrant_handler.py b/welearn_datastack/modules/qdrant_handler.py index 1a895b0..b1fbc3a 100644 --- a/welearn_datastack/modules/qdrant_handler.py +++ b/welearn_datastack/modules/qdrant_handler.py @@ -36,7 +36,7 @@ def classify_documents_per_collection( model = dslice.embedding_model.title except AttributeError: logger.error( - f"Slice {dslice.id} has no updated embedding model, document ({dslice.document_id} put in error", + f"Slice {dslice.id} has no updated embedding model, document ({dslice.document_id}) put in error", ) ret[None].add(dslice.document_id) # type: ignore continue