From 4dc2f991ed90795167a4394c5e57615f2acac6c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <theo.nardin@cri-paris.org>
Date: Thu, 20 Nov 2025 14:12:13 +0100
Subject: [PATCH 1/3] feat: add validation for mandatory fields in document
 extraction

---
 .../test_nodes/test_extract_n_collect_docs.py | 25 +++++++++++++++++++
 welearn_datastack/modules/validation.py       | 11 ++++++++
 .../document_collector.py                     |  8 ++++++
 3 files changed, 44 insertions(+)
 create mode 100644 welearn_datastack/modules/validation.py

diff --git a/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py b/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py
index 90bf606..96cf1b7 100644
--- a/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py
+++ b/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py
@@ -123,6 +123,31 @@ def test_extract_data(self, collector_selector_mock):
         self.assertEqual(error_docs[0].document_id, self.doc_invalid.id)
         self.assertEqual(len(process_states), 2)
 
+    @patch(
+        "welearn_datastack.nodes_workflow.DocumentHubCollector.document_collector.collector_selector"
+    )
+    def test_extract_and_with_none_data(self, collector_selector_mock):
+        collector_selector_mock.select_collector.return_value = mock.MagicMock(
+            spec=IPluginRESTCollector
+        )
+        self.doc_valid.full_content = None  # Simulate missing content
+        collector_selector_mock.select_collector.return_value.run.return_value = [
+            WrapperRetrieveDocument(document=self.doc_valid),
+        ]
+
+        (
+            extracted_docs,
+            error_docs,
+            process_states,
+        ) = document_collector.extract_data_from_urls(
+            welearn_documents=[self.doc_valid, self.doc_invalid]
+        )
+
+        self.assertEqual(len(extracted_docs), 0)
+        self.assertEqual(len(error_docs), 1)
+        self.assertEqual(error_docs[0].document_id, self.doc_valid.id)
+        self.assertEqual(len(process_states), 1)
+
     @patch(
         "welearn_datastack.nodes_workflow.DocumentHubCollector.document_collector.collector_selector"
     )
diff --git a/welearn_datastack/modules/validation.py b/welearn_datastack/modules/validation.py
new file mode 100644
index 0000000..5d76a66
--- /dev/null
+++ b/welearn_datastack/modules/validation.py
@@ -0,0 +1,11 @@
+from welearn_database.data.models import WeLearnDocument
+
+
+def validate_non_null_fields_document(doc: WeLearnDocument) -> bool:
+    """
+    Validate if a WeLearnDocument has values where it's mandatory after extraction.
+    :return: True if valid, False otherwise
+    """
+    desc_in_error = not doc.description or doc.description.strip() == ""
+    content_in_error = not doc.full_content or doc.full_content.strip() == ""
+    return not (desc_in_error or content_in_error)
diff --git a/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py b/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py
index 510944d..0be6723 100644
--- a/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py
+++ b/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py
@@ -17,6 +17,7 @@
     compute_readability,
     identify_document_language,
 )
+from welearn_datastack.modules.validation import validate_non_null_fields_document
 from welearn_datastack.plugins.interface import IPlugin
 from welearn_datastack.utils_.database_utils import create_db_session
 from welearn_datastack.utils_.path_utils import setup_local_path
@@ -156,6 +157,13 @@ def extract_data_from_urls(
         documents = corpus_collector.run(documents=welearn_documents)  # type: ignore
 
         for wrapper_document in documents:
+            is_none_valid = validate_non_null_fields_document(wrapper_document.document)
+            if not is_none_valid and not wrapper_document.is_error:
+                wrapper_document.http_error_code = 422
+                wrapper_document.error_info = (
+                    "Mandatory fields are missing after extraction"
+                )
+
             state_title = (
                 Step.DOCUMENT_SCRAPED.value
                 if not wrapper_document.is_error

From 3b5a2fba06c69307b5014296848a801b1c62e008 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com>
Date: Thu, 20 Nov 2025 14:14:21 +0100
Subject: [PATCH 2/3] Update welearn_datastack/modules/validation.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 welearn_datastack/modules/validation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/welearn_datastack/modules/validation.py b/welearn_datastack/modules/validation.py
index 5d76a66..84575b7 100644
--- a/welearn_datastack/modules/validation.py
+++ b/welearn_datastack/modules/validation.py
@@ -6,6 +6,6 @@ def validate_non_null_fields_document(doc: WeLearnDocument) -> bool:
     Validate if a WeLearnDocument has values where it's mandatory after extraction.
     :return: True if valid, False otherwise
     """
-    desc_in_error = not doc.description or doc.description.strip() == ""
-    content_in_error = not doc.full_content or doc.full_content.strip() == ""
-    return not (desc_in_error or content_in_error)
+    is_desc_empty = not doc.description or doc.description.strip() == ""
+    is_content_empty = not doc.full_content or doc.full_content.strip() == ""
+    return not (is_desc_empty or is_content_empty)

From 5cf00bd75e2743f4ee0a25647f725d5b8cdf96ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com>
Date: Thu, 20 Nov 2025 14:14:44 +0100
Subject: [PATCH 3/3] Update
 welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .../nodes_workflow/DocumentHubCollector/document_collector.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py b/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py
index 0be6723..cfe6906 100644
--- a/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py
+++ b/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py
@@ -154,7 +154,7 @@ def extract_data_from_urls(
     for corpus_name in batch_docs:
         # Get data
         corpus_collector = corpus_plugin[corpus_name]
-        documents = corpus_collector.run(documents=welearn_documents)  # type: ignore
+        documents = corpus_collector.run(documents=batch_docs[corpus_name])  # type: ignore
 
         for wrapper_document in documents:
             is_none_valid = validate_non_null_fields_document(wrapper_document.document)