From 4dc2f991ed90795167a4394c5e57615f2acac6c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Thu, 20 Nov 2025 14:12:13 +0100 Subject: [PATCH 1/3] feat: add validation for mandatory fields in document extraction --- .../test_nodes/test_extract_n_collect_docs.py | 25 +++++++++++++++++++ welearn_datastack/modules/validation.py | 11 ++++++++ .../document_collector.py | 8 ++++++ 3 files changed, 44 insertions(+) create mode 100644 welearn_datastack/modules/validation.py diff --git a/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py b/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py index 90bf606..96cf1b7 100644 --- a/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py +++ b/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py @@ -123,6 +123,31 @@ def test_extract_data(self, collector_selector_mock): self.assertEqual(error_docs[0].document_id, self.doc_invalid.id) self.assertEqual(len(process_states), 2) + @patch( + "welearn_datastack.nodes_workflow.DocumentHubCollector.document_collector.collector_selector" + ) + def test_extract_and_with_none_data(self, collector_selector_mock): + collector_selector_mock.select_collector.return_value = mock.MagicMock( + spec=IPluginRESTCollector + ) + self.doc_valid.full_content = None # Simulate missing content + collector_selector_mock.select_collector.return_value.run.return_value = [ + WrapperRetrieveDocument(document=self.doc_valid), + ] + + ( + extracted_docs, + error_docs, + process_states, + ) = document_collector.extract_data_from_urls( + welearn_documents=[self.doc_valid, self.doc_invalid] + ) + + self.assertEqual(len(extracted_docs), 0) + self.assertEqual(len(error_docs), 1) + self.assertEqual(error_docs[0].document_id, self.doc_valid.id) + self.assertEqual(len(process_states), 1) + @patch( "welearn_datastack.nodes_workflow.DocumentHubCollector.document_collector.collector_selector" ) diff --git a/welearn_datastack/modules/validation.py b/welearn_datastack/modules/validation.py new file mode 100644 index 0000000..5d76a66 --- /dev/null +++ b/welearn_datastack/modules/validation.py @@ -0,0 +1,11 @@ +from welearn_database.data.models import WeLearnDocument + + +def validate_non_null_fields_document(doc: WeLearnDocument) -> bool: + """ + Validate if a WeLearnDocument has values where it's mandatory after extraction. + :return: True if valid, False otherwise + """ + desc_in_error = not doc.description or doc.description.strip() == "" + content_in_error = not doc.full_content or doc.full_content.strip() == "" + return not (desc_in_error or content_in_error) diff --git a/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py b/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py index 510944d..0be6723 100644 --- a/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py +++ b/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py @@ -17,6 +17,7 @@ compute_readability, identify_document_language, ) +from welearn_datastack.modules.validation import validate_non_null_fields_document from welearn_datastack.plugins.interface import IPlugin from welearn_datastack.utils_.database_utils import create_db_session from welearn_datastack.utils_.path_utils import setup_local_path @@ -156,6 +157,13 @@ def extract_data_from_urls( documents = corpus_collector.run(documents=welearn_documents) # type: ignore for wrapper_document in documents: + is_none_valid = validate_non_null_fields_document(wrapper_document.document) + if not is_none_valid and not wrapper_document.is_error: + wrapper_document.http_error_code = 422 + wrapper_document.error_info = ( + "Mandatory fields are missing after extraction" + ) + state_title = ( Step.DOCUMENT_SCRAPED.value if not wrapper_document.is_error From 3b5a2fba06c69307b5014296848a801b1c62e008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com> Date: Thu, 20 Nov 2025 14:14:21 +0100 Subject: [PATCH 2/3] Update welearn_datastack/modules/validation.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- welearn_datastack/modules/validation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/welearn_datastack/modules/validation.py b/welearn_datastack/modules/validation.py index 5d76a66..84575b7 100644 --- a/welearn_datastack/modules/validation.py +++ b/welearn_datastack/modules/validation.py @@ -6,6 +6,6 @@ def validate_non_null_fields_document(doc: WeLearnDocument) -> bool: Validate if a WeLearnDocument has values where it's mandatory after extraction. :return: True if valid, False otherwise """ - desc_in_error = not doc.description or doc.description.strip() == "" - content_in_error = not doc.full_content or doc.full_content.strip() == "" - return not (desc_in_error or content_in_error) + is_desc_empty = not doc.description or doc.description.strip() == "" + is_content_empty = not doc.full_content or doc.full_content.strip() == "" + return not (is_desc_empty or is_content_empty) From 5cf00bd75e2743f4ee0a25647f725d5b8cdf96ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= <133012334+lpi-tn@users.noreply.github.com> Date: Thu, 20 Nov 2025 14:14:44 +0100 Subject: [PATCH 3/3] Update welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../nodes_workflow/DocumentHubCollector/document_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py b/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py index 0be6723..cfe6906 100644 --- a/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py +++ b/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py @@ -154,7 +154,7 @@ def extract_data_from_urls( for corpus_name in batch_docs: # Get data corpus_collector = corpus_plugin[corpus_name] - documents = corpus_collector.run(documents=welearn_documents) # type: ignore + documents = corpus_collector.run(documents=batch_docs[corpus_name]) # type: ignore for wrapper_document in documents: is_none_valid = validate_non_null_fields_document(wrapper_document.document)