diff --git a/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py b/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py index 90bf606..96cf1b7 100644 --- a/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py +++ b/tests/document_collector_hub/test_nodes/test_extract_n_collect_docs.py @@ -123,6 +123,31 @@ def test_extract_data(self, collector_selector_mock): self.assertEqual(error_docs[0].document_id, self.doc_invalid.id) self.assertEqual(len(process_states), 2) + @patch( + "welearn_datastack.nodes_workflow.DocumentHubCollector.document_collector.collector_selector" + ) + def test_extract_and_with_none_data(self, collector_selector_mock): + collector_selector_mock.select_collector.return_value = mock.MagicMock( + spec=IPluginRESTCollector + ) + self.doc_valid.full_content = None # Simulate missing content + collector_selector_mock.select_collector.return_value.run.return_value = [ + WrapperRetrieveDocument(document=self.doc_valid), + ] + + ( + extracted_docs, + error_docs, + process_states, + ) = document_collector.extract_data_from_urls( + welearn_documents=[self.doc_valid, self.doc_invalid] + ) + + self.assertEqual(len(extracted_docs), 0) + self.assertEqual(len(error_docs), 1) + self.assertEqual(error_docs[0].document_id, self.doc_valid.id) + self.assertEqual(len(process_states), 1) + @patch( "welearn_datastack.nodes_workflow.DocumentHubCollector.document_collector.collector_selector" ) diff --git a/welearn_datastack/modules/validation.py b/welearn_datastack/modules/validation.py new file mode 100644 index 0000000..84575b7 --- /dev/null +++ b/welearn_datastack/modules/validation.py @@ -0,0 +1,11 @@ +from welearn_database.data.models import WeLearnDocument + + +def validate_non_null_fields_document(doc: WeLearnDocument) -> bool: + """ + Validate if a WeLearnDocument has values where it's mandatory after extraction. + :return: True if valid, False otherwise + """ + is_desc_empty = not doc.description or doc.description.strip() == "" + is_content_empty = not doc.full_content or doc.full_content.strip() == "" + return not (is_desc_empty or is_content_empty) diff --git a/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py b/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py index 510944d..cfe6906 100644 --- a/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py +++ b/welearn_datastack/nodes_workflow/DocumentHubCollector/document_collector.py @@ -17,6 +17,7 @@ compute_readability, identify_document_language, ) +from welearn_datastack.modules.validation import validate_non_null_fields_document from welearn_datastack.plugins.interface import IPlugin from welearn_datastack.utils_.database_utils import create_db_session from welearn_datastack.utils_.path_utils import setup_local_path @@ -153,9 +154,16 @@ def extract_data_from_urls( for corpus_name in batch_docs: # Get data corpus_collector = corpus_plugin[corpus_name] - documents = corpus_collector.run(documents=welearn_documents) # type: ignore + documents = corpus_collector.run(documents=batch_docs[corpus_name]) # type: ignore for wrapper_document in documents: + is_none_valid = validate_non_null_fields_document(wrapper_document.document) + if not is_none_valid and not wrapper_document.is_error: + wrapper_document.http_error_code = 422 + wrapper_document.error_info = ( + "Mandatory fields are missing after extraction" + ) + state_title = ( Step.DOCUMENT_SCRAPED.value if not wrapper_document.is_error