Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,31 @@ def test_extract_data(self, collector_selector_mock):
self.assertEqual(error_docs[0].document_id, self.doc_invalid.id)
self.assertEqual(len(process_states), 2)

@patch(
"welearn_datastack.nodes_workflow.DocumentHubCollector.document_collector.collector_selector"
)
def test_extract_and_with_none_data(self, collector_selector_mock):
collector_selector_mock.select_collector.return_value = mock.MagicMock(
spec=IPluginRESTCollector
)
self.doc_valid.full_content = None # Simulate missing content
collector_selector_mock.select_collector.return_value.run.return_value = [
WrapperRetrieveDocument(document=self.doc_valid),
]

(
extracted_docs,
error_docs,
process_states,
) = document_collector.extract_data_from_urls(
welearn_documents=[self.doc_valid, self.doc_invalid]
)

self.assertEqual(len(extracted_docs), 0)
self.assertEqual(len(error_docs), 1)
self.assertEqual(error_docs[0].document_id, self.doc_valid.id)
self.assertEqual(len(process_states), 1)

@patch(
"welearn_datastack.nodes_workflow.DocumentHubCollector.document_collector.collector_selector"
)
Expand Down
11 changes: 11 additions & 0 deletions welearn_datastack/modules/validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from welearn_database.data.models import WeLearnDocument


def validate_non_null_fields_document(doc: WeLearnDocument) -> bool:
"""
Validate if a WeLearnDocument has values where it's mandatory after extraction.
:return: True if valid, False otherwise
"""
is_desc_empty = not doc.description or doc.description.strip() == ""
is_content_empty = not doc.full_content or doc.full_content.strip() == ""
return not (is_desc_empty or is_content_empty)
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
compute_readability,
identify_document_language,
)
from welearn_datastack.modules.validation import validate_non_null_fields_document
from welearn_datastack.plugins.interface import IPlugin
from welearn_datastack.utils_.database_utils import create_db_session
from welearn_datastack.utils_.path_utils import setup_local_path
Expand Down Expand Up @@ -153,9 +154,16 @@ def extract_data_from_urls(
for corpus_name in batch_docs:
# Get data
corpus_collector = corpus_plugin[corpus_name]
documents = corpus_collector.run(documents=welearn_documents) # type: ignore
documents = corpus_collector.run(documents=batch_docs[corpus_name]) # type: ignore

for wrapper_document in documents:
is_none_valid = validate_non_null_fields_document(wrapper_document.document)
if not is_none_valid and not wrapper_document.is_error:
wrapper_document.http_error_code = 422
wrapper_document.error_info = (
"Mandatory fields are missing after extraction"
)

state_title = (
Step.DOCUMENT_SCRAPED.value
if not wrapper_document.is_error
Expand Down
Loading