From 72a19f779390a105117ae402a2b2c5e2cc778c1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Tue, 24 Jun 2025 17:15:16 +0200 Subject: [PATCH 1/4] unescape --- welearn_datastack/utils_/scraping_utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/welearn_datastack/utils_/scraping_utils.py b/welearn_datastack/utils_/scraping_utils.py index 5009611..3739901 100644 --- a/welearn_datastack/utils_/scraping_utils.py +++ b/welearn_datastack/utils_/scraping_utils.py @@ -1,5 +1,6 @@ import logging import re +from html import unescape from html.parser import HTMLParser from bs4 import BeautifulSoup, NavigableString, Tag # type: ignore @@ -31,9 +32,9 @@ def remove_extra_whitespace(text: str) -> str: return " ".join(text.split()) -def remove_html_tags(text: str) -> str: +def remove_html_stuff(text: str) -> str: """ - removes html tags from text + removes html tags and special stuff like & from text Args: text (str): text to evaluate @@ -43,7 +44,9 @@ def remove_html_tags(text: str) -> str: """ remover = HTMLTagRemover() remover.feed(text + "\n") - return remover.get_text() + txt = remover.get_text() + ret = unescape(txt) + return ret def format_cc_license(license: str) -> str: @@ -140,7 +143,7 @@ def clean_text(content: str) -> str: Returns: str: the cleaned content """ - return remove_extra_whitespace(remove_html_tags(content)).strip() + return remove_extra_whitespace(remove_html_stuff(content)).strip() def get_url_without_hal_like_versionning(url: str) -> str: From 24a409adca5697d9631feab2ac6d2d4fe197dde5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Tue, 24 Jun 2025 17:18:27 +0200 Subject: [PATCH 2/4] clean_text --- .../plugins/rest_requesters/pressbooks.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/welearn_datastack/plugins/rest_requesters/pressbooks.py b/welearn_datastack/plugins/rest_requesters/pressbooks.py index 32199b3..ff11dc4 100644 --- a/welearn_datastack/plugins/rest_requesters/pressbooks.py +++ b/welearn_datastack/plugins/rest_requesters/pressbooks.py @@ -84,13 +84,6 @@ def run(self, urls: List[str]) -> Tuple[List[ScrapedWeLearnDocument], List[str]] for item in container_content: post_id = item["id"] url = self._create_pressbook_id(main_url, post_id) - # if post_id not in main_urls[main_url]: - # # Retrieve document doesnt exist in previous retrieved url - # logger.warning( - # f"Post ID {post_id} not found in main URLs for {main_url}" - # ) - # error_docs.append(url) - # continue try: metadata_url = item["_links"]["metadata"][0]["href"] except KeyError: @@ -118,7 +111,7 @@ def run(self, urls: List[str]) -> Tuple[List[ScrapedWeLearnDocument], List[str]] ) error_docs.append(url) continue - title = metadata["name"] + title = clean_text(metadata["name"]) # Content stuff not_formatted_content = item["content"]["raw"] @@ -160,8 +153,8 @@ def run(self, urls: List[str]) -> Tuple[List[ScrapedWeLearnDocument], List[str]] for author in metadata["author"]: authors.append( { - "name": author["name"], - "misc": author.get("contributor_institution"), + "name": clean_text(author["name"]), + "misc": clean_text(author.get("contributor_institution")), } ) @@ -170,11 +163,11 @@ def run(self, urls: List[str]) -> Tuple[List[ScrapedWeLearnDocument], List[str]] for editor in metadata["editor"]: editors.append( { - "name": editor["name"], + "name": clean_text(editor["name"]), } ) - publisher = metadata.get("publisher", {}).get("name") + publisher = clean_text(metadata.get("publisher", {}).get("name")) details = { "license": license_url, From 1d8f9ea76ca44ae952ebde238a8e5d7373f47b33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Tue, 24 Jun 2025 17:28:38 +0200 Subject: [PATCH 3/4] add fallback case --- welearn_datastack/utils_/scraping_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/welearn_datastack/utils_/scraping_utils.py b/welearn_datastack/utils_/scraping_utils.py index 3739901..da20ed8 100644 --- a/welearn_datastack/utils_/scraping_utils.py +++ b/welearn_datastack/utils_/scraping_utils.py @@ -29,6 +29,8 @@ def remove_extra_whitespace(text: str) -> str: Returns: str: text without extra whitespace """ + if not isinstance(text, str): + return text return " ".join(text.split()) @@ -42,6 +44,8 @@ def remove_html_stuff(text: str) -> str: Returns: str: text without html tags """ + if not isinstance(text, str): + return text remover = HTMLTagRemover() remover.feed(text + "\n") txt = remover.get_text() @@ -55,6 +59,8 @@ def format_cc_license(license: str) -> str: :param license: License to format. :return: License well formated. """ + if not isinstance(license, str): + return license splitted_elements = license.split("-") version = splitted_elements[-1].strip() rights_code = "-".join(splitted_elements[1:-1]).strip().lower() @@ -129,6 +135,8 @@ def extract_property_from_html( def clean_return_to_line(string: str): + if not isinstance(string, str): + return string ret = re.sub(r"([\n\t\r])", "", string).strip() return ret @@ -143,6 +151,8 @@ def clean_text(content: str) -> str: Returns: str: the cleaned content """ + if not isinstance(content, str): + return content return remove_extra_whitespace(remove_html_stuff(content)).strip() @@ -154,5 +164,7 @@ def get_url_without_hal_like_versionning(url: str) -> str: :return: URL without versionning """ # Get the URL without the versionning part + if not isinstance(url, str): + return url uri = re.sub(r"v\d+$", "", url) return uri.strip() From 08a83c37392cc8745ee3d17397be1182f70fcf8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Tue, 24 Jun 2025 17:28:42 +0200 Subject: [PATCH 4/4] add fallback case --- .../plugins_test/test_pressbooks.py | 2 +- .../plugins/rest_requesters/pressbooks.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/document_collector_hub/plugins_test/test_pressbooks.py b/tests/document_collector_hub/plugins_test/test_pressbooks.py index 8d9dafd..3770007 100644 --- a/tests/document_collector_hub/plugins_test/test_pressbooks.py +++ b/tests/document_collector_hub/plugins_test/test_pressbooks.py @@ -59,7 +59,7 @@ def mock_get(url, *args, **kwargs): self.assertEqual(len(collected_docs), 1) doc = collected_docs[0] - self.assertEqual(doc.document_title, self.mock_metadata["name"]) + self.assertEqual(doc.document_title, f"{self.mock_metadata["isPartOf"]} - {self.mock_metadata["name"]}") self.assertTrue( doc.document_content.startswith( "Chapter 1: Introduction to Communication Situations" diff --git a/welearn_datastack/plugins/rest_requesters/pressbooks.py b/welearn_datastack/plugins/rest_requesters/pressbooks.py index ff11dc4..c0fdcb2 100644 --- a/welearn_datastack/plugins/rest_requesters/pressbooks.py +++ b/welearn_datastack/plugins/rest_requesters/pressbooks.py @@ -111,7 +111,13 @@ def run(self, urls: List[str]) -> Tuple[List[ScrapedWeLearnDocument], List[str]] ) error_docs.append(url) continue - title = clean_text(metadata["name"]) + book_title = clean_text(metadata.get("isPartOf")) + element_title = clean_text(metadata["name"]) + + if book_title: + title = f"{book_title} - {element_title}" + else: + title = element_title # Content stuff not_formatted_content = item["content"]["raw"] @@ -154,7 +160,9 @@ def run(self, urls: List[str]) -> Tuple[List[ScrapedWeLearnDocument], List[str]] authors.append( { "name": clean_text(author["name"]), - "misc": clean_text(author.get("contributor_institution")), + "misc": clean_text( + author.get("contributor_institution") + ), } )