Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def mock_get(url, *args, **kwargs):

self.assertEqual(len(collected_docs), 1)
doc = collected_docs[0]
self.assertEqual(doc.document_title, self.mock_metadata["name"])
self.assertEqual(doc.document_title, f"{self.mock_metadata["isPartOf"]} - {self.mock_metadata["name"]}")
self.assertTrue(
doc.document_content.startswith(
"Chapter 1: Introduction to Communication Situations"
Expand Down
25 changes: 13 additions & 12 deletions welearn_datastack/plugins/rest_requesters/pressbooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,6 @@ def run(self, urls: List[str]) -> Tuple[List[ScrapedWeLearnDocument], List[str]]
for item in container_content:
post_id = item["id"]
url = self._create_pressbook_id(main_url, post_id)
# if post_id not in main_urls[main_url]:
# # Retrieve document doesnt exist in previous retrieved url
# logger.warning(
# f"Post ID {post_id} not found in main URLs for {main_url}"
# )
# error_docs.append(url)
# continue
try:
metadata_url = item["_links"]["metadata"][0]["href"]
except KeyError:
Expand Down Expand Up @@ -118,7 +111,13 @@ def run(self, urls: List[str]) -> Tuple[List[ScrapedWeLearnDocument], List[str]]
)
error_docs.append(url)
continue
title = metadata["name"]
book_title = clean_text(metadata.get("isPartOf"))
element_title = clean_text(metadata["name"])

if book_title:
title = f"{book_title} - {element_title}"
else:
title = element_title

# Content stuff
not_formatted_content = item["content"]["raw"]
Expand Down Expand Up @@ -160,8 +159,10 @@ def run(self, urls: List[str]) -> Tuple[List[ScrapedWeLearnDocument], List[str]]
for author in metadata["author"]:
authors.append(
{
"name": author["name"],
"misc": author.get("contributor_institution"),
"name": clean_text(author["name"]),
"misc": clean_text(
author.get("contributor_institution")
),
}
)

Expand All @@ -170,11 +171,11 @@ def run(self, urls: List[str]) -> Tuple[List[ScrapedWeLearnDocument], List[str]]
for editor in metadata["editor"]:
editors.append(
{
"name": editor["name"],
"name": clean_text(editor["name"]),
}
)

publisher = metadata.get("publisher", {}).get("name")
publisher = clean_text(metadata.get("publisher", {}).get("name"))

details = {
"license": license_url,
Expand Down
23 changes: 19 additions & 4 deletions welearn_datastack/utils_/scraping_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import re
from html import unescape
from html.parser import HTMLParser

from bs4 import BeautifulSoup, NavigableString, Tag # type: ignore
Expand Down Expand Up @@ -28,22 +29,28 @@ def remove_extra_whitespace(text: str) -> str:
Returns:
str: text without extra whitespace
"""
if not isinstance(text, str):
return text
return " ".join(text.split())


def remove_html_tags(text: str) -> str:
def remove_html_stuff(text: str) -> str:
"""
removes html tags from text
removes html tags and special stuff like & from text

Args:
text (str): text to evaluate

Returns:
str: text without html tags
"""
if not isinstance(text, str):
return text
remover = HTMLTagRemover()
remover.feed(text + "\n")
return remover.get_text()
txt = remover.get_text()
ret = unescape(txt)
return ret


def format_cc_license(license: str) -> str:
Expand All @@ -52,6 +59,8 @@ def format_cc_license(license: str) -> str:
:param license: License to format.
:return: License well formated.
"""
if not isinstance(license, str):
return license
splitted_elements = license.split("-")
version = splitted_elements[-1].strip()
rights_code = "-".join(splitted_elements[1:-1]).strip().lower()
Expand Down Expand Up @@ -126,6 +135,8 @@ def extract_property_from_html(


def clean_return_to_line(string: str):
if not isinstance(string, str):
return string
ret = re.sub(r"([\n\t\r])", "", string).strip()
return ret

Expand All @@ -140,7 +151,9 @@ def clean_text(content: str) -> str:
Returns:
str: the cleaned content
"""
return remove_extra_whitespace(remove_html_tags(content)).strip()
if not isinstance(content, str):
return content
return remove_extra_whitespace(remove_html_stuff(content)).strip()


def get_url_without_hal_like_versionning(url: str) -> str:
Expand All @@ -151,5 +164,7 @@ def get_url_without_hal_like_versionning(url: str) -> str:
:return: URL without versionning
"""
# Get the URL without the versionning part
if not isinstance(url, str):
return url
uri = re.sub(r"v\d+$", "", url)
return uri.strip()