In [1]:
import pandas as pd

In [2]:
import time
import requests
from bs4 import BeautifulSoup

REPLACEMENTS: dict[str, str] = {
    "“": "'",
    "”": "'",
    "’": "'",
    "‘": "'",
    "…": "...",
    "—": "-",
    "\u00a0": " ",
}

EXCLUDE_STARTSWITH: list[str] = [
    "Written By",
    "Image Credit",
    "In health",
    "Michael Greger",
    "-Michael Greger",
    "PS:",
    "A founding member",
    "Subscribe",
    "Catch up",
    "Charity ID",
    "We  our volunteers!",
    "Interested in learning more about",
    "Check out",
    "For more on",
]

def get_webpage_content(url: str, timeout: int = 10) -> requests.Response | None:
    """Fetches the HTML content of a webpage.

    Args:
        url (str): The URL of the webpage to retrieve.
        timeout (int, optional): Timeout for the request in seconds. Defaults to 10.

    Returns:
        requests.Response | None: The response object containing the HTML content, or None if an error occurs.
    """
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=timeout)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return None

    return response

def filter_links(links: list[str], root: str) -> list[str]:
    """Filters links by ensuring they start with the root URL and are not pagination links.

    Args:
        links (list[str]): List of extracted links from a webpage.
        root (str): The root URL to filter links that start with it.

    Returns:
        list[str]: Filtered list of links.
    """
    filtered_links: list[str] = []
    for href in links:
        if not href.startswith(root):
            continue
        link_tail: str = href.replace(root, "")
        if link_tail and not link_tail.startswith("page"):
            filtered_links.append(href)

    return filtered_links

def extract_all_urls(root: str, page_stop: int | None = None, wait: float = 0.2) -> list[str]:
    """Extracts all blog post URLs from paginated web pages.

    Args:
        root (str): The base URL of the website to scrape.
        page_stop (int | None, optional): Stop after a certain number of pages. Defaults to None (no limit).
        wait (float, optional): Delay between page requests to avoid blocking. Defaults to 0.2.

    Returns:
        list[str]: List of all blog post URLs.
    """
    i_page: int = 0
    url_list: list[str] = []
    while True:
        time.sleep(wait)  # wait a bit to avoid being blocked
        i_page += 1

        if page_stop is not None and i_page > page_stop:
            break

        if i_page == 1:
            page_url = root
        else:
            page_url = f"{root}page/{i_page}/"
        print(f"{i_page}. Page URL: {page_url}")

        response = get_webpage_content(page_url)
        if response is None:
            break

        soup = BeautifulSoup(response.content, "html.parser")
        links: list[str] = sorted({link["href"] for link in soup.find_all("a", href=True)})

        blog_posts_of_page: list[str] = filter_links(links, root)
        n_posts: int = len(blog_posts_of_page)
        print(f"\t Number of blog posts: {n_posts}")

        if n_posts < 2:
            break
        url_list.extend(blog_posts_of_page)

    return url_list

def replace_strange_chars(text: str) -> str:
    """Replaces strange characters in a string with more standard equivalents.

    Args:
        text (str): The input string.

    Returns:
        str: The cleaned string with replacements applied.
    """
    return text.translate(str.maketrans(REPLACEMENTS))

def get_meta_data(soup: BeautifulSoup) -> dict:
    """Extracts metadata from a blog page such as title, created date, and updated date.

    Args:
        soup (BeautifulSoup): Parsed HTML of the blog page.

    Returns:
        dict: A dictionary containing the blog's title, created date, and updated date.
    """
    meta_data = {
        "title": soup.find("h1", class_="entry-title").get_text(),
        "created": soup.find("time", class_="updated")["datetime"],
        "updated": soup.find_all("time")[1]["datetime"],
    }
    return meta_data

def get_paragraphs(soup: BeautifulSoup) -> list[str]:
    """Extracts and cleans paragraphs from the blog content, excluding certain phrases.

    Args:
        soup (BeautifulSoup): Parsed HTML of the blog page.

    Returns:
        list[str]: A list of cleaned paragraphs from the blog content.
    """
    paragraphs_html: list = soup.find_all("p", class_="p1")
    if not paragraphs_html:
        paragraphs_html = soup.find_all("p")

    paragraphs_raw: list[str] = [replace_strange_chars(para_html.get_text().strip()) for para_html in paragraphs_html]

    paragraphs_clean: list[str] = [
        para_raw
        for para_raw in paragraphs_raw
        if para_raw and not any(para_raw.startswith(prefix) for prefix in EXCLUDE_STARTSWITH)
    ]
    return paragraphs_clean

def get_key_takeaways(soup: BeautifulSoup) -> list[str]:
    """Extracts key takeaways from the blog content.

    Args:
        soup (BeautifulSoup): Parsed HTML of the blog page.

    Returns:
        list[str]: A list of key takeaways from the blog.
    """
    key_takeaways_heading = soup.find("p", string="KEY TAKEAWAYS")
    if key_takeaways_heading is None:
        return []

    key_takeaways_list = key_takeaways_heading.find_next("ul")
    return [replace_strange_chars(li.get_text().strip()) for li in key_takeaways_list.find_all("li")]

def extract_blog_data(soup: BeautifulSoup) -> dict:
    """Extracts all relevant blog data, including metadata, paragraphs, categories, and key takeaways.

    Args:
        soup (BeautifulSoup): Parsed HTML of the blog page.

    Returns:
        dict: A dictionary containing the blog's metadata, paragraphs, categories, and key takeaways.
    """
    blog_content: dict = get_meta_data(soup)

    tags_raw = soup.find("article").get("class")
    blog_content["category"] = [cat.split("-")[1] for cat in tags_raw if cat.startswith("category-")]
    blog_content["blog_tags"] = [tag.split("-")[1:] for tag in tags_raw if tag.startswith("tag-")]
    blog_content["raw_tags"] = tags_raw

    blog_content["paragraphs"] = get_paragraphs(soup)
    blog_content["key_takeaways"] = get_key_takeaways(soup)

    return blog_content


In [3]:
import json
import time
from pathlib import Path

from bs4 import BeautifulSoup
from tqdm import tqdm

In [4]:
from pathlib import Path

# Define the paths relative to the current directory
data_path = Path(".").resolve() / "data"  # Now resolves to the current directory
blog_posts_root: Path = data_path / "blog_posts"
post_path_raw: Path = blog_posts_root / "raw_txt"
post_path_json: Path = blog_posts_root / "json"

# Create directories if they don't exist
data_path.mkdir(parents=True, exist_ok=True)
blog_posts_root.mkdir(parents=True, exist_ok=True)
post_path_raw.mkdir(parents=True, exist_ok=True)
post_path_json.mkdir(parents=True, exist_ok=True)

# Check if the directories exist
print(data_path.is_dir())  # Should print True
print(post_path_raw.is_dir())  # Should print True
print(post_path_json.is_dir())  # Should print True


True
True
True


In [5]:
root_url: str = "https://nutritionfacts.org/blog/"
file_url_list: Path = blog_posts_root / "blog_posts_urls.csv"

In [6]:
response = get_webpage_content(root_url)

In [7]:
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

In [8]:
# Find all links on the page
links: set[str] = sorted({link["href"] for link in soup.find_all("a", href=True)})
print("Number of links:", len(links))

Number of links: 92


In [9]:
# filter the links
blog_posts_of_page: list[str] = filter_links(links, root_url)
n_posts: int = len(blog_posts_of_page)
print(f"Number of blog posts: {n_posts}")


Number of blog posts: 24


### Extract urls of all blog posts

In [10]:
urls_list: list[str] = extract_all_urls(root=root_url, page_stop=None)

1. Page URL: https://nutritionfacts.org/blog/
	 Number of blog posts: 24
2. Page URL: https://nutritionfacts.org/blog/page/2/
	 Number of blog posts: 25
3. Page URL: https://nutritionfacts.org/blog/page/3/
	 Number of blog posts: 25
4. Page URL: https://nutritionfacts.org/blog/page/4/
	 Number of blog posts: 25
5. Page URL: https://nutritionfacts.org/blog/page/5/
	 Number of blog posts: 25
6. Page URL: https://nutritionfacts.org/blog/page/6/
	 Number of blog posts: 25
7. Page URL: https://nutritionfacts.org/blog/page/7/
	 Number of blog posts: 25
8. Page URL: https://nutritionfacts.org/blog/page/8/
	 Number of blog posts: 25
9. Page URL: https://nutritionfacts.org/blog/page/9/
	 Number of blog posts: 25
10. Page URL: https://nutritionfacts.org/blog/page/10/
	 Number of blog posts: 25
11. Page URL: https://nutritionfacts.org/blog/page/11/
	 Number of blog posts: 25
12. Page URL: https://nutritionfacts.org/blog/page/12/
	 Number of blog posts: 25
13. Page URL: https://nutritionfacts.org/

In [11]:
blog_post_urls_set = set(urls_list)
print("Number of unique blog posts:", len(blog_post_urls_set))
# Number of blog posts: 1290

Number of unique blog posts: 1290


In [12]:
# post processing
for url in list(blog_post_urls_set):  # create a copy of the set
    link_tail: str = url.replace(root_url, "").replace("/", "")
    # remove some urls that are not blog posts
    if link_tail.isdigit():
        print(url)
        blog_post_urls_set.remove(url)
print("Number of unique blog posts:", len(blog_post_urls_set))
# Number of unique blog posts: 1287

https://nutritionfacts.org/blog/29369/
https://nutritionfacts.org/blog/66878/
https://nutritionfacts.org/blog/66532/
Number of unique blog posts: 1287


In [13]:
# export to csv file
with open(blog_posts_root / file_url_list, "w") as f:
    for url in sorted(blog_post_urls_set):
        f.write(f"{url}\n")

### Extract content of each blog post

In [14]:
# read from csv file
with open(blog_posts_root / file_url_list) as f:
    urls_list: list[str] = f.read().splitlines()

### Testing

In [15]:
blog_post_url = urls_list[1111]
url_tail = blog_post_url.replace(root_url, "").replace("/", "")
url_tail

'using-lavender-to-treat-anxiety'

In [16]:
blog_post_url

'https://nutritionfacts.org/blog/using-lavender-to-treat-anxiety/'

In [17]:
response = get_webpage_content(blog_post_url)
# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

In [18]:
# write to file
with open(f"{url_tail}.html", "w") as f:
    f.write(str(soup))

### pure content

In [19]:
# Extract the content you are interested in
paragraphs_raw = soup.find_all("p", class_="p1")
content = "\n\n".join(para.get_text() for para in paragraphs_raw)
paragraphs_raw

[]

In [20]:
with open(f"{url_tail}.txt", "w") as f:
    f.write(content)

### meta data

In [21]:
meta_data = get_meta_data(soup)
meta_data

{'title': 'Using Lavender to Treat Anxiety',
 'created': '2014-01-14T13:00:42+00:00',
 'updated': '2024-05-15T14:38:37-04:00'}

In [22]:
title_text = soup.find("h1", class_="entry-title").get_text()
title_text

'Using Lavender to Treat Anxiety'

In [23]:
# Extract the first datetime value
date_created = soup.find("time", class_="updated")["datetime"]

# Extract the second datetime value (using the second <time> tag)
date_last_update = soup.find_all("time")[1]["datetime"]

print("Datetime 01:", date_created)
print("Datetime 02:", date_last_update)

Datetime 01: 2014-01-14T13:00:42+00:00
Datetime 02: 2024-05-15T14:38:37-04:00


### paragraphs

In [24]:
paragraphs_clean = get_paragraphs(soup)
paragraphs_clean

['Lavender oil, which is distilled from lavender flowers, is often used in aromatherapy and massage. Despite its popularity, only recently have scientific investigations been undertaken into its biological activity.',
 "While there have been small-scale studies suggesting benefit from lavender oil massage, we didn't know if the benefit was coming from the lavender, the massage, or both. In an attempt to separate these two variables, a study was conducted in which patients in intensive care were given massages with either odorless oil or lavender oil. While patients massaged with lavender oil did say they felt less anxious and more positive, there were no objective differences found in terms of blood pressure, breathing, or heart rate. Perhaps the lavender was just covering up the nasty hospital smells.",
 "Subsequent studies using more sensitive tests did find physiological changes, though. We now know the scent of lavender can actually change brain wave patterns, but we didn't know wh

In [25]:
paragraphs_html: list = soup.find_all("p", class_="p1")
if not paragraphs_html:
    paragraphs_html = soup.find_all("p")

In [26]:
paragraphs_raw: list[str] = [para.get_text() for para in paragraphs_html]
paragraphs_raw

['\n\n  Written By \n    Michael Greger M.D. FACLM\n  \n  •\n  \n      January 14, 2014\n  \n\n\nLast updated: \n  May 15, 2024\n  \n    • 3 min read\n  ',
 '\n',
 'Lavender oil, which is distilled from lavender flowers, is often used in aromatherapy and massage. Despite its popularity, only recently have scientific investigations been undertaken into its biological activity.',
 'While there have been small-scale studies suggesting benefit from lavender oil massage, we didn’t know if the benefit was coming from the lavender, the massage, or both. In an attempt to separate these two variables, a study was conducted in which patients in intensive care were given massages with either odorless oil or lavender oil. While patients massaged with lavender oil did say they felt less anxious and more positive, there were no objective differences found in terms of blood pressure, breathing, or heart rate.\xa0Perhaps the lavender was just covering up the nasty hospital smells.',
 'Subsequent studi

In [27]:
# Extract and clean paragraphs while excluding those that start with certain phrases
paragraphs_raw: list[str] = [para_html.get_text().strip() for para_html in paragraphs_html]
exclude_startswith: list[str] = [
    "Written By",
    "Image Credit",
    "In health",
    "Michael Greger",
    "PS:",
    "A founding member",
    "Subscribe",
    "Catch up",
    "Charity ID",
    "We  our volunteers!",
    "Interested in learning more about",
    "Check out:",
]
# Create clean list
paragraphs_clean: list[str] = [
    replace_strange_chars(para_raw)
    for para_raw in paragraphs_raw
    if para_raw and not any(para_raw.startswith(prefix) for prefix in exclude_startswith)
]
paragraphs_clean

['Lavender oil, which is distilled from lavender flowers, is often used in aromatherapy and massage. Despite its popularity, only recently have scientific investigations been undertaken into its biological activity.',
 "While there have been small-scale studies suggesting benefit from lavender oil massage, we didn't know if the benefit was coming from the lavender, the massage, or both. In an attempt to separate these two variables, a study was conducted in which patients in intensive care were given massages with either odorless oil or lavender oil. While patients massaged with lavender oil did say they felt less anxious and more positive, there were no objective differences found in terms of blood pressure, breathing, or heart rate. Perhaps the lavender was just covering up the nasty hospital smells.",
 "Subsequent studies using more sensitive tests did find physiological changes, though. We now know the scent of lavender can actually change brain wave patterns, but we didn't know wh

### Extract key takeaways

In [28]:
key_takeaways_heading = soup.find("p", string="KEY TAKEAWAYS")
if key_takeaways_heading is None:
    key_takeaways = []
else:
    # Find the next <ul> element after the "KEY TAKEAWAYS" heading
    key_takeaways_list = key_takeaways_heading.find_next("ul")

    # Extract the text from each <li> in the list
    key_takeaways = [replace_strange_chars(li.get_text().stripe()) for li in key_takeaways_list.find_all("li")]

# Print or use the extracted key takeaways
for takeaway in key_takeaways:
    print(takeaway)


### article tags

In [29]:
tags_raw = soup.find("article").get("class")
if tags_raw:
    tags_blog = [tag.split("-")[1] for tag in tags_raw if tag.startswith("tag-")]
    print(tags_blog)
    cats = [cat.split("-")[1] for cat in tags_raw if cat.startswith("category-")]
    print(cats)

['anxiety', 'aromatherapy', 'ativan', 'benzodiazepines', 'brain', 'downers', 'hormone', 'hormones', 'lavender', 'lavender', 'lorazepam', 'massage', 'saffron', 'sedatives', 'side']
['news']


### export to json

In [30]:
blog_data = extract_blog_data(soup)

In [31]:
# write to json file
with open(f"{url_tail}.json", "w", encoding="utf-8") as json_file:
    json.dump(blog_data, json_file, ensure_ascii=True, indent=4)

### Real extraction loop
pure text (NOT used in the end)

In [32]:
# pure text
for url in tqdm(urls_list):
    url_tail = url.replace(root_url, "").replace("/", "")
    file_out = post_path_raw / f"{url_tail}.txt"
    if file_out.exists():
        continue

    time.sleep(0.5)  # wait a bit to avoid being blocked

    # get the HTML content
    response = get_webpage_content(url)
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract the content
    paragraphs = soup.find_all("p")
    content = "\n\n".join(para.get_text() for para in paragraphs)

    # export to file
    with open(file_out, "w", encoding="utf-8") as f:
        f.write(content)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1287/1287 [27:28<00:00,  1.28s/it]


### meta data & text chunks (used in the end)

In [33]:
for url in tqdm(urls_list):
    url_tail = url.replace(root_url, "").replace("/", "")
    file_out = post_path_json / f"{url_tail}.json"
    if file_out.exists():
        continue

    time.sleep(0.1)  # wait a bit to avoid being blocked

    # get the HTML content
    response = get_webpage_content(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract the blog data
    blog_data: dict = {"url": url}
    blog_data.update(extract_blog_data(soup))

    # export to json file
    with open(file_out, "w", encoding="utf-8") as json_file:
        json.dump(blog_data, json_file, ensure_ascii=True, indent=4)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1287/1287 [21:23<00:00,  1.00it/s]
