### Example of getting the articles data (in case of animals)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import os
from urllib.parse import urljoin

def fetch_articles(url_template, topic, num_articles, source_name, existing_data=None):
    """
    Fetches articles from paginated URLs, extracts relevant information, and saves it to a CSV file.

    Args:
        url_template (str): The URL template for the category page with {page} placeholder.
        topic (str): The topic of the articles (e.g., "environment").
        num_articles (int): The maximum number of articles to fetch.
        source_name (str): The name of the news source ("Mental Floss").
        existing_data (pd.DataFrame, optional): Existing data to append to. Defaults to None.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted article data.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6423.0 Safari/537.36',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
    }
    articles_data = [] if existing_data is None else existing_data.to_dict('records')
    articles_count = 0 if existing_data is None else len(existing_data)
    page_number = 1
    processed_links = set()  # Initialize processed_links here

    while articles_count < num_articles:
        page_url = url_template.format(page=page_number)
        try:
            response = requests.get(page_url, headers=headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')
            article_links_on_page = set()
            for a_tag in soup.find_all('a', href=True):
                href = a_tag['href']
                if href.startswith('/article/') or href.startswith('https://www.mentalfloss.com/article/'):
                    full_url = urljoin('https://www.mentalfloss.com', href)
                    article_links_on_page.add(full_url)

            for link in article_links_on_page:
                if articles_count >= num_articles:
                    break
                if link in processed_links:
                    continue

                try:
                    article_response = requests.get(link, headers=headers)
                    article_response.raise_for_status()
                    article_soup = BeautifulSoup(article_response.content, 'html.parser')

                    content_div = article_soup.find('div', class_='article-content')
                    if not content_div:
                        content_div = article_soup.find('main')
                    content = '\n'.join([p.get_text(strip=True) for p in content_div.find_all('p')]) if content_div else "Content not found"

                    if content and content != "Content not found":
                        articles_data.append({
                            'source': source_name,
                            'topic': topic,
                            'link': link,
                            'content': content.replace('\n', ' ').strip()
                        })
                        articles_count += 1
                        processed_links.add(link)
                        print(f"Processed article {articles_count}/{num_articles}: {link}")

                        df = pd.DataFrame(articles_data)
                        df.to_csv(f'articles_animals.csv', index=False)
                        time.sleep(random.uniform(1, 3))

                except Exception as e:
                    print(f"Error processing article: {link} - {e}")
                    time.sleep(random.uniform(1, 3))

            page_number += 1

            # Basic check to stop if no new articles are found on a page
            if not article_links_on_page:
                print(f"No more articles found on page {page_number}. Stopping pagination for {topic}.")
                break

        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {page_url}: {e}")
            break
        except Exception as e:
            print(f"An unexpected error occurred while fetching from {page_url}: {e}")
            break

    return pd.DataFrame(articles_data)

def main():
    """
    Main function to orchestrate the web scraping process for the animals section of Mental Floss.
    """
    source_name = "Mental Floss"
    topic = "animals"
    url_template = "https://www.mentalfloss.com/section/animals?page={page}"
    num_articles_to_scrape = 5000

    # Check for existing data to append to.
    try:
        existing_data = pd.read_csv('articles_animals.csv')
    except FileNotFoundError:
        existing_data = None

    print(f"Fetching up to {num_articles_to_scrape} animals articles from {source_name}...")
    all_articles = fetch_articles(url_template, topic, num_articles_to_scrape, source_name, existing_data)
    print(f"Successfully fetched {len(all_articles)} animals articles from {source_name}.")

    # Save all articles after fetching.
    all_articles.to_csv('articles_animals.csv', index=False)
    print("Done!")

if __name__ == "__main__":
    main()