In [None]:
import boto3
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


In [None]:
# Creating an S3 access object
obj = boto3.client("s3")


In [None]:
def fetch_links(base_url, start_url, max_articles=50):
    """
    Crawls Conservapedia to gather article links.
    
    Args:
        base_url (str): The base URL of Conservapedia.
        start_url (str): The starting URL for crawling.
        max_articles (int): Maximum number of articles to crawl.
    
    Returns:
        list: A list of article URLs.
    """
    visited = set()
    to_visit = [start_url]
    article_links = []

    # Setup session with retries
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }

    while to_visit and len(article_links) < max_articles:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue

        try:
            response = session.get(current_url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            visited.add(current_url)

            for link in soup.find_all('a', href=True):
                href = link['href']
                full_url = urljoin(base_url, href)
                if full_url.startswith(base_url) and ":" not in href and full_url not in visited:
                    to_visit.append(full_url)
                    article_links.append(full_url)
                    if len(article_links) >= max_articles:
                        break

            time.sleep(2)  # Politeness delay

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {current_url}: {e}")
            continue
    article_links = [ x for x in article_links if '#' not in x ]
    return article_links

def scrape_article(url, session, headers):
    """
    Scrapes the main text content of a Conservapedia article.
    
    Args:
        url (str): The URL of the Conservapedia article.
        session: The session object with retry logic.
        headers (dict): Headers for the request.
    
    Returns:
        str: The text content of the article.
    """
    try:
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        content_div = soup.find('div', {'id': 'bodyContent'})

        if not content_div:
            return "Main content area not found."
        
        paragraphs = content_div.find_all('p')
        return "\n".join(p.get_text(strip=True) for p in paragraphs)

    except requests.exceptions.RequestException as e:
        return f"Error fetching the URL: {e}"
    except Exception as e:
        return f"An error occurred: {e}"

def save_to_txt(content, filename):
    """
    Saves scraped article content to a .txt file.
    
    Args:
        content (str): The text content of the article.
        filename (str): Name of the file to save the content.
    """
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

def send_to_s3(filename):
    obj.upload_file(
    Filename= filename,
    Bucket="arxivpapers",
    Key="conservapedia_papers/"+filename
)

In [None]:
if __name__ == "__main__":
    base_url = "https://www.conservapedia.com"
    start_url = f"{base_url}/Main_Page"
    max_articles = 20000  # Change as needed
    output_dir = "conservapedia_articles"
    os.makedirs(output_dir, exist_ok=True)

    # Setup session with retries
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }

    print("Fetching article links...")
    article_links = fetch_links(base_url, start_url, max_articles)
    #print(article_links)
    print(f"Found {len(article_links)} articles. Scraping content...")

    for idx, article_url in enumerate(article_links, start=1):
        print(f"\n--- Scraping Article {idx} ---")
        article_text = scrape_article(article_url, session, headers)
        article_title = article_url.split("/")[-1]
        filename = os.path.join(output_dir, f"{article_title}.txt")
        shortname = f"{article_title}.txt"
        save_to_txt(article_text, shortname)
        #print(f"Saved: {shortname}")
        send_to_s3(shortname)
        os.remove(shortname)

Fetching article links...
Found 13787 articles. Scraping content...

--- Scraping Article 1 ---

--- Scraping Article 2 ---

--- Scraping Article 3 ---

--- Scraping Article 4 ---

--- Scraping Article 5 ---

--- Scraping Article 6 ---

--- Scraping Article 7 ---

--- Scraping Article 8 ---

--- Scraping Article 9 ---

--- Scraping Article 10 ---

--- Scraping Article 11 ---

--- Scraping Article 12 ---

--- Scraping Article 13 ---

--- Scraping Article 14 ---

--- Scraping Article 15 ---

--- Scraping Article 16 ---

--- Scraping Article 17 ---

--- Scraping Article 18 ---

--- Scraping Article 19 ---

--- Scraping Article 20 ---

--- Scraping Article 21 ---

--- Scraping Article 22 ---

--- Scraping Article 23 ---

--- Scraping Article 24 ---

--- Scraping Article 25 ---

--- Scraping Article 26 ---

--- Scraping Article 27 ---

--- Scraping Article 28 ---

--- Scraping Article 29 ---

--- Scraping Article 30 ---

--- Scraping Article 31 ---

--- Scraping Article 32 ---

--- Scraping