In [8]:
# scraper.py
# Make sure to run: pip install requests beautifulsoup4 python-dateutil tqdm

import requests
import sqlite3
import json
from bs4 import BeautifulSoup
from dateutil.parser import parse
from tqdm import tqdm
import time

# --- Database Functions ---

# In scraper.py, replace the old database functions with these corrected versions.

def setup_database(db_name="fightaging_articles.db"):
    """Creates a database and an 'articles' table if they don't exist."""
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    # Create table with the new 'external_link' column
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS articles (
        id INTEGER PRIMARY KEY,
        url TEXT UNIQUE,
        publish_date TEXT,
        title TEXT,
        body TEXT,
        quotes TEXT,
        external_link TEXT 
    )
    ''')
    conn.commit()
    print(f"Database '{db_name}' is ready with the 'external_link' column.")
    return conn

def save_to_db(conn, article_data):
    """Saves a single article to the database, now including the external_link."""
    cursor = conn.cursor()
    
    quotes_json = json.dumps(article_data['quotes'])
    
    try:
        # Updated INSERT statement to include the 7th column
        cursor.execute('''
        INSERT OR IGNORE INTO articles (id, url, publish_date, title, body, quotes, external_link)
        VALUES (?, ?, ?, ?, ?, ?, ?)
        ''', (
            article_data['id'],
            article_data['url'],
            article_data['publish_date'],
            article_data['title'],
            article_data['body'],
            quotes_json,
            article_data['external_link'] # The new data to be saved
        ))
        conn.commit()
        return cursor.rowcount > 0 
    except sqlite3.Error as e:
        print(f"Database error: {e}")
        return False

# --- Scraping and Parsing Functions ---

def parse_article(article_html):
    """Extracts required fields from a single article's HTML."""
    try:
        # Extract the unique ID from the article tag (e.g., "post-64033")
        post_id = int(article_html.get('id', 'post-0').split('-')[1])
        if not post_id:
            return None

        # Extract Date
        date_str = article_html.find('div', class_='post-date').get_text(strip=True)
        # Normalize date to YYYY-MM-DD format
        publish_date = parse(date_str).strftime('%Y-%m-%d')
        
        # Extract Title
        title = article_html.find('h2', class_='title').get_text(strip=True)
        
        # Extract permalink URL
        permalink_tag = article_html.find('a', id=f'permalink-{post_id}')
        url = permalink_tag['href'] if permalink_tag else None
        if not url:
            return None # Skip if we can't find a URL
        
        # Extract Body text, clean and join paragraphs
        post_body_div = article_html.find('div', class_='post-body')
        body_text = post_body_div.get_text(separator=' ', strip=True) if post_body_div else ""
        
        # Extract all quotes
        quotes = []
        if post_body_div:
            quote_tags = post_body_div.find_all('blockquote')
            for quote in quote_tags:
                quotes.append(quote.get_text(separator=' ', strip=True))

        # Extract external links
        external_link = None # Default value
        post_body_div = article_html.find('div', class_='post-body')
        if post_body_div:
            # Find the span, then the 'a' tag inside it
            newslink_span = post_body_div.find('span', class_='newslink')
            if newslink_span:
                link_tag = newslink_span.find('a')
                if link_tag and 'href' in link_tag.attrs:
                    external_link = link_tag['href']

        return {
            'id': post_id,
            'url': url,
            'publish_date': publish_date,
            'title': title,
            'body': body_text,
            'quotes': quotes,
            'external_link': external_link
        }

    except (AttributeError, ValueError) as e:
        print(f"Skipping an article due to parsing error: {e}")
        return None

def scrape_and_save_month(year, month, conn):
    """Scrapes all articles for a given year and month and saves them to the DB."""
    url = f"https://www.fightaging.org/archives/{year}/{month:02d}/"
    print(f"Scraping page: {url}")

    headers = {
        'User-Agent': 'Friendly Scraper for ML Project'
    }
    
    try:
        response = requests.get(url,headers=headers, timeout=15)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
    except requests.exceptions.RequestException as e:
        print(f"Could not fetch page {url}. Error: {e}")
        return
        
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all article containers on the page
    articles_html = soup.find_all('article', class_='post')
    
    if not articles_html:
        print(f"No articles found for {year}-{month:02d}.")
        return

    print(f"Found {len(articles_html)} articles. Parsing and saving...")
    
    saved_count = 0
    for article_html in tqdm(articles_html, desc="Processing Articles"):
        article_data = parse_article(article_html)
        if article_data:
            if save_to_db(conn, article_data):
                saved_count += 1
    
    print(f"Successfully saved {saved_count} new articles to the database.")


In [9]:
# --- Main Execution ---

# if __name__ == "__main__":
#     DB_FILE = 'fightaging_articles.db'
#     connection = setup_database(DB_FILE)

#     # --- Start with a single month: September 2025 ---
#     target_year = 2025
#     target_month = 9
#     scrape_and_save_month(target_year, target_month, connection)
    
#     connection.close()
#     print("\nScraping process finished.")

In [10]:
# Download all

# --- Main Execution ---

if __name__ == "__main__":
    DB_FILE = 'fightaging_articles.db'
    connection = setup_database(DB_FILE)

    # --- Start with a single month: September 2025 ---
    # target_year = 2025
    # target_month = 9
    # scrape_and_save_month(target_year, target_month, connection)

    # --- To scrape the entire catalog (Nov 2002 to Sep 2025) ---
    print("Starting full catalog scrape...")
    for year in range(2002, 2026):
        # The archive ends in September 2025
        end_month = 9 if year == 2025 else 12 
        # The archive starts in November 2002
        start_month = 11 if year == 2002 else 1

        for month in range(start_month, end_month + 1):
            scrape_and_save_month(year, month, connection)
            # A friendly pause to avoid overwhelming the server
            time.sleep(1) 
    
    connection.close()
    print("\nScraping process finished.")

Database 'fightaging_articles.db' is ready with the 'external_link' column.
Starting full catalog scrape...
Scraping page: https://www.fightaging.org/archives/2002/11/
Found 24 articles. Parsing and saving...


Processing Articles: 100%|██████████| 24/24 [00:03<00:00,  7.62it/s]


Successfully saved 24 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2002/12/
Found 24 articles. Parsing and saving...


Processing Articles: 100%|██████████| 24/24 [00:02<00:00,  9.45it/s]


Successfully saved 24 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/01/
Found 41 articles. Parsing and saving...


Processing Articles: 100%|██████████| 41/41 [00:05<00:00,  7.95it/s]


Successfully saved 41 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/02/
Found 40 articles. Parsing and saving...


Processing Articles: 100%|██████████| 40/40 [00:04<00:00,  8.52it/s]


Successfully saved 40 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/03/
Found 47 articles. Parsing and saving...


Processing Articles: 100%|██████████| 47/47 [00:05<00:00,  8.67it/s]


Successfully saved 47 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/04/
Found 56 articles. Parsing and saving...


Processing Articles: 100%|██████████| 56/56 [00:08<00:00,  6.92it/s]


Successfully saved 56 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/05/
Found 55 articles. Parsing and saving...


Processing Articles: 100%|██████████| 55/55 [00:07<00:00,  7.33it/s]


Successfully saved 55 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/06/
Found 48 articles. Parsing and saving...


Processing Articles: 100%|██████████| 48/48 [00:06<00:00,  7.74it/s]


Successfully saved 48 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/07/
Found 51 articles. Parsing and saving...


Processing Articles: 100%|██████████| 51/51 [00:06<00:00,  7.84it/s]


Successfully saved 51 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/08/
Found 52 articles. Parsing and saving...


Processing Articles: 100%|██████████| 52/52 [00:07<00:00,  6.92it/s]


Successfully saved 52 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/09/
Found 61 articles. Parsing and saving...


Processing Articles: 100%|██████████| 61/61 [00:09<00:00,  6.69it/s]


Successfully saved 61 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/10/
Found 63 articles. Parsing and saving...


Processing Articles: 100%|██████████| 63/63 [00:09<00:00,  6.44it/s]


Successfully saved 62 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/11/
Found 63 articles. Parsing and saving...


Processing Articles: 100%|██████████| 63/63 [00:08<00:00,  7.20it/s]


Successfully saved 63 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2003/12/
Found 55 articles. Parsing and saving...


Processing Articles: 100%|██████████| 55/55 [00:07<00:00,  7.78it/s]


Successfully saved 55 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/01/
Found 59 articles. Parsing and saving...


Processing Articles: 100%|██████████| 59/59 [00:08<00:00,  6.72it/s]


Successfully saved 59 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/02/
Found 92 articles. Parsing and saving...


Processing Articles: 100%|██████████| 92/92 [00:12<00:00,  7.23it/s]


Successfully saved 92 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/03/
Found 100 articles. Parsing and saving...


Processing Articles: 100%|██████████| 100/100 [00:13<00:00,  7.30it/s]


Successfully saved 98 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/04/
Found 100 articles. Parsing and saving...


Processing Articles: 100%|██████████| 100/100 [00:13<00:00,  7.27it/s]


Successfully saved 98 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/05/
Found 96 articles. Parsing and saving...


Processing Articles: 100%|██████████| 96/96 [00:12<00:00,  7.99it/s]


Successfully saved 96 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/06/
Found 95 articles. Parsing and saving...


Processing Articles: 100%|██████████| 95/95 [00:10<00:00,  9.01it/s]


Successfully saved 95 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/07/
Found 95 articles. Parsing and saving...


Processing Articles: 100%|██████████| 95/95 [00:10<00:00,  8.86it/s]


Successfully saved 95 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/08/
Found 92 articles. Parsing and saving...


Processing Articles: 100%|██████████| 92/92 [00:10<00:00,  9.04it/s]


Successfully saved 90 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/09/
Found 88 articles. Parsing and saving...


Processing Articles: 100%|██████████| 88/88 [00:09<00:00,  9.26it/s]


Successfully saved 87 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/10/
Found 91 articles. Parsing and saving...


Processing Articles: 100%|██████████| 91/91 [00:09<00:00,  9.37it/s]


Successfully saved 91 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/11/
Found 93 articles. Parsing and saving...


Processing Articles: 100%|██████████| 93/93 [00:10<00:00,  8.59it/s]


Successfully saved 93 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2004/12/
Found 96 articles. Parsing and saving...


Processing Articles: 100%|██████████| 96/96 [00:09<00:00,  9.64it/s]


Successfully saved 96 new articles to the database.
Scraping page: https://www.fightaging.org/archives/2005/01/
Found 93 articles. Parsing and saving...


Processing Articles:  13%|█▎        | 12/93 [00:01<00:08,  9.67it/s]


KeyboardInterrupt: 