
# Fed Speeches Scraping and Analysis

This notebook is an improved and modularized version of the Fed speeches scraping script.
It includes enhanced handling for missing data, optimized DataFrame operations, and modular functions for better readability and maintainability.


In [167]:
import sqlite3
from bs4 import BeautifulSoup
import requests
import time
from datetime import datetime
import logging
import re

In [168]:
def setup_database():
    """Create the database and tables with speech_date"""
    conn = sqlite3.connect('fed_speeches.db')
    c = conn.cursor()

    c.execute('''
        CREATE TABLE IF NOT EXISTS speeches (
            id INTEGER PRIMARY KEY,
            speech_date DATE,
            title TEXT,
            speaker TEXT,
            event TEXT,
            link TEXT UNIQUE,
            speech_text TEXT,
            scrape_date DATE,
            processed BOOLEAN DEFAULT FALSE
        )
    ''')

    c.execute('CREATE INDEX IF NOT EXISTS idx_speech_date ON speeches(speech_date)')
    c.execute('CREATE INDEX IF NOT EXISTS idx_speaker ON speeches(speaker)')
    c.execute('CREATE INDEX IF NOT EXISTS idx_processed ON speeches(processed)')

    conn.commit()
    return conn

In [169]:
def parse_date(text, year):
  match = re.search(r'\d{8}', text)
  if match:
    speech_date_raw = match.group(0)
    speech_date = datetime.strptime(speech_date_raw, '%Y%m%d').strftime('%Y-%m-%d')
    return speech_date
  else:
    print("No date found in the link.")

    # If no date found, return January 1st of the year as default
  return datetime(year, 1, 1).date()


In [170]:
def save_speech(conn, speech_data):
    c = conn.cursor()
    try:
        c.execute('''
            INSERT OR IGNORE INTO speeches
            (speech_date, title, speaker, event, link, scrape_date)
            VALUES (?, ?, ?, ?, ?, ?)
        ''', (
            speech_data['speech_date'],
            speech_data['title'],
            speech_data['speaker'],
            speech_data['event'],
            speech_data['link'],
            datetime.now().strftime("%Y-%m-%d")
        ))
        conn.commit()
    except sqlite3.Error as e:
        logging.error(f"Database error: {str(e)}")
        conn.rollback()

In [171]:
def scrape_speeches(year, url_format, conn):
    url = url_format.format(year=year)

    try:
        page = requests.get(url, timeout=30)
        page.raise_for_status()  # This will raise an exception for bad status codes
        page.encoding = page.apparent_encoding
        print(f"Successfully fetched page for year {year}")  # Added debug print

        soup = BeautifulSoup(page.text, 'html.parser')

        if year < 2006:
            titles = soup.select(".title")
            speakers = soup.select(".speaker")
            locations = soup.select(".location")

            print(f"Found {len(titles)} speeches for year {year}")  # Added debug print

            for i in range(min(len(titles), len(speakers), len(locations))):
                try:
                    title_text = titles[i].text.strip()
                    link = 'https://www.federalreserve.gov' + titles[i].find('a', href=True)['href']
                    speech_date = parse_date(link, year)


                    speech_data = {
                        "speech_date": speech_date,
                        "link": link,
                        "title": title_text,
                        "speaker": speakers[i].text.strip(),
                        "event": locations[i].text.strip()
                    }
                    save_speech(conn, speech_data)
                except Exception as e:
                    print(f"Error processing item {i} for year {year}: {str(e)}")
        else:
            events = soup.select(".eventlist__event")
            print(f"Found {len(events)} speeches for year {year}")  # Added debug print

            for event in events:
                try:
                    link = 'https://www.federalreserve.gov' + event.find('a', href=True)['href']
                    text_parts = [part.strip() for part in event.text.split('\n') if part.strip()]
                    speech_date = parse_date(link, year)
                    title = text_parts[0]
                    speaker_idx = 1 if text_parts[1] not in ['Watch Live', 'Video'] else 2
                    speech_data = {
                        "speech_date": speech_date,
                        "link": link,
                        "title": title,
                        "speaker": text_parts[speaker_idx],
                        "event": text_parts[speaker_idx + 1]
                    }
                    save_speech(conn, speech_data)
                    print(f"Saved speech: {title[:50]}...")  # Added debug print
                except Exception as e:
                    print(f"Error processing event: {str(e)}")

        time.sleep(1)  # Rate limiting
    except Exception as e:
        print(f"Error fetching year {year}: {str(e)}")

In [172]:
def process_speech_text(conn):
    c = conn.cursor()
    c.execute('SELECT id, link, strftime("%Y", speech_date) AS year FROM speeches WHERE processed = FALSE')
    unprocessed = c.fetchall()

    print(f"Found {len(unprocessed)} unprocessed speeches")  # Debug print

    for id, link, year in unprocessed:
        try:
            print(f"Processing speech ID {id} for year {year}")  # Debug print
            page = requests.get(link, timeout=30)
            page.encoding = page.apparent_encoding  # Set the encoding explicitly
            soup = BeautifulSoup(page.text, 'html.parser')

            start_keyword = year
            end_keywords = [year + " Speeches", "Footnotes", "References", "Endnotes"]

            if int(year) >= 2006:
                content_div = soup.select_one('#article > div:nth-child(3)')
                if not content_div:
                    print(f"Content not found for speech ID {id}")
                    continue
                relevant_text = content_div.get_text()
                end_idx = len(relevant_text)
                for end_keyword in end_keywords:
                    idx = relevant_text.find(end_keyword)
                    if idx != -1 and idx < end_idx:
                        end_idx = idx
                relevant_text = relevant_text[:end_idx].strip()
            else:
                full_text = soup.get_text()
                first_idx = full_text.find(start_keyword)
                if first_idx == -1:
                    print(f"Start keyword '{start_keyword}' not found for speech ID {id}")
                    continue
                second_idx = full_text.find(start_keyword, first_idx + len(start_keyword))
                if second_idx == -1:
                    print(f"Second occurrence of '{start_keyword}' not found for speech ID {id}")
                    continue
                start_idx = second_idx + len(start_keyword)
                end_idx = -1
                for end_keyword in end_keywords: #TODO cycle the keywords and have the earliest one as a mark
                    end_idx = full_text.find(end_keyword, start_idx)
                    if end_idx != -1:
                        break
                if end_idx == -1:
                    print(f"End keywords not found for speech ID {id}")
                    continue
                relevant_text = full_text[start_idx:end_idx].strip()

            relevant_text = re.sub(r'\n+', '\n', relevant_text)
            relevant_text = relevant_text.replace("Return to top", "").replace("Watch live", "").replace("Return to text", "")

            c.execute('''
                UPDATE speeches
                SET speech_text = ?, processed = TRUE
                WHERE id = ?
            ''', (relevant_text, id))
            conn.commit()

            print(f"Successfully processed speech ID {id}")
            time.sleep(2)  # Rate limiting

        except Exception as e:
            print(f"Error processing speech {id}: {str(e)}")
            conn.rollback()


In [173]:
def main():
    # Set up logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        filename=f'fed_scraper_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
    )

    print("Starting the Federal Reserve speech scraper...")  # Added debug print

    # Initialize database
    conn = setup_database()
    print("Database setup complete")  # Added debug print

    try:
        # Scrape speeches
        start_year = 1996
        end_year = datetime.now().year

        url_format_1996_2010 = "https://www.federalreserve.gov/newsevents/speech/{year}speech.htm"
        url_format_2011_2025 = "https://www.federalreserve.gov/newsevents/speech/{year}-speeches.htm"

        print(f"Starting scraping from {start_year} to {end_year}")  # Added debug print

        for year in range(start_year, end_year + 1):
            print(f"\nScraping year {year}")
            url_format = url_format_1996_2010 if year <= 2010 else url_format_2011_2025
            scrape_speeches(year, url_format, conn)

        # Process speech texts
        print("\nStarting to process speech texts...")  # Added debug print
        process_speech_text(conn)

        print("Scraping and processing complete!")  # Added debug print

    except Exception as e:
        print(f"Fatal error in main execution: {str(e)}")
    finally:
        conn.close()

if __name__ == "__main__":
    main()

Starting the Federal Reserve speech scraper...
Database setup complete
Starting scraping from 1996 to 2025

Scraping year 1996
Successfully fetched page for year 1996
Found 19 speeches for year 1996

Scraping year 1997
Successfully fetched page for year 1997
Found 45 speeches for year 1997

Scraping year 1998
Successfully fetched page for year 1998
Found 57 speeches for year 1998

Scraping year 1999
Successfully fetched page for year 1999
Found 68 speeches for year 1999

Scraping year 2000
Successfully fetched page for year 2000
Found 62 speeches for year 2000

Scraping year 2001
Successfully fetched page for year 2001
Found 58 speeches for year 2001

Scraping year 2002
Successfully fetched page for year 2002
Found 76 speeches for year 2002

Scraping year 2003
Successfully fetched page for year 2003
Found 71 speeches for year 2003

Scraping year 2004
Successfully fetched page for year 2004
Found 102 speeches for year 2004

Scraping year 2005
Successfully fetched page for year 2005
Foun

proccess documenting:
i had 2 different types of addresses, 1996-2010 and 2010-today.
two different types of library pages, 1996-2005 and 2006-today, the speeches
has 2 different types of pages as well.