In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
FED_BASE_URL = "https://www.federalreserve.gov"
FED_SPEECHES_URL = "https://www.federalreserve.gov/newsevents/speeches.htm"

In [3]:
print("Instantiating Chrome web driver")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

Instantiating Chrome web driver


[WDM] - Downloading: 100%|██████████| 7.72M/7.72M [00:00<00:00, 14.4MB/s]


In [4]:
def get_fed_speeches(start_date, end_date):
    START_DATE_XPATH = "//*[@id=\"content\"]/div[2]/div/div[1]/form/div[2]/div/div[1]/input"
    END_DATE_XPATH = "//*[@id=\"content\"]/div[2]/div/div[1]/form/div[2]/div/div[2]/input"
    SUBMIT_BUTTON_XPATH = "//*[@id=\"content\"]/div[2]/div/div[1]/form/div[5]/a"
    driver.get(FED_SPEECHES_URL)
    start_date_input_box = driver.find_element("xpath", START_DATE_XPATH)
    end_date_input_box = driver.find_element("xpath", END_DATE_XPATH)
    start_date_input_box.clear()
    start_date_input_box.send_keys(start_date)
    end_date_input_box.clear()
    end_date_input_box.send_keys(end_date)
    submit_button = driver.find_element("xpath", SUBMIT_BUTTON_XPATH)
    submit_button.click()
    return

In [5]:
def get_next_page():
    NEXT_BUTTON_TEXT = "Next"

    try:
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.LINK_TEXT, NEXT_BUTTON_TEXT))
        )
    except TimeoutException:
        return False

    next_button = driver.find_element("link text", NEXT_BUTTON_TEXT)

    try:
        next_button.click()
    except ElementClickInterceptedException:
        return False

    return True

In [6]:
def get_fed_speech_links(all_speeches):
    soup = BeautifulSoup(all_speeches, 'html.parser')
    fed_page_links = soup.find_all("a", href=True)
    fed_speech_transcript_links = [link['href'] for \
            link in fed_page_links if "/speech/" in link['href']]
    return fed_speech_transcript_links

In [7]:
import re

def get_fed_speech_transcript_links(fed_speech_links):
    DATE_LENGTH = 8
    fed_speech_transcript_links = []
    for link in fed_speech_links:
        date = re.findall('[0-9]+', link)
        if len(date) != 0:
            date = date[0]
        if len(date) == DATE_LENGTH:
            fed_speech_transcript_links.append(link)
    return fed_speech_transcript_links

In [8]:
def filter_duplicate_links(fed_speech_links):
    uniq_links = set(fed_speech_links)
    return list(uniq_links)

In [9]:
def get_fed_speeches_body(fed_speech_links):
    all_fed_speeches = []
    for link in fed_speech_links:
        all_fed_speeches.append(requests.get(FED_BASE_URL + link).text)
    return all_fed_speeches

In [10]:
def get_speech(fed_speech_links, fed_speeches_source):
    fed_speech_content = {}
    num_speeches = len(fed_speech_links)
    for speech_idx in range(num_speeches):
        date = re.findall('[0-9]+', fed_speech_links[speech_idx])[0]
        soup = BeautifulSoup(fed_speeches_source[speech_idx], "html.parser")
        speech_text = ""
        for paragraph in soup.find_all('p'):
            speech_text += paragraph.get_text()
        fed_speech_content[fed_speech_links[speech_idx]] = (date, speech_text)
    return fed_speech_content

In [12]:
def tokenize_corpus(filing_corpus):
    filing_tokenizer = RegexpTokenizer(r'\w+')
    return filing_tokenizer.tokenize(filing_corpus)

# a function to remove the stop words from corpus
def filter_out_stopwords(tokenized_corpus):
    stop_words = set(stopwords.words('english'))
    filing_corpus_filtered = [word for word in tokenized_corpus if not \
        word.lower() in stop_words]
    return filing_corpus_filtered


# a function that filters out numbers from corpus
def filter_out_numbers(tokenized_corpus):
    return [token for token in tokenized_corpus if not (token.isdigit()
                                         or token[0] == '-' and token[1:].isdigit())]

In [13]:
def tokenize_fed_speeches(fed_speeches_dict):
    tokenized_speeches = {}
    DATE_IDX = 0
    CONTENT_IDX = 1
    for link, speech in fed_speeches_dict.items():
        tokenized_corpus = tokenize_corpus(speech[CONTENT_IDX])
        tokenized_corpus = filter_out_stopwords(tokenized_corpus)
        tokenized_corpus = filter_out_numbers(tokenized_corpus)
        tokenized_speeches[link] = (speech[DATE_IDX], tokenized_corpus)
    return tokenized_speeches

In [37]:
SPEECHES_DIR = "fed_speeches/"

def write_to_disc(file_names, file_content):
    for file_idx in range(len(file_names)):
        file_name = file_names[file_idx].replace('/', '_')
        file_name = file_name.replace(".htm", "")
        file_name = file_name.replace("_newsevents_speech_", "")
        with open(SPEECHES_DIR + file_name + ".txt", 'w') as f:
            f.write(file_content[file_idx])
            f.close()

In [14]:
print("Retrieving all links from the fed speeches web page ")

START_DATE = "06/01/2022"
END_DATE = "10/28/2022"

fed_page_links = []
get_fed_speeches(START_DATE, END_DATE)
fed_page_links.extend(get_fed_speech_links(driver.page_source))
while get_next_page():
    fed_page_links.extend(get_fed_speech_links(driver.page_source))

Retrieving all links from the fed speeches web page 


In [15]:
print("Retrieving all speech transcripts from the fed web page ")
fed_page_transcript_links = get_fed_speech_transcript_links(fed_page_links)

Retrieving all speech transcripts from the fed web page 


In [16]:
print("Filtering out duplicate links ")
speech_links = filter_duplicate_links(fed_page_transcript_links)

Filtering out duplicate links 


In [23]:
print("Getting the body of all fed speeches")
fed_speeches_body = get_fed_speeches_body(speech_links)

Getting the body of all fed speeches


In [38]:
print("Writing crawled speeches to disk")
write_to_disc(speech_links, fed_speeches_body)

Writing crawled speeches to disk


In [18]:
print("Parsing the speech body for each speech")
speeches_text = get_speech(speech_links, fed_speeches_body)

Parsing the speech body for each speech


In [19]:
print("Tokenizing the body of each speech")
tokenized_speeches = tokenize_fed_speeches(speeches_text)

Tokenizing the body of each speech


In [21]:
print(len(tokenized_speeches))
print(tokenized_speeches)

27
{'/newsevents/speech/powell20220617a.htm': ('20220617', ['Federal', 'Reserve', 'central', 'bank', 'United', 'States', 'provides', 'nation', 'safe', 'flexible', 'stable', 'monetary', 'financial', 'system', 'Federal', 'Open', 'Market', 'Committee', 'Monetary', 'Policy', 'Principles', 'Practice', 'Policy', 'Implementation', 'Reports', 'Review', 'Monetary', 'Policy', 'Strategy', 'Tools', 'Communications', 'Institution', 'Supervision', 'Reports', 'Reporting', 'Forms', 'Supervision', 'Regulation', 'Letters', 'Banking', 'Applications', 'Legal', 'Developments', 'Regulatory', 'Resources', 'Banking', 'Data', 'Structure', 'Financial', 'Stability', 'Assessments', 'Financial', 'Stability', 'Coordination', 'Actions', 'Reports', 'Regulations', 'Statutes', 'Payment', 'Policies', 'Reserve', 'Bank', 'Payment', 'Services', 'Data', 'Financial', 'Market', 'Utilities', 'Infrastructures', 'Research', 'Committees', 'Forums', 'Working', 'Papers', 'Notes', 'Data', 'Models', 'Tools', 'Bank', 'Assets', 'Liabil