# Data Collection Process for FOMC Statements
## Danyan Zhang, Xu Yan

## Imports

In [1]:
from datetime import datetime
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import re
from IPython.display import Image, display

# Scrape the text of the FOMC statements from January 2000 to Present

We scrape FOMC statements published between 2019 and 2023 from [url1](https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm) listed below, and historical FOMC statements published prior to 2019 from [url2](https://www.federalreserve.gov/monetarypolicy/fomc_historical_year.htm).

In [16]:
url1 = "https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm"
url2 = "https://www.federalreserve.gov/monetarypolicy/fomc_historical_year.htm"

- On the Federal Reserve's monetary policy website (url1 and url2), there is a lot of ULRs link to the content besides the FOMC statements, such as links to implementation notes and press conference videos. Therefore, it is essential to identify which URLs link to the HTML format FOMC statements. A quick way to obtain these URLs is by extracting the pattern of the statement URLs and using this pattern to find all matching URLs.

- We determined the patterns manually. During the process, we found that the URL patterns for statements vary by year. Therefore, we need to use different patterns for different years to extract the URLs of the statements.
For example, for statements from 2019 to present (using url1), the URL pattern is consistent:
 
    - https://www.federalreserve.gov/newsevents/pressreleases/monetary\d{8}\a.htm
    where 8 digits represent the date in the format YYYYMMDD.

- Additionally, we aimed to capture the main text of each FOMC statement and remove extraneous parts from the start and end. In this way, we identified different templates for different time periods to effectively extract this content. For example, FOMC statements from 2019 to present typically begin after "For release at" and conclude before "Last Update".

## 2019 - Present

In [17]:
response = requests.get(url1)
text_df = pd.DataFrame(columns = ['Date', 'Text'])

if response.status_code == 200:
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')

    # define a pattern for the URL format of the FOMC statment
    pattern = re.compile(r'/newsevents/pressreleases/monetary\d{8}a\.htm')

    # find all links that match the pattern
    matching_links = soup.find_all('a', href=pattern)

    for link in matching_links:
        url = link['href']
        
        # extract date from the url
        date_match = re.search(r'/monetary(\d{8})a\.htm', url)
        if date_match:
            date = date_match.group(1)
        
        full_url = f"https://www.federalreserve.gov{url}"
        linked_response = requests.get(full_url)
        
        if linked_response.status_code == 200:
            linked_html = linked_response.text
            linked_soup = BeautifulSoup(linked_html, 'html.parser')
            
            linked_text = linked_soup.get_text()
            
            # extract the main text
            match = re.search(r"For release at(.*?)Last Update:", linked_text, re.DOTALL)
            if match:
                desired_text = match.group(1)
                match2 = re.search(r"Share(.*?)$", desired_text, re.DOTALL)
                if match2:
                    text2 = match2.group(1)
                    print(f"URL: {full_url}")
                    new_row = pd.DataFrame({'Date': [date], 'Text':[text2.strip()]})
                    text_df = pd.concat([text_df, new_row], ignore_index=True)
                    print("=" * 50)

URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20240131a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20240320a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20240501a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20240612a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20230201a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20230322a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20230503a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20230614a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20230726a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20230920a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20231101a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20231213a.htm
URL:

## 2000 - 2018

The statements published prior to 2019 are included in url2.

We need to first extract all the URLs for each year separately. 

In [21]:
response = requests.get(url2)
if response.status_code == 200:
    html_content = response.text
else:
    print("Failed to retrieve the web page. Status code:", response.status_code)

soup = BeautifulSoup(html_content, 'html.parser')
links = soup.find_all('a', href=True)

# define a pattern for the URL format of each year's website
pattern = r'/monetarypolicy/fomchistorical\d{4}\.htm'

# find all links that match the pattern
matching_links2 = [link['href'] for link in links if re.search(pattern, link['href'])]

full_urls = []
for link in matching_links2:
    full_urls.append(f"https://www.federalreserve.gov{link}")

In [24]:
full_urls[:19]

['https://www.federalreserve.gov/monetarypolicy/fomchistorical2018.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2017.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2016.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2015.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2014.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2013.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2012.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2011.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2010.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2009.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2008.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2007.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomchistorical2006.htm',
 'https://www.federalreserve.gov/monetarypolicy/fom

## 2016 - 2018

In [29]:
text_df2 = pd.DataFrame(columns = ['Date', 'Text'])

for temp in full_urls[:3]:
    response = requests.get(temp)

    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        # define a pattern for the URL format of the FOMC statment
        pattern = re.compile(r'/newsevents/pressreleases/monetary\d{8}a\.htm')

        # find all links that match the pattern
        matching_links = soup.find_all('a', href=pattern)

        for link in matching_links:
            url = link['href']

            # extract date from the url
            date_match = re.search(r'/monetary(\d{8})a\.htm', url)
            if date_match:
                date = date_match.group(1)

            full_url = f"https://www.federalreserve.gov{url}"
            linked_response = requests.get(full_url)

            if linked_response.status_code == 200:
                linked_html = linked_response.text
                linked_soup = BeautifulSoup(linked_html, 'html.parser')

                linked_text = linked_soup.get_text()

                # extract the main text
                match = re.search(r"For release at(.*?)Last Update:", linked_text, re.DOTALL)
                if match:
                    desired_text = match.group(1)

                    match2 = re.search(r"Share(.*?)$", desired_text, re.DOTALL)
                    if match2:
                        text2 = match2.group(1)
                        print(f"URL: {full_url}")
                        new_row = pd.DataFrame({'Date': [date], 'Text':[text2.strip()]})
                        text_df2 = pd.concat([text_df2, new_row], ignore_index=True)
                        print("=" * 50)

URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20180131a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20180321a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20180502a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20180613a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20180801a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20180926a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20181108a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20181219a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20170201a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20170315a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20170503a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20170614a.htm
URL:

## 2015 - 2011

In [30]:
text_df3 = pd.DataFrame(columns = ['Date', 'Text'])

for temp in full_urls[3:8]:
    response = requests.get(temp)

    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        # define a pattern for the URL format of the FOMC statment
        pattern = re.compile(r'/newsevents/pressreleases/monetary\d{8}a\.htm')

        # find all links that match the pattern
        matching_links = soup.find_all('a', href=pattern)

        for link in matching_links:
            url = link['href']

            # extract date from the url
            date_match = re.search(r'/monetary(\d{8})a\.htm', url)
            if date_match:
                date = date_match.group(1)

            full_url = f"https://www.federalreserve.gov{url}"
            linked_response = requests.get(full_url)

            if linked_response.status_code == 200:
                linked_html = linked_response.text
                linked_soup = BeautifulSoup(linked_html, 'html.parser')

                linked_text = linked_soup.get_text()

                # extract the main text
                match = re.search(r"For immediate release(.*?)Last Update:", linked_text, re.DOTALL)
                if match:
                    desired_text = match.group(1)

                    match2 = re.search(r"Share(.*?)$", desired_text, re.DOTALL)
                    if match2:
                        text2 = match2.group(1)
                        print(f"URL: {full_url}")
                        new_row = pd.DataFrame({'Date': [date], 'Text':[text2.strip()]})
                        text_df3 = pd.concat([text_df3, new_row], ignore_index=True)
                        print("=" * 50)

URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20150128a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20150318a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20150429a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20150617a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20150729a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20150917a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20151028a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20151216a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20140129a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20140319a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20140430a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20140618a.htm
URL:

## 2010 - 2006

In [32]:
text_df4 = pd.DataFrame(columns = ['Date', 'Text'])

for temp in full_urls[8:13]:
    response = requests.get(temp)

    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        # define a pattern for the URL format of the FOMC statment
        pattern = re.compile(r'/newsevents/press/monetary/\d{8}a\.htm')

        # find all links that match the pattern
        matching_links = soup.find_all('a', href=pattern)

        for link in matching_links:
            url = link['href']

            # extract date from the url
            date_match = re.search(r'/monetary/(\d{8})a\.htm', url)
            if date_match:
                date = date_match.group(1)

            # Make a request to the linked page
            full_url = f"https://www.federalreserve.gov{url}"
            linked_response = requests.get(full_url)

            if linked_response.status_code == 200:
                linked_html = linked_response.text
                linked_soup = BeautifulSoup(linked_html, 'html.parser')

                linked_text = linked_soup.get_text()

                # extract the main text
                match = re.search(r"For immediate release(.*?)Last Update:", linked_text, re.DOTALL)
                if match:
                    desired_text = match.group(1)

                    match2 = re.search(r"Share(.*?)$", desired_text, re.DOTALL)
                    if match2:
                        text2 = match2.group(1)
                        print(f"URL: {full_url}")
                        new_row = pd.DataFrame({'Date': [date], 'Text':[text2.strip()]})
                        text_df4 = pd.concat([text_df4, new_row], ignore_index=True)
                        print("=" * 50)

URL: https://www.federalreserve.gov/newsevents/press/monetary/20100127a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20100316a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20100428a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20100623a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20100810a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20100921a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20101103a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20101214a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20090128a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20090318a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20090429a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20090624a.htm
URL: https://www.federalreserve.gov/newsevents/press/monetary/20090812a.htm
URL: https:/

## 2005 - 2003

In [34]:
text_df5 = pd.DataFrame(columns = ['Date', 'Text'])

for temp in full_urls[13:16]:
    response = requests.get(temp)

    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        # define a pattern for the URL format of the FOMC statment
        pattern = re.compile(r'/boarddocs/press/monetary/\d{4}\/\d{8}\/default.htm')

        # find all links that match the pattern
        matching_links = soup.find_all('a', href=pattern)
        
        for link in matching_links:
            url = link['href']

            # extract date from the url
            date_match = re.search(r'(\d{8})', url)
            if date_match:
                date = date_match.group(1)

            full_url = f"https://www.federalreserve.gov{url}"
            linked_response = requests.get(full_url)

            if linked_response.status_code == 200:
                linked_html = linked_response.text
                linked_soup = BeautifulSoup(linked_html, 'html.parser')

                linked_text = linked_soup.get_text()

                # extract the main text
                match = re.search(r"For immediate release(.*?)Last update:", linked_text, re.DOTALL)
                if match:
                    desired_text = match.group(1)
                    print(f"URL: {full_url}")
                    new_row = pd.DataFrame({'Date': [date], 'Text':[desired_text.strip()]})
                    text_df5 = pd.concat([text_df5, new_row], ignore_index=True)
                    print("=" * 50)

URL: https://www.federalreserve.gov/boarddocs/press/monetary/2005/20050202/default.htm
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2005/20050322/default.htm
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2005/20050503/default.htm
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2005/20050630/default.htm
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2005/20050809/default.htm
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2004/20040128/default.htm
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2004/20040316/default.htm
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2004/20040504/default.htm
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2004/20040630/default.htm
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2004/20040810/default.htm
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2004/20040921/default.htm
URL: https://www.federalreserve.gov/boarddo

## 2002 - 2000

In [36]:
text_df6 = pd.DataFrame(columns = ['Date', 'Text'])

for temp in full_urls[16:19]:
    response = requests.get(temp)

    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        # define a pattern for the URL format of the FOMC statment
        pattern = re.compile(r'/boarddocs/press/(general|monetary)/\d{4}\/\d{8}\/')

        # find all links that match the pattern
        matching_links = soup.find_all('a', href=pattern)
        
        for link in matching_links:
            url = link['href']

            # extract date from the url
            date_match = re.search(r'(\d{8})', url)
            if date_match:
                date = date_match.group(1)

            full_url = f"https://www.federalreserve.gov{url}"
            linked_response = requests.get(full_url)

            if linked_response.status_code == 200:
                linked_html = linked_response.text
                linked_soup = BeautifulSoup(linked_html, 'html.parser')

                linked_text = linked_soup.get_text()

               # extract the main text
                match = re.search(r"For immediate release(.*?)Last update:", linked_text, re.DOTALL)
                if match:
                    desired_text = match.group(1)
                    print(f"URL: {full_url}")
                    new_row = pd.DataFrame({'Date': [date], 'Text':[desired_text.strip()]})
                    text_df6 = pd.concat([text_df6, new_row], ignore_index=True)
                    print("=" * 50)

URL: https://www.federalreserve.gov/boarddocs/press/general/2002/20020130/
URL: https://www.federalreserve.gov/boarddocs/press/general/2002/20020319/
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2002/20020507/
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2002/20020626/
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2002/20020813/
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2002/20020924/
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2002/20021106/
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2002/20021210/
URL: https://www.federalreserve.gov/boarddocs/press/general/2001/20010103/
URL: https://www.federalreserve.gov/boarddocs/press/general/2001/20010131/
URL: https://www.federalreserve.gov/boarddocs/press/general/2001/20010320/
URL: https://www.federalreserve.gov/boarddocs/press/general/2001/20010418/
URL: https://www.federalreserve.gov/boarddocs/press/general/2001/20010515/
URL: https://www.fe

In [37]:
final_df = pd.concat([text_df, text_df2, text_df3, text_df4, text_df5, text_df6]).reset_index(drop=True)

In [38]:
final_df = final_df[final_df['Date'] != "20200331"]
final_df = final_df[final_df['Date'] != "20200827"]
final_df

Unnamed: 0,Date,Text
0,20240131,Recent indicators suggest that economic activi...
1,20240320,Recent indicators suggest that economic activi...
2,20240501,Recent indicators suggest that economic activi...
3,20240612,Recent indicators suggest that economic activi...
4,20230201,Recent indicators point to modest growth in sp...
...,...,...
197,20000628,The Federal Open Market Committee at its meeti...
198,20000822,The Federal Open Market Committee at its meeti...
199,20001003,The Federal Open Market Committee at its meeti...
200,20001115,The Federal Open Market Committee at its meeti...


## Special Case

In [39]:
matching_urls = ["/newsevents/pressreleases/monetary20100509a.htm",
                 "/newsevents/pressreleases/monetary20081008a.htm"]
text_df7 = pd.DataFrame(columns = ['Date', 'Text'])

for url in matching_urls:
    # extract date from the url
    date_match = re.search(r'(\d{8})', url)
    if date_match:
        date = date_match.group(1)

    full_url = f"https://www.federalreserve.gov{url}"
    linked_response = requests.get(full_url)

    if linked_response.status_code == 200:
        linked_html = linked_response.text
        linked_soup = BeautifulSoup(linked_html, 'html.parser')

        linked_text = linked_soup.get_text()

         # extract the main text using another pattern
        match = re.search(r"For release(.*?)Last Update:", linked_text, re.DOTALL)
        if match:
            desired_text = match.group(1)

            match2 = re.search(r"Share(.*?)$", desired_text, re.DOTALL)
            if match2:
                text2 = match2.group(1)
                print(f"URL: {full_url}")
                new_row = pd.DataFrame({'Date': [date], 'Text':[text2.strip()]})
                text_df7 = pd.concat([text_df7, new_row], ignore_index=True)
                print("=" * 50)

URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20100509a.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20081008a.htm


In [40]:
matching_urls = ["/newsevents/pressreleases/monetary20080122b.htm",
                 "/newsevents/pressreleases/monetary20081216b.htm",
                 "/newsevents/pressreleases/monetary20070817b.htm"]
text_df8 = pd.DataFrame(columns = ['Date', 'Text'])

for url in matching_urls:
    # extract date from the url
    date_match = re.search(r'(\d{8})', url)
    if date_match:
        date = date_match.group(1)

    full_url = f"https://www.federalreserve.gov{url}"
    linked_response = requests.get(full_url)

    if linked_response.status_code == 200:
        linked_html = linked_response.text
        linked_soup = BeautifulSoup(linked_html, 'html.parser')


        linked_text = linked_soup.get_text()

        # extract the main text using another pattern
        match = re.search(r"For immediate release(.*?)Last Update:", linked_text, re.DOTALL)
        if match:
            desired_text = match.group(1)

            match2 = re.search(r"Share(.*?)$", desired_text, re.DOTALL)
            if match2:
                text2 = match2.group(1)
                print(f"URL: {full_url}")
                new_row = pd.DataFrame({'Date': [date], 'Text':[text2.strip()]})
                text_df8 = pd.concat([text_df8, new_row], ignore_index=True)
                print("=" * 50)

URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20080122b.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20081216b.htm
URL: https://www.federalreserve.gov/newsevents/pressreleases/monetary20070817b.htm


In [41]:
matching_urls = ["/boarddocs/press/monetary/2005/20050920/",
                 "/boarddocs/press/monetary/2005/20051101/",
                 "/boarddocs/press/monetary/2005/20051213/"]
text_df9 = pd.DataFrame(columns = ['Date', 'Text'])

for url in matching_urls:
    # extract date from the url
    date_match = re.search(r'(\d{8})', url)
    if date_match:
        date = date_match.group(1)

    full_url = f"https://www.federalreserve.gov{url}"
    linked_response = requests.get(full_url)

    if linked_response.status_code == 200:
        linked_html = linked_response.text
        linked_soup = BeautifulSoup(linked_html, 'html.parser')

        linked_text = linked_soup.get_text()

        # extract the main text using another pattern
        match = re.search(r"For immediate release(.*?)Last update:", linked_text, re.DOTALL)
        if match:
            desired_text = match.group(1)
            print(f"URL: {full_url}")
            new_row = pd.DataFrame({'Date': [date], 'Text':[desired_text.strip()]})
            text_df9 = pd.concat([text_df9, new_row], ignore_index=True)
            print("=" * 50)

URL: https://www.federalreserve.gov/boarddocs/press/monetary/2005/20050920/
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2005/20051101/
URL: https://www.federalreserve.gov/boarddocs/press/monetary/2005/20051213/


In [42]:
final_df = pd.concat([final_df, text_df7, text_df8, text_df9]).reset_index(drop=True)

In [43]:
final_df = final_df.sort_values(by="Date")
final_df = final_df.reset_index(drop=True)

In [44]:
final_df.to_excel("TextData.xlsx", index=False)