# **SIADS Milestone II - Forecasting Federal Funds Rate Movements through Natural Language Processing of FOMC Minutes**

**s24-milestone2-team7-cspaarg-cydia-davidrez**
- **Casey Spaargaren(cspaarg@umich.edu)**, School of Information, University of Michigan
- **Cydia Tsang (cydia@umich.edu)**, School of Information, University of Michigan
- **David Rezkalla(davidrez@umich.edu)**, School of Information, University of Michigan


#### **Strcuture of the Code:**

&emsp; **Data Import**<br>
&emsp;&emsp;&emsp; Federal Reserve's meeting minutes from 2000 to 2024<br>
&emsp; **Data Cleaning & Manipulation**<br>
&emsp;&emsp;&emsp; 1. Basic Desciptive Statistic Data Manipulation<br>
&emsp; **Data Analysis & Visualisation**<br>
&emsp;&emsp;&emsp; 1. Basic Desciptive Statistic Analysis<br>

In [None]:
#Remeber to run the following commannd before running the following codes. Details Please refer to README.md
#!pip install -r requirements.txt

In [2]:
from bs4 import BeautifulSoup
import urllib
from time import sleep
import re,csv,os
from os import listdir
from os.path import isfile, join
import os
import glob
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import string

# **Data Import**

The project requires the Federal Reserve's meeting minutes from 2000 to 2024, sourced from the official Fed website at https://www.federalreserve.gov. The Federal Open Market Committee (FOMC) holds eight regularly scheduled meetings each year, with additional meetings as needed. Policy statements and minutes are linked in the calendars on the website. The minutes of regularly scheduled meetings are released three weeks after the policy decision date, and committee membership changes occur at the first meeting of each year. The retrieved data is organized and stored as individual files, named meeting_minute_YYYYMMDD.txt, in the data/output directory.

In [18]:
releaseDates = [line.rstrip() for line in open(os.path.join('data/meeting_minute/meeting_dates.csv'), 'r')]

def getFedMeetingMinutesUrl(date):
    dateInt = int(date)
    if dateInt <= 20070918 :
        url = 'https://www.federalreserve.gov/fomc/minutes/' + date + '.htm'
    elif dateInt == 20080625:
        url = 'https://www.federalreserve.gov/monetarypolicy/fomc20080625.htm'
    elif dateInt > 20070918:
        url = 'https://www.federalreserve.gov/monetarypolicy/fomcminutes' + date + '.htm'
    print(url)
    return url

def getStatement(date):
    print('Pulling meeting minute of date: ' + date)
    req = urllib.request.Request(getFedMeetingMinutesUrl(date), headers={'User-Agent' : "Magic Browser"}) 
    html = urllib.request.urlopen( req ).read()
    soup = BeautifulSoup(html)
    minutesText = soup.get_text(" ")
    return minutesText

In [19]:
for releaseDate in releaseDates:
    file_path="./data/output/meeting_minute_" + releaseDate +".txt"
    if os.path.isfile(file_path) == False:
        data = getStatement(releaseDate)
        sleep(2)
    
        f = open(file_path, 'w')
        f.write(data)
        f.close

# **Data Cleaning**

In [34]:
nltk.download('stopwords')

# Load FOMC meeting minutes
meeting_minutes_dir = "data/output"
meeting_minutes_files = os.listdir(meeting_minutes_dir)

# Function to preprocess text (tokenize and remove stop words)
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

meeting_minutes = []
for file in meeting_minutes_files:
    with open(os.path.join(meeting_minutes_dir, file), 'r') as f:
        text = f.read()
        preprocessed_text = preprocess_text(text)
        meeting_minute = {
            'meeting_minute': preprocessed_text,
            'file_date': file.replace('meeting_minute_', '').replace('.txt', '')
        }
        meeting_minutes.append(meeting_minute)
        
meeting_minutes_df = pd.DataFrame(meeting_minutes)
meeting_minutes_df.file_date = pd.to_datetime(meeting_minutes_df.file_date, format='%Y%m%d')
meeting_minutes_df = meeting_minutes_df.set_index('file_date')
meeting_minutes_df = meeting_minutes_df.sort_index()
# Split the dataset into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
meeting_minutes_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cydiatsang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0_level_0,meeting_minute
file_date,Unnamed: 1_level_1
1999-05-18,"FRB : FOMC Minutes - May 18 , 1999 Minutes Fed..."
2000-02-02,"FRB : FOMC Minutes - February 1-2 , 2000 Minut..."
2000-03-21,"FRB : FOMC Minutes - March 21 , 2000 Minutes F..."
2000-05-16,"FRB : FOMC minutes - May 16 , 2000 Minutes Fed..."
2000-06-28,"FRB : FOMC Minutes - June 27-28 , 2000 Minutes..."


In [35]:
# Load Federal Funds Rate data
fed_fund_rate_data = pd.read_csv("data/fed_fund_rate.csv")
fed_fund_rate_data['date'] = pd.to_datetime(fed_fund_rate_data['date'])
fed_fund_rate_data['diff'] =fed_fund_rate_data['value'].diff(1)
fed_fund_rate_data = fed_fund_rate_data.set_index('date')
fed_fund_rate_data.head()

Unnamed: 0_level_0,value,diff
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1954-07-07,1.0,
1954-07-14,1.22,0.22
1954-07-21,0.57,-0.65
1954-07-28,0.63,0.06
1954-08-04,0.27,-0.36


In [39]:
# Indicate if a rate changed within x days (try 7?) after a document is published, 
# by merging the rate and content df's
# then display what it gets us

tol = pd.Timedelta('7 day')
merge_df = pd.merge_asof(left=meeting_minutes_df,right=fed_fund_rate_data,right_index=True,left_index=True,direction='nearest',tolerance=tol)
merge_df.head()

# Prepare the dataset
X = meeting_minutes

y = fed_fund_rate_data['value']

In [None]:
# Description: This file converts the raw FOMC Statements into cleaner
#              versions of themselves.
#
#              This file leverages an open source version for the cleaning.
#              The content has been modified from it's originally published version and
#              adapted for python3. The author of that original version is
#              Miguel Acosta  www.acostamiguel.com
#
# Input:       The raw FOMC statements, downloaded from federalreserve.gov
#              by the python script pullStatements.py. These are
#              located in the directory statements/statements.raw
#
# Output:      Two sets of cleaned FOMC statements:
#                (i) A set, located in the directory statements/statements.clean
#                    of FOMC statements that have had header, footer, and voting
#                    information removed. These files are currently being used
#                    in this project.
#                (ii) A set, located in statements/statements.clean.np
#                    of FOMC statements that have had header, footer, and voting
#                    information removed. They have also been stemmed, words
#                    have been concatenated, and numbers/stopwords
#                    have been removed. These files are not currently being used
#                    in this project.
#
#--------------------------------- IMPORTS -----------------------------------#
import os, csv, re
from os import listdir
from os.path import isfile, join
from nltk.stem.lancaster import LancasterStemmer
from textmining_withnumbers import TermDocumentMatrix as TDM

#-------------------------DEFINE GLOBAL VARIABLES-----------------------------#
# Directory where the stop words and n-grams to concatenate are
datadir      = 'data'
# Where the raw statements are
statementdir = os.path.join('statements','statements.raw')
# Where the clean statements will go (with and without preprocessing)
cleanDir     = os.path.join('statements','statements.clean')
cleanDirNP   = os.path.join('statements','statements.clean.np')
# Where the cleaned documents should go
outputDir    = 'output'

#-----------------------------------------------------------------------------#
# getReplacementList: Returns two lists, a list of N n-grams (phrase with n
#   words) and a list with N "words" to replace the n-grams. The function reads
#   a file, list_name, where every odd entry is an n-gram, and every even entry
#   is a replacement.
#-----------------------------------------------------------------------------#
def getReplacementList (list_name):
    allWords = [line.rstrip('\n') for line in  open(list_name, 'r') ]
    oldWords = [allWords[i] for i in range(len(allWords)) if i % 2 == 0]
    newWords = [allWords[i] for i in range(len(allWords)) if i % 2 == 1]
    return [oldWords, newWords]

#-----------------------------------------------------------------------------#
# cleanStatement: This function is the meat of this code--it performs all of the
#   cleaning/preprocessing described in the header of this document. It's
#   inputs are:
#     (1) statement   : a string with the filename of a single FOMC statement
#     (2) locationold : Directory where raw statements are located (string)
#     (3) replacements: Output from getReplacementList
#     (4) locationnew : Directory where clean statements go (string)
#     (5) stoplist    : A list of words to remove (list of strings)
#     (6) charsToKeep : A regular expression of the character types to keep
#-----------------------------------------------------------------------------#
def cleanStatement (statement, locationold, replacements, locationnew, \
                    stoplist, charsToKeep):
    # Read in the statement and convert it to lower case
    original  = open(os.path.join(locationold,statement),'r').read().lower()

    clean = original
    # Remove punctuation and newlines first, to keep space between words
    for todelete in ['.', '\r\n', '\n', ',', '-', ';', ':']:
        clean = clean.replace(todelete, ' ')

    # Keep only the characters that you want to keep
    clean = re.sub(charsToKeep, '', clean)
    clean = clean.replace('  ', ' ')
    clean = clean.replace(' u s ', ' unitedstates ')

    # Remove anything before (and including) 'for immediate release'
    deleteBefore= re.search("[Ff]or\s[Ii]mmediate\s[Rr]elease", \
                            clean).start() + len ('for immediate release')
    clean = clean[deleteBefore:]

    # Looking for the end of the text
    intaking   = re.search("in\staking\sthe\sdiscount\srate\saction",\
                           clean)
    votingfor  = re.search("voting\sfor\sthe\sfomc", clean)
    if intaking == None and not votingfor == None:
        deleteAfter = votingfor.start()
    elif votingfor == None and not intaking == None:
        deleteAfter = intaking.start()
    elif votingfor == None and intaking == None:
        deleteAfter = len(clean)
    else:
        deleteAfter = min(votingfor.start(), intaking.start())
    clean = clean[:deleteAfter]

    # Replace replacement words (concatenations)
    for word in range(len(replacements[0])):
        clean = clean.replace(replacements[0][word], replacements[1][word])

    # Remove stop words
    for word in stoplist:
        clean = clean.replace(' '+word.lower() + ' ', ' ')

    # Write cleaned file
    new = open(os.path.join(locationnew,statement), 'w')
    new.write(clean)
    new.close

#-----------------------------------------------------------------------------#
# The Main function generates the stop list, and word replacement lists, then
#   loops through every file in the statements/statements.raw directory and
#   performs two types of cleaning: one that is less extensive (saved in
#   statements/statements.clean) and one that includes more preprocessing steps
#   (saved in statements/statements.clean.np). Finally, it creates the
#   term-document matrix for each type of cleaning. 'NP' denotes 'no preprocessing.
#
#   Only the files in the statements/statements.clean folder are currently
#   being used in this project, these are the files with less preprocessing
#   at this step. More processing happens in the Jupyter Notebook code which
#   is the file called Data.ipynb.
#-----------------------------------------------------------------------------#

def main():
    stoplist       = [line.rstrip('\n') for line in \
                      open(os.path.join(datadir,"stoplist_mcdonald_comb.txt")
                           , 'r') ]
    stoplistNP     = [line.rstrip('\n') for line in \
                      open(os.path.join(datadir,"emptystop.txt"), 'r') ]

    replacements   = getReplacementList(os.path.join(datadir,"wordlist.txt"))
    replacementsNP = getReplacementList(os.path.join(datadir,"wordlist.np.txt"))

    statementList  = [ f for f in listdir(statementdir) \
                       if isfile(join(statementdir,f)) ]

    for statement in statementList:
        # First, the case with heavier preprocessing (keep only letters)
        cleanStatement(statement, statementdir, replacements, \
                       cleanDir, stoplist, '[^A-Za-z ]+',1)
        # Second, the no-preprocessing case (keep letters and numbers)
        cleanStatement(statement, statementdir, replacementsNP, \
                       cleanDirNP, stoplistNP, '[^A-Za-z0-9 ]+',0)

if __name__ == "__main__":
    main()

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

# Function to read and tokenize text from meeting minutes file
def process_minutes(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
        tokens = word_tokenize(text.lower())  # Tokenize text and convert to lowercase
        # Remove stopwords and punctuation
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
        return filtered_tokens

# Directory containing meeting minutes files
minutes_dir = 'data/output/'

# List to store tokenized words from all meeting minutes
meeting_minutes = []

# Process each meeting minute file
for file_path in glob.glob(os.path.join(minutes_dir, 'meeting_minute_*.txt')):
    tokens = process_minutes(file_path)
    meeting_minutes.extend(tokens)

In [None]:
fed_funds_data = pd.read_csv('data/fed_fund_rate.csv')
fed_funds_data['date'] = pd.to_datetime(fed_funds_data['date'])
print(fed_funds_data.head())

In [None]:
merged_data = pd.DataFrame(columns=['date', 'tokens', 'value'])
    for i, tokens in enumerate(meeting_minutes):
        date_str = os.path.basename(meeting_files[i]).split('_')[-1].split('.')[0]
        date = pd.to_datetime(date_str, format='%Y%m%d')
        value = fed_fund_rate_df.loc[fed_fund_rate_df['date'] == date, 'value'].values[0]
        merged_data = merged_data.append({'date': date, 'tokens': tokens, 'value': value}, ignore_index=True)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(merged_data['tokens'], merged_data['value'], test_size=0.2, random_state=42)