This project deals with analyzing the FED (Federal Reserve) announcements. All historic Fed announcements can be found here: https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm

Here is for instance the minutes of the last Fed meeting: https://www.federalreserve.gov/monetarypolicy/files/fomcminutes20211103.pdf

My goal is to predict the S&P500 index direction in the next week based on what is written in the FED minutes announcement. Text will be vectorized into features using extracted macro-economic indexes as also using TF-IDF following by Cosine Similarity and finally by using BERT and sentiment analysis. For model training and prediction I will be using Keras and XGBoost.

Edit: as for 17.02.2022 stay tuned for the upcoming model fitting and prediction.

In [1]:
# %pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import chromedriver_autoinstaller
chromedriver_autoinstaller.install()

import time
from datetime import timedelta

import re

import pandas as pd
import pandas_datareader.data as web

import numpy as np
import datetime

In [2]:
# SELENIUM_URL = 'http://127.0.0.1:4444/wd/hub'

In [3]:
driver = webdriver.Chrome(options=webdriver.ChromeOptions())
driver.implicitly_wait(10)

##### Scrape FOMC meeting dates and meeting urls

In [4]:
FOMC_URL = 'https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm'

driver.get(FOMC_URL)
time.sleep(7)

In [5]:
# Create a selenium webobject for all minutes HTML urls

main_page_urls = driver.find_elements(By.XPATH, "//*[contains(@href,'fomcminutes') and contains(@href,'.htm')]")

In [6]:
# Crete a list af minutes urls:

minute_urls = [elem.get_attribute("href") for elem in main_page_urls]
minute_urls.sort()

# Extract meeting dates for S&P value data and df index:

# in string format
f = lambda x: x[-12:-4]
meeting_dates = [f(x) for x in minute_urls]
meeting_dates.sort()
    
# in date format
g = lambda x: datetime.date(int(x[:4]), int(x[4:6]), int(x[6:]))
meeting_dates_obj_list = [g(x) for x in meeting_dates]
meeting_dates_obj_list.sort()

In [7]:
# minute_urls

In [8]:
# meeting_dates

In [9]:
# meeting_dates_obj_list

#### Create a df for all meetings' scraped data

In [10]:
# Prepare series for df creation:

full_text_s = []
recommendations_s = []
num_of_recommendations_s = []

for minute_url in minute_urls:
    
    # Begin selenium instance
    driver.get(minute_url) 
    time.sleep(10)
    
    # Extract full text of minutes
    minutes_raw = driver.find_element(By.XPATH, "//div[@id='article']")
    full_text = minutes_raw.text.replace('\n', ' ')
    full_text_s.append(full_text)
    
    # Extract recommendations section into list:
    recs = []
    
    # If recommendations are in ul format:
    recommendations_list_of_obj = minutes_raw.find_elements(By.XPATH,
        "//p[contains(.,'Effective') and contains(.,'Federal Open Market Committee directs the Desk')]//following-sibling::ul//li")
    
    # If recommendations are in p format:
    if len(recommendations_list_of_obj) == 0:
        recommendations_list_of_obj = minutes_raw.find_elements(By.XPATH,
            "//p[contains(.,'Effective') and contains(.,'Federal Open Market Committee directs the Desk')]")
        recommendations_list_of_obj2 = minutes_raw.find_elements(By.XPATH,
            "//p[contains(.,'Effective') and contains(.,'Federal Open Market Committee directs the Desk')]//following-sibling::p")
        for obj in recommendations_list_of_obj2:
            recommendations_list_of_obj.append(obj)
            
    # If recommendations are in unknown format:
    if len(recommendations_list_of_obj) == 0:
        print('problem')    
        
    # Extract text from objects and add them to df series:
    for recommendations_obj in recommendations_list_of_obj:
        recs.append(recommendations_obj.text)
#         print('recs', type(recs), len(recs))
    recommendations_s.append(recs)
    num_of_recommendations_s.append(len(recs))
#     print(recommendations_s)
    print(f'{len(full_text_s)} pages scraped. data from {minute_url} was appended')
    
print(len(full_text_s), 'minutes were scraped')

1 pages scraped. data from https://www.federalreserve.gov/monetarypolicy/fomcminutes20170201.htm was appended
2 pages scraped. data from https://www.federalreserve.gov/monetarypolicy/fomcminutes20170315.htm was appended
3 pages scraped. data from https://www.federalreserve.gov/monetarypolicy/fomcminutes20170503.htm was appended
4 pages scraped. data from https://www.federalreserve.gov/monetarypolicy/fomcminutes20170614.htm was appended
5 pages scraped. data from https://www.federalreserve.gov/monetarypolicy/fomcminutes20170726.htm was appended
6 pages scraped. data from https://www.federalreserve.gov/monetarypolicy/fomcminutes20170920.htm was appended
7 pages scraped. data from https://www.federalreserve.gov/monetarypolicy/fomcminutes20171101.htm was appended
8 pages scraped. data from https://www.federalreserve.gov/monetarypolicy/fomcminutes20171213.htm was appended
9 pages scraped. data from https://www.federalreserve.gov/monetarypolicy/fomcminutes20180131.htm was appended
10 pages s

In [11]:
print('meeting_dates', len(meeting_dates))
print('full_text_s', len(full_text_s))
print('recommendations_s', len(recommendations_s))
print('num_of_recommendations_s', len(num_of_recommendations_s))

meeting_dates 41
full_text_s 41
recommendations_s 41
num_of_recommendations_s 41


In [12]:
# Cast percent targets in recommendations as floats

def text_to_float(text):
    
    if '/' in text:
        split_fraction = text.split('/')
        num = int(split_fraction[0])/int(split_fraction[1]) 
    else: num = float(text)
        
    return num

    
def text_to_percent(text):
    
    percent = 0
    if '-' in text:
        split_text = text.split('-')
        for split in split_text:
            percent += text_to_float(split)
    else:   
        percent = text_to_float(text)

    return percent

In [13]:
# Extract nominal federal fund rate targets from recommendations' text and flatten recommendation list:

ffr_1 = []
ffr_2 = []
flat_recommendations_s = []
                                
for i in range(len(recommendations_s)):
    counter1 = 1
    ffrs = 0
    flat_recs = ''
    while counter1 <= len(recommendations_s[i]):
        rec = recommendations_s[i][counter1-1].lower()
        
        # Add federal fund rate to ffr_1 and ffr_2:

        if 'maintain the federal funds rate' in rec:
            ffr = re.search(r'target range of (.+?) to (.+?) percent', rec)
            ffr_1.append(text_to_percent(ffr.group(1)))
            ffr_2.append(text_to_percent(ffr.group(2)))
            ffrs += 1
            
        flat_recs = flat_recs + rec + ' '
            
        counter1 += 1
        
    # When there is more then one federal funds rate recommendation in same meeting - take largest range:
    if ffrs > 1:
        many_list = ffr_1[-ffrs:]
        del ffr_1[-ffrs:]
        ffr_1.append(min(many_list)) # keep lower value
        many_list = ffr_2[-ffrs:]
        del ffr_2[-ffrs:]
        ffr_2.append(max(many_list)) # keep higher value
        
    flat_recommendations_s.append(flat_recs)

In [14]:
print('meeting_dates', len(meeting_dates))
print('full_text_s', len(full_text_s))
print('recommendations_s', len(recommendations_s))
print('flat_recommendations_s', len(flat_recommendations_s))
print('num_of_recommendations_s', len(num_of_recommendations_s))
print('num_of_ffrs_s', len(ffr_1), len(ffr_2))

meeting_dates 41
full_text_s 41
recommendations_s 41
flat_recommendations_s 41
num_of_recommendations_s 41
num_of_ffrs_s 41 41


In [15]:
# flat_recommendations_s[20]
# rec + rec

In [16]:
fomc_data = pd.DataFrame(data = {'date': meeting_dates,
                                 'full_text': full_text_s,
                                 'recommendations': flat_recommendations_s, 
                                 'num_of_recommendations': num_of_recommendations_s,
                                 'ffr_min': ffr_1, 'ffr_max': ffr_2})
fomc_data = fomc_data.astype({'date':str, 'full_text':str, 'recommendations':str,
                              'num_of_recommendations':int, 'ffr_min':float, 'ffr_max':float})

In [17]:
# Calculate the difference in federal fund rate against rate in previous meeting:
fomc_data['ffr_min_diff'] = fomc_data['ffr_min'] - fomc_data.shift(1)['ffr_min']
fomc_data['ffr_max_diff'] = fomc_data['ffr_max'] - fomc_data.shift(1)['ffr_max']

In [18]:
fomc_data.head()

Unnamed: 0,date,full_text,recommendations,num_of_recommendations,ffr_min,ffr_max,ffr_min_diff,ffr_max_diff
0,20170201,Minutes of the Federal Open Market Committee J...,"""effective february 2, 2017, the federal open ...",2,0.5,0.75,,
1,20170315,Minutes of the Federal Open Market Committee M...,"""effective march 16, 2017, the federal open ma...",2,0.75,1.0,0.25,0.25
2,20170503,Minutes of the Federal Open Market Committee M...,"""effective may 4, 2017, the federal open marke...",2,0.75,1.0,0.0,0.0
3,20170614,Minutes of the Federal Open Market Committee J...,"""effective june 15, 2017, the federal open mar...",2,1.0,1.25,0.25,0.25
4,20170726,Minutes of the Federal Open Market Committee J...,"""effective july 27, 2017, the federal open mar...",2,1.0,1.25,0.0,0.0


#### Build and apply a tf-idf cosine similarity model

In [19]:
# Import sklearn packages and functions
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm
from tqdm.notebook import tqdm
from collections import Counter

# Import nltk modules and download dataset
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dannystatland/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dannystatland/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dannystatland/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/dannystatland/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [20]:
# import datetime as dt

# import pickle

# import os
# import sys
# import time
# import logging
# import random
# from collections import defaultdict, Counter

# Data Science modules

# import matplotlib.pyplot as plt
# import seaborn as sns; sns.set()
# plt.style.use('ggplot')

# Import Scikit-learn moduels

# from sklearn.metrics import accuracy_score, f1_score, plot_confusion_matrix
# from sklearn.pipeline import Pipeline, FeatureUnion
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn import model_selection
# from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate, StratifiedKFold, learning_curve, RandomizedSearchCV

In [21]:
# Function to lemmatize a word
def lemmatize_word(word):
    wnl = nltk.stem.WordNetLemmatizer()
    return wnl.lemmatize(wnl.lemmatize(word, 'n'), 'v')

In [22]:
# Function to tokenize text in a DataFrame
def tokenize_df(df, col):
    tokenized = []
    wnl = nltk.stem.WordNetLemmatizer()
    for text in tqdm(df[col]):
        # Filter alphabet words only and non stop words, make it loser case
        words = [word.lower() for word in word_tokenize(text) if ((word.isalpha()==1) & (word not in stop))]
        # Lemmatize words 
        tokens = [lemmatize_word(word) for word in words]
        tokenized.append(tokens)
    return tokenized

In [23]:
# Function to create Tfidf Vector
# from sklearn.feature_extraction.text import TfidfVectorizer
# def get_tfidf(sentiment_words, docs):
def get_tfidf(docs):
    vectorizer = TfidfVectorizer(analyzer='word')  #, vocabulary=sentiment_words)        TODO: return sentiment !!!!
    tfidf = vectorizer.fit_transform(docs)
    features = vectorizer.get_feature_names()
    return tfidf.toarray()

In [24]:
# Function to calculate Cosine Similarity
# from sklearn.metrics.pairwise import cosine_similarity
def get_cosine_similarity(tfidf_matrix):
    return [cosine_similarity(u.reshape(1,-1), v.reshape(1,-1))[0][0].tolist() for u, v in zip(tfidf_matrix, tfidf_matrix[1:])]

In [25]:
def pipeline(df, origin_text, col_prefix):
    
    # Tokenize
    tokenized = tokenize_df(df, origin_text)
    docs = [" ".join(words) for words in tokenized]
    
    # Create tf/idf vectors and calculate cosine similarity:
    tfidf_v = get_tfidf(docs)
    cosine_similarity = get_cosine_similarity(tfidf_v)
    cosine_similarity.insert(0, 0)

    # Add cosine similarity to fomc_data
    df[f'{col_prefix}_cosine_similarity'] = cosine_similarity
    

In [26]:
# Calculate cosine similarity beween full texts and add to df:
pipeline(fomc_data, 'full_text', 'full_text')

  0%|          | 0/41 [00:00<?, ?it/s]



In [27]:
# Calculate cosine similarity beween recommendation texts and add to df:
pipeline(fomc_data, 'recommendations', 'recommendations')
fomc_data.head()

  0%|          | 0/41 [00:00<?, ?it/s]



Unnamed: 0,date,full_text,recommendations,num_of_recommendations,ffr_min,ffr_max,ffr_min_diff,ffr_max_diff,full_text_cosine_similarity,recommendations_cosine_similarity
0,20170201,Minutes of the Federal Open Market Committee J...,"""effective february 2, 2017, the federal open ...",2,0.5,0.75,,,0.0,0.0
1,20170315,Minutes of the Federal Open Market Committee M...,"""effective march 16, 2017, the federal open ma...",2,0.75,1.0,0.25,0.25,0.810823,0.957881
2,20170503,Minutes of the Federal Open Market Committee M...,"""effective may 4, 2017, the federal open marke...",2,0.75,1.0,0.0,0.0,0.905553,0.961575
3,20170614,Minutes of the Federal Open Market Committee J...,"""effective june 15, 2017, the federal open mar...",2,1.0,1.25,0.25,0.25,0.910641,0.956847
4,20170726,Minutes of the Federal Open Market Committee J...,"""effective july 27, 2017, the federal open mar...",2,1.0,1.25,0.0,0.0,0.923488,0.953171


In [28]:
# # Create vocab
# all_words = [word for text in tokenized for word in text]
# counts = Counter(all_words)
# bow = sorted(counts, key=counts.get, reverse=True)
# vocab = {word: ii for ii, word in enumerate(counts, 1)}
# id2vocab = {v: k for k, v in vocab.items()}

#### Add S&P500 change and market direction from the days following each meeting

In [29]:
# Create a df to hold the S&P500 values on the meeting dates and the consecutive days:

sp500_values = pd.DataFrame(data = {'date': meeting_dates_obj_list})
for i in range(-2,8):
    sp500_values[f'day_{i}'] = [np.nan for i in range(len(meeting_dates_obj_list))]
sp500_values.set_index('date', inplace = True)

In [30]:
sp500_values.head()

Unnamed: 0_level_0,day_-2,day_-1,day_0,day_1,day_2,day_3,day_4,day_5,day_6,day_7
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-02-01,,,,,,,,,,
2017-03-15,,,,,,,,,,
2017-05-03,,,,,,,,,,
2017-06-14,,,,,,,,,,
2017-07-26,,,,,,,,,,


In [31]:
# Add S&P500 values from FRED for each meeting's day and consecutive days:

for meeting_date in meeting_dates_obj_list:
    
    # Create a list of dates starting with the meeting day and ending 7 days later:
    all_dates = [meeting_date + timedelta(days=i) for i in range(-2,8)]
    start = all_dates[0]
    end = all_dates[-1]
    
    # Retrieve FRED data for all days in list
    sp500_temp = web.DataReader('sp500', 'fred', start, end)
    
    # Change format of fred reply's index from datetime to date:
    f = lambda x: x.date()
    sp500_temp.index = [f(x) for x in sp500_temp.index]
    
    # For each date with a response, find its index in the consecutive days list and add it to the final df:
    for date_temp in sp500_temp.index:
        if date_temp in all_dates:
            indx = all_dates.index(date_temp)
            sp500_values.loc[meeting_date, sp500_values.columns[indx]] = sp500_temp.loc[date_temp].values[0]

In [32]:
sp500_values.head()

Unnamed: 0_level_0,day_-2,day_-1,day_0,day_1,day_2,day_3,day_4,day_5,day_6,day_7
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-02-01,2280.9,2278.87,2279.55,2280.85,2297.42,,,2292.56,2293.08,2294.67
2017-03-15,2373.47,2365.45,2385.26,2381.38,2378.25,,,2373.47,2344.02,2348.45
2017-05-03,2388.33,2391.17,2388.13,2389.52,2399.29,,,2399.38,2396.92,2399.63
2017-06-14,2429.39,2440.35,2437.92,2432.46,2433.15,,,2453.46,2437.03,2435.61
2017-07-26,2469.91,2477.13,2477.83,2475.42,2472.1,,,2470.3,2476.35,2477.57


In [33]:
# Change values when firsl/last day is not a trading day:

def sp500_calc_change(date):
    last_day = 7
    first_day = 0
    
    while pd.isna(sp500_values.loc[date][f'day_{last_day}']):
        last_day -= 1
        
    while pd.isna(sp500_values.loc[date][f'day_{first_day}']):
        first_day -= 1
    
    return sp500_values.loc[date, f'day_{last_day}'] - sp500_values.loc[date, f'day_{first_day}']

In [34]:
sp500_values['sp500_change'] = [sp500_calc_change(x) for x in sp500_values.index]
sp500_values['sp500_change'][:5]

date
2017-02-01    15.12
2017-03-15   -36.81
2017-05-03    11.50
2017-06-14    -2.31
2017-07-26    -0.26
Name: sp500_change, dtype: float64

In [35]:
# Create label for market direction:

def is_possitive(sp500_change):
    if sp500_change > 0: return 1
    else: return 0
        
sp500_values['market_dir_label'] = [is_possitive(x) for x in sp500_values['sp500_change']]

In [36]:
sp500_values.head()

Unnamed: 0_level_0,day_-2,day_-1,day_0,day_1,day_2,day_3,day_4,day_5,day_6,day_7,sp500_change,market_dir_label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-02-01,2280.9,2278.87,2279.55,2280.85,2297.42,,,2292.56,2293.08,2294.67,15.12,1
2017-03-15,2373.47,2365.45,2385.26,2381.38,2378.25,,,2373.47,2344.02,2348.45,-36.81,0
2017-05-03,2388.33,2391.17,2388.13,2389.52,2399.29,,,2399.38,2396.92,2399.63,11.5,1
2017-06-14,2429.39,2440.35,2437.92,2432.46,2433.15,,,2453.46,2437.03,2435.61,-2.31,0
2017-07-26,2469.91,2477.13,2477.83,2475.42,2472.1,,,2470.3,2476.35,2477.57,-0.26,0


In [37]:
# Change the date index into a column of strings for joining:

sp500_values.reset_index(inplace=True)

f = lambda x: (str(x)[:4]+str(x)[5:7]+str(x)[8:])
sp500_values['date'] = [f(x) for x in sp500_values['date']]

In [38]:
# Create a smaller df with the needed columns only:

sp500_values_to_merge = sp500_values[['date', 'sp500_change', 'market_dir_label']]
sp500_values_to_merge.head()

Unnamed: 0,date,sp500_change,market_dir_label
0,20170201,15.12,1
1,20170315,-36.81,0
2,20170503,11.5,1
3,20170614,-2.31,0
4,20170726,-0.26,0


In [39]:
# Join the S&P500 calaulations to the meetings texts
fomc_data = fomc_data.merge(sp500_values_to_merge, on='date', how='left')

In [40]:
fomc_data.drop(columns=['full_text', 'recommendations'], inplace=True)

In [41]:
fomc_data

Unnamed: 0,date,num_of_recommendations,ffr_min,ffr_max,ffr_min_diff,ffr_max_diff,full_text_cosine_similarity,recommendations_cosine_similarity,sp500_change,market_dir_label
0,20170201,2,0.5,0.75,,,0.0,0.0,15.12,1
1,20170315,2,0.75,1.0,0.25,0.25,0.810823,0.957881,-36.81,0
2,20170503,2,0.75,1.0,0.0,0.0,0.905553,0.961575,11.5,1
3,20170614,2,1.0,1.25,0.25,0.25,0.910641,0.956847,-2.31,0
4,20170726,2,1.0,1.25,0.0,0.0,0.923488,0.953171,-0.26,0
5,20170920,4,1.0,1.25,0.0,0.0,0.870423,0.815107,-1.2,0
6,20171101,3,1.0,1.25,0.0,0.0,0.885533,0.897315,15.02,1
7,20171213,3,1.25,1.5,0.25,0.25,0.900365,0.881453,16.4,1
8,20180131,3,1.25,1.5,0.0,0.0,0.802817,0.880947,-142.15,0
9,20180321,3,1.5,1.75,0.25,0.25,0.778237,0.892704,-106.93,0
