In [50]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [76]:
macro_df = pd.read_csv(r"C:\code\fed-trade-of-the-decade\data\raw\fed_macro_variables_2021_present.csv")
macro_df.head()

Unnamed: 0,Date,Unemployment Rate,CPI (All Urban Consumers),GDP Growth Rate,10-Year Treasury Yield,Fed Funds Rate,PCE Inflation,Industrial Production Index,Total Nonfarm Payrolls
0,2021-01-01,6.4,262.518,21058.379,,0.09,106.083,98.8135,142916.0
1,2021-01-04,,,,0.93,,,,
2,2021-01-05,,,,0.96,,,,
3,2021-01-06,,,,1.04,,,,
4,2021-01-07,,,,1.08,,,,


In [78]:
# Ensure the Date column is in datetime format
macro_df['Date'] = pd.to_datetime(macro_df['Date'])

# Create a new DataFrame for rows where the date is the first of the month
filtered_macro_df = macro_df[macro_df['Date'].dt.is_month_start]

# Handle the 10-Year Treasury Yield column separately
# If data for the first of the month is missing, take the next available date
def get_first_or_next_available(df, column):
    result = []
    for date in pd.date_range(start=macro_df['Date'].min(), 
                              end=macro_df['Date'].max(), freq='MS'):
        # Subset the data for the current month
        monthly_data = df[df['Date'].dt.to_period('M') == date.to_period('M')]
        # Find the first available value for the column
        available_row = monthly_data.loc[monthly_data[column].first_valid_index()]
        result.append(available_row)
    return pd.DataFrame(result)

# Apply the function to get the filtered data
yield_filtered_df = get_first_or_next_available(macro_df, '10-Year Treasury Yield')

# Combine the filtered data
final_filtered_df = filtered_macro_df.copy()
final_filtered_df.update(yield_filtered_df)

final_filtered_df.drop(['GDP Growth Rate','10-Year Treasury Yield'],inplace=True, axis=1)

# Reset the index for clarity (optional)
final_filtered_df.reset_index(drop=True, inplace=True)

# Display the final DataFrame
final_filtered_df.tail()

Unnamed: 0,Date,Unemployment Rate,CPI (All Urban Consumers),Fed Funds Rate,PCE Inflation,Industrial Production Index,Total Nonfarm Payrolls
42,2024-07-01,4.3,313.534,5.33,123.564,102.5381,158692.0
43,2024-08-01,4.2,314.121,5.33,123.708,103.0449,158770.0
44,2024-09-01,4.1,314.686,5.13,123.931,102.5497,158993.0
45,2024-10-01,4.1,315.454,4.83,124.226,102.2805,159005.0
46,2024-11-01,,,,,,


In [None]:
final_filtered_df.dropna(inplace=True)
# for the last row

In [None]:
# TODO: incorporate macro data into the regression below
# TODO: change the target from t to t + 1
# TODO: increase the data ... get all the fed data from 2014~ 2012

In [51]:
# Load the rate change data
rate_df = pd.read_excel(r'C:\code\fed-trade-of-the-decade\data\raw\fed_rates_manual.xlsx')

# Ensure 'meeting_year' is integer
rate_df['meeting_year'] = rate_df['meeting_year'].astype(int)

# Keep only necessary columns
rate_df = rate_df[['meeting_month', 'meeting_year', 'rate change']]

# Convert 'meeting_month' to proper case to match filenames (if needed)
rate_df['meeting_month'] = rate_df['meeting_month'].str.strip().str.capitalize()

In [52]:
# Initialize lists to store data
texts = []
months = []
years = []

# Directory containing the meeting minutes
minutes_dir = r'C:\code\fed-trade-of-the-decade\data\raw\FOMC\meeting_minutes'

# Iterate over files in the directory
for filename in os.listdir(minutes_dir):
    if filename.endswith('.txt'):
        # Extract date from filename (assuming format: YYYY-MM-DD_Minutes.txt)
        date_str = filename.split('_')[0]
        meeting_date = pd.to_datetime(date_str)

        # Extract month and year
        meeting_month = meeting_date.strftime('%B')  # Full month name
        meeting_year = meeting_date.year

        # Read the text file
        with open(os.path.join(minutes_dir, filename), 'r', encoding='utf-8') as file:
            text = file.read()

        # Append to lists
        texts.append(text)
        months.append(meeting_month)
        years.append(meeting_year)

# Create a DataFrame
minutes_df = pd.DataFrame({
    'meeting_month': months,
    'meeting_year': years,
    'text': texts
})

# Convert 'meeting_year' to integer (in case it's not)
minutes_df['meeting_year'] = minutes_df['meeting_year'].astype(int)

# Ensure 'meeting_month' is properly formatted
minutes_df['meeting_month'] = minutes_df['meeting_month'].str.strip().str.capitalize()

In [53]:
# Merge on 'meeting_month' and 'meeting_year'
data_df = pd.merge(minutes_df, rate_df, on=['meeting_month', 'meeting_year'], how='inner')

# Check for missing values
print(f"Total records after merge: {len(data_df)}")

Total records after merge: 31


In [54]:
data_df.head()

Unnamed: 0,meeting_month,meeting_year,text,rate change
0,January,2021,"The Federal Reserve, the central bank of the U...",0
1,March,2021,"The Federal Reserve, the central bank of the U...",0
2,April,2021,"The Federal Reserve, the central bank of the U...",0
3,June,2021,"The Federal Reserve, the central bank of the U...",0
4,July,2021,"The Federal Reserve, the central bank of the U...",0


In [55]:
import pandas as pd
import numpy as np

# Text processing libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer

# Model training and evaluation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [56]:
# Download NLTK data (if not already downloaded)
nltk.download('stopwords')

# Initialize the stemmer and stopwords list
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase conversion
    text = text.lower()
    
    # Remove numbers and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenization
    words = text.split()
    
    # Remove stopwords and apply stemming
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    # Rejoin the words into a single string
    clean_text = ' '.join(words)
    
    return clean_text

# Apply the preprocessing function to the 'text' column
data_df['clean_text'] = data_df['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\easod\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
def rate_change_to_category(rate):
    if rate > 0:
        return 'hike'
    elif rate < 0:
        return 'cut'
    else:
        return 'hold'

data_df['rate_category'] = data_df['rate change'].apply(rate_change_to_category)

In [58]:
print(data_df['rate_category'].value_counts())

rate_category
hold    17
hike    12
cut      2
Name: count, dtype: int64


In [59]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the clean text and transform it into feature vectors
X = vectorizer.fit_transform(data_df['clean_text'])

# Target variable
y = data_df['rate_category']

In [60]:
# Split the dataset into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [61]:
# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')

# Train the model on the training data
model.fit(X_train, y_train)

In [62]:
# Predict on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.90

Classification Report:
              precision    recall  f1-score   support

         cut       0.00      0.00      0.00         1
        hike       1.00      1.00      1.00         4
        hold       0.83      1.00      0.91         5

    accuracy                           0.90        10
   macro avg       0.61      0.67      0.64        10
weighted avg       0.82      0.90      0.85        10


Confusion Matrix:
[[0 0 1]
 [0 4 0]
 [0 0 5]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Get the coefficients for each class
coefficients = model.coef_

# Create a DataFrame for the coefficients
coeff_df = pd.DataFrame(coefficients.T, index=feature_names, columns=model.classes_)

# For each class, get the top 10 words
for class_label in model.classes_:
    print(f"\nTop words for class '{class_label}':")
    # Get the words with the highest coefficients
    top_words = coeff_df[class_label].sort_values(ascending=False).head(20)
    print(top_words)



Top words for class 'cut':
market        0.021752
risk          0.016534
septemb       0.015850
manag         0.014420
labor         0.014294
board         0.013925
solid         0.012021
eas           0.011408
somewhat      0.011321
balanc        0.011180
quarterend    0.010598
low           0.010581
repo          0.009858
rate          0.009039
servic        0.008673
data          0.008326
coupl         0.008167
novemb        0.008107
octob         0.008000
divis         0.007994
Name: cut, dtype: float64

Top words for class 'hike':
inflat      0.030677
tighten     0.026541
bank        0.025841
ukrain      0.020792
price       0.020232
rais        0.019350
pressur     0.018455
invas       0.018165
anticip     0.017905
high        0.017560
elev        0.016510
robust      0.016048
polici      0.015421
russia      0.014853
monetari    0.014818
march       0.014747
remain      0.013714
war         0.013213
end         0.012704
develop     0.012289
Name: hike, dtype: float64

Top words

In [66]:
for class_label in model.classes_:
    print(class_label)

cut
hike
hold
