In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
class TextPreprocessingPipeline:
    def __init__(self):
        self.pipeline_functions = []
    def register(self, func):
        self.pipeline_functions.append(func)
    def execute(self, data):
        for func in self.pipeline_functions:
            func(data)
        return data
    def reset_pipeline(self):
        self.pipeline_functions = []

In [3]:
pipeline = TextPreprocessingPipeline()

In [4]:
df = pd.read_csv('Fed_Scrape-2015-2023.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,Date,Type,Text
0,0,20230312,0,"To support American businesses and households,..."
1,1,20230312,0,The Federal Reserve is prepared to address any...
2,2,20230312,0,The additional funding will be made available ...
3,3,20230312,0,"With approval of the Treasury Secretary, the D..."
4,4,20230312,0,After receiving a recommendation from the boar...


In [5]:
#Query Statements
df_statements = df.copy()

df_statements.head()

Unnamed: 0.1,Unnamed: 0,Date,Type,Text
0,0,20230312,0,"To support American businesses and households,..."
1,1,20230312,0,The Federal Reserve is prepared to address any...
2,2,20230312,0,The additional funding will be made available ...
3,3,20230312,0,"With approval of the Treasury Secretary, the D..."
4,4,20230312,0,After receiving a recommendation from the boar...


In [6]:
#Group and Join
df_statements_group = df_statements.groupby('Date')['Text'].apply(' '.join).reset_index()

df_statements_group.head()

Unnamed: 0,Date,Text
0,20150107,\r\n The Federal Reserve Board and the F...
1,20150113,\r\n The Federal Reserve Board on Tuesda...
2,20150128,"January 27-28, 2015 A meeting of the Federal O..."
3,20150202,\r\n As part of operational testing of t...
4,20150204,"\r\n On February 5, 2015, the Federal Re..."


In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

# Make sure to download the necessary resources
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text_lambda(text):
    text = text.lower()
    tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    filtered_tokens = [token.translate(table) for token in tokens if token.isalnum()]
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

def clean_text(frame):
    print("Executing Clean_Text")
    frame['Text'] = frame['Text'].apply(lambda x: clean_text_lambda(x))
    return frame
    
pipeline.register(clean_text)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def drop_columns(frame):
    print("Exeucting Drop_Columns")
    frame.drop(['Date'], axis =1, inplace=True)
    return frame

pipeline.register(drop_columns)

In [9]:
df_statements_group = pipeline.execute(df_statements_group)

Executing Clean_Text
Exeucting Drop_Columns


In [10]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer, AdamWeightDecay, pipeline, create_optimizer
from transformers import DefaultDataCollator
import tensorflow as tf
from datasets import Dataset, DatasetDict, load_dataset

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
model = TFAutoModelForCausalLM.from_pretrained("distilgpt2", pad_token_id=tokenizer.eos_token_id)

data = Dataset.from_pandas(df_statements[['Text']])
data

Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)"tf_model.h5";:   0%|          | 0.00/328M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Dataset({
    features: ['Text'],
    num_rows: 9827
})

In [11]:
data = data.train_test_split(shuffle = True, seed = 200, test_size=0.2)
train = data["train"]
val = data["test"]

In [12]:
def tokenization(data):
    tokens = tokenizer(data["Text"], padding="max_length", truncation=True, max_length=300)
    return tokens

In [13]:
train_token = train.map(tokenization, batched = True, num_proc=10)
val_token = val.map(tokenization, batched = True, num_proc=10)

           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
def create_labels(text):
    text["labels"] = text["input_ids"].copy()
    return text

In [15]:
lm_train = train_token.map(create_labels, batched=True, num_proc=10)
lm_val = val_token.map(create_labels, batched=True, num_proc=10)

           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
train_set = model.prepare_tf_dataset(
    lm_train,
    shuffle=True,
    batch_size=16
)

validation_set = model.prepare_tf_dataset(
    lm_val,
    shuffle=False,
    batch_size=16
)

In [17]:
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.0005,
    decay_steps=500,
    decay_rate=0.95,
    staircase=False)
    
# Exponential decay learning rate
optimizer = AdamWeightDecay(learning_rate=lr_schedule, weight_decay_rate=0.01)

In [18]:
model.compile(
    optimizer=optimizer,
    metrics=['accuracy']
)
model.summary()

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 81912576  
 r)                                                              
                                                                 
Total params: 81,912,576
Trainable params: 81,912,576
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.fit(
    train_set, 
    validation_data=validation_set, 
    epochs=1, 
)



<keras.callbacks.History at 0x7f1ed37afd10>

In [20]:
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    framework="tf",
    max_new_tokens=500
)

In [21]:
test_sentence = "Stock market "
text_generator(test_sentence)

  "You have modified the pretrained model configuration to control generation. This is a"


[{'generated_text': 'Stock market \x7f\nOn the morning of December 31, 2015, the manager pro tem cited many challenges to forecast. The staff also had a strong forecast for 2019 after having lowered the target range for the federal funds rate since January 2016. This revision was not without a significant revision, and the manager pro tem noted that the actual rate of decline in the unemployment rate over the previous two years had declined in 2015. Nevertheless, the next reduction in the target range for the federal funds rate would make it more likely that the staff anticipated that inflation would turn out to have more recent downward revisions to the core data.'}]

In [22]:
model.save_weights('gpt-2-FOMC.h5')