In [40]:
import numpy as np
import pandas as pd
import torch

# Loading The Dataset

In [2]:
from datasets import load_dataset
dataset = load_dataset("zeroshot/twitter-financial-news-topic")



In [3]:
# Labels
label_mapping = {
    0: "Analyst Update",
    1: "Fed | Central Banks",
    2: "Company | Product News",
    3: "Treasuries | Corporate Debt",
    4: "Dividend",
    5: "Earnings",
    6: "Energy | Oil",
    7: "Financials",
    8: "Currencies",
    9: "General News | Opinion",
    10: "Gold | Metals | Materials",
    11: "IPO",
    12: "Legal | Regulation",
    13: "M&A | Investments",
    14: "Macro",
    15: "Markets",
    16: "Politics",
    17: "Personnel Change",
    18: "Stock Commentary",
    19: "Stock Movement",
}

## Exploring The Dataset

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16990
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 4117
    })
})

# Loading The Model And Tokenizer

In [5]:
from transformers import AutoTokenizer

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [6]:
# Tokenize The Data
tokenized_datasets = {}

for split in dataset.keys():
    tokenized_datasets[split] = dataset[split].map(lambda x: tokenizer(x['text'], truncation=True, padding="max_length"), batched=True)

Map:   0%|          | 0/4117 [00:00<?, ? examples/s]

In [7]:
tokenized_datasets

{'train': Dataset({
     features: ['text', 'label', 'input_ids', 'attention_mask'],
     num_rows: 16990
 }),
 'validation': Dataset({
     features: ['text', 'label', 'input_ids', 'attention_mask'],
     num_rows: 4117
 })}

In [8]:
# Load The Model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=20,
                                 id2label=label_mapping)

# freeze Model Parameters
for param in model.base_model.parameters():
    param.requires_grad = False

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Exploring The Model

In [9]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# PEFT

In [10]:
from peft import LoraConfig, get_peft_model, TaskType

In [11]:
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

2024-02-18 05:14:44.149677: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-18 05:14:44.193455: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [13]:
# Training Function

def training(model, tokenizer, datasets, compute_metrics):
    
    training_args = TrainingArguments(
        output_dir='./data',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        load_best_model_at_end=True,
        learning_rate=2e-5,
        evaluation_strategy='epoch',
        save_strategy='epoch')
    
    return Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets["train"],
        eval_dataset=datasets["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics)

In [14]:
# Create Lora Config File
config = LoraConfig(r=10, target_modules=['q_lin', 'k_lin', 'v_lin', 'lin1', 'lin2'], 
                    lora_alpha=16, lora_dropout=0.1, bias="none", 
                    task_type=TaskType.SEQ_CLS)

In [15]:
# Load Model With PEFT Config File
lora_model = get_peft_model(model, config)

In [16]:
lora_model.print_trainable_parameters()

trainable params: 1,343,252 || all params: 68,312,104 || trainable%: 1.9663455249453303


## Evaluate Model Prior To FineTuning

In [17]:
trainer = training(model, tokenizer, tokenized_datasets, compute_metrics)

In [18]:
# Base Model Evaluations

In [19]:
trainer.evaluate(eval_dataset=tokenized_datasets["validation"])

{'eval_loss': 3.050250768661499,
 'eval_accuracy': 0.004615010930289046,
 'eval_runtime': 31.0012,
 'eval_samples_per_second': 132.801,
 'eval_steps_per_second': 8.322}

In [20]:
df = pd.DataFrame(tokenized_datasets["validation"])
df = df[["text", "label"]]
predictions = trainer.predict(tokenized_datasets["validation"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)
df.head(100)

Unnamed: 0,text,label,predicted_label
0,Analyst call of the day for @CNBCPro subscribe...,0,10
1,"Loop upgrades CSX to buy, says it's a good pla...",0,10
2,BofA believes we're already in a recession — a...,0,10
3,JPMorgan sees these derivative plays as best w...,0,10
4,Morgan Stanley's Huberty sees Apple earnings m...,0,10
...,...,...,...
95,LISTEN NOW: Netflix reported that it lost fewe...,2,10
96,Ford's Mustang Mach-E electric crossover is a ...,2,10
97,Coinbase says it has no exposure to collapsed ...,2,10
98,GM reveals electric Chevrolet Blazer priced st...,2,10


## Train PEFT Model

In [21]:
peft_trainer = training(lora_model, tokenizer, tokenized_datasets, compute_metrics)

In [22]:
# Train
peft_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.4682,1.174579,0.660189
2,0.9529,0.820407,0.761234
3,0.7645,0.69644,0.786252
4,0.6847,0.648547,0.79864
5,0.6633,0.63218,0.802526


Checkpoint destination directory ./data/checkpoint-1062 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/checkpoint-2124 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./data/checkpoint-3186 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=5310, training_loss=0.9933307683625014, metrics={'train_runtime': 1754.9362, 'train_samples_per_second': 48.406, 'train_steps_per_second': 3.026, 'total_flos': 1.16072614514688e+16, 'train_loss': 0.9933307683625014, 'epoch': 5.0})

In [23]:
lora_model.save_pretrained("lora_model")

In [24]:
# PEFT Model Evaluations

In [25]:
trainer.evaluate(eval_dataset=tokenized_datasets["validation"])

{'eval_loss': 0.6321803331375122,
 'eval_accuracy': 0.802526111246053,
 'eval_runtime': 32.3303,
 'eval_samples_per_second': 127.342,
 'eval_steps_per_second': 7.98}

In [26]:
df = pd.DataFrame(tokenized_datasets["validation"])
df = df[["text", "label"]]
predictions = peft_trainer.predict(tokenized_datasets["validation"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)
df.head(100)

Unnamed: 0,text,label,predicted_label
0,Analyst call of the day for @CNBCPro subscribe...,0,5
1,"Loop upgrades CSX to buy, says it's a good pla...",0,2
2,BofA believes we're already in a recession — a...,0,15
3,JPMorgan sees these derivative plays as best w...,0,2
4,Morgan Stanley's Huberty sees Apple earnings m...,0,5
...,...,...,...
95,LISTEN NOW: Netflix reported that it lost fewe...,2,2
96,Ford's Mustang Mach-E electric crossover is a ...,2,2
97,Coinbase says it has no exposure to collapsed ...,2,2
98,GM reveals electric Chevrolet Blazer priced st...,2,2


# Inference

In [29]:
from peft import AutoPeftModelForSequenceClassification

inference_model = AutoPeftModelForSequenceClassification.from_pretrained("lora_model",  num_labels=20)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
tweet1 = 'The 10 bitcoin ETFs netted +$2.3b last week. For context, that is more than any other ETF (out of 3,400) took in. $IBIT alone was #2. This brings total net to +$5b, which is more than BlackRock as a whole has taken in. Again, this is all net GBTC bleed. Throw that out and the numbers get even crazier.'

In [56]:
tweet2 = 'We have released a Research Discussion Paper - \'Do Monetary Policy and Economic Conditions Impact Innovation? Evidence from Australian Administrative Data\' - https://t.ly/OVa5E'

In [61]:
tweet3 = 'At its meeting today, the Board decided to leave the cash rate target unchanged at 4.35 per cent and the interest rate paid on Exchange Settlement balances unchanged at 4.25 per cent.'

In [58]:
def run_inf(text):
    input = tokenizer(text, 
                  padding="max_length", truncation=True, return_tensors="pt")

    logits = inference_model(**input).logits
    predictions = torch.argmax(logits,dim=1).numpy()[0]

    print(f'Tweet: {text}')
    print()
    print(f'Prediction: {predictions} ({label_mapping[predictions]})')

In [59]:
run_inf(tweet1)

Tweet: The 10 bitcoin ETFs netted +$2.3b last week. For context, that is more than any other ETF (out of 3,400) took in. $IBIT alone was #2. This brings total net to +$5b, which is more than BlackRock as a whole has taken in. Again, this is all net GBTC bleed. Throw that out and the numbers get even crazier.

Prediction: 18 (Stock Commentary)


In [60]:
run_inf(tweet2)

Tweet: We have released a Research Discussion Paper - 'Do Monetary Policy and Economic Conditions Impact Innovation? Evidence from Australian Administrative Data' - https://t.ly/OVa5E

Prediction: 14 (Macro)


In [62]:
run_inf(tweet3)

Tweet: At its meeting today, the Board decided to leave the cash rate target unchanged at 4.35 per cent and the interest rate paid on Exchange Settlement balances unchanged at 4.25 per cent.

Prediction: 1 (Fed | Central Banks)
