In [1]:
import numpy as np
import pandas as pd

# Loading The Dataset

In [2]:
from datasets import load_dataset
dataset = load_dataset("zeroshot/twitter-financial-news-topic")

In [3]:
# Labels
label_mapping = {
    0: "Analyst Update",
    1: "Fed | Central Banks",
    2: "Company | Product News",
    3: "Treasuries | Corporate Debt",
    4: "Dividend",
    5: "Earnings",
    6: "Energy | Oil",
    7: "Financials",
    8: "Currencies",
    9: "General News | Opinion",
    10: "Gold | Metals | Materials",
    11: "IPO",
    12: "Legal | Regulation",
    13: "M&A | Investments",
    14: "Macro",
    15: "Markets",
    16: "Politics",
    17: "Personnel Change",
    18: "Stock Commentary",
    19: "Stock Movement",
}

## Exploring The Dataset

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16990
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 4117
    })
})

# Loading The Model And Tokenizer

In [6]:
from transformers import AutoTokenizer

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
# Tokenize The data
tokenized_datasets = {}

for split in dataset.keys():
    tokenized_datasets[split] = dataset[split].map(lambda x: tokenizer(x['text'], truncation=True, padding="max_length"), batched=True)

In [8]:
tokenized_datasets

{'train': Dataset({
     features: ['text', 'label', 'input_ids', 'attention_mask'],
     num_rows: 16990
 }),
 'validation': Dataset({
     features: ['text', 'label', 'input_ids', 'attention_mask'],
     num_rows: 4117
 })}

In [9]:
# Load The Model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=20,
                                 id2label=label_mapping)

# freeze Model Parameters
for param in model.base_model.parameters():
    param.requires_grad = False

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Exploring The Model

In [10]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# PEFT

In [18]:
from peft import LoraConfig, get_peft_model, TaskType

In [25]:
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

In [30]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [26]:
# Training Function

def trainer(model, tokenizer, datasets, compute_metrics):
    
    training_args = TrainingArguments(
        output_dir='./data',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=1,
        load_best_model_at_end=True,
        learning_rate=2e-5,
        evaluation_strategy='epoch',
        save_strategy='epoch')
    
    return Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets["train"],
        eval_dataset=datasets["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics)

In [2]:
# Create Lora Config File
config = LoraConfig(r=10, target_modules=['q_lin', 'k_lin', 'v_lin'], 
                    lora_alpha=16, lora_dropout=0.1, bias="none", 
                    task_type=TaskType.SEQ_CLS)

NameError: name 'LoraConfig' is not defined

In [28]:
# Load Model With PEFT Config File
lora_model = get_peft_model(model, config)

In [29]:
lora_model.print_trainable_parameters()

trainable params: 882,452 || all params: 67,851,304 || trainable%: 1.3005674879881455


## Evaluate Model Prior To FineTuning

In [31]:
trainer = trainer(model, tokenizer, tokenized_datasets, compute_metrics)

In [None]:
# Base Model Evaluations

In [None]:
trainer.evaluate(eval_dataset=tokenized_datasets["validation"])

In [None]:
df = pd.DataFrame(tokenized_datasets["validation"])
df = df[["text", "label"]]
predictions = peft_trainer.predict(tokenized_datasets["validation"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)
df.head(10)

## Train PEFT Model

In [None]:
peft_trainer = trainer(lora_model, tokenizer, tokenized_datasets, compute_metrics)

In [None]:
# Train
peft_trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# PEFT Model Evaluations

In [None]:
trainer.evaluate(eval_dataset=tokenized_datasets["validation"])

In [None]:
df = pd.DataFrame(tokenized_datasets["validation"])
df = df[["text", "label"]]
predictions = peft_trainer.predict(tokenized_datasets["validation"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)
df.head(10)