# Finetuning SwissFinBERT

this code has been adapted form a LightingAI template
https://lightning.ai/docs/pytorch/1.4.0/notebooks/lightning_examples/text-transformers.html

![](figures/finetuning-ii.png)

# 1 Loading the dataset into DataFrames

In [None]:
import os.path as op

from datasets import load_dataset

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

import numpy as np
import pandas as pd
import torch

from sklearn.feature_extraction.text import CountVectorizer

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset


In [None]:
df = pd.read_excel("finetuning_gpt_labelled_vfinal_balanced.xlsx") # load the dataset

In [None]:
df.columns = ["Sentence", "Label"] # rename the columns
df.head()

In [None]:
label_matchings = { "BUY" : 2, "HOLD" : 1, "SELL" : 0}

df["Label"] = df["Label"].apply(lambda x: label_matchings[x]) # convert the labels to integers

In [None]:
df = df[["Sentence", "Label"]] # keep only the columns we need
df.dropna(inplace=True)

In [None]:
df.head()# check the first few rows

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
df_train, df_test_val = train_test_split(df, test_size=0.3, random_state=42)

# Split the test set into test and validation sets
df_test, df_val = train_test_split(df_test_val, test_size=0.5, random_state=42)

# Print the shapes of the resulting dataframes

df_train.to_csv("train_df.csv", index=False)
df_test.to_csv("test_df.csv", index=False)
df_val.to_csv("val_df.csv", index=False)

In [None]:
df_train = pd.read_csv("train_df.csv")
df_val = pd.read_csv("val_df.csv")
df_test = pd.read_csv("test_df.csv")

# 2 Tokenization and Numericalization

**Load the dataset via `load_dataset`**

In [None]:
dataset = load_dataset(
    "csv",
    data_files={
        "train": "train_df.csv",
        "validation": "val_df.csv",
        "test": "test_df.csv",
    },
)

**Tokenize the dataset**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("scherrmann/GermanFinBert_SC_Sentiment") # load the tokenizer for the model
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

In [None]:
def tokenize_text(batch): # function to tokenize the text
    return tokenizer(batch["Sentence"], truncation=True, padding=True)

In [None]:
tokenized = dataset.map(tokenize_text, batched=True, batch_size=None) # tokenize the dataset

In [None]:
del dataset # delete the original dataset to save memory

In [None]:
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "Label"])

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 3 Set Up DataLoaders

In [None]:
from torch.utils.data import DataLoader, Dataset

# Define the dataset class
class Dataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [None]:
train_dataset = Dataset(tokenized, partition_key="train")
val_dataset = Dataset(tokenized, partition_key="validation")
test_dataset = Dataset(tokenized, partition_key="test")

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=36, # set the batch size, increased to prevent overfitting
    shuffle=True, 
    num_workers=4
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=36, # set the batch size, increased to prevent overfitting
    num_workers=4
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=36, # set the batch size, increased to prevent overfitting
    num_workers=4
)

# 4 Initializing GFinBERT

In [None]:
from transformers import AutoModelForSequenceClassification
# Load the model to be fine-tuned
model = AutoModelForSequenceClassification.from_pretrained("scherrmann/GermanFinBert_SC_Sentiment")

## 5 Finetuning

**Wrap in LightningModule for Training**

In [None]:
import lightning as L
import torch
import torchmetrics
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Define the LightningModule, used for fine-tuning the model
class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate=1e-5, dropout_rate=0.5):
        super().__init__()

        self.learning_rate = learning_rate
        self.model = model
        self.dropout = nn.Dropout(p=dropout_rate)

        self.val_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=3)
        self.test_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=3)
        self.val_f1 = torchmetrics.classification.MulticlassF1Score(3, top_k=1, average='micro')
        self.test_f1 = torchmetrics.classification.MulticlassF1Score(3, top_k=1, average='micro')

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)
        
    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["Label"])        
        self.log("train_loss", outputs["loss"])
        return outputs["loss"]

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["Label"])        
        self.log("val_loss", outputs["loss"], prog_bar=True)
        
        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.val_acc(predicted_labels, batch["Label"])
        self.log("val_acc", self.val_acc, prog_bar=True)
        self.val_f1(predicted_labels, batch["Label"])
        self.log("val_f1", self.val_f1, prog_bar=True)
        
    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["Label"])        
        
        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.test_acc(predicted_labels, batch["Label"])
        self.log("accuracy", self.test_acc, prog_bar=True)
        self.test_f1(predicted_labels, batch["Label"])
        self.log("f1", self.test_f1, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate, weight_decay=1e-5)
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss'
            }
        }

# Initialize the model
lightning_model = LightningModel(model) # initialize the model in the LightningModule class

In [None]:
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger

# Define the callbacks and logger, used for logging and saving the model during training
callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode="max", monitor="val_acc"
    )  # save top 1 model based on the best accuracy
]
logger = CSVLogger(save_dir="logs/", name="my-model")

In [None]:
trainer = L.Trainer(
    max_epochs=5, # set the number of epochs
    callbacks=callbacks,
    accelerator="gpu", # use the GPU
    precision="16-mixed",
    devices=1,
    logger=logger,
    log_every_n_steps=3,
)

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load the logged metrics
metrics = pd.read_csv("/logs/my-model/version_40/metrics.csv")

# Plot the loss curves
fig, ax1 = plt.subplots(figsize=(10, 5))

# Plot train and validation loss against steps
ax1.plot(metrics["step"], metrics["train_loss"], label="Train Loss", color='black')
ax1.set_xlabel("Step")
ax1.set_ylabel("Loss")
ax1.set_title("Loss Curve")
ax1.legend(loc="upper right")

# Create a secondary x-axis to show epochs
ax2 = ax1.twiny()
ax2.set_xlim(ax1.get_xlim())
ax2.set_xticks(metrics["step"][metrics["epoch"].drop_duplicates().index])
ax2.set_xticklabels(metrics["epoch"].drop_duplicates())
ax2.set_xlabel("Epoch")

plt.show()

In [None]:
metrics.head()

In [None]:
trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")

In [None]:
trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")

In [None]:
trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")

In [None]:
lightning_model.model.save_pretrained("SwissFinBERT")


In [None]:
from huggingface_hub import notebook_login # log in to the Hugging Face Hub
notebook_login()

In [None]:
lightning_model.model.push_to_hub("AlGatone21/SwissFinBERT", use_temp_dir=True) # push the model to the Hugging Face Hub


In [None]:
tokenizer.push_to_hub("AlGatone21/SwissFinBERT", use_temp_dir=True) # push the tokenizer to the Hugging Face Hub

In [None]:
lightning_model.model.config.label2id = {"SELL": 0, "HOLD": 1, "BUY": 2} # set the label mappings
lightning_model.model.config.id2label = {0: "SELL", 1: "HOLD", 2: "BUY"} # set the label mappings
lightning_model.model.config.push_to_hub("SwissFinBERT", use_temp_dir=True) # push the model configuration to the Hugging Face Hub