# **Purpose: Fine-tune LLMs to Predict Price Movements (Up or Down) **

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "yiyanghkust/finbert-tone"  # FinBERT variant pre-trained for financial sentiment
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, ignore_mismatched_sizes=True)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-tone and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([1, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv('gbp_usd_dataset_ph_1.csv')
df = df.dropna()
df['Titles'] = df['Titles'].apply(lambda x: x.replace('\n', '. '))  # Replace \n with a token to mark separation
# Calculate the length of each title in the dataset
df['title_length'] = df['Titles'].apply(lambda x: len(x))

# Check the statistics of title lengths
print(f"Average title length: {df['title_length'].mean()}")
print(f"Maximum title length: {df['title_length'].max()}")

dataset = Dataset.from_pandas(df)
def preprocess(example):
    example["Label"] = float(example["Label"])  # Cast to float
    return example

dataset = dataset.map(preprocess)

split_dataset = dataset.train_test_split(test_size=0.2)

train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize_function(examples):
    return tokenizer(examples["Titles"], truncation=True, padding="max_length", max_length=256)

# Tokenize training and validation datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("Label", "labels")
val_dataset = val_dataset.rename_column("Label", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Average title length: 158.22803738317756
Maximum title length: 1015


Map:   0%|          | 0/1070 [00:00<?, ? examples/s]

Map:   0%|          | 0/856 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

In [11]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm import tqdm

# Assuming model, train_dataset, val_dataset are already defined
# Define the compute_accuracy function
def compute_accuracy(logits, labels):
    predictions = np.round(logits.cpu().numpy())  # Apply sigmoid and round for binary classification
    return accuracy_score(labels.cpu().numpy(), predictions)

# Create DataLoaders for training and validation datasets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=6e-5, weight_decay=0.01)

# Define the loss function
loss_fn = torch.nn.BCELoss()  # For binary classification

# Training loop
num_epochs = 100
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
bast_val_acc = 0
for epoch in range(num_epochs):
    
    model.train()
    total_loss = 0
    correct_preds = 0
    total_preds = 0
    # Training step
    for batch in train_loader:
        optimizer.zero_grad()
        
        # Move batch to device
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        logits = torch.sigmoid(model(inputs, attention_mask=attention_mask).logits.squeeze(-1))
        
        loss = loss_fn(logits, labels.float())  # Compute loss (binary classification)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        # Compute accuracy
        correct_preds += np.sum(np.round(logits.cpu().detach().numpy()) == labels.cpu().numpy())
        total_preds += len(labels)

    # Compute average training loss and accuracy
    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = correct_preds / total_preds
    print('train loss:',avg_train_loss)
    print('train accuracy:',train_accuracy)
    
    # Evaluation loop
    model.eval()
    val_accuracy = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # Forward pass
            logits = model(inputs, attention_mask=attention_mask).logits
            logits = torch.sigmoid(logits).squeeze(-1)
            val_accuracy += compute_accuracy(logits, labels)

    avg_val_accuracy = val_accuracy / len(val_loader)
    if avg_val_accuracy > bast_val_acc:
        torch.save(model.state_dict(), 'model.pth')
        print('model saved')
        bast_val_acc = avg_val_accuracy
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Training loss: {avg_train_loss:.4f}, Training accuracy: {train_accuracy:.4f}")
    print(f"Validation accuracy: {avg_val_accuracy:.4f}")
    


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

train loss: 0.6947512710205862
train accuracy: 0.48130841121495327
model saved
Epoch 1/100
Training loss: 0.6948, Training accuracy: 0.4813
Validation accuracy: 0.5139
train loss: 0.6940905390498794
train accuracy: 0.49182242990654207
Epoch 2/100
Training loss: 0.6941, Training accuracy: 0.4918
Validation accuracy: 0.4861


KeyboardInterrupt: 

In [12]:
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn

import torch.nn as nn
from transformers import Trainer, TrainingArguments

# Optional: Azure ML logging
# from azureml.core import Run
# run = Run.get_context()

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").float()
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1), labels.view(-1).to(logits.device))

        # Optional Azure logging
        # run.log("training_loss", loss.item())

        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",        # Evaluate at end of each epoch
    save_strategy="epoch",              # Save model at end of each epoch
    logging_strategy="epoch",           # Log training loss once per epoch
    num_train_epochs=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=6e-5,
    weight_decay=0.01,
    logging_dir="./logs",               # TensorBoard logs
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

2025-04-07 23:31:42.047364: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-07 23:31:44.263860: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744068704.970803    3167 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744068705.166920    3167 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-07 23:31:47.361881: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Epoch,Training Loss,Validation Loss
1,0.695,0.696081
2,0.6947,0.693018
3,0.694,0.693075
4,0.6958,0.693193
5,0.6951,0.69443
6,0.6943,0.695638
7,0.6943,0.693074
8,0.6925,0.697573
