# **Purpose: Fine-tune LLMs to Predict Price Movements (Up or Down) **

In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "yiyanghkust/finbert-tone"  # FinBERT variant pre-trained for financial sentiment
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1, ignore_mismatched_sizes=True)

  return torch.load(checkpoint_file, map_location=map_location)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-tone and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([1, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
import pandas as pd
from datasets import Dataset

df = pd.read_csv('gbp_usd_dataset.csv')
df['Titles'] = df['Titles'].apply(lambda x: x.replace('\n', '. '))  # Replace \n with a token to mark separation
# Calculate the length of each title in the dataset
df['title_length'] = df['Titles'].apply(lambda x: len(x))

# Check the statistics of title lengths
print(f"Average title length: {df['title_length'].mean()}")
print(f"Maximum title length: {df['title_length'].max()}")

dataset = Dataset.from_pandas(df)

split_dataset = dataset.train_test_split(test_size=0.2)

train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize_function(examples):
    return tokenizer(examples["Titles"], truncation=True, padding="max_length", max_length=512)

# Tokenize training and validation datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("Label", "labels")
val_dataset = val_dataset.rename_column("Label", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Average title length: 551.8222021660649
Maximum title length: 3023


Map:   0%|          | 0/886 [00:00<?, ? examples/s]

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

In [33]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm import tqdm

# Assuming model, train_dataset, val_dataset are already defined

# Define the compute_accuracy function
def compute_accuracy(logits, labels):
    predictions = np.round(logits.cpu().numpy())  # Apply sigmoid and round for binary classification
    return accuracy_score(labels.cpu().numpy(), predictions)

# Create DataLoaders for training and validation datasets
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Define the loss function
loss_fn = torch.nn.BCEWithLogitsLoss()  # For binary classification

# Training loop
num_epochs = 3
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_preds = 0
    total_preds = 0

    # Training step
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        
        # Move batch to device
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        logits = model(inputs, attention_mask=attention_mask).logits
        logits = torch.sigmoid(logits).squeeze(-1)
        loss = loss_fn(logits.squeeze(-1), labels.float())  # Compute loss (binary classification)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        # Compute accuracy
        correct_preds += np.sum(np.round(torch.sigmoid(logits).cpu().detach().numpy()) == labels.cpu().numpy())
        total_preds += len(labels)

    # Compute average training loss and accuracy
    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = correct_preds / total_preds
    print('train loss:',avg_train_loss)
    print('train accuracy:',train_accuracy)
    # Evaluation loop
    model.eval()
    val_accuracy = 0
    with torch.no_grad():
        for batch in tqdm(val_loader):
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            logits = model(inputs, attention_mask=attention_mask).logits
            logits = torch.sigmoid(logits).squeeze(-1)
            val_accuracy += compute_accuracy(logits, labels)

    avg_val_accuracy = val_accuracy / len(val_loader)
    
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Training loss: {avg_train_loss:.4f}, Training accuracy: {train_accuracy:.4f}")
    print(f"Validation accuracy: {avg_val_accuracy:.4f}")


  5%|▍         | 5/111 [01:11<22:55, 12.98s/it]

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()