# **Purpose: Fine-tune LLMs to Predict Price Movements (Up or Down) **

In [97]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, AdamW, BertConfig

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [98]:
import torch
# Example random input
random_input = "The market is showing a positive trend today. Try to do something"

# Tokenize the input text
inputs = tokenizer(random_input, return_tensors="pt", truncation=True, padding="max_length", max_length=256)
print("Min token ID:", inputs['input_ids'].min().item())
print("Max token ID:", inputs['input_ids'].max().item())
# Move inputs to the same device as the model (e.g., GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = {key: value.to(device) for key, value in inputs.items()}
model.to(device)
# Forward pass through the model
with torch.no_grad():  # Disable gradient calculation for inference
    model.eval()
    logits = model(**inputs).logits
    print("Raw logits:", logits)

# Convert logits to probabilities (using softmax for multi-class classification)
probs = torch.nn.functional.softmax(logits, dim=-1)
print(model.classifier)
# Get the predicted class (index of the max probability)
predicted_class = torch.argmax(probs, dim=-1).item()
# Print the probabilities and predicted class
print(f"Probabilities: {probs}")
print(f"Predicted class: {predicted_class}")


Min token ID: 0
Max token ID: 9874
Raw logits: tensor([[ 0.3725, -0.3706]])
Linear(in_features=768, out_features=2, bias=True)
Probabilities: tensor([[0.6777, 0.3223]])
Predicted class: 0


In [108]:
import pandas as pd
from datasets import Dataset

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove unwanted characters (optional based on dataset)
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    
    return text

df = pd.read_csv('gbp_usd_dataset_ph_1.csv')
L1 = len(df['Titles'].tolist())
df = df.dropna()
L2 = len(df['Titles'].tolist())

print(L1 - L2, 'Null records')
#df['Titles'] = df['Titles'].apply(preprocess_text)

df['Titles'] = df['Titles'].apply(lambda x: x.replace('\n', '. '))  # Replace \n with a token to mark separation
# Calculate the length of each title in the dataset
df['title_length'] = df['Titles'].apply(lambda x: len(x))

# Check the statistics of title lengths
print(f"Average title length: {df['title_length'].mean()}")
print(f"Maximum title length: {df['title_length'].max()}")

dataset = Dataset.from_pandas(df)
def preprocess(example):
    example["Label"] = float(example["Label"])  # Cast to float
    return example

dataset = dataset.map(preprocess)

split_dataset = dataset.train_test_split(test_size=0.2)

train_dataset = split_dataset['train']
val_dataset = split_dataset['test']



def tokenize_function(examples):
    return tokenizer(examples["Titles"], truncation=True, padding="max_length", max_length=512)

# Tokenize training and validation datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("Label", "labels")
val_dataset = val_dataset.rename_column("Label", "labels")

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


38 Null records
Average title length: 158.22803738317756
Maximum title length: 1015


Map:   0%|          | 0/1070 [00:00<?, ? examples/s]

Map:   0%|          | 0/856 [00:00<?, ? examples/s]

Map:   0%|          | 0/214 [00:00<?, ? examples/s]

In [109]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
import numpy as np
from tqdm import tqdm

# Assuming model, train_dataset, val_dataset are already defined
# Define the compute_accuracy function
def compute_accuracy(logits, labels):
    predictions = np.round(logits.cpu().numpy())  # Apply sigmoid and round for binary classification
    return accuracy_score(labels.cpu().numpy(), predictions)

# Create DataLoaders for training and validation datasets
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

# Initialize the optimizer
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8)

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss()
#loss_fn = torch.nn.BCEWithLogitsLoss()

# Training loop
num_epochs = 30
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
bast_val_acc = 0
for epoch in range(num_epochs):
    print('epoch:',epoch+1)
    total_loss = 0
    correct_preds = 0
    total_preds = 0
    # Training step
    for batch in train_loader:
        optimizer.zero_grad()
        
        # Move batch to device
        inputs = batch['input_ids'].to(device)
        if torch.isnan(inputs).any() or torch.isinf(inputs).any():
            print("NaN or Inf detected in inputs!")

        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        logits = model(inputs, attention_mask=attention_mask).logits
        loss = loss_fn(logits, labels)  # Compute loss (binary classification)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=-1).long()
        # Compute accuracy
        correct_preds += (preds == labels).sum().item()
        total_preds += len(labels)

    # Compute average training loss and accuracy
    avg_train_loss = total_loss / len(train_loader)
    train_accuracy = round(100*correct_preds / total_preds,2)
    
    # Evaluation loop
    model.eval()
    val_accuracy = 0
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        correct_preds = 0
        for batch in val_loader:
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # Forward pass
            logits = model(inputs, attention_mask=attention_mask).logits
            preds = torch.argmax(logits, dim=-1).long()
            correct_preds += (preds == labels).sum().item()
            total_preds += len(labels)

    avg_val_accuracy = round(100*correct_preds / total_preds,2)
    print(f"Training loss: {avg_train_loss:.4f}, Training accuracy: {train_accuracy:.4f}")
    print(f"Validation accuracy: {avg_val_accuracy:.4f}")
    if avg_val_accuracy > bast_val_acc:
        torch.save(model.state_dict(), 'model.pth')
        print('model saved')
        bast_val_acc = avg_val_accuracy

    print(f"Epoch {epoch+1}/{num_epochs}")
    
    
    


epoch: 1
Training loss: 0.6852, Training accuracy: 60.0500
Validation accuracy: 61.6800
model saved
Epoch 1/30
epoch: 2


In [12]:
from transformers import Trainer, TrainingArguments
import torch
import torch.nn as nn

import torch.nn as nn
from transformers import Trainer, TrainingArguments

# Optional: Azure ML logging
# from azureml.core import Run
# run = Run.get_context()

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").float()
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1), labels.view(-1).to(logits.device))

        # Optional Azure logging
        # run.log("training_loss", loss.item())

        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",        # Evaluate at end of each epoch
    save_strategy="epoch",              # Save model at end of each epoch
    logging_strategy="epoch",           # Log training loss once per epoch
    num_train_epochs=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=6e-5,
    weight_decay=0.01,
    logging_dir="./logs",               # TensorBoard logs
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

2025-04-07 23:31:42.047364: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-07 23:31:44.263860: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744068704.970803    3167 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744068705.166920    3167 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-07 23:31:47.361881: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Epoch,Training Loss,Validation Loss
1,0.695,0.696081
2,0.6947,0.693018
3,0.694,0.693075
4,0.6958,0.693193
5,0.6951,0.69443
6,0.6943,0.695638
7,0.6943,0.693074
8,0.6925,0.697573
9,0.6972,0.693403
