Performing sentiment analysis
Dataset used: https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment
sentiments = {
    "LABEL_0": "Bearish", 
    "LABEL_1": "Bullish", 
    "LABEL_2": "Neutral"
} 

In [1]:
import re
import os
import csv
import torch
import random
import string
import numpy as np
import unicodedata
import contractions
import pandas as pd
from io import StringIO
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer, Trainer, TrainingArguments, DistilBertModel, DistilBertTokenizer, RobertaModel, RobertaTokenizer, EarlyStoppingCallback
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels).float()

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

In [4]:
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [5]:
def tokenizer_function(texts, tokenizer):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length = 128,
        return_tensors='pt'  
    )

In [6]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
train_data = train_data.dropna(subset=['text', 'label'])
train_data


Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0
...,...,...
9538,The Week's Gainers and Losers on the Stoxx Eur...,2
9539,Tupperware Brands among consumer gainers; Unil...,2
9540,vTv Therapeutics leads healthcare gainers; Myo...,2
9541,"WORK, XPO, PYX and AMKR among after hour movers",2


In [7]:
train_data_feature = train_data['text'].values
train_data_label = train_data['label'].values
print("train_data_feature.shape:", train_data_feature.shape)

train_data_feature.shape: (9543,)


In [8]:
test_data_feature = test_data['text'].values
test_data_label = test_data['label'].values
print("test_data_feature.shape:", test_data_feature.shape)


test_data_feature.shape: (2388,)


In [19]:
train_encodings = tokenizer_function(train_data_feature.tolist(), distilbert_tokenizer)
val_encoding = tokenizer_function(test_data_feature.tolist()[:1000], distilbert_tokenizer)
test_encodings = tokenizer_function(test_data_feature.tolist(), distilbert_tokenizer)

In [20]:
train_dataset = SentimentDataset(train_encodings, train_data_label.tolist())
val_dataset = SentimentDataset(val_encoding, test_data_label.tolist()[:1000])
test_dataset = SentimentDataset(test_encodings, test_data_label.tolist())


In [12]:
class CustomBertModel(nn.Module):
    def __init__(self, pretrained_model, num_classes):
        super(CustomBertModel, self).__init__()
        self.bert = pretrained_model
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()  # Multi-class loss

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # CLS token
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels.long())  # labels should be class indices (int)
        return {'loss': loss, 'logits': logits}

pre_trained_distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model = CustomBertModel(pre_trained_distilbert_model, num_classes = 3).to(device)


In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)  # <--- Correct for multiclass

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [14]:
training_args = TrainingArguments(
    output_dir="./bertResults",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    # warmup_ratio=0.06,  # Prevent aggressive weight updates early
    # gradient_accumulation_steps=2,  # Simulate larger batch without increasing memory
    # fp16=True,  # Use mixed precision training if available
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [16]:
trainer.train()

trainer.save_model("/models/centralized_bert_model")

# output the training time
training_time = trainer.state.log_history[-1]['train_runtime']
print(f"Training time: {training_time * 60} minutes")

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4611,0.433496,0.838,0.839609,0.838,0.838412
2,0.3218,0.387157,0.868,0.86625,0.868,0.8661
3,0.2463,0.398612,0.869,0.868427,0.869,0.868681
4,0.1556,0.473238,0.872,0.870823,0.872,0.871228
5,0.1277,0.536488,0.872,0.871022,0.872,0.871333
6,0.0852,0.616849,0.876,0.874773,0.876,0.874287
7,0.0894,0.664598,0.877,0.876049,0.877,0.876001
8,0.038,0.721075,0.871,0.870274,0.871,0.869223
9,0.0371,0.729405,0.88,0.878603,0.88,0.878713
10,0.0401,0.730904,0.882,0.881067,0.882,0.881005


Training time: 73022.922 minutes


In [21]:
distilbert_test_results = trainer.evaluate(test_dataset)
print(f'Test Results: {distilbert_test_results}')

Test Results: {'eval_loss': 0.7099828124046326, 'eval_accuracy': 0.8806532663316583, 'eval_precision': 0.8821743626009017, 'eval_recall': 0.8806532663316583, 'eval_f1': 0.8811056535334082, 'eval_runtime': 8.7868, 'eval_samples_per_second': 271.772, 'eval_steps_per_second': 17.071, 'epoch': 10.0}


In [22]:
print(f"Training time: {training_time / 60} minutes")

Training time: 20.284145000000002 minutes
