In [1]:
import re
import pandas as pd
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

from pathlib import Path
import torch





In [2]:


def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)      
    text = re.sub(r"@\w+", "", text)        
    text = re.sub(r"^user:\s*", "", text, flags=re.IGNORECASE)  
    text = re.sub(r"^user\s*", "", text, flags=re.IGNORECASE)  
    text = re.sub(r"[\"]+", "", text)        
    text = re.sub(r"\s+", " ", text).strip() 
    return text



In [3]:
def lables_zero_one(y: int) -> int:
    # input in {-1, 1} -> output in {0, 1}
    return 1 if int(y) == 1 else 0

In [4]:
df = pd.read_csv("data/stock_data.csv")
df["Text"] = df["Text"].astype(str).apply(clean_text)
print(df.head)

<bound method NDFrame.head of                                                    Text  Sentiment
0     Kickers on my watchlist XIDE TIT SOQ PNK CPW B...          1
1     AAP MOVIE. 55% return for the FEA/GEED indicat...          1
2     I'd be afraid to short AMZN - they are looking...          1
3                                       MNTA Over 12.00          1
4                                         OI Over 21.37          1
...                                                 ...        ...
5786  Industry body CII said #discoms are likely to ...         -1
5787  #Gold prices slip below Rs 46,000 as #investor...         -1
5788  Workers at Bajaj Auto have agreed to a 10% wag...          1
5789  #Sharemarket LIVE: Sensex off day’s high, up 6...          1
5790  #Sensex, #Nifty climb off day's highs, still u...          1

[5791 rows x 2 columns]>


In [5]:
class FinancialSentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len: int = 128):
        # csvPath = Path(csvPath)
        # if not csvPath.exists():
        #     raise ValueError(f"CSV not found: {csvPath}")

        # df = pd.read_csv(csvPath)
        # expect columns: Text, Sentiment
        # self.texts = [clean_text(t) for t in df["Text"].astype(str).tolist()]
        # self.labels = [lables_zero_one(y) for y in df["Sentiment"].tolist()]
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True, # cut off long texts
            padding='max_length', # pad shorter text with zeroes so all have same len
            max_length=self.max_len,
            return_tensors='pt' #pytorch output
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            "label": torch.tensor(label, dtype=torch.long)
        }

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

train_dataset = FinancialSentimentDataset(
    texts=df["Text"].values,
    labels=df["Sentiment"].values,
    tokenizer=tokenizer
)


In [7]:
from sklearn.model_selection import train_test_split

# Convert labels from {-1, 1} to {0, 1} for binary classification
labels_binary = df["Sentiment"].apply(lables_zero_one).values

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Text"].values, labels_binary, test_size=0.2, random_state=67)

In [8]:
train_dataset = FinancialSentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = FinancialSentimentDataset(val_texts, val_labels, tokenizer)


In [9]:
import torch
import torch.nn as nn
from transformers import AutoModel

class FinbertBackbone(nn.Module):
    def __init__(self, modelName: str = "ProsusAI/finbert"):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(modelName)
        self.hiddenSize = self.encoder.config.hidden_size  # 768 for BERT-base

    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        cls = out.last_hidden_state[:, 0]  # [CLS] token
        return cls  # [batch, hidden]
    

class BinaryHead(nn.Module):
    def __init__(self, inFeatures: int, pDrop: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(pDrop)
        self.fc = nn.Linear(inFeatures, 1)  # single logit

    def forward(self, x):
        x = self.dropout(x)
        logits = self.fc(x).squeeze(-1)    # [batch]
        return logits


class FinbertBinaryClf(nn.Module):
    def __init__(self, modelName: str = "ProsusAI/finbert", pDrop: float = 0.1):
        super().__init__()
        self.backbone = FinbertBackbone(modelName)
        self.head = BinaryHead(self.backbone.hiddenSize, pDrop)

    def forward(self, input_ids, attention_mask):
        feats = self.backbone(input_ids, attention_mask)
        logits = self.head(feats)
        return logits


In [10]:
from torch.utils.data import DataLoader, random_split
from torch.optim import AdamW
import torch.nn.functional as F
import torch
from transformers import AutoTokenizer
from sklearn.metrics import f1_score, accuracy_score

def getLoaders(texts,labels, tokenizer, maxLen=128, batchSize=64, valFrac=0.2, seed=42):
    ds = FinancialSentimentDataset(texts,labels, tokenizer, max_len=maxLen)
    valLen = int(len(ds) * valFrac)
    trainLen = len(ds) - valLen
    gen = torch.Generator().manual_seed(seed)
    trainDs, valDs = random_split(ds, [trainLen, valLen], generator=gen)
    return (
        DataLoader(trainDs, batch_size=batchSize, shuffle=True),
        DataLoader(valDs, batch_size=batchSize, shuffle=False),
    )

def step(model, batch, device, posWeight=None, train=True, optimizer=None, clip=1.0):
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["label"].float().to(device)    # 0/1 as float for BCE

    logits = model(input_ids, attention_mask)
    if posWeight is None:
        loss = F.binary_cross_entropy_with_logits(logits, labels)
    else:
        bce = nn.BCEWithLogitsLoss(pos_weight=posWeight)
        loss = bce(logits, labels)

    if train:
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

    probs = torch.sigmoid(logits).detach().cpu().numpy()
    preds = (probs >= 0.5).astype(int)
    y_true = labels.detach().cpu().numpy().astype(int)
    return loss.item(), preds, y_true

def runEpoch(model, loader, device, train, optimizer=None, posWeight=None):
    model.train(train)
    losses, allPreds, allTrue = [], [], []
    for batch in loader:
        loss, preds, y_true = step(model, batch, device, posWeight, train, optimizer)
        losses.append(loss)
        allPreds.extend(preds.tolist())
        allTrue.extend(y_true.tolist())
    acc = accuracy_score(allTrue, allPreds)
    f1 = f1_score(allTrue, allPreds)
    return float(sum(losses)/max(1,len(losses))), acc, f1


In [11]:
from transformers import AutoTokenizer

csvPath = "data/stock_data.csv"  # Text,Sentiment
modelName = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(modelName)

trainLoader, valLoader = getLoaders(train_texts,train_labels, tokenizer, maxLen=128, batchSize=64)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = FinbertBinaryClf(modelName, pDrop=0.1).to(device)

cuda


In [12]:


# optional: freeze encoder for warmup
for p in model.backbone.parameters():
    p.requires_grad = False

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4, weight_decay=0.01)

# warmup head
for epoch in range(1):
    trLoss, trAcc, trF1 = runEpoch(model, trainLoader, device, train=True, optimizer=optimizer)
    vaLoss, vaAcc, vaF1 = runEpoch(model, valLoader, device, train=False)
    print(f"[warmup] {epoch}: train loss {trLoss:.4f} acc {trAcc:.3f} f1 {trF1:.3f} | val {vaLoss:.4f} {vaAcc:.3f} {vaF1:.3f}")
print("\nwarmup complete\n")
# unfreeze some/all encoder layers
for p in model.backbone.parameters():
    p.requires_grad = True
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
print("\nmain training started\n")
# main training
for epoch in range(3):
    trLoss, trAcc, trF1 = runEpoch(model, trainLoader, device, train=True, optimizer=optimizer)
    vaLoss, vaAcc, vaF1 = runEpoch(model, valLoader, device, train=False)
    print(f"epoch {epoch}: train loss {trLoss:.4f} acc {trAcc:.3f} f1 {trF1:.3f} | val {vaLoss:.4f} {vaAcc:.3f} {vaF1:.3f}")
print("\nmodel saved as finbert_custom_head\n")
torch.save(model.state_dict(), "finbert_custom_head.pt")


[warmup] 0: train loss 0.6190 acc 0.659 f1 0.756 | val 0.5692 0.711 0.806

warmup complete


main training started

epoch 0: train loss 0.4980 acc 0.761 f1 0.825 | val 0.4746 0.793 0.848
epoch 1: train loss 0.3309 acc 0.857 f1 0.890 | val 0.5059 0.789 0.845
epoch 2: train loss 0.2088 acc 0.923 f1 0.940 | val 0.6098 0.792 0.849

model saved as finbert_custom_head



In [13]:
@torch.no_grad()
def predictTexts(texts, tokenizer, model, maxLen=128, threshold=0.5, device=None):
    device = device or next(model.parameters()).device
    model.eval()
    if isinstance(texts, str):
        texts = [texts]
    enc = tokenizer(
        texts, truncation=True, padding="max_length", max_length=maxLen, return_tensors="pt"
    )
    logits = model(enc["input_ids"].to(device), enc["attention_mask"].to(device))
    probs = torch.sigmoid(logits).cpu().numpy()
    labels = (probs >= threshold).astype(int)  # 1=positive, 0=negative
    return labels, probs
