In [2]:
import pandas as pd
df = pd.read_csv("../data/processed/twitter-financial-news-sentiment/samples/sample1.csv")
df.head()

Unnamed: 0,text,label
0,Copa Holdings stock price target raised to $13...,1
1,News Corporation (NWS): Hedge Funds Are Snappi...,1
2,$ALXN: Alexion Pharma issues statement in rega...,2
3,The Federal Reserve on Monday rolled out an ex...,2
4,Stock Market Update: Stock market lifted by me...,1


In [4]:
import re

def process_source_links(row):
    if 'https' in row['text']:
        row['text'] = re.sub(r'http\S+', '', row['text']).strip()
        row['has_source'] = 1
    else:
        row['has_source'] = 0
    return row

df = df.apply(process_source_links, axis=1)

In [26]:
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

class FinancialTweetsDataset(Dataset):

    def __init__(self, texts, has_source, labels, tokenizer, max_length=100):
        self.texts = texts
        self.has_source = has_source
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        has_source = self.has_source[idx]

        encoding = self.tokenizer(text,
                                  padding="max_length",
                                  max_length=self.max_length,
                                  return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'has_source': torch.tensor(has_source, dtype=torch.float),
            'labels': torch.tensor(label, dtype=torch.long)
        }


def split_and_get_loaders(data, ratio=0.33, batch_size=32, tokenizer='bert'):
    X = data.drop(columns=['label'])
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=ratio,
                                                        random_state=42)

    if tokenizer == "bert":
        tok = AutoTokenizer.from_pretrained('bert-base-uncased')

        train_dataset = FinancialTweetsDataset(X_train['text'].tolist(),
                                               X_train['has_source'].tolist(),
                                               y_train.tolist(), tok)
        val_dataset = FinancialTweetsDataset(X_test['text'].tolist(),
                                             X_test['has_source'].tolist(),
                                             y_test.tolist(), tok)

        train_dataloader = DataLoader(dataset=train_dataset,
                                      batch_size=batch_size,
                                      shuffle=True)
        val_dataloader = DataLoader(dataset=val_dataset,
                                    batch_size=batch_size,
                                    shuffle=False)
        return train_dataloader, val_dataloader

In [27]:
train_loader, val_loader = split_and_get_loaders(df)



In [28]:
import torch.nn as nn
from transformers import BertModel

class SentimentAnalysisModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_labels=3):
        super(SentimentAnalysisModel, self).__init__()

        self.bert = BertModel.from_pretrained(bert_model_name)

        self.linear1 = nn.Linear(self.bert.config.hidden_size + 1, num_labels)

        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, has_source):
        embeddings = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        has_source = has_source.unsqueeze(1) 
        combined_input = torch.cat((embeddings, has_source), dim=1)

        regularized = self.dropout(combined_input)
        logits = self.linear1(regularized)

        return logits


In [29]:
from tqdm import tqdm

def train_one_epoch(model, dataloader, optimizer, criterion, device, epoch):
    model.train()
    train_loss = 0.0
    total = 0.

    loop = tqdm(
        enumerate(dataloader, 1),
        total=len(dataloader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )

    for i, batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        has_source = batch['has_source'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        logits = model(input_ids = input_ids, attention_mask=attention_mask, has_source=has_source)

        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item() * input_ids.size(0)
        total += labels.size(0)

        loop.set_postfix({"loss": train_loss/total})


def val_one_epoch(model, dataloader, criterion, device, epoch, best_so_far, ckpt_path='best.pt'):
    model.eval()
    val_loss = 0.
    correct = 0.
    total = 0.
    with torch.no_grad():
        loop = tqdm(
            enumerate(dataloader, 1),
            total=len(dataloader),
            desc=f"Epoch {epoch}: val",
            leave=True,
        )
        for i, batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            has_source = batch['has_source'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask, has_source=has_source)

            loss = criterion(logits, labels)
            val_loss += loss.item() * input_ids.size(0)

            _, preds = torch.max(logits, dim=1)
            correct += (preds == labels).sum().item()

            total += labels.size(0)

            loop.set_postfix({"loss": val_loss/total, "acc": correct / total})

        current_acc = correct / total
        if current_acc > best_so_far:
            print(f"Validation accuracy improved from {best_so_far:.4f} to {current_acc:.4f}. Saving model...")
            torch.save(model.state_dict(), ckpt_path)
            best_so_far = current_acc
    return best_so_far



In [30]:
import torch.optim as optim

epochs = 10
device = 'mps'
chechpoint_path = 'models/best.pt'

model = SentimentAnalysisModel(bert_model_name='bert-base-uncased', num_labels=3).to(device)
criterion = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=2e-5)

best_so_far = 0.

for epoch in range(epochs):
    train_one_epoch(model, train_loader, optimizer, criterion, device, epoch)
    best_so_far = val_one_epoch(model, val_loader, criterion, device, epoch, best_so_far, chechpoint_path)

Epoch 0: train: 100%|██████████| 106/106 [00:37<00:00,  2.83it/s, loss=0.882]
Epoch 0: val: 100%|██████████| 52/52 [00:06<00:00,  8.33it/s, loss=0.587, acc=0.782]


Validation accuracy improved from 0.0000 to 0.7818. Saving model...


Epoch 1: train: 100%|██████████| 106/106 [00:37<00:00,  2.86it/s, loss=0.501]
Epoch 1: val: 100%|██████████| 52/52 [00:06<00:00,  8.58it/s, loss=0.58, acc=0.787] 


Validation accuracy improved from 0.7818 to 0.7866. Saving model...


Epoch 2: train: 100%|██████████| 106/106 [00:36<00:00,  2.92it/s, loss=0.308]
Epoch 2: val: 100%|██████████| 52/52 [00:05<00:00,  8.88it/s, loss=0.52, acc=0.817] 


Validation accuracy improved from 0.7866 to 0.8168. Saving model...


Epoch 3: train: 100%|██████████| 106/106 [00:36<00:00,  2.87it/s, loss=0.18] 
Epoch 3: val: 100%|██████████| 52/52 [00:05<00:00,  8.74it/s, loss=0.538, acc=0.835]


Validation accuracy improved from 0.8168 to 0.8348. Saving model...


Epoch 4: train: 100%|██████████| 106/106 [00:37<00:00,  2.84it/s, loss=0.114]
Epoch 4: val: 100%|██████████| 52/52 [00:06<00:00,  8.62it/s, loss=0.616, acc=0.826]
Epoch 5: train: 100%|██████████| 106/106 [00:37<00:00,  2.83it/s, loss=0.0726]
Epoch 5: val: 100%|██████████| 52/52 [00:06<00:00,  8.35it/s, loss=0.653, acc=0.827]
Epoch 6: train: 100%|██████████| 106/106 [00:37<00:00,  2.79it/s, loss=0.058] 
Epoch 6: val: 100%|██████████| 52/52 [00:06<00:00,  8.32it/s, loss=0.704, acc=0.826]
Epoch 7: train: 100%|██████████| 106/106 [00:38<00:00,  2.77it/s, loss=0.0427]
Epoch 7: val: 100%|██████████| 52/52 [00:06<00:00,  8.28it/s, loss=0.748, acc=0.819]
Epoch 8: train: 100%|██████████| 106/106 [00:39<00:00,  2.68it/s, loss=0.0321]
Epoch 8: val: 100%|██████████| 52/52 [00:06<00:00,  7.87it/s, loss=0.709, acc=0.829]
Epoch 9: train: 100%|██████████| 106/106 [00:42<00:00,  2.51it/s, loss=0.0219]
Epoch 9: val: 100%|██████████| 52/52 [00:07<00:00,  7.34it/s, loss=0.836, acc=0.828]
