In [2]:
import pandas as pd
df = pd.read_csv("../data/processed/twitter-financial-news-sentiment/samples/sample1.csv")
df.head()

Unnamed: 0,text,label
0,Copa Holdings stock price target raised to $13...,1
1,News Corporation (NWS): Hedge Funds Are Snappi...,1
2,$ALXN: Alexion Pharma issues statement in rega...,2
3,The Federal Reserve on Monday rolled out an ex...,2
4,Stock Market Update: Stock market lifted by me...,1


In [4]:
import re

def process_source_links(row):
    if 'https' in row['text']:
        row['text'] = re.sub(r'http\S+', '', row['text']).strip()
        row['has_source'] = 1
    else:
        row['has_source'] = 0
    return row

df = df.apply(process_source_links, axis=1)

In [26]:
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

class FinancialTweetsDataset(Dataset):

    def __init__(self, texts, has_source, labels, tokenizer, max_length=100):
        self.texts = texts
        self.has_source = has_source
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        has_source = self.has_source[idx]

        encoding = self.tokenizer(text,
                                  padding="max_length",
                                  max_length=self.max_length,
                                  return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'has_source': torch.tensor(has_source, dtype=torch.float),
            'labels': torch.tensor(label, dtype=torch.long)
        }


def split_and_get_loaders(data, ratio=0.33, batch_size=32, tokenizer='bert'):
    X = data.drop(columns=['label'])
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=ratio,
                                                        random_state=42)

    if tokenizer == "bert":
        tok = AutoTokenizer.from_pretrained('bert-base-uncased')

        train_dataset = FinancialTweetsDataset(X_train['text'].tolist(),
                                               X_train['has_source'].tolist(),
                                               y_train.tolist(), tok)
        val_dataset = FinancialTweetsDataset(X_test['text'].tolist(),
                                             X_test['has_source'].tolist(),
                                             y_test.tolist(), tok)

        train_dataloader = DataLoader(dataset=train_dataset,
                                      batch_size=batch_size,
                                      shuffle=True)
        val_dataloader = DataLoader(dataset=val_dataset,
                                    batch_size=batch_size,
                                    shuffle=False)
        return train_dataloader, val_dataloader

In [7]:
from pipline_extract import extract_latest_loaders

dataloaders = extract_latest_loaders()
train_loader = dataloaders['train']
val_loader = dataloaders['validation']

Pipeline artifact [: 1c535be5-fc01-42c9-b1bd-a82f30a727c8] loaded successfully


In [8]:
import torch.nn as nn
from transformers import BertModel

class SentimentAnalysisModel(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_labels=3):
        super(SentimentAnalysisModel, self).__init__()

        self.bert = BertModel.from_pretrained(bert_model_name)

        self.linear1 = nn.Linear(self.bert.config.hidden_size + 1, num_labels)

        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, has_source):
        embeddings = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        has_source = has_source.unsqueeze(1) 
        combined_input = torch.cat((embeddings, has_source), dim=1)

        regularized = self.dropout(combined_input)
        logits = self.linear1(regularized)

        return logits


In [10]:
from tqdm import tqdm

def train_one_epoch(model, dataloader, optimizer, criterion, device, epoch):
    model.train()
    train_loss = 0.0
    total = 0.

    loop = tqdm(
        enumerate(dataloader, 1),
        total=len(dataloader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )

    for i, batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        has_source = batch['has_source'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        logits = model(input_ids = input_ids, attention_mask=attention_mask, has_source=has_source)

        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item() * input_ids.size(0)
        total += labels.size(0)

        loop.set_postfix({"loss": train_loss/total})


def val_one_epoch(model, dataloader, criterion, device, epoch, best_so_far, ckpt_path='best.pt'):
    model.eval()
    val_loss = 0.
    correct = 0.
    total = 0.
    with torch.no_grad():
        loop = tqdm(
            enumerate(dataloader, 1),
            total=len(dataloader),
            desc=f"Epoch {epoch}: val",
            leave=True,
        )
        for i, batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            has_source = batch['has_source'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask, has_source=has_source)

            loss = criterion(logits, labels)
            val_loss += loss.item() * input_ids.size(0)

            _, preds = torch.max(logits, dim=1)
            correct += (preds == labels).sum().item()

            total += labels.size(0)

            loop.set_postfix({"loss": val_loss/total, "acc": correct / total})

        current_acc = correct / total
        if current_acc > best_so_far:
            print(f"Validation accuracy improved from {best_so_far:.4f} to {current_acc:.4f}. Saving model...")
            torch.save(model.state_dict(), ckpt_path)
            best_so_far = current_acc
    return best_so_far



In [12]:
import torch
import torch.optim as optim

epochs = 10
device = 'mps'
chechpoint_path = 'models/best.pt'

model = SentimentAnalysisModel(bert_model_name='bert-base-uncased', num_labels=3).to(device)
criterion = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=2e-5)

best_so_far = 0.

for epoch in range(epochs):
    train_one_epoch(model, train_loader, optimizer, criterion, device, epoch)
    best_so_far = val_one_epoch(model, val_loader, criterion, device, epoch, best_so_far, chechpoint_path)

Epoch 0: train: 100%|██████████| 106/106 [00:21<00:00,  4.93it/s, loss=0.816]
Epoch 0: val: 100%|██████████| 52/52 [00:03<00:00, 15.01it/s, loss=0.561, acc=0.778]


Validation accuracy improved from 0.0000 to 0.7782. Saving model...


Epoch 1: train: 100%|██████████| 106/106 [00:19<00:00,  5.32it/s, loss=0.447]
Epoch 1: val: 100%|██████████| 52/52 [00:03<00:00, 17.14it/s, loss=0.488, acc=0.81] 


Validation accuracy improved from 0.7782 to 0.8101. Saving model...


Epoch 2: train: 100%|██████████| 106/106 [00:20<00:00,  5.23it/s, loss=0.265]
Epoch 2: val: 100%|██████████| 52/52 [00:03<00:00, 16.51it/s, loss=0.484, acc=0.826]


Validation accuracy improved from 0.8101 to 0.8258. Saving model...


Epoch 3: train: 100%|██████████| 106/106 [00:20<00:00,  5.14it/s, loss=0.15] 
Epoch 3: val: 100%|██████████| 52/52 [00:03<00:00, 16.55it/s, loss=0.552, acc=0.827]


Validation accuracy improved from 0.8258 to 0.8270. Saving model...


Epoch 4: train: 100%|██████████| 106/106 [00:20<00:00,  5.16it/s, loss=0.0888]
Epoch 4: val: 100%|██████████| 52/52 [00:03<00:00, 16.54it/s, loss=0.647, acc=0.812]
Epoch 5: train: 100%|██████████| 106/106 [00:20<00:00,  5.20it/s, loss=0.0729]
Epoch 5: val: 100%|██████████| 52/52 [00:03<00:00, 16.60it/s, loss=0.682, acc=0.832]


Validation accuracy improved from 0.8270 to 0.8324. Saving model...


Epoch 6: train: 100%|██████████| 106/106 [00:20<00:00,  5.17it/s, loss=0.0467]
Epoch 6: val: 100%|██████████| 52/52 [00:03<00:00, 16.61it/s, loss=0.995, acc=0.773]
Epoch 7: train: 100%|██████████| 106/106 [00:20<00:00,  5.20it/s, loss=0.0334]
Epoch 7: val: 100%|██████████| 52/52 [00:03<00:00, 16.62it/s, loss=0.757, acc=0.825]
Epoch 8: train: 100%|██████████| 106/106 [00:20<00:00,  5.13it/s, loss=0.0179]
Epoch 8: val: 100%|██████████| 52/52 [00:03<00:00, 16.72it/s, loss=0.828, acc=0.824]
Epoch 9: train: 100%|██████████| 106/106 [00:20<00:00,  5.15it/s, loss=0.0112]
Epoch 9: val: 100%|██████████| 52/52 [00:03<00:00, 16.63it/s, loss=0.85, acc=0.834] 


Validation accuracy improved from 0.8324 to 0.8336. Saving model...
