In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import torch.nn as nn


class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text=text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt')
            
        return {'input_ids': encoding['input_ids'].squeeze(0),
               'attention_mask': encoding['attention_mask'].squeeze(0),
               'label': torch.tensor(label, dtype=torch.long)}
        
        

In [3]:
# Load data and split
data_dir = '/projappl/project_2006600/fin_experiment/data'
data_combined_news = pd.read_csv(os.path.join(data_dir, 'data_combined_news.csv'), sep='\t', encoding='utf-8')
clean_news = data_combined_news['All_news_clean']

x = data_combined_news['All_news_clean']
y = data_combined_news['Label']

X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)


In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
#tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")



In [5]:
train_dataset = NewsDataset(X_train.values, y_train.values, tokenizer, 500)
valid_dataset = NewsDataset(X_valid.values, y_valid.values, tokenizer, 500)
#next(iter(train_dataset))

In [6]:
train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=12, shuffle=True)


In [7]:
from transformers import BertModel


class BertForNewsClassification(nn.Module):
    def __init__(self, pretrained_model_name, args):
        super(BertForNewsClassification, self).__init__()
        self.args = args
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(self.args.dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, self.args.num_classes)
        # Freeze BERT parameters
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.classifier(x)
        return logits
        

In [8]:
from dataclasses import dataclass 

@dataclass
class TrainArgs:
    learning_rate: float
    batch_size: int
    epochs: int
    num_classes: int
    dropout: float

In [9]:
# Setting device on GPU if available
torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

torch.backends.cudnn.deterministic = True

Using device: cuda

NVIDIA A100-SXM4-40GB MIG 1g.5gb
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [None]:
from torch.optim import AdamW
from torch.optim import lr_scheduler

args = TrainArgs(2e-5, 16, 10, 2, 0.3)

model = BertForNewsClassification("bert-base-uncased", args)

In [24]:
optimizer = AdamW(model.parameters(), lr=args.learning_rate)
criterion = nn.CrossEntropyLoss()

In [25]:
from sklearn.metrics import accuracy_score


def train_model(model, train_loader, val_loader, optimizer, loss_fn, device, epochs):
    model.to(device)
    best_val_loss = float("inf")

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        
        # Training phase
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            optimizer.zero_grad()
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(logits, labels)
            
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            
        train_loss /= len(train_loader)
        print(f"Train Loss: {train_loss:.4f}")

        # Validation phase
        val_loss = evaluate_model(model, val_loader, loss_fn, device)
        print(f"Validation Loss: {val_loss:.4f}")

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_model.pth")
            print("Saved best model.")


def evaluate_model(model, val_loader, loss_fn, device):
    model.eval()
    val_loss = 0
    running_corrects = 0
    total_samples = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(logits, labels)
                        
            val_loss += loss.item()
            
            probs = torch.softmax(logits, dim=1)
            predictions = torch.argmax(probs, dim=1)
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    print(f'Validation accuracy: {acc}')

    val_loss /= len(val_loader)
    return val_loss

In [None]:
train_model(model, train_loader, valid_loader, optimizer, criterion, device, args.epochs)

Epoch 1/10
Train Loss: 0.7184
Validation accuracy: 0.4949748743718593
Validation Loss: 0.6980
Saved best model.
Epoch 2/10
Train Loss: 0.7090
Validation accuracy: 0.535175879396985
Validation Loss: 0.6955
Saved best model.
Epoch 3/10
Train Loss: 0.7072
Validation accuracy: 0.4824120603015075
Validation Loss: 0.6949
Saved best model.
Epoch 4/10
Train Loss: 0.7037
Validation accuracy: 0.542713567839196
Validation Loss: 0.6905
Saved best model.
Epoch 5/10
Train Loss: 0.7067
Validation accuracy: 0.5603015075376885
Validation Loss: 0.6931
Epoch 6/10
Train Loss: 0.6944
Validation accuracy: 0.5678391959798995
Validation Loss: 0.6885
Saved best model.
Epoch 7/10
Train Loss: 0.6984
Validation accuracy: 0.5577889447236181
Validation Loss: 0.6897
Epoch 8/10
