In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import re
from sklearn.metrics import precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean(text):
    text = str(text)
    # Remove URLs/links
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove $
    text = text.replace('$', '')
    # Remove #
    text = text.replace('#', '')
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing whitespace
    text = text.strip()
    return text

In [3]:
class FinancialAspectDataset(Dataset):
    def __init__(self, csv_file, tokenizer_name='ProsusAI/finbert', max_length=128):
        """
        Args:
            csv_file: Path to the CSV file
            tokenizer_name: Name of the pretrained tokenizer
            max_length: Maximum length of the tokenized sequences
        """
        # Read CSV
        self.df = pd.read_csv(csv_file)
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length
        
        # Extract and process aspects
        with open("unique_aspects", 'r') as file:
            aspects_list = [line.strip() for line in file]

        # self.aspects = self.df['aspects'].apply(ast.literal_eval).tolist()
        self.aspects = aspects_list #unique aspects

        self.mlb = MultiLabelBinarizer()
        self.aspect_labels = self.mlb.fit_transform(self.aspects)
        
        # Store aspect classes for reference
        self.aspect_classes = self.mlb.classes_
        self.num_aspects = len(self.aspect_classes)
        
    def __len__(self):
        return len(self.df)
    
    def process_aspects(self, aspects):
        aspect_names = np.unique(aspects[2:-2].split('/')).tolist()
        true_mask = [self.aspects.index(a) for a in aspect_names if a != '']
        aspect_ids = [0]*len(self.aspects)
        for m in true_mask:
            aspect_ids[m] = 1
        return aspect_ids
    
    def __getitem__(self, idx):
        # print("idx: ", idx)
        row = self.df.iloc[idx]
        
        # Get text inputs
        sentence = str(row['sentence'])
        # print("sentence: ", sentence)
        snippet = ast.literal_eval(row['snippets'])[0]  # Take first snippet
        # print("snippet: ", snippet)
        target = str(row['target'])
        # print("target: ", target)
        
        # Tokenize inputs
        sentence_encoding = self.tokenizer(
            clean(sentence),
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        snippet_encoding = self.tokenizer(
            clean(snippet),
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        target_encoding = self.tokenizer(
            clean(target),
            padding='max_length',
            truncation=True,
            max_length=32,  # Shorter max_length for targets
            return_tensors='pt'
        )
        
        # Get aspect labels
        aspect_label_ids = self.process_aspects(row['aspects'])
        
        # Remove batch dimension added by tokenizer
        return {
            'sentence_ids': sentence_encoding['input_ids'].squeeze(0),
            'sentence_mask': sentence_encoding['attention_mask'].squeeze(0),
            'snippet_ids': snippet_encoding['input_ids'].squeeze(0),
            'snippet_mask': snippet_encoding['attention_mask'].squeeze(0),
            'target_ids': target_encoding['input_ids'].squeeze(0),
            'target_mask': target_encoding['attention_mask'].squeeze(0),
            'aspect_label_ids': (torch.tensor(aspect_label_ids, dtype=float)).squeeze(0),
            'sentiment_score': torch.FloatTensor([float(row['sentiment_score'])])
        }

In [4]:
# Create data loaders
def create_dataloaders(csv_file, batch_size=10, train_split=0.9, seed=42):
    """
    Create train and validation dataloaders
    """
    # Set random seed for reproducibility
    torch.manual_seed(seed)
    
    # Create dataset
    dataset = FinancialAspectDataset(csv_file)
    
    # Calculate split sizes
    train_size = int(train_split * len(dataset))
    val_size = len(dataset) - train_size
    
    # Split dataset
    train_dataset, val_dataset = torch.utils.data.random_split(
        dataset, [train_size, val_size]
    )
    
    # Create dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False
    )
    
    return train_loader, val_loader, dataset.aspects


In [5]:
# Create dataloaders
train_loader, val_loader, aspect_classes = create_dataloaders('train.csv', batch_size=16)

In [6]:
class TargetAttention(torch.nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        # Reduce number of heads (8 -> 2 or 4)
        self.attention = torch.nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=2,  # Reduced from 8
            dropout=0.1   # Add dropout for regularization
        )
        
        # Reduce hidden dimension
        self.hidden_dim = 768  # Reduced from 768
        # self.dim_reducer = torch.nn.Linear(-1, self.hidden_dim) #might be 768 but let's see
        
    def forward(self, sentence_encoding, snippet_encoding, target_encoding):
        # Reduce dimensions
        # sentence_reduced = self.dim_reducer(sentence_encoding)
        # snippet_reduced = self.dim_reducer(snippet_encoding)
        # target_reduced = self.dim_reducer(target_encoding)
        
        # Apply attention
        attn_output, attn_weights = self.attention(
            query=target_encoding,
            key=sentence_encoding,
            value=snippet_encoding
        )
        
        return attn_output, attn_weights

In [7]:
class AspectDetectionModel(torch.nn.Module):
    def __init__(self, num_aspects):
        super().__init__()
        self.finbert = AutoModel.from_pretrained('ProsusAI/finbert')
        self.tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')

        self.label_embedding = self.create_label_embeddings(self.tokenizer, self.finbert).to('cuda:0')
        
        print("label embeddings: ", (self.label_embedding).shape)

        #freeze all the layers of bert
        for param in self.finbert.parameters():
            param.requires_grad = False
            
        # Only fine-tune last few layers
        for param in self.finbert.encoder.layer[-2:].parameters():
            param.requires_grad = True
        
        self.hidden_dim = 768
        # Optimized attention for small data
        self.target_attention = TargetAttention(
            hidden_dim=768  # Reduced dimension
        )
        
        # Multi-label aspect classifier
        # self.aspect_classifier = torch.nn.Sequential(
        #     torch.nn.Linear(768, 128),
        #     torch.nn.ReLU(),
        #     torch.nn.Dropout(0.2),
        #     torch.nn.Linear(128, num_aspects),
        #     # No sigmoid here - we'll use BCEWithLogitsLoss
        # )

        self.semantic_matcher = torch.nn.Sequential(
            torch.nn.Linear(self.hidden_dim, self.hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(self.hidden_dim, self.hidden_dim)
        )

        self.temperature = torch.nn.Parameter(torch.ones(1))
        
    def forward(self, sentence_ids, sentence_mask, 
                              snippet_ids, snippet_mask,
                              target_ids, target_mask):
        # Encode inputs
        sentence_encoding = self.finbert(sentence_ids, sentence_mask)[0]  # [batch_size, seq_len, 768]
        sentence_encoding = sentence_encoding[:, 0, :] # B, H
        # print("sent_encoding: ",sentence_encoding.shape)
        snippet_encoding = self.finbert(snippet_ids, snippet_mask)[0]
        snippet_encoding = snippet_encoding[:, 0, :]
        # print("snippet_encoding: ",snippet_encoding.shape)
        target_encoding = self.finbert(target_ids, target_mask)[0]
        target_encoding = target_encoding[:,0,:]
        # print("target_encoding: ",target_encoding.shape)
        
        # Get target-aware representation
        target_aware_output, _ = self.target_attention(
            sentence_encoding=sentence_encoding,
            snippet_encoding=snippet_encoding,
            target_encoding=target_encoding
        ) # (B, H)

        # print("attn_output: ",target_aware_output.shape)
        
        # Predict aspects
        # aspect_logits = self.aspect_classifier(target_aware_output)
        # print("output_logits: ",aspect_logits.shape)
        # print(aspect_logits[0])
        # return aspect_logits

        text_repr= self.semantic_matcher(target_aware_output)
        similarities = self.calculate_similarities(text_repr)

        # print("similarities: ", similarities.shape)
        
        return similarities
    
    def calculate_similarities(self, text_repr):
        # Normalize embeddings
        text_repr = torch.nn.functional.normalize(text_repr, dim=-1)
        label_embeddings = torch.nn.functional.normalize(self.label_embedding, dim=-1)
        
        # Calculate cosine similarity
        similarities = torch.matmul(text_repr, label_embeddings.T)
        
        # Scale with learnable temperature
        similarities = similarities * self.temperature
        
        return similarities
    
    def create_label_embeddings(self, tokenizer, finbert):
        """Create embeddings for all labels using FinBERT"""
        labels = None
        with open("unique_aspects", 'r') as file:
            labels = [line.strip() for line in file]

        label_embeddings = {}
        
        with torch.no_grad():
            for label in labels:
                # Tokenize label
                inputs = tokenizer(
                    label,
                    padding=True,
                    truncation=True,
                    return_tensors='pt'
                )
                
                # Get embedding from FinBERT
                outputs = finbert(**inputs)
                # Use [CLS] token embedding
                label_emb = outputs.last_hidden_state[:, 0, :]  # [1, hidden_dim]
                label_embeddings[label] = label_emb
                
        # Stack all label embeddings
        return torch.cat(list(label_embeddings.values()), dim=0)  # [num_labels, hidden_dim]

In [15]:
ASPECT_LIST = 119
num_epochs = 2

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [16]:
model = AspectDetectionModel(num_aspects=ASPECT_LIST).to(device)
criterion = torch.nn.BCEWithLogitsLoss()  # Better than BCE for numerical stability

# Use different learning rates for BERT and custom layers
optimizer = AdamW([
    {'params': model.finbert.parameters(), 'lr': 1e-3},
    {'params': model.target_attention.parameters(), 'lr': 1e-3},
    {'params': model.semantic_matcher.parameters(), 'lr': 1e-2},
    {'params': model.temperature, 'lr': 1e-2}
])

label embeddings:  torch.Size([119, 768])


In [36]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    cnt = 0
    for batch in train_loader:
        cnt += 1
        # Get batch data
        sentence_ids = batch['sentence_ids'].to(device)
        sentence_mask = batch['sentence_mask'].to(device)
        snippet_ids = batch['snippet_ids'].to(device)
        snippet_mask = batch['snippet_mask'].to(device)
        target_ids = batch['target_ids'].to(device)
        target_mask = batch['target_mask'].to(device)

        aspect_labels = batch['aspect_label_ids'].to(device)  # Binary matrix [batch_size, num_aspects]
        # print("true: ", aspect_labels.shape)
        
        # print("forward pass: ")
        # Forward pass
        similarity_logits = model(sentence_ids, sentence_mask, 
                              snippet_ids, snippet_mask,
                              target_ids, target_mask)
        
        # print("pred: ", aspect_logits.shape)
        
        # Calculate loss
        loss = criterion(similarity_logits, aspect_labels)

        if cnt%10==0:
            print("loss: ", loss.item())
        epoch_loss += loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient clipping for stability
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()

    print("epoch_loss: ", (epoch_loss/cnt).item())
    model.eval()
    total_val_loss = 0
    val_steps = 0
        
    print("\nRunning Validation...")
    
    with torch.no_grad():
        for batch in val_loader:
            # Move batch to device
            sentence_ids = batch['sentence_ids'].to(device)
            sentence_mask = batch['sentence_mask'].to(device)
            snippet_ids = batch['snippet_ids'].to(device)
            snippet_mask = batch['snippet_mask'].to(device)
            target_ids = batch['target_ids'].to(device)
            target_mask = batch['target_mask'].to(device)

            aspect_labels = batch['aspect_label_ids'].to(device)
            
            # Forward pass
            aspect_logits = model(sentence_ids, sentence_mask, 
                              snippet_ids, snippet_mask,
                              target_ids, target_mask)
            
            # Calculate loss
            loss = criterion(aspect_logits, aspect_labels)
            total_val_loss += loss.item()
            val_steps += 1
    
    avg_val_loss = total_val_loss / val_steps
    print("val loss: ", avg_val_loss, "\n")

loss:  0.13394873701975124
loss:  0.13696777626496404
loss:  0.1417593694910282


KeyboardInterrupt: 

In [24]:
val_loader, _ , _ = create_dataloaders('validation.csv', batch_size=16, train_split=0.98)

In [35]:
model.eval()
total_val_loss = 0
val_steps = 0

# Initialize lists to store predictions and true labels
all_predictions = []
all_true_labels = []
    
print("\nRunning Validation...")

with torch.no_grad():
    for batch in train_loader:
        # Move batch to device
        sentence_ids = batch['sentence_ids'].to(device)
        sentence_mask = batch['sentence_mask'].to(device)
        snippet_ids = batch['snippet_ids'].to(device)
        snippet_mask = batch['snippet_mask'].to(device)
        target_ids = batch['target_ids'].to(device)
        target_mask = batch['target_mask'].to(device)
        aspect_labels = batch['aspect_label_ids'].to(device)
        
        # Forward pass
        aspect_logits = model(sentence_ids, sentence_mask, 
                          snippet_ids, snippet_mask,
                          target_ids, target_mask)
        
        # print(aspect_logits[0])
        # print(aspect_labels[0])
        # break
        # Calculate loss
        loss = criterion(aspect_logits, aspect_labels)
        total_val_loss += loss.item()
        val_steps += 1
        
        # Convert logits to predictions
        predictions = torch.sigmoid(aspect_logits) > 0.1  # threshold at 0.5
        
        # Convert to numpy for sklearn metrics
        predictions = predictions.cpu().numpy()
        true_labels = aspect_labels.cpu().numpy()

        # print(np.unique(predictions, return_counts=True))
        # print(np.unique(true_labels, return_counts=True))
        # print(predictions)
        # break
        
        all_predictions.append(predictions)
        all_true_labels.append(true_labels)

# Concatenate all batches
all_predictions = np.vstack(all_predictions)
all_true_labels = np.vstack(all_true_labels)

# Calculate metrics
# micro average
precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
    all_true_labels, all_predictions, average='micro'
)
avg_val_loss = total_val_loss / val_steps

print(f"Validation Loss: {avg_val_loss:.4f}")
print("\nMicro-averaged metrics:")
print(f"Precision: {precision_micro:.4f}")
print(f"Recall: {recall_micro:.4f}")
print(f"F1-score: {f1_micro:.4f}")


Running Validation...
Validation Loss: 0.1452

Micro-averaged metrics:
Precision: 0.0229
Recall: 0.1133
F1-score: 0.0381


In [32]:
a = torch.tensor([-2.5918, -1.8610, -2.5541, -2.4071])
a

tensor([-2.5918, -1.8610, -2.5541, -2.4071])

In [33]:
torch.sigmoid(a)

tensor([0.0697, 0.1346, 0.0722, 0.0826])