# DimABSA - Subtask 1: Baseline Model
## Dimensional Aspect Sentiment Regression (DimASR)

---

## Objective
Predict Valence (1-9) and Arousal (1-9) scores for aspects in laptop reviews.

**Example:**
- Input: "The battery life is excellent" + Aspect: "battery life"
- Output: Valence: 8.2, Arousal: 6.5

---

## Dataset
- Training samples: 4,076
- Dev samples: 200
- Format: JSONL (JSON Lines)

In [1]:
# Cell 2: Import Required Libraries

import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from tqdm import tqdm
import math
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

✅ All libraries imported successfully!
PyTorch version: 2.9.1
CUDA available: False


In [2]:
# Configuration
TRAIN_PATH = "datasets-subtask-1/eng_laptop_train_alltasks.jsonl"
DEV_PATH = "datasets-subtask-1/eng_laptop_dev_task1.jsonl"

MODEL_NAME = "bert-base-multilingual-cased"
MAX_LENGTH = 128
BATCH_SIZE = 16
LEARNING_RATE = 1e-5
EPOCHS = 3  # Start with 3 epochs for faster testing
DROPOUT = 0.1

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Configuration set!")
print(f"Training: {TRAIN_PATH}")
print(f"Dev: {DEV_PATH}")
print(f"Model: {MODEL_NAME}")
print(f"Device: {DEVICE}")

✅ Configuration set!
Training: datasets-subtask-1/eng_laptop_train_alltasks.jsonl
Dev: datasets-subtask-1/eng_laptop_dev_task1.jsonl
Model: bert-base-multilingual-cased
Device: cpu


In [3]:
# Data loading functions
def load_jsonl(filepath):
    """Load JSONL file"""
    with open(filepath, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

def jsonl_to_df(data):
    """Convert JSONL to DataFrame for Subtask 1"""
    if 'Quadruplet' in data[0]:
        # Training data (has quadruplets with VA scores)
        df = pd.json_normalize(data, 'Quadruplet', ['ID', 'Text'])
        df[['Valence', 'Arousal']] = df['VA'].str.split('#', expand=True).astype(float)
        df = df.drop(columns=['VA', 'Category', 'Opinion'])
        df = df.drop_duplicates(subset=['ID', 'Aspect'], keep='first')
    elif 'Aspect' in data[0]:
        # Dev/Test data (only has aspects, no VA scores)
        df = pd.json_normalize(data, 'Aspect', ['ID', 'Text'])
        df = df.rename(columns={df.columns[0]: "Aspect"})
        df['Valence'] = 0  # Placeholder
        df['Arousal'] = 0  # Placeholder
    else:
        raise ValueError("Invalid format")
    
    return df

print("Data loading functions defined!")

✅ Data loading functions defined!


In [4]:
# Load the data
print("Loading training data...")
train_raw = load_jsonl(TRAIN_PATH)
train_df = jsonl_to_df(train_raw)

print("Loading dev data...")
dev_raw = load_jsonl(DEV_PATH)
dev_df = jsonl_to_df(dev_raw)

# Split training data (90% train, 10% validation)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

print(f"\n Data loaded!")
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Dev samples: {len(dev_df)}")

Loading training data...
Loading dev data...

✅ Data loaded!
Training samples: 4462
Validation samples: 496
Dev samples: 275


In [5]:
# Look at training data
print("=" * 50)
print("TRAINING DATA SAMPLE:")
print("=" * 50)
print(train_df.head())
print(f"\nColumns: {train_df.columns.tolist()}")
print(f"\nValence range: {train_df['Valence'].min():.2f} - {train_df['Valence'].max():.2f}")
print(f"Arousal range: {train_df['Arousal'].min():.2f} - {train_df['Arousal'].max():.2f}")

# Show a few examples
print("\n" + "=" * 50)
print("EXAMPLE REVIEWS:")
print("=" * 50)
for i in range(3):
    row = train_df.iloc[i]
    print(f"\n{i+1}. Text: {row['Text']}")
    print(f"   Aspect: {row['Aspect']}")
    print(f"   Valence: {row['Valence']:.2f} | Arousal: {row['Arousal']:.2f}")

TRAINING DATA SAMPLE:
        Aspect                      ID  \
251   computer     laptop_quad_dev_190   
4516      unit  laptop_quad_train_2141   
335       NULL     laptop_quad_dev_253   
3286    device  laptop_quad_train_1230   
753     screen    laptop_quad_test_236   

                                                   Text  Valence  Arousal  
251   if i had it to do over , i would not purchase ...     3.10     6.30  
4516  after charging the unit for 2 hours i discover...     4.75     5.25  
335   freezes with red lines across it , froze five ...     2.00     7.67  
3286  a wonderful device with extremely clear display .     8.00     7.83  
753                         the screen does look good .     6.62     6.62  

Columns: ['Aspect', 'ID', 'Text', 'Valence', 'Arousal']

Valence range: 1.00 - 8.83
Arousal range: 3.83 - 8.83

EXAMPLE REVIEWS:

1. Text: if i had it to do over , i would not purchase this computer .
   Aspect: computer
   Valence: 3.10 | Arousal: 6.30

2. Text: afte

In [6]:
# PyTorch Dataset class
class VADataset(Dataset):
    """
    Dataset for Valence-Arousal regression
    Combines aspect and text into input: "aspect: text"
    """
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.sentences = dataframe["Text"].tolist()
        self.aspects = dataframe["Aspect"].tolist()
        self.labels = dataframe[["Valence", "Arousal"]].values.astype(float)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        # Combine aspect and text
        text = f"{self.aspects[idx]}: {self.sentences[idx]}"
        
        # Tokenize
        encoded = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

print("Dataset class defined!")

✅ Dataset class defined!


In [7]:
# Initialize tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Create datasets
print("Creating datasets...")
train_dataset = VADataset(train_df, tokenizer, MAX_LENGTH)
val_dataset = VADataset(val_df, tokenizer, MAX_LENGTH)
dev_dataset = VADataset(dev_df, tokenizer, MAX_LENGTH)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f" Datasets created!")
print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")
print(f"Dev batches: {len(dev_loader)}")

# Test one batch
sample_batch = next(iter(train_loader))
print(f"\nSample batch shapes:")
print(f"  Input IDs: {sample_batch['input_ids'].shape}")
print(f"  Attention mask: {sample_batch['attention_mask'].shape}")
print(f"  Labels: {sample_batch['labels'].shape}")

Loading tokenizer...
Creating datasets...
✅ Datasets created!
Train batches: 279
Val batches: 31
Dev batches: 18

Sample batch shapes:
  Input IDs: torch.Size([16, 128])
  Attention mask: torch.Size([16, 128])
  Labels: torch.Size([16, 2])


In [8]:
# BERT-based regression model
class VARegressor(nn.Module):
    """
    BERT-based model for predicting Valence and Arousal
    """
    def __init__(self, model_name=MODEL_NAME, dropout=DROPOUT):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        hidden_size = self.backbone.config.hidden_size
        self.regressor = nn.Linear(hidden_size, 2)  # 2 outputs: V and A
    
    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use [CLS] token representation
        cls_output = outputs.last_hidden_state[:, 0, :]
        
        # Apply dropout and regressor
        x = self.dropout(cls_output)
        predictions = self.regressor(x)
        
        return predictions

print(" Model class defined!")

✅ Model class defined!


In [9]:
# Initialize model
print("Initializing model...")
model = VARegressor().to(DEVICE)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f" Model initialized!")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model on device: {next(model.parameters()).device}")

Initializing model...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

✅ Model initialized!
Total parameters: 177,854,978
Trainable parameters: 177,854,978
Model on device: cpu


In [10]:
# Training and evaluation functions
def train_epoch(model, dataloader, optimizer, criterion, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    return total_loss / len(dataloader)

def eval_epoch(model, dataloader, criterion, device):
    """Evaluate for one epoch"""
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

print(" Training functions defined!")

✅ Training functions defined!


In [11]:
# Training loop
print("=" * 60)
print("STARTING TRAINING")
print("=" * 60)

best_val_loss = float('inf')
training_history = []

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    print("-" * 40)
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, criterion, DEVICE)
    
    # Validate
    val_loss = eval_epoch(model, val_loader, criterion, DEVICE)
    
    # Save history
    training_history.append({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'val_loss': val_loss
    })
    
    # Print results
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_loss:.4f}")
    
    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        print(f" New best model! Saving...")
        torch.save(model.state_dict(), 'best_model.pt')

print("\n" + "=" * 60)
print("TRAINING COMPLETE!")
print("=" * 60)
print(f"Best validation loss: {best_val_loss:.4f}")

STARTING TRAINING

Epoch 1/3
----------------------------------------


Training: 100%|██████████| 279/279 [09:15<00:00,  1.99s/it, loss=1.2203]


Train Loss: 2.9101
Val Loss: 1.1106
✅ New best model! Saving...

Epoch 2/3
----------------------------------------


Training: 100%|██████████| 279/279 [08:58<00:00,  1.93s/it, loss=0.5706]


Train Loss: 0.8928
Val Loss: 0.9914
✅ New best model! Saving...

Epoch 3/3
----------------------------------------


Training: 100%|██████████| 279/279 [09:05<00:00,  1.96s/it, loss=1.1120]


Train Loss: 0.6594
Val Loss: 0.8738
✅ New best model! Saving...

TRAINING COMPLETE!
Best validation loss: 0.8738


In [12]:
# Evaluation functions
def get_predictions(model, dataloader, device):
    """Get predictions from model"""
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].cpu().numpy()
            
            outputs = model(input_ids, attention_mask).cpu().numpy()
            
            all_preds.append(outputs)
            all_labels.append(labels)
    
    preds = np.vstack(all_preds)
    labels = np.vstack(all_labels)
    
    return preds, labels

def calculate_rmse(preds, labels):
    """Calculate RMSE for VA predictions"""
    pred_v, pred_a = preds[:, 0], preds[:, 1]
    gold_v, gold_a = labels[:, 0], labels[:, 1]
    
    # Combine V and A for RMSE calculation
    gold_va = np.concatenate([gold_v, gold_a])
    pred_va = np.concatenate([pred_v, pred_a])
    
    # Calculate RMSE
    squared_errors = (gold_va - pred_va) ** 2
    rmse = np.sqrt(np.mean(squared_errors))
    
    # Normalized RMSE
    rmse_norm = rmse / np.sqrt(128)
    
    # Calculate PCC
    pcc_v = pearsonr(pred_v, gold_v)[0]
    pcc_a = pearsonr(pred_a, gold_a)[0]
    
    return {
        'RMSE': rmse,
        'RMSE_normalized': rmse_norm,
        'PCC_Valence': pcc_v,
        'PCC_Arousal': pcc_a
    }

print("Evaluation functions defined!")

✅ Evaluation functions defined!


In [13]:
# Load best model
print("Loading best model...")
model.load_state_dict(torch.load('best_model.pt'))
model.eval()

# Get predictions on validation set
print("\nEvaluating on validation set...")
val_preds, val_labels = get_predictions(model, val_loader, DEVICE)
val_metrics = calculate_rmse(val_preds, val_labels)

print("\n" + "=" * 60)
print("VALIDATION SET RESULTS:")
print("=" * 60)
print(f"RMSE: {val_metrics['RMSE']:.4f}")
print(f"RMSE (normalized): {val_metrics['RMSE_normalized']:.4f}")
print(f"PCC Valence: {val_metrics['PCC_Valence']:.4f}")
print(f"PCC Arousal: {val_metrics['PCC_Arousal']:.4f}")

# Show some example predictions
print("\n" + "=" * 60)
print("SAMPLE PREDICTIONS:")
print("=" * 60)
print(f"{'Predicted V':<12} {'Predicted A':<12} {'True V':<10} {'True A':<10} {'Error'}")
print("-" * 60)
for i in range(min(10, len(val_preds))):
    pred_v, pred_a = val_preds[i]
    true_v, true_a = val_labels[i]
    error = np.sqrt((pred_v - true_v)**2 + (pred_a - true_a)**2)
    print(f"{pred_v:>11.2f} {pred_a:>12.2f} {true_v:>10.2f} {true_a:>10.2f} {error:>10.2f}")

Loading best model...

Evaluating on validation set...

VALIDATION SET RESULTS:
RMSE: 0.9348
RMSE (normalized): 0.0826
PCC Valence: 0.8374
PCC Arousal: 0.6866

SAMPLE PREDICTIONS:
Predicted V  Predicted A  True V     True A     Error
------------------------------------------------------------
       4.07         6.21       3.12       6.12       0.95
       7.80         7.69       7.67       7.50       0.22
       7.74         7.74       7.50       7.50       0.35
       7.46         7.32       7.12       7.00       0.47
       7.81         7.73       8.12       8.25       0.60
       3.89         6.04       3.83       6.33       0.30
       6.49         6.34       5.67       5.67       1.06
       7.59         7.25       7.25       7.38       0.36
       5.26         5.89       5.33       5.17       0.72
       7.80         7.73       7.75       7.62       0.12


In [14]:
# Generate predictions for dev set (for submission)
print("\nGenerating predictions for dev set...")
dev_preds, _ = get_predictions(model, dev_loader, DEVICE)

# Add predictions to dev dataframe
dev_df['Valence'] = dev_preds[:, 0]
dev_df['Arousal'] = dev_preds[:, 1]

# Clip values to [1, 9] range
dev_df['Valence'] = dev_df['Valence'].clip(1.0, 9.0)
dev_df['Arousal'] = dev_df['Arousal'].clip(1.0, 9.0)

print(f"Predictions generated for {len(dev_df)} samples")
print("\nSample predictions:")
print(dev_df[['Text', 'Aspect', 'Valence', 'Arousal']].head(10))


Generating predictions for dev set...
✅ Predictions generated for 275 samples

Sample predictions:
                                                Text  \
0                    The touchscreen works very well   
1                         I am so disappointed in HP   
2  The keyboard is big enough to use for real typing   
3                             I like the screen size   
4            Lenovo is my favorite brand of computer   
5  The sound is great and it's easy to use for us...   
6             The quality for the price is excellent   
7         The on screen keyboard is very easy to use   
8            I got this laptop and am very impressed   
9  I enjoyed the functionality of the trackpad , ...   

                          Aspect   Valence   Arousal  
0                    touchscreen  7.737481  7.465717  
1                             HP  2.992138  6.917946  
2                       keyboard  7.459754  7.129376  
3                    screen size  7.539342  7.205931  
4       

In [15]:
# Save predictions in competition format
def save_predictions(df, output_path):
    """Save predictions in JSONL format for submission"""
    # Sort by ID
    df_sorted = df.sort_values(by="ID")
    
    # Group by ID
    grouped = df_sorted.groupby("ID", sort=False)
    
    with open(output_path, "w", encoding="utf-8") as f:
        for gid, gdf in grouped:
            record = {
                "ID": gid,
                "Aspect_VA": []
            }
            for _, row in gdf.iterrows():
                record["Aspect_VA"].append({
                    "Aspect": row["Aspect"],
                    "VA": f"{row['Valence']:.2f}#{row['Arousal']:.2f}"
                })
            f.write(json.dumps(record, ensure_ascii=False) + "\n")
    
    print(f"Predictions saved to {output_path}")

# Save predictions
output_file = "pred_eng_laptop.jsonl"
save_predictions(dev_df, output_file)

print("\n" + "=" * 60)
print("SUBMISSION FILE CREATED!")
print("=" * 60)
print(f"File: {output_file}")
print(f"You can now submit this file to the competition!")

✅ Predictions saved to pred_eng_laptop.jsonl

SUBMISSION FILE CREATED!
File: pred_eng_laptop.jsonl
You can now submit this file to the competition!
