<a href="https://colab.research.google.com/github/Abhishek-Y53/DimABSA/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


True
Tesla T4


In [24]:
!pip install transformers datasets torch scikit-learn




In [25]:
import json


def read_jsonl(path):
    """Read a JSONL file into a list of dicts."""
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data


def extract_aspect_level_samples(raw_data):

    samples = []

    for item in raw_data:
        sample_id = item["ID"]
        text = item["Text"]

        if "Quadruplet" in item:
            for q in item["Quadruplet"]:
                v, a = q["VA"].split("#")
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": q["Aspect"],
                    "valence": float(v),
                    "arousal": float(a)
                })

        elif "Triplet" in item:
            for t in item["Triplet"]:
                v, a = t["VA"].split("#")
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": t["Aspect"],
                    "valence": float(v),
                    "arousal": float(a)
                })

        elif "Aspect_VA" in item:
            for av in item["Aspect_VA"]:
                v, a = av["VA"].split("#")
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": av["Aspect"],
                    "valence": float(v),
                    "arousal": float(a)
                })

        elif "Aspect" in item:
            for aspect in item["Aspect"]:
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": aspect,
                    "valence": None,
                    "arousal": None
                })

        else:
            raise ValueError(f"Unknown annotation format in item {sample_id}")

    return samples


In [26]:
from sklearn.model_selection import train_test_split


def split_train_dev(samples, dev_ratio=0.1, seed=42):
    train_samples, dev_samples = train_test_split(
        samples,
        test_size=dev_ratio,
        random_state=seed,
        shuffle=True
    )
    return train_samples, dev_samples


In [27]:
raw = read_jsonl("eng_laptop_train_alltasks.jsonl")
samples = extract_aspect_level_samples(raw)
samples[:10]
train_samples, dev_samples = split_train_dev(samples)

test_raw = read_jsonl("eng_laptop_test_task1.jsonl")
test_samples = extract_aspect_level_samples(test_raw)

In [28]:
def build_model_input(text, aspect):
    """
    Construct model input string for aspect-conditioned sentiment.
    """
    if aspect == "NULL":
        aspect = "overall"

    return text.strip() + " [SEP] " + aspect.strip()


In [29]:
from transformers import BertTokenizer

# Load tokenizer that matches the pretrained model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def tokenize_sample(text, aspect, max_length=128):
    """
    Tokenize a single (text, aspect) pair into model inputs.
    """
    model_input = build_model_input(text, aspect)

    encoding = tokenizer(
        model_input,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    return {
        "input_ids": encoding["input_ids"].squeeze(0),
        "attention_mask": encoding["attention_mask"].squeeze(0)
    }


In [30]:
sample = samples[0]
tokens = tokenize_sample(sample["text"], sample["aspect"])

print(tokens["input_ids"].shape)
print(tokens["attention_mask"].sum())


torch.Size([128])
tensor(30)


In [31]:
import torch
from torch.utils.data import Dataset


class DimASRDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=128, is_test=False):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        text = sample["text"]
        aspect = sample["aspect"]

        # Build input string
        model_input = build_model_input(text, aspect)

        encoding = self.tokenizer(
            model_input,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0)
        }

        # Only include labels if not test data
        if not self.is_test:
            item["labels"] = torch.tensor(
                [sample["valence"], sample["arousal"]],
                dtype=torch.float
            )

        return item


In [32]:
from torch.utils.data import DataLoader

train_dataset = DimASRDataset(
    train_samples,
    tokenizer,
    is_test=False
)

dev_dataset = DimASRDataset(
    dev_samples,
    tokenizer,
    is_test=False
)

test_dataset = DimASRDataset(
    test_samples,
    tokenizer,
    is_test=True
)

train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True
)

dev_loader = DataLoader(
    dev_dataset,
    batch_size=16,
    shuffle=False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=16,
    shuffle=False
)


In [33]:
batch = next(iter(train_loader))
print(batch.keys())
print(batch["input_ids"].shape)
print(batch["labels"].shape)


dict_keys(['input_ids', 'attention_mask', 'labels'])
torch.Size([16, 128])
torch.Size([16, 2])


In [34]:
import torch
import torch.nn as nn
from transformers import BertModel


class DimASRModel(nn.Module):
    def __init__(self):
        super(DimASRModel, self).__init__()

        # Load pretrained BERT backbone
        self.bert = BertModel.from_pretrained("bert-base-uncased")

        hidden_size = self.bert.config.hidden_size  # 768 for base model

        # Regression head (2 outputs: Valence, Arousal)
        self.regressor = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask, labels=None):

        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # CLS token representation
        cls_output = outputs.last_hidden_state[:, 0, :]  # (batch_size, hidden_size)

        predictions = self.regressor(cls_output)  # (batch_size, 2)

        if labels is not None:
            loss_fn = nn.MSELoss()
            loss = loss_fn(predictions, labels)
            return loss, predictions

        return predictions


In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DimASRModel().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

def train_one_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        loss, _ = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [36]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            loss, _ = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            total_loss += loss.item()

    return total_loss / len(dataloader)


In [37]:
import numpy as np
from scipy.stats import pearsonr

def evaluate_metrics(model, dataloader, device):
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            preds = model(input_ids=input_ids, attention_mask=attention_mask)

            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    # Separate valence and arousal
    valence_corr = pearsonr(all_preds[:, 0], all_labels[:, 0])[0]
    arousal_corr = pearsonr(all_preds[:, 1], all_labels[:, 1])[0]

    return valence_corr, arousal_corr


In [38]:
epochs = 7

best_dev_loss = float("inf")  # initialize before training starts

for epoch in range(epochs):

    train_loss = train_one_epoch(model, train_loader, optimizer, device)
    val_loss = evaluate(model, dev_loader, device)

    val_corr, aro_corr = evaluate_metrics(model, dev_loader, device)

    # ðŸ”¥ Save best model
    if val_loss < best_dev_loss:
        best_dev_loss = val_loss
        torch.save(model.state_dict(), "best_model.pt")
        print(">>> Best model saved.")

    print(f"Epoch {epoch+1}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Dev Loss:   {val_loss:.4f}")
    print(f"Valence Pearson: {val_corr:.4f}")
    print(f"Arousal Pearson: {aro_corr:.4f}")
    print("-" * 40)



>>> Best model saved.
Epoch 1
Train Loss: 2.2462
Dev Loss:   0.8177
Valence Pearson: 0.8487
Arousal Pearson: 0.6679
----------------------------------------
>>> Best model saved.
Epoch 2
Train Loss: 0.6256
Dev Loss:   0.6747
Valence Pearson: 0.8616
Arousal Pearson: 0.7023
----------------------------------------
Epoch 3
Train Loss: 0.4797
Dev Loss:   0.7054
Valence Pearson: 0.8681
Arousal Pearson: 0.7049
----------------------------------------
>>> Best model saved.
Epoch 4
Train Loss: 0.3787
Dev Loss:   0.6481
Valence Pearson: 0.8739
Arousal Pearson: 0.7103
----------------------------------------
Epoch 5
Train Loss: 0.3062
Dev Loss:   0.6586
Valence Pearson: 0.8831
Arousal Pearson: 0.7175
----------------------------------------
>>> Best model saved.
Epoch 6
Train Loss: 0.2600
Dev Loss:   0.6017
Valence Pearson: 0.8744
Arousal Pearson: 0.7115
----------------------------------------
>>> Best model saved.
Epoch 7
Train Loss: 0.2293
Dev Loss:   0.5963
Valence Pearson: 0.8789
Arousal Pe

In [39]:
import torch
print("Torch version:", torch.__version__)
print("CUDA in torch:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())


Torch version: 2.9.0+cu128
CUDA in torch: 12.8
CUDA available: True


In [40]:
model.load_state_dict(torch.load("best_model.pt"))
model.eval()


DimASRModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [41]:
def predict_test(model, dataloader, device):
    model.eval()

    predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            preds = model(input_ids=input_ids, attention_mask=attention_mask)

            predictions.append(preds.cpu())

    predictions = torch.cat(predictions, dim=0)
    return predictions


In [43]:
test_predictions = predict_test(model, test_loader, device)


In [44]:
test_predictions = torch.clamp(test_predictions, min=1.0, max=9.0)


In [45]:
import json
from collections import defaultdict

def build_submission(test_samples, predictions, output_path):

    grouped = defaultdict(list)

    for sample, pred in zip(test_samples, predictions):

        valence = round(pred[0].item(), 2)
        arousal = round(pred[1].item(), 2)

        va_string = f"{valence:.2f}#{arousal:.2f}"

        grouped[sample["id"]].append({
            "Aspect": sample["aspect"],
            "VA": va_string
        })

    with open(output_path, "w", encoding="utf-8") as f:
        for sample_id in grouped:
            output_obj = {
                "ID": sample_id,
                "Aspect_VA": grouped[sample_id]
            }
            f.write(json.dumps(output_obj) + "\n")


In [46]:
build_submission(test_samples, test_predictions, "submission.jsonl")
