In [6]:
!pip install transformers datasets torch scikit-learn




In [7]:
import json


def read_jsonl(path):
    """Read a JSONL file into a list of dicts."""
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data


def extract_aspect_level_samples(raw_data):

    samples = []

    for item in raw_data:
        sample_id = item["ID"]
        text = item["Text"]

        if "Quadruplet" in item:
            for q in item["Quadruplet"]:
                v, a = q["VA"].split("#")
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": q["Aspect"],
                    "valence": float(v),
                    "arousal": float(a)
                })

        elif "Triplet" in item:
            for t in item["Triplet"]:
                v, a = t["VA"].split("#")
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": t["Aspect"],
                    "valence": float(v),
                    "arousal": float(a)
                })

        elif "Aspect_VA" in item:
            for av in item["Aspect_VA"]:
                v, a = av["VA"].split("#")
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": av["Aspect"],
                    "valence": float(v),
                    "arousal": float(a)
                })

        elif "Aspect" in item:
            for aspect in item["Aspect"]:
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": aspect,
                    "valence": None,
                    "arousal": None
                })

        else:
            raise ValueError(f"Unknown annotation format in item {sample_id}")

    return samples


In [8]:
from sklearn.model_selection import train_test_split


def split_train_dev(samples, dev_ratio=0.1, seed=42):
    train_samples, dev_samples = train_test_split(
        samples,
        test_size=dev_ratio,
        random_state=seed,
        shuffle=True
    )
    return train_samples, dev_samples


In [9]:
raw = read_jsonl("eng_laptop_train_alltasks.jsonl")
samples = extract_aspect_level_samples(raw)
samples[:10]
train_samples, dev_samples = split_train_dev(samples)

test_raw = read_jsonl("eng_laptop_test_task1.jsonl")
test_samples = extract_aspect_level_samples(test_raw)


In [10]:
def build_model_input(text, aspect):
    """
    Construct model input string for aspect-conditioned sentiment.
    """
    if aspect == "NULL":
        aspect = "overall"

    return text.strip() + " [SEP] " + aspect.strip()


In [11]:
from transformers import BertTokenizer

# Load tokenizer that matches the pretrained model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def tokenize_sample(text, aspect, max_length=128):
    """
    Tokenize a single (text, aspect) pair into model inputs.
    """
    model_input = build_model_input(text, aspect)

    encoding = tokenizer(
        model_input,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    return {
        "input_ids": encoding["input_ids"].squeeze(0),
        "attention_mask": encoding["attention_mask"].squeeze(0)
    }




In [12]:
sample = samples[0]
tokens = tokenize_sample(sample["text"], sample["aspect"])

print(tokens["input_ids"].shape)
print(tokens["attention_mask"].sum())


torch.Size([128])
tensor(30)


In [13]:
import torch
from torch.utils.data import Dataset


class DimASRDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=128, is_test=False):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        text = sample["text"]
        aspect = sample["aspect"]

        # Build input string
        model_input = build_model_input(text, aspect)

        encoding = self.tokenizer(
            model_input,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0)
        }

        # Only include labels if not test data
        if not self.is_test:
            item["labels"] = torch.tensor(
                [sample["valence"], sample["arousal"]],
                dtype=torch.float
            )

        return item


In [14]:
from torch.utils.data import DataLoader

train_dataset = DimASRDataset(
    train_samples,
    tokenizer,
    is_test=False
)

dev_dataset = DimASRDataset(
    dev_samples,
    tokenizer,
    is_test=False
)

test_dataset = DimASRDataset(
    test_samples,
    tokenizer,
    is_test=True
)

train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True
)

dev_loader = DataLoader(
    dev_dataset,
    batch_size=16,
    shuffle=False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=16,
    shuffle=False
)


In [15]:
batch = next(iter(train_loader))
print(batch.keys())
print(batch["input_ids"].shape)
print(batch["labels"].shape)


dict_keys(['input_ids', 'attention_mask', 'labels'])
torch.Size([16, 128])
torch.Size([16, 2])


In [16]:
import torch
import torch.nn as nn
from transformers import BertModel


class DimASRModel(nn.Module):
    def __init__(self):
        super(DimASRModel, self).__init__()

        # Load pretrained BERT backbone
        self.bert = BertModel.from_pretrained("bert-base-uncased")

        hidden_size = self.bert.config.hidden_size  # 768 for base model

        # Regression head (2 outputs: Valence, Arousal)
        self.regressor = nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask, labels=None):

        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # CLS token representation
        cls_output = outputs.last_hidden_state[:, 0, :]  # (batch_size, hidden_size)

        predictions = self.regressor(cls_output)  # (batch_size, 2)

        if labels is not None:
            loss_fn = nn.MSELoss()
            loss = loss_fn(predictions, labels)
            return loss, predictions

        return predictions


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DimASRModel().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

def train_one_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        loss, _ = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


Loading weights: 100%|██████████| 199/199 [00:00<00:00, 259.41it/s, Materializing param=pooler.dense.weight]                               
[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [18]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            loss, _ = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            total_loss += loss.item()

    return total_loss / len(dataloader)


In [None]:
epochs = 3

for epoch in range(epochs):
    train_loss = train_one_epoch(model, train_loader, optimizer, device)
    val_loss = evaluate(model, dev_loader, device)

    print(f"Epoch {epoch+1}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Dev Loss:   {val_loss:.4f}")
    