In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

In [None]:
def read_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    return data


def extract_aspect_level_samples(raw_data):
    samples = []

    for item in raw_data:
        sample_id = item["ID"]
        text = item["Text"]

        # Quadruplet
        if "Quadruplet" in item:
            for q in item["Quadruplet"]:
                v, a = q["VA"].split("#")
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": q["Aspect"],
                    "valence": float(v),
                    "arousal": float(a)
                })

        # Triplet
        elif "Triplet" in item:
            for t in item["Triplet"]:
                v, a = t["VA"].split("#")
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": t["Aspect"],
                    "valence": float(v),
                    "arousal": float(a)
                })

        # Aspect_VA
        elif "Aspect_VA" in item:
            for av in item["Aspect_VA"]:
                v, a = av["VA"].split("#")
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": av["Aspect"],
                    "valence": float(v),
                    "arousal": float(a)
                })

        # Aspect-only (test)
        elif "Aspect" in item:
            for aspect in item["Aspect"]:
                samples.append({
                    "id": sample_id,
                    "text": text,
                    "aspect": aspect,
                    "valence": None,
                    "arousal": None
                })

        else:
            raise ValueError(f"Unknown annotation format in item {sample_id}")

    return samples


In [None]:
def split_train_dev(samples, dev_ratio=0.1, seed=42):
    train_samples, dev_samples = train_test_split(
        samples,
        test_size=dev_ratio,
        random_state=seed,
        shuffle=True
    )
    return train_samples, dev_samples

In [None]:
def build_model_input(text, aspect):
    if aspect == "NULL":
        aspect = "overall"
    return text.strip() + " [SEP] " + aspect.strip()

In [None]:

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
class DimASRDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=128, is_test=False):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        text = sample["text"]
        aspect = sample["aspect"]

        model_input = build_model_input(text, aspect)

        encoding = self.tokenizer(
            model_input,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0)
        }

        if not self.is_test:
            item["labels"] = torch.tensor(
                [sample["valence"], sample["arousal"]],
                dtype=torch.float
            )

        return item


In [None]:
train_raw = read_jsonl("train.jsonl")
all_samples = extract_aspect_level_samples(train_raw)
train_samples, dev_samples = split_train_dev(all_samples)
test_raw = read_jsonl("test.jsonl")
test_samples = extract_aspect_level_samples(test_raw)