In [1]:
import torch
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

!pip install -q transformers datasets accelerate

import json, pandas as pd, numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
class Config:
    TRAIN_PATH = "eng_laptop_train_alltasks.jsonl"
    DEV_PATH = "eng_laptop_dev_task2.jsonl"

    MODEL_NAME = "t5-small"

    MAX_INPUT_LEN = 128   # Reduced from 256
    MAX_OUTPUT_LEN = 256  # Reduced from 512

    BATCH_SIZE = 4

    EPOCHS = 3

    LR = 1e-4
    WARMUP_RATIO = 0.1
    WEIGHT_DECAY = 0.01
    OUTPUT_FILE = "submission_task2_colab.jsonl"
    MODEL_SAVE_PATH = "best_model_task2.pt"
    DEVICE = DEVICE
    SEED = 42

config = Config()
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

print("=" * 60)
print("=" * 60)
print(f"Model: {config.MODEL_NAME} (60M params)")
print(f"Batch Size: {config.BATCH_SIZE}")
print(f"Epochs: {config.EPOCHS}")
print(f"Max Lengths: Input={config.MAX_INPUT_LEN}, Output={config.MAX_OUTPUT_LEN}")
print("=" * 60)

GPU: Tesla T4
Model: t5-small (60M params)
Batch Size: 4
Epochs: 3
Max Lengths: Input=128, Output=256


In [2]:
def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line.strip()) for line in f]

def convert_quadruplets_to_triplets(data):
    converted = []
    for item in data:
        new_item = {
            "ID": item["ID"],
            "Text": item["Text"],
            "Triplet": [{"Aspect": q["Aspect"], "Opinion": q["Opinion"], "VA": q["VA"]}
                       for q in item.get("Quadruplet", [])]
        }
        converted.append(new_item)
    return converted

train_raw = convert_quadruplets_to_triplets(load_jsonl(config.TRAIN_PATH))
dev_raw = load_jsonl(config.DEV_PATH)

print(f"Train: {len(train_raw)} | Triplets: {sum(len(i['Triplet']) for i in train_raw)}")
print(f"Dev: {len(dev_raw)}")

Train: 4076 | Triplets: 5773
Dev: 200


In [3]:
def triplets_to_text(triplets):
    if not triplets:
        return "none"
    return " [SEP] ".join([f"{t['Aspect']} | {t['Opinion']} | {t['VA']}" for t in triplets])

def text_to_triplets(text):
    if text.strip().lower() == "none":
        return []
    triplets = []
    for part in text.split("[SEP]"):
        components = [c.strip() for c in part.split("|")]
        if len(components) >= 3 and "#" in components[2]:
            try:
                v, a = map(float, components[2].split("#"))
                triplets.append({
                    "Aspect": components[0],
                    "Opinion": components[1],
                    "VA": f"{np.clip(v, 1, 9):.2f}#{np.clip(a, 1, 9):.2f}"
                })
            except:
                pass
    return triplets

class TripletDataset(Dataset):
    def __init__(self, data, tokenizer, max_in, max_out):
        self.data = data
        self.tokenizer = tokenizer
        self.max_in = max_in
        self.max_out = max_out

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inp = self.tokenizer(
            f"Extract aspect, opinion, and VA triplets: {item['Text']}",
            max_length=self.max_in, padding="max_length", truncation=True, return_tensors="pt"
        )
        out = self.tokenizer(
            triplets_to_text(item.get("Triplet", [])),
            max_length=self.max_out, padding="max_length", truncation=True, return_tensors="pt"
        )
        return {
            "input_ids": inp["input_ids"].squeeze(0),
            "attention_mask": inp["attention_mask"].squeeze(0),
            "labels": out["input_ids"].squeeze(0)
        }

print("Functions ready")

Functions ready


In [4]:
tokenizer = T5Tokenizer.from_pretrained(config.MODEL_NAME, legacy=False)
train_data, val_data = train_test_split(train_raw, test_size=0.1, random_state=42, shuffle=True)

train_dataset = TripletDataset(train_data, tokenizer, config.MAX_INPUT_LEN, config.MAX_OUTPUT_LEN)
val_dataset = TripletDataset(val_data, tokenizer, config.MAX_INPUT_LEN, config.MAX_OUTPUT_LEN)

train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=0)

print(f"Train: {len(train_loader)} batches | Val: {len(val_loader)} batches")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Train: 917 batches | Val: 102 batches


In [5]:
print(f"Loading {config.MODEL_NAME}...")
model = T5ForConditionalGeneration.from_pretrained(config.MODEL_NAME)
model = model.to(config.DEVICE)

total = sum(p.numel() for p in model.parameters())
print(f"Model loaded: {total:,} params (~{total*4/1e9:.2f} GB)")

Loading t5-small...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model loaded: 60,506,624 params (~0.24 GB)


In [6]:
def train_epoch(model, loader, opt, sched, device, tok):
    model.train()
    total = 0
    for batch in tqdm(loader, desc="Training"):
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        labels[labels == tok.pad_token_id] = -100

        loss = model(input_ids=ids, attention_mask=mask, labels=labels).loss
        opt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        sched.step()
        total += loss.item()
    return total / len(loader)

def evaluate(model, loader, device, tok):
    model.eval()
    total = 0
    with torch.no_grad():
        for batch in tqdm(loader, desc="Eval", leave=False):
            ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            labels[labels == tok.pad_token_id] = -100
            total += model(input_ids=ids, attention_mask=mask, labels=labels).loss.item()
    return total / len(loader)

print("Functions ready")

Functions ready


In [7]:
print("TRAINING")
optimizer = torch.optim.AdamW(model.parameters(), lr=config.LR, weight_decay=config.WEIGHT_DECAY)
steps = len(train_loader) * config.EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, int(0.1*steps), steps)

best = float("inf")
history = []

for epoch in range(config.EPOCHS):
    print(f"\nEpoch {epoch+1}/{config.EPOCHS}")
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, config.DEVICE, tokenizer)
    val_loss = evaluate(model, val_loader, config.DEVICE, tokenizer)

    history.append({"epoch": epoch+1, "train_loss": train_loss, "val_loss": val_loss})
    print(f"Train: {train_loss:.4f} | Val: {val_loss:.4f}")

    if val_loss < best:
        best = val_loss
        torch.save(model.state_dict(), config.MODEL_SAVE_PATH)
        print(f"Saved! Best: {best:.4f}")

print(f"\nDone! Best: {best:.4f}")
print(pd.DataFrame(history))

TRAINING

Epoch 1/3


Training:   0%|          | 0/917 [00:00<?, ?it/s]

Eval:   0%|          | 0/102 [00:00<?, ?it/s]

Train: 1.9140 | Val: 0.8892
Saved! Best: 0.8892

Epoch 2/3


Training:   0%|          | 0/917 [00:00<?, ?it/s]

Eval:   0%|          | 0/102 [00:00<?, ?it/s]

Train: 0.9346 | Val: 0.7984
Saved! Best: 0.7984

Epoch 3/3


Training:   0%|          | 0/917 [00:00<?, ?it/s]

Eval:   0%|          | 0/102 [00:00<?, ?it/s]

Train: 0.8549 | Val: 0.7699
Saved! Best: 0.7699

Done! Best: 0.7699
   epoch  train_loss  val_loss
0      1    1.914011  0.889156
1      2    0.934643  0.798398
2      3    0.854881  0.769906


In [8]:
def generate_triplets(text, model, tok, device):
    model.eval()
    inp = tok(f"Extract aspect, opinion, and VA triplets: {text}",
              max_length=config.MAX_INPUT_LEN, return_tensors="pt")
    with torch.no_grad():
        out = model.generate(inp["input_ids"].to(device),
                            max_length=config.MAX_OUTPUT_LEN, num_beams=4)
    return text_to_triplets(tok.decode(out[0], skip_special_tokens=True))

model.load_state_dict(torch.load(config.MODEL_SAVE_PATH))
predictions = [{"ID": item["ID"], "Triplet": generate_triplets(item["Text"], model, tokenizer, config.DEVICE)}
               for item in tqdm(dev_raw, desc="Predicting")]

with open(config.OUTPUT_FILE, "w") as f:
    for p in predictions:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

print(f"Saved: {config.OUTPUT_FILE}")
print(f"Triplets: {sum(len(p['Triplet']) for p in predictions)}")

from google.colab import files
files.download(config.OUTPUT_FILE)
print("Downloaded!")

Predicting:   0%|          | 0/200 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Saved: submission_task2_colab.jsonl
Triplets: 340


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded!
