In [1]:
"""
DimABSA - Subtask 3: Dimensional Aspect-Category Sentiment Quadruplet Extraction
================================================================================
Task: Extract (Aspect, Category, Opinion, VA) quadruplets from text
Example: "battery is great" → ("battery", "LAPTOP#BATTERY", "great", "7.50#6.80")
"""

import torch
print("CHECKING ENVIRONMENT")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print("GPU is ready!")
else:
    print("WARNING: No GPU detected!")
    print("Go to: Runtime → Change runtime type → Select T4 GPU → Save")

CHECKING ENVIRONMENT
PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: Tesla T4
GPU is ready!


In [2]:
# ============================================================================
# CELL 2: Install & Import
# ============================================================================

print("Installing packages...")
!pip install -q transformers datasets accelerate

import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("ALL IMPORTS LOADED")
print(f"Using device: {DEVICE}")

Installing packages...
ALL IMPORTS LOADED
Using device: cuda


In [3]:
# ============================================================================
# CELL 3: Configuration for Subtask 3
# ============================================================================

class Config:
    # Data files
    TRAIN_PATH = "eng_laptop_train_alltasks.jsonl"
    DEV_PATH = "eng_laptop_dev_task3.jsonl"

    # Model - Using T5-small
    MODEL_NAME = "t5-small"

    # Sequence lengths
    MAX_INPUT_LEN = 160   # Input text length
    MAX_OUTPUT_LEN = 320  # Output quadruplets (longer than triplets!)

    # Training parameters
    BATCH_SIZE = 4
    EPOCHS = 4  # Slightly more for quadruplets
    LR = 1e-4
    WARMUP_RATIO = 0.1
    WEIGHT_DECAY = 0.01

    # Output
    OUTPUT_FILE = "submission_task3_colab.jsonl"
    MODEL_SAVE_PATH = "best_model_task3.pt"

    # System
    DEVICE = DEVICE
    SEED = 42

config = Config()

# Set seeds
np.random.seed(config.SEED)
torch.manual_seed(config.SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(config.SEED)

print("CONFIGURATION - SUBTASK 3")
print(f"Model: {config.MODEL_NAME}")
print(f"Device: {config.DEVICE}")
print(f"Batch Size: {config.BATCH_SIZE}")
print(f"Epochs: {config.EPOCHS}")
print(f"Max Input: {config.MAX_INPUT_LEN}")
print(f"Max Output: {config.MAX_OUTPUT_LEN}")
print("Task: Extract (Aspect, Category, Opinion, VA) quadruplets")

CONFIGURATION - SUBTASK 3
Model: t5-small
Device: cuda
Batch Size: 4
Epochs: 4
Max Input: 160
Max Output: 320
Task: Extract (Aspect, Category, Opinion, VA) quadruplets


In [4]:
# ============================================================================
# CELL 4: Load Data
# ============================================================================

def load_jsonl(path):
    """Load JSONL file"""
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line.strip()) for line in f]

print("LOADING DATA")

# Load data files
train_raw = load_jsonl(config.TRAIN_PATH)
dev_raw = load_jsonl(config.DEV_PATH)

print(f"✓ Loaded {len(train_raw)} training instances")
print(f"✓ Loaded {len(dev_raw)} dev instances")

# Count quadruplets
total_quads = sum(len(item.get("Quadruplet", [])) for item in train_raw)
print(f"\nTraining Statistics:")
print(f"  Total quadruplets: {total_quads}")
print(f"  Average per instance: {total_quads/len(train_raw):.2f}")

# Show example
print("SAMPLE TRAINING DATA")
example = train_raw[0]
print(f"ID: {example['ID']}")
print(f"Text: {example['Text'][:100]}...")
print(f"\nQuadruplets:")
for i, q in enumerate(example.get("Quadruplet", [])[:3], 1):
    print(f"  {i}. Aspect: {q['Aspect']}")
    print(f"     Category: {q['Category']}")
    print(f"     Opinion: {q['Opinion']}")
    print(f"     VA: {q['VA']}")

print("SAMPLE DEV DATA (to predict)")
dev_example = dev_raw[0]
print(f"ID: {dev_example['ID']}")
print(f"Text: {dev_example['Text']}")

LOADING DATA
✓ Loaded 4076 training instances
✓ Loaded 200 dev instances

Training Statistics:
  Total quadruplets: 5773
  Average per instance: 1.42
SAMPLE TRAINING DATA
ID: laptop_quad_dev_1
Text: this unit is ` ` pretty ` ` and stylish , so my high school daughter was attracted to it for that re...

Quadruplets:
  1. Aspect: unit
     Category: LAPTOP#DESIGN_FEATURES
     Opinion: pretty
     VA: 7.12#7.12
  2. Aspect: unit
     Category: LAPTOP#DESIGN_FEATURES
     Opinion: stylish
     VA: 7.12#7.12
SAMPLE DEV DATA (to predict)
ID: lap26_asqp_dev_1
Text: Great perforemce at a great price


In [5]:
# ============================================================================
# CELL 5: Preprocessing Functions for Quadruplets
# ============================================================================

def quadruplets_to_text(quadruplets):
    """
    Convert quadruplets to text format for T5

    Format: "aspect | category | opinion | VA"
    Multiple quadruplets separated by [SEP]

    Example:
    Input: [{"Aspect": "battery", "Category": "LAPTOP#BATTERY",
             "Opinion": "great", "VA": "7.50#6.80"}]
    Output: "battery | LAPTOP#BATTERY | great | 7.50#6.80"
    """
    if not quadruplets:
        return "none"

    quad_texts = []
    for q in quadruplets:
        aspect = q.get("Aspect", "NULL")
        category = q.get("Category", "NULL")
        opinion = q.get("Opinion", "NULL")
        va = q.get("VA", "5.00#5.00")
        quad_texts.append(f"{aspect} | {category} | {opinion} | {va}")

    return " [SEP] ".join(quad_texts)

def text_to_quadruplets(text):
    """
    Convert T5 output back to quadruplet list

    Input: "battery | LAPTOP#BATTERY | great | 7.50#6.80 [SEP] ..."
    Output: [{"Aspect": "battery", "Category": "LAPTOP#BATTERY", ...}]
    """
    if text.strip().lower() == "none":
        return []

    quadruplets = []
    parts = text.split("[SEP]")

    for part in parts:
        part = part.strip()
        if not part:
            continue

        # Split by | to get components
        components = [c.strip() for c in part.split("|")]

        # Need at least 4 components: aspect, category, opinion, VA
        if len(components) >= 4:
            aspect = components[0]
            category = components[1]
            opinion = components[2]
            va = components[3]

            # Validate VA format
            if "#" in va:
                try:
                    v_str, a_str = va.split("#")
                    v = float(v_str)
                    a = float(a_str)

                    # Clip to valid range
                    v = np.clip(v, 1.0, 9.0)
                    a = np.clip(a, 1.0, 9.0)

                    quadruplets.append({
                        "Aspect": aspect,
                        "Category": category,
                        "Opinion": opinion,
                        "VA": f"{v:.2f}#{a:.2f}"
                    })
                except:
                    continue

    return quadruplets

# Test the functions
print("TESTING CONVERSION FUNCTIONS")

test_quads = [
    {
        "Aspect": "battery",
        "Category": "LAPTOP#BATTERY",
        "Opinion": "great",
        "VA": "7.50#6.80"
    },
    {
        "Aspect": "screen",
        "Category": "LAPTOP#DISPLAY",
        "Opinion": "terrible",
        "VA": "2.30#7.20"
    }
]

test_text = quadruplets_to_text(test_quads)
test_back = text_to_quadruplets(test_text)

print(f"Original quadruplets:")
for q in test_quads:
    print(f"  ({q['Aspect']}, {q['Category']}, {q['Opinion']}, {q['VA']})")

print(f"\nConverted to text:\n{test_text}")

print(f"\nConverted back:")
for q in test_back:
    print(f"  ({q['Aspect']}, {q['Category']}, {q['Opinion']}, {q['VA']})")

print("\nConversion functions working correctly!")

TESTING CONVERSION FUNCTIONS
Original quadruplets:
  (battery, LAPTOP#BATTERY, great, 7.50#6.80)
  (screen, LAPTOP#DISPLAY, terrible, 2.30#7.20)

Converted to text:
battery | LAPTOP#BATTERY | great | 7.50#6.80 [SEP] screen | LAPTOP#DISPLAY | terrible | 2.30#7.20

Converted back:
  (battery, LAPTOP#BATTERY, great, 7.50#6.80)
  (screen, LAPTOP#DISPLAY, terrible, 2.30#7.20)

Conversion functions working correctly!


In [6]:
# ============================================================================
# CELL 5: Preprocessing Functions for Quadruplets
# ============================================================================

def quadruplets_to_text(quadruplets):
    """
    Convert quadruplets to text format for T5

    Format: "aspect | category | opinion | VA"
    Multiple quadruplets separated by [SEP]

    Example:
    Input: [{"Aspect": "battery", "Category": "LAPTOP#BATTERY",
             "Opinion": "great", "VA": "7.50#6.80"}]
    Output: "battery | LAPTOP#BATTERY | great | 7.50#6.80"
    """
    if not quadruplets:
        return "none"

    quad_texts = []
    for q in quadruplets:
        aspect = q.get("Aspect", "NULL")
        category = q.get("Category", "NULL")
        opinion = q.get("Opinion", "NULL")
        va = q.get("VA", "5.00#5.00")
        quad_texts.append(f"{aspect} | {category} | {opinion} | {va}")

    return " [SEP] ".join(quad_texts)

def text_to_quadruplets(text):
    """
    Convert T5 output back to quadruplet list

    Input: "battery | LAPTOP#BATTERY | great | 7.50#6.80 [SEP] ..."
    Output: [{"Aspect": "battery", "Category": "LAPTOP#BATTERY", ...}]
    """
    if text.strip().lower() == "none":
        return []

    quadruplets = []
    parts = text.split("[SEP]")

    for part in parts:
        part = part.strip()
        if not part:
            continue

        # Split by | to get components
        components = [c.strip() for c in part.split("|")]

        # Need at least 4 components: aspect, category, opinion, VA
        if len(components) >= 4:
            aspect = components[0]
            category = components[1]
            opinion = components[2]
            va = components[3]

            # Validate VA format
            if "#" in va:
                try:
                    v_str, a_str = va.split("#")
                    v = float(v_str)
                    a = float(a_str)

                    # Clip to valid range
                    v = np.clip(v, 1.0, 9.0)
                    a = np.clip(a, 1.0, 9.0)

                    quadruplets.append({
                        "Aspect": aspect,
                        "Category": category,
                        "Opinion": opinion,
                        "VA": f"{v:.2f}#{a:.2f}"
                    })
                except:
                    continue

    return quadruplets

# Test the functions
print("TESTING CONVERSION FUNCTIONS")

test_quads = [
    {
        "Aspect": "battery",
        "Category": "LAPTOP#BATTERY",
        "Opinion": "great",
        "VA": "7.50#6.80"
    },
    {
        "Aspect": "screen",
        "Category": "LAPTOP#DISPLAY",
        "Opinion": "terrible",
        "VA": "2.30#7.20"
    }
]

test_text = quadruplets_to_text(test_quads)
test_back = text_to_quadruplets(test_text)

print(f"Original quadruplets:")
for q in test_quads:
    print(f"  ({q['Aspect']}, {q['Category']}, {q['Opinion']}, {q['VA']})")

print(f"\nConverted to text:\n{test_text}")

print(f"\nConverted back:")
for q in test_back:
    print(f"  ({q['Aspect']}, {q['Category']}, {q['Opinion']}, {q['VA']})")

print("\nConversion functions working correctly!")

TESTING CONVERSION FUNCTIONS
Original quadruplets:
  (battery, LAPTOP#BATTERY, great, 7.50#6.80)
  (screen, LAPTOP#DISPLAY, terrible, 2.30#7.20)

Converted to text:
battery | LAPTOP#BATTERY | great | 7.50#6.80 [SEP] screen | LAPTOP#DISPLAY | terrible | 2.30#7.20

Converted back:
  (battery, LAPTOP#BATTERY, great, 7.50#6.80)
  (screen, LAPTOP#DISPLAY, terrible, 2.30#7.20)

Conversion functions working correctly!


In [7]:
# ============================================================================
# CELL 6: Dataset Class for Quadruplets
# ============================================================================

class QuadrupletDataset(Dataset):
    """
    PyTorch Dataset for T5 quadruplet extraction

    Converts data to T5 format:
    Input: "Extract aspect, category, opinion, and VA quadruplets: <sentence>"
    Output: "aspect1 | category1 | opinion1 | VA1 [SEP] aspect2 | ..."
    """

    def __init__(self, data, tokenizer, max_input_len=160, max_output_len=320):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Input: Task instruction + sentence
        input_text = f"Extract aspect, category, opinion, and VA quadruplets: {item['Text']}"

        # Output: Quadruplets in text format
        quadruplets = item.get("Quadruplet", [])
        output_text = quadruplets_to_text(quadruplets)

        # Tokenize input
        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_input_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Tokenize output
        output_encoding = self.tokenizer(
            output_text,
            max_length=self.max_output_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": input_encoding["input_ids"].squeeze(0),
            "attention_mask": input_encoding["attention_mask"].squeeze(0),
            "labels": output_encoding["input_ids"].squeeze(0)
        }

print("DATASET CLASS CREATED")
print("QuadrupletDataset class ready")
print("Handles tokenization for T5 quadruplet extraction")

DATASET CLASS CREATED
QuadrupletDataset class ready
Handles tokenization for T5 quadruplet extraction


In [8]:
# ============================================================================
# CELL 7: Create DataLoaders
# ============================================================================

print("CREATING DATALOADERS")

print("Loading T5 tokenizer...")
tokenizer = T5Tokenizer.from_pretrained(config.MODEL_NAME, legacy=False)
print("Tokenizer loaded")

# Split training data into train and validation (90/10)
print("\nSplitting data...")
train_data, val_data = train_test_split(
    train_raw,
    test_size=0.1,
    random_state=config.SEED,
    shuffle=True
)

print(f"Train: {len(train_data)} instances")
print(f"Val: {len(val_data)} instances")
print(f"Dev: {len(dev_raw)} instances")

# Create datasets
print("\nCreating datasets...")
train_dataset = QuadrupletDataset(
    train_data,
    tokenizer,
    config.MAX_INPUT_LEN,
    config.MAX_OUTPUT_LEN
)
val_dataset = QuadrupletDataset(
    val_data,
    tokenizer,
    config.MAX_INPUT_LEN,
    config.MAX_OUTPUT_LEN
)

print(f"Train dataset: {len(train_dataset)} samples")
print(f"Val dataset: {len(val_dataset)} samples")

# Create dataloaders
print("\nCreating dataloaders...")
train_loader = DataLoader(
    train_dataset,
    batch_size=config.BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config.BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

print("\nDataLoaders ready!")

CREATING DATALOADERS
Loading T5 tokenizer...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Tokenizer loaded

Splitting data...
Train: 3668 instances
Val: 408 instances
Dev: 200 instances

Creating datasets...
Train dataset: 3668 samples
Val dataset: 408 samples

Creating dataloaders...
Train batches: 917
Val batches: 102

DataLoaders ready!


In [9]:
# ============================================================================
# CELL 8: Load T5 Model
# ============================================================================

print("LOADING T5 MODEL")
print(f"Model: {config.MODEL_NAME}")
print("Loading...")

model = T5ForConditionalGeneration.from_pretrained(config.MODEL_NAME)
model = model.to(config.DEVICE)

total_params = sum(p.numel() for p in model.parameters())

print("\nModel loaded successfully!")
print("MODEL STATISTICS")
print(f"Total parameters: {total_params:,}")
print(f"Model size: ~{total_params * 4 / 1e9:.2f} GB")
print(f"Device: {config.DEVICE}")

LOADING T5 MODEL
Model: t5-small
Loading...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Model loaded successfully!
MODEL STATISTICS
Total parameters: 60,506,624
Model size: ~0.24 GB
Device: cuda


In [10]:
# ============================================================================
# CELL 9: Training Functions
# ============================================================================

def train_epoch(model, loader, optimizer, scheduler, device, tokenizer):
    """Train for one epoch"""
    model.train()
    total_loss = 0

    progress_bar = tqdm(loader, desc="Training")

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Replace padding token id with -100 (ignored in loss)
        labels[labels == tokenizer.pad_token_id] = -100

        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    return total_loss / len(loader)

def evaluate(model, loader, device, tokenizer):
    """Evaluate on validation set"""
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            labels[labels == tokenizer.pad_token_id] = -100

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            total_loss += outputs.loss.item()

    return total_loss / len(loader)

print("TRAINING FUNCTIONS READY")
print("train_epoch() - Trains model for one epoch")
print("evaluate() - Evaluates on validation set")

TRAINING FUNCTIONS READY
train_epoch() - Trains model for one epoch
evaluate() - Evaluates on validation set


In [11]:
# ============================================================================
# CELL 10: TRAINING LOOP
# ============================================================================

print("STARTING TRAINING - SUBTASK 3")
print(f"Epochs: {config.EPOCHS}")
print(f"Train batches per epoch: {len(train_loader)}")
print(f"Expected time per epoch: ~2-3 minutes")
print(f"Total expected time: ~8-12 minutes")

# Setup optimizer
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config.LR,
    weight_decay=config.WEIGHT_DECAY
)

# Setup scheduler with warmup
num_training_steps = len(train_loader) * config.EPOCHS
num_warmup_steps = int(config.WARMUP_RATIO * num_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

print(f"Optimizer: AdamW (lr={config.LR})")
print(f"Total training steps: {num_training_steps}")
print(f"Warmup steps: {num_warmup_steps}")

# Training loop
best_val_loss = float("inf")
history = []

for epoch in range(config.EPOCHS):
    print(f"EPOCH {epoch + 1}/{config.EPOCHS}")

    # Train
    train_loss = train_epoch(
        model,
        train_loader,
        optimizer,
        scheduler,
        config.DEVICE,
        tokenizer
    )

    # Evaluate
    val_loss = evaluate(model, val_loader, config.DEVICE, tokenizer)

    # Log results
    history.append({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "val_loss": val_loss
    })

    print(f"\n Results:")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss:   {val_loss:.4f}")

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), config.MODEL_SAVE_PATH)
        print(f" Best model saved! (Val Loss: {best_val_loss:.4f})")

print("\n" + "=" * 60)
print(" TRAINING COMPLETE!")
print(f"Best Validation Loss: {best_val_loss:.4f}")
print(f"Model saved to: {config.MODEL_SAVE_PATH}")

# Show training history
print("\n Training History:")
history_df = pd.DataFrame(history)
print(history_df.to_string(index=False))

STARTING TRAINING - SUBTASK 3
Epochs: 4
Train batches per epoch: 917
Expected time per epoch: ~2-3 minutes
Total expected time: ~8-12 minutes
Optimizer: AdamW (lr=0.0001)
Total training steps: 3668
Warmup steps: 366
EPOCH 1/4


Training:   0%|          | 0/917 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/102 [00:00<?, ?it/s]


 Results:
  Train Loss: 1.8610
  Val Loss:   0.6482
 Best model saved! (Val Loss: 0.6482)
EPOCH 2/4


Training:   0%|          | 0/917 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/102 [00:00<?, ?it/s]


 Results:
  Train Loss: 0.6905
  Val Loss:   0.5459
 Best model saved! (Val Loss: 0.5459)
EPOCH 3/4


Training:   0%|          | 0/917 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/102 [00:00<?, ?it/s]


 Results:
  Train Loss: 0.6078
  Val Loss:   0.5077
 Best model saved! (Val Loss: 0.5077)
EPOCH 4/4


Training:   0%|          | 0/917 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/102 [00:00<?, ?it/s]


 Results:
  Train Loss: 0.5717
  Val Loss:   0.5004
 Best model saved! (Val Loss: 0.5004)

 TRAINING COMPLETE!
Best Validation Loss: 0.5004
Model saved to: best_model_task3.pt

 Training History:
 epoch  train_loss  val_loss
     1    1.860960  0.648240
     2    0.690497  0.545938
     3    0.607789  0.507665
     4    0.571691  0.500385


In [12]:
# ============================================================================
# CELL 11: Generate Predictions on Dev Set
# ============================================================================

print("GENERATING PREDICTIONS")

def generate_quadruplets(text, model, tokenizer, device):
    """Generate quadruplets for a single text"""
    model.eval()

    input_text = f"Extract aspect, category, opinion, and VA quadruplets: {text}"

    input_encoding = tokenizer(
        input_text,
        max_length=config.MAX_INPUT_LEN,
        truncation=True,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model.generate(
            input_encoding["input_ids"].to(device),
            attention_mask=input_encoding["attention_mask"].to(device),
            max_length=config.MAX_OUTPUT_LEN,
            num_beams=5,  # Beam search for better quality
            early_stopping=True,
            no_repeat_ngram_size=2,
            length_penalty=1.0
        )

    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text_to_quadruplets(output_text)

# Load best model
print("Loading best model...")
model.load_state_dict(torch.load(config.MODEL_SAVE_PATH))
model.eval()

# Generate predictions
print(f"Predicting on {len(dev_raw)} examples...")
predictions = []

for item in tqdm(dev_raw, desc="Predicting"):
    quadruplets = generate_quadruplets(item["Text"], model, tokenizer, config.DEVICE)
    predictions.append({
        "ID": item["ID"],
        "Quadruplet": quadruplets
    })

# Statistics
total_quads = sum(len(p["Quadruplet"]) for p in predictions)
empty = sum(1 for p in predictions if len(p["Quadruplet"]) == 0)

print("\n Prediction Statistics:")
print(f"  Total quadruplets: {total_quads}")
print(f"  Average per example: {total_quads/len(predictions):.2f}")
print(f"  Empty predictions: {empty} ({empty/len(predictions)*100:.1f}%)")

# Show sample predictions
print("\n Sample Predictions:")
for i, pred in enumerate(predictions[:5], 1):
    original = next(item for item in dev_raw if item["ID"] == pred["ID"])
    print(f"\n{i}. {pred['ID']}")
    print(f"   Text: {original['Text'][:60]}...")
    if pred["Quadruplet"]:
        for q in pred["Quadruplet"]:
            print(f"   → ({q['Aspect']}, {q['Category']}, {q['Opinion']}, {q['VA']})")
    else:
        print("   (No quadruplets)")

GENERATING PREDICTIONS
Loading best model...
Predicting on 200 examples...


Predicting:   0%|          | 0/200 [00:00<?, ?it/s]


 Prediction Statistics:
  Total quadruplets: 250
  Average per example: 1.25
  Empty predictions: 0 (0.0%)

 Sample Predictions:

1. lap26_asqp_dev_1
   Text: Great perforemce at a great price...
   → (perforemce, LAPTOP#GENERAL, great, 7.67#7.67)

2. lap26_asqp_dev_2
   Text: Very bright display , and wide color gamut...
   → (display, DISPLAY#DESIGN_FEATURES, very bright, 7.67#7.67)
   → (color gamut, LAPTOP#OPERATION_PERFORMANCE, wide, 7.00#8.00)

3. lap26_asqp_dev_3
   Text: Battery life is bad enough as it is...
   → (battery life, BATTERY#OPERATION_PERFORMANCE, bad, 3.50#6.50)

4. lap26_asqp_dev_4
   Text: The Chromebook is very clean looking and it doesn't weigh a ...
   → (Chromebook, LAPTOP#DESIGN_FEATURES, very clean, 7.67#7.67)

5. lap26_asqp_dev_5
   Text: This laptop's screen is also very bright and clear , brighte...
   → (screen, DISPLAY#DESIGN_FEATURES, very bright, 7.67#7.67)


In [13]:
# ============================================================================
# CELL 12: Save Submission & Download
# ============================================================================

print("SAVING SUBMISSION")

# Save to file
with open(config.OUTPUT_FILE, "w", encoding="utf-8") as f:
    for pred in predictions:
        f.write(json.dumps(pred, ensure_ascii=False) + "\n")

print(f" Saved: {config.OUTPUT_FILE}")

# Show first few lines
print("\n First 3 submission lines:")
with open(config.OUTPUT_FILE, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i < 3:
            data = json.loads(line)
            print(f"\n{i+1}. ID: {data['ID']}")
            for q in data["Quadruplet"]:
                print(f"   - ({q['Aspect']}, {q['Category']}, {q['Opinion']}, {q['VA']})")

# Validation
print("VALIDATING FORMAT")

errors = []
for pred in predictions:
    pred_id = pred["ID"]

    if "ID" not in pred:
        errors.append(f"{pred_id}: Missing ID")
    if "Quadruplet" not in pred:
        errors.append(f"{pred_id}: Missing Quadruplet")
        continue

    for j, q in enumerate(pred["Quadruplet"], 1):
        if "Aspect" not in q:
            errors.append(f"{pred_id}, Q{j}: Missing Aspect")
        if "Category" not in q:
            errors.append(f"{pred_id}, Q{j}: Missing Category")
        if "Opinion" not in q:
            errors.append(f"{pred_id}, Q{j}: Missing Opinion")
        if "VA" not in q:
            errors.append(f"{pred_id}, Q{j}: Missing VA")
            continue

        va = q["VA"]
        if "#" not in va:
            errors.append(f"{pred_id}, Q{j}: Bad VA format")
            continue

        try:
            v, a = map(float, va.split("#"))
            if not (1.0 <= v <= 9.0) or not (1.0 <= a <= 9.0):
                errors.append(f"{pred_id}, Q{j}: VA out of range")
        except:
            errors.append(f"{pred_id}, Q{j}: Invalid VA")

if errors:
    print("ERRORS:")
    for e in errors[:10]:
        print(f"  {e}")
else:
    print("ALL VALIDATIONS PASSED!")
    print("Format is correct")
    print("All required fields present")
    print("VA values in valid range")

# Download file
print("DOWNLOADING FILE")

from google.colab import files
files.download(config.OUTPUT_FILE)

print("SUBTASK 3 COMPLETE!")
print(f"Training complete! (Best Val Loss: {best_val_loss:.4f})")
print(f"Predictions generated! ({total_quads} quadruplets)")
print(f"File downloaded: {config.OUTPUT_FILE}")
print(f"Ready to submit to competition!")

SAVING SUBMISSION
 Saved: submission_task3_colab.jsonl

 First 3 submission lines:

1. ID: lap26_asqp_dev_1
   - (perforemce, LAPTOP#GENERAL, great, 7.67#7.67)

2. ID: lap26_asqp_dev_2
   - (display, DISPLAY#DESIGN_FEATURES, very bright, 7.67#7.67)
   - (color gamut, LAPTOP#OPERATION_PERFORMANCE, wide, 7.00#8.00)

3. ID: lap26_asqp_dev_3
   - (battery life, BATTERY#OPERATION_PERFORMANCE, bad, 3.50#6.50)
VALIDATING FORMAT
ALL VALIDATIONS PASSED!
Format is correct
All required fields present
VA values in valid range
DOWNLOADING FILE


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

SUBTASK 3 COMPLETE!
Training complete! (Best Val Loss: 0.5004)
Predictions generated! (250 quadruplets)
File downloaded: submission_task3_colab.jsonl
Ready to submit to competition!
