In [None]:
import sys
from pathlib import Path
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

import torch
from transformers import AutoTokenizer
import pandas as pd
from src.data_loader import load_dimabsa_dataset, create_dataloaders
from src.models.baselines import BERTDimABSA, RoBERTaDimABSA
from src.models.deberta_dimabsa import DeBERTaDimABSA, DeBERTaDimABSAConfig
from src.trainer import TrainingConfig, train_model
from src.lexicon import create_lexicon

DATA_DIR = PROJECT_ROOT / 'DimABSA2026' / 'task-dataset' / 'track_a' / 'subtask_1'
MODEL_NAMES = {'bert': 'bert-base-uncased', 'roberta': 'roberta-base', 'deberta': 'microsoft/deberta-v3-base'}
OUTPUT_DIR = PROJECT_ROOT / 'outputs' / 'baselines'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

raw = load_dimabsa_dataset(DATA_DIR, lang='eng', domain='laptop', split_dev=True)
train_df = raw['train']
dev_df = raw['dev']

# Create a single lexicon extractor used for all baselines (fallback to zeros if lexicon missing)
try:
    lex = create_lexicon(PROJECT_ROOT / 'NRC-VAD-Lexicon-v2.1', use_dependency_parsing=False)
    lex_extractor = lambda texts, aspects: lex.extract_batch_features(texts, aspects)
    print('Lexicon loaded for baselines')
except Exception as e:
    print('Lexicon unavailable, using zero-features fallback:', e)
    lex_extractor = lambda texts, aspects: torch.zeros((len(texts), 8), dtype=torch.float32)

results = []
for name, model_name in MODEL_NAMES.items():
    print('Preparing baseline:', name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if name == 'bert':
        model = BERTDimABSA(model_name=model_name)
    elif name == 'roberta':
        model = RoBERTaDimABSA(model_name=model_name)
    else:
        cfg = DeBERTaDimABSAConfig(model_name=model_name, use_lexicon=True, lexicon_feature_dim=8, output_scaling='tanh')
        model = DeBERTaDimABSA(cfg)

    train_loader, dev_loader = create_dataloaders(
        train_df.sample(n=min(len(train_df), 1000), random_state=42),
        dev_df,
        tokenizer,
        batch_size=16,
        max_length=128,
        use_aspect_aware=True
    )

    cfg = TrainingConfig(learning_rate=2e-5, num_epochs=3, checkpoint_dir=str(OUTPUT_DIR / 'checkpoints'), device=str(DEVICE))
    #lex_extractor is passed for all models so each uses lexicon features.
    res = train_model(model, train_loader, dev_loader, lexicon_extractor=lex_extractor, config=cfg, model_name=f'baseline_{name}')
    results.append({'model': name, 'best_metric': res['best_metric']})

# Save plan
pd.DataFrame([{'model': k, 'model_name': v} for k, v in MODEL_NAMES.items()]).to_csv(OUTPUT_DIR / 'baselines_plan.csv', index=False)
print('Baselines plan saved to', OUTPUT_DIR / 'baselines_plan.csv')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

OUT_DIR = Path.cwd().parent / 'outputs' / 'baselines'
OUT_DIR.mkdir(parents=True, exist_ok=True)
results_path = OUT_DIR / 'baselines_results.csv'
if not results_path.exists():
    print('No baselines results file found at', results_path)
    print('Run baseline experiments to produce', results_path)
else:
    df = pd.read_csv(results_path)
    if not {'model','best_metric'}.issubset(df.columns):
        print('Expected columns missing in', results_path)
    else:
        plt.figure(figsize=(6,4))
        sns.barplot(data=df, x='model', y='best_metric', ci='sd', palette='muted')
        plt.title('Baseline Models â€” Best Metric (lower is better)')
        plt.tight_layout()
        out_file = OUT_DIR / 'baselines_barplot.png'
        plt.savefig(out_file, dpi=200)
        plt.show()
        print('Saved', out_file)


In [None]:
# Boxplot of baseline results
results_path = OUT_DIR / 'baselines_results.csv'
if not results_path.exists():
    print('No baselines results file found at', results_path)
else:
    df = pd.read_csv(results_path)
    if not {'model','best_metric'}.issubset(df.columns):
        print('Expected columns missing in', results_path)
    else:
        plt.figure(figsize=(8,5))
        sns.boxplot(data=df, x='model', y='best_metric', palette='pastel')
        sns.swarmplot(data=df, x='model', y='best_metric', color='k', alpha=0.6)
        plt.title('Baseline Metric Distributions')
        plt.tight_layout()
        out_file = OUT_DIR / 'baselines_boxplot.png'
        plt.savefig(out_file, dpi=200)
        plt.show()
        print('Saved', out_file)
