# PCL Detection — Binary Classification Pipeline

Systematic approach to PCL binary classification:
1. True baseline (RoBERTa-base, unweighted CE, t=0.5)
2. Incremental improvements: weighted CE, threshold optimisation, multi-task learning
3. Ablation studies showing contribution of each component
4. Error analysis and custom metrics

## 1. Imports & Setup

In [None]:
import os
import ast
import re
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, precision_recall_curve, confusion_matrix
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# Device
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    DEVICE = torch.device('mps')
else:
    DEVICE = torch.device('cpu')

print(f'Device: {DEVICE}')

# Auto-detect environment and set batch sizes accordingly
ON_GPUDOJO = 'COLAB_GPU' in os.environ or 'COLAB_RELEASE_TAG' in os.environ or DEVICE.type == 'cuda'

if ON_GPUDOJO:
    BASE_DIR = '/home/azureuser/PCL_Detection'
    BATCH_SIZE = 2
    GRAD_ACCUM = 16
    EVAL_BATCH_SIZE = 16
    print('Running on GPUDOJO (CUDA) — batch_size=2, grad_accum=16')
else:
    BASE_DIR = '/Users/alexanderchow/Documents/Y3/60035_NLP/PCL_Detection'
    BATCH_SIZE = 2
    GRAD_ACCUM = 16
    EVAL_BATCH_SIZE = 4
    print('Running locally (MPS/CPU) — batch_size=2, grad_accum=16')

print(f'Effective batch size: {BATCH_SIZE * GRAD_ACCUM}')

DATA_DIR = f'{BASE_DIR}/data'
SPLITS_DIR = f'{BASE_DIR}/practice splits'
CHECKPOINT_DIR = f'{BASE_DIR}/checkpoints'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

## 2. Data Loading & Preprocessing

In [3]:
# Load main PCL dataset (skip 4 header lines)
pcl_df = pd.read_csv(
    f'{DATA_DIR}/dontpatronizeme_pcl.tsv',
    sep='\t', skiprows=4, header=None,
    names=['par_id', 'art_id', 'keyword', 'country_code', 'text', 'label'],
    quoting=3
)
pcl_df['par_id'] = pcl_df['par_id'].astype(int)
pcl_df['label'] = pcl_df['label'].astype(int)

# Binary label: {0,1}->0, {2,3,4}->1
pcl_df['binary_label'] = (pcl_df['label'] >= 2).astype(int)

# Clean text: strip <h> tags and HTML artifacts
def clean_text(text):
    text = str(text)
    text = re.sub(r'<[^>]+>', ' ', text)       # remove HTML tags
    text = re.sub(r'&[a-z]+;', ' ', text)      # remove HTML entities
    text = re.sub(r'\s+', ' ', text).strip()    # normalise whitespace
    return text

pcl_df['text'] = pcl_df['text'].apply(clean_text)

# Load train/dev splits
train_splits = pd.read_csv(f'{SPLITS_DIR}/train_semeval_parids-labels.csv')
dev_splits = pd.read_csv(f'{SPLITS_DIR}/dev_semeval_parids-labels.csv')
train_splits['par_id'] = train_splits['par_id'].astype(int)
dev_splits['par_id'] = dev_splits['par_id'].astype(int)

# Parse category labels from split files (7-dim multi-label vectors)
def parse_category_label(label_str):
    try:
        return ast.literal_eval(label_str)
    except:
        return [0, 0, 0, 0, 0, 0, 0]

train_splits['category_labels'] = train_splits['label'].apply(parse_category_label)
dev_splits['category_labels'] = dev_splits['label'].apply(parse_category_label)

# Merge with main data
train_ids = set(train_splits['par_id'].values)
dev_ids = set(dev_splits['par_id'].values)

train_df = pcl_df[pcl_df['par_id'].isin(train_ids)].copy()
dev_df = pcl_df[pcl_df['par_id'].isin(dev_ids)].copy()

# Merge category labels
cat_train = train_splits[['par_id', 'category_labels']].copy()
cat_dev = dev_splits[['par_id', 'category_labels']].copy()

train_df = train_df.merge(cat_train, on='par_id', how='left')
dev_df = dev_df.merge(cat_dev, on='par_id', how='left')

# Fill missing category labels with zeros
train_df['category_labels'] = train_df['category_labels'].apply(
    lambda x: x if isinstance(x, list) else [0]*7
)
dev_df['category_labels'] = dev_df['category_labels'].apply(
    lambda x: x if isinstance(x, list) else [0]*7
)

print(f'Train: {len(train_df)} samples ({train_df["binary_label"].sum()} PCL)')
print(f'Dev:   {len(dev_df)} samples ({dev_df["binary_label"].sum()} PCL)')
print(f'\nTrain class distribution:')
print(train_df['binary_label'].value_counts().sort_index())

Train: 8375 samples (794 PCL)
Dev:   2094 samples (199 PCL)

Train class distribution:
binary_label
0    7581
1     794
Name: count, dtype: int64


## 3. Dataset & DataLoader

In [None]:
MODEL_NAME = 'roberta-base'
MAX_LENGTH = 256

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class PCLDataset(Dataset):
    def __init__(self, texts, binary_labels, category_labels, tokenizer, max_length):
        self.texts = texts
        self.binary_labels = binary_labels
        self.category_labels = category_labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'binary_label': torch.tensor(self.binary_labels[idx], dtype=torch.long),
            'category_labels': torch.tensor(self.category_labels[idx], dtype=torch.float),
        }

def create_datasets(train_df, dev_df, tokenizer, max_length):
    train_dataset = PCLDataset(
        texts=train_df['text'].tolist(),
        binary_labels=train_df['binary_label'].tolist(),
        category_labels=train_df['category_labels'].tolist(),
        tokenizer=tokenizer,
        max_length=max_length
    )
    dev_dataset = PCLDataset(
        texts=dev_df['text'].tolist(),
        binary_labels=dev_df['binary_label'].tolist(),
        category_labels=dev_df['category_labels'].tolist(),
        tokenizer=tokenizer,
        max_length=max_length
    )
    return train_dataset, dev_dataset

print(f'Tokenizer loaded: {MODEL_NAME}')
print(f'Max length: {MAX_LENGTH}')

## 4. Model Architecture

In [None]:
class PCLMultiTaskModel(nn.Module):
    def __init__(self, model_name, num_categories=7, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size

        self.binary_head = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size, 2)
        )

        self.category_head = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_categories)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]

        binary_logits = self.binary_head(cls_output)
        category_logits = self.category_head(cls_output)

        return binary_logits, category_logits


class BaselineModel(nn.Module):
    """Simple RoBERTa-base binary classifier (baseline)."""
    def __init__(self, model_name='roberta-base', dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size, 2)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output)
        return logits, None  # None for compatibility with evaluate()

print('Model classes defined: PCLMultiTaskModel, BaselineModel')

## 5. Training Loop

In [None]:
import gc
print_every_updates = 20

def free_gpu():
    """Clear GPU memory."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def evaluate(model, dataloader, device, threshold=0.5):
    """Evaluate model on a dataset, return metrics and probabilities."""
    model.eval()
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['binary_label']

            binary_logits, _ = model(input_ids, attention_mask)
            probs = F.softmax(binary_logits, dim=1)[:, 1].cpu()

            all_probs.extend(probs.tolist())
            all_labels.extend(labels.tolist())

    all_preds = [1 if p >= threshold else 0 for p in all_probs]
    f1 = f1_score(all_labels, all_preds, pos_label=1)
    precision = precision_score(all_labels, all_preds, pos_label=1, zero_division=0)
    recall = recall_score(all_labels, all_preds, pos_label=1, zero_division=0)

    return {
        'f1': f1, 'precision': precision, 'recall': recall,
        'preds': all_preds, 'labels': all_labels, 'probs': all_probs,
        'threshold': threshold
    }


def find_best_threshold(probs, labels):
    """Sweep thresholds on probability outputs to maximise F1."""
    best_f1 = 0.0
    best_threshold = 0.5
    for t in np.arange(0.05, 0.95, 0.01):
        preds = [1 if p >= t else 0 for p in probs]
        f1 = f1_score(labels, preds, pos_label=1)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = t
    return best_threshold, best_f1


def train_model(config_name, train_df, dev_df, tokenizer,
                use_weighted_ce=True, use_threshold_opt=True, use_multitask=False,
                num_epochs=10, batch_size=BATCH_SIZE, grad_accum_steps=GRAD_ACCUM,
                lr=2e-5, weight_decay=0.01, patience=3, category_weight=0.3,
                model_class=PCLMultiTaskModel, model_name='roberta-base'):
    """Train a model with the given configuration."""
    free_gpu()

    print(f'\n{"="*60}')
    print(f'Training Config: {config_name}')
    print(f'  Model: {model_name}')
    print(f'  Weighted CE: {use_weighted_ce} | Multi-task: {use_multitask}')
    print(f'  Threshold Opt: {use_threshold_opt}')
    print(f'  Epochs: {num_epochs} | LR: {lr} | Patience: {patience}')
    print(f'  Batch: {batch_size} x {grad_accum_steps} = {batch_size * grad_accum_steps} effective')
    print(f'{"="*60}')

    effective_train_df = train_df.copy()

    # Create tokenizer for this model
    tok = AutoTokenizer.from_pretrained(model_name)

    train_dataset = PCLDataset(
        texts=effective_train_df['text'].tolist(),
        binary_labels=effective_train_df['binary_label'].tolist(),
        category_labels=effective_train_df['category_labels'].tolist(),
        tokenizer=tok, max_length=MAX_LENGTH
    )
    dev_dataset = PCLDataset(
        texts=dev_df['text'].tolist(),
        binary_labels=dev_df['binary_label'].tolist(),
        category_labels=dev_df['category_labels'].tolist(),
        tokenizer=tok, max_length=MAX_LENGTH
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    dev_loader = DataLoader(dev_dataset, batch_size=EVAL_BATCH_SIZE, shuffle=False, num_workers=0)

    # Model
    if model_class == BaselineModel:
        model = BaselineModel(model_name=model_name).to(DEVICE).float()
    else:
        model = PCLMultiTaskModel(model_name=model_name).to(DEVICE).float()

    # Loss function
    if use_weighted_ce:
        n_neg = (effective_train_df['binary_label'] == 0).sum()
        n_pos = (effective_train_df['binary_label'] == 1).sum()
        weight = torch.tensor([1.0, n_neg / n_pos], dtype=torch.float).to(DEVICE)
        binary_criterion = nn.CrossEntropyLoss(weight=weight)
        print(f'  CE class weights: [{weight[0]:.3f}, {weight[1]:.3f}]')
    else:
        binary_criterion = nn.CrossEntropyLoss()
        print(f'  Unweighted CE')

    category_criterion = nn.BCEWithLogitsLoss()

    # Optimizer & scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    total_steps = len(train_loader) * num_epochs // grad_accum_steps
    warmup_steps = int(0.1 * total_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)

    best_f1 = 0.0
    patience_counter = 0
    history = []

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        optimizer.zero_grad()

        for step, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            binary_labels = batch['binary_label'].to(DEVICE)
            category_labels = batch['category_labels'].to(DEVICE)

            binary_logits, category_logits = model(input_ids, attention_mask)

            loss_binary = binary_criterion(binary_logits, binary_labels)
            if use_multitask and category_logits is not None:
                loss_category = category_criterion(category_logits, category_labels)
                loss = loss_binary + category_weight * loss_category
            else:
                loss = loss_binary
            loss = loss / grad_accum_steps

            loss.backward()
            total_loss += loss.item() * grad_accum_steps

            if (step + 1) % grad_accum_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

                update = (step + 1) // grad_accum_steps
                if update % print_every_updates == 0:
                    avg_recent = total_loss / (step + 1)
                    print(f"    step {step+1}/{len(train_loader)} "
                          f"(update {update}) | avg loss so far: {avg_recent:.4f}")

        # Handle remaining gradients
        if (step + 1) % grad_accum_steps != 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        avg_loss = total_loss / len(train_loader)

        # Evaluate on dev at t=0.5
        metrics = evaluate(model, dev_loader, DEVICE, threshold=0.5)
        history.append({
            'epoch': epoch + 1, 'loss': avg_loss,
            'f1': metrics['f1'], 'precision': metrics['precision'], 'recall': metrics['recall']
        })

        print(f'  Epoch {epoch+1}/{num_epochs} — Loss: {avg_loss:.4f} | '
              f'F1: {metrics["f1"]:.4f} | P: {metrics["precision"]:.4f} | R: {metrics["recall"]:.4f}')

        if metrics['f1'] > best_f1:
            best_f1 = metrics['f1']
            patience_counter = 0
            torch.save(model.state_dict(), f'{CHECKPOINT_DIR}/{config_name}_best.pt')
            print(f'  -> New best F1! Saved.')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'  Early stopping at epoch {epoch+1}')
                break

    # Load best model
    model.load_state_dict(torch.load(f'{CHECKPOINT_DIR}/{config_name}_best.pt', weights_only=True))

    # Evaluate at t=0.5
    final_metrics = evaluate(model, dev_loader, DEVICE, threshold=0.5)

    # Threshold optimisation (if enabled)
    if use_threshold_opt:
        best_thresh, _ = find_best_threshold(final_metrics['probs'], final_metrics['labels'])
        thresh_metrics = evaluate(model, dev_loader, DEVICE, threshold=best_thresh)
    else:
        best_thresh = 0.5
        thresh_metrics = final_metrics

    print(f'\n  Dev F1 @ t=0.50: {final_metrics["f1"]:.4f}')
    if use_threshold_opt:
        print(f'  Dev F1 @ t={best_thresh:.2f} (optimised): {thresh_metrics["f1"]:.4f}')
    print(classification_report(
        thresh_metrics['labels'], thresh_metrics['preds'],
        target_names=['No PCL', 'PCL'], digits=4
    ))

    # Move to CPU and free GPU
    model = model.cpu()
    del model; free_gpu()

    return final_metrics, thresh_metrics, history, best_thresh, tok

print('Training function defined.')

## 6. True Baseline: RoBERTa-base + Unweighted CE + t=0.5

In [None]:
# True baseline: RoBERTa-base, unweighted CE, no tricks, fixed threshold=0.5
BASELINE_MODEL = 'roberta-base'

metrics_bl, thresh_metrics_bl, history_bl, thresh_bl, tok_bl = train_model(
    config_name='baseline',
    train_df=train_df, dev_df=dev_df, tokenizer=tokenizer,
    model_class=BaselineModel, model_name=BASELINE_MODEL,
    use_weighted_ce=False, use_multitask=False, use_threshold_opt=False,
    lr=2e-5, patience=3
)

## 7. Incremental Improvements

In [None]:
# Config A: Weighted CE + Threshold Optimisation + Multi-task Learning
metrics_a, thresh_metrics_a, history_a, thresh_a, tok_a = train_model(
    config_name='config_A_weighted_ce_thresh_mt',
    train_df=train_df, dev_df=dev_df, tokenizer=tokenizer,
    model_class=PCLMultiTaskModel, model_name='roberta-base',
    use_weighted_ce=True, use_multitask=True, use_threshold_opt=True,
    lr=2e-5, patience=5
)

## 8. Ablation Studies

In [None]:
# Ablation 1: Config A without multi-task (isolate multi-task contribution)
metrics_abl_nomt, thresh_metrics_abl_nomt, history_abl_nomt, thresh_abl_nomt, tok_abl_nomt = train_model(
    config_name='ablation_no_multitask',
    train_df=train_df, dev_df=dev_df, tokenizer=tokenizer,
    model_class=BaselineModel, model_name='roberta-base',
    use_weighted_ce=True, use_multitask=False, use_threshold_opt=True,
    lr=2e-5, patience=5
)

# Ablation 2: Config A without threshold opt (isolate threshold contribution)
metrics_abl_nothresh, thresh_metrics_abl_nothresh, history_abl_nothresh, thresh_abl_nothresh, tok_abl_nothresh = train_model(
    config_name='ablation_no_thresh',
    train_df=train_df, dev_df=dev_df, tokenizer=tokenizer,
    model_class=PCLMultiTaskModel, model_name='roberta-base',
    use_weighted_ce=True, use_multitask=True, use_threshold_opt=False,
    lr=2e-5, patience=5
)

# Ablation 3: Config A without weighted CE (isolate weighted CE contribution)
metrics_abl_nowe, thresh_metrics_abl_nowe, history_abl_nowe, thresh_abl_nowe, tok_abl_nowe = train_model(
    config_name='ablation_no_weighted_ce',
    train_df=train_df, dev_df=dev_df, tokenizer=tokenizer,
    model_class=PCLMultiTaskModel, model_name='roberta-base',
    use_weighted_ce=False, use_multitask=True, use_threshold_opt=True,
    lr=2e-5, patience=5
)

## 9. Results Comparison & Best Model Selection

In [None]:
# ---- Results comparison table ----
results = pd.DataFrame([
    {
        'Config': 'Baseline (unweighted CE, t=0.5)',
        'Threshold': '0.50',
        'F1': thresh_metrics_bl['f1'],
        'Precision': thresh_metrics_bl['precision'],
        'Recall': thresh_metrics_bl['recall'],
    },
    {
        'Config': 'A: + Weighted CE + Thresh Opt + Multi-task',
        'Threshold': f'{thresh_a:.2f}',
        'F1': thresh_metrics_a['f1'],
        'Precision': thresh_metrics_a['precision'],
        'Recall': thresh_metrics_a['recall'],
    },
    {
        'Config': 'Ablation: A w/o Multi-task',
        'Threshold': f'{thresh_abl_nomt:.2f}',
        'F1': thresh_metrics_abl_nomt['f1'],
        'Precision': thresh_metrics_abl_nomt['precision'],
        'Recall': thresh_metrics_abl_nomt['recall'],
    },
    {
        'Config': 'Ablation: A w/o Threshold Opt',
        'Threshold': '0.50',
        'F1': thresh_metrics_abl_nothresh['f1'],
        'Precision': thresh_metrics_abl_nothresh['precision'],
        'Recall': thresh_metrics_abl_nothresh['recall'],
    },
    {
        'Config': 'Ablation: A w/o Weighted CE',
        'Threshold': f'{thresh_abl_nowe:.2f}',
        'F1': thresh_metrics_abl_nowe['f1'],
        'Precision': thresh_metrics_abl_nowe['precision'],
        'Recall': thresh_metrics_abl_nowe['recall'],
    },
])

print('\n' + '='*80)
print('RESULTS COMPARISON (all models evaluated on dev set)')
print('='*80)
print(results.to_string(index=False, float_format='{:.4f}'.format))

# Best model is Config A
best_metrics = thresh_metrics_a
best_threshold = thresh_a
best_tok = tok_a
best_ckpt_name = 'config_A_weighted_ce_thresh_mt'
best_model_class = PCLMultiTaskModel
best_model_name = 'roberta-base'
best_key = 'A'

improvement = best_metrics['f1'] - thresh_metrics_bl['f1']
print(f'\n** Best model: Config A (F1={best_metrics["f1"]:.4f} @ t={best_threshold:.2f}) **')
print(f'   Improvement over baseline: +{improvement:.4f} F1')

## 10. Generate dev.txt and test.txt Predictions

In [None]:
# ---- Dev predictions ----
dev_preds = best_metrics['preds']
dev_pred_path = f'{BASE_DIR}/dev.txt'
with open(dev_pred_path, 'w') as f:
    for p in dev_preds:
        f.write(f'{p}\n')
print(f'Dev predictions saved to {dev_pred_path}')
print(f'  {len(dev_preds)} predictions, {sum(dev_preds)} predicted PCL')

# ---- Test predictions ----
test_df = pd.read_csv(f'{DATA_DIR}/task4_test.tsv', sep='\t', header=None,
                       names=['par_id', 'art_id', 'keyword', 'country_code', 'text'])
test_df['text'] = test_df['text'].apply(clean_text)
print(f'\nTest set: {len(test_df)} samples')

test_dataset = PCLDataset(
    texts=test_df['text'].tolist(),
    binary_labels=[0] * len(test_df),
    category_labels=[[0]*7] * len(test_df),
    tokenizer=best_tok, max_length=MAX_LENGTH
)
test_loader = DataLoader(test_dataset, batch_size=EVAL_BATCH_SIZE, shuffle=False, num_workers=0)

# Reload best model from checkpoint
best_model = PCLMultiTaskModel(model_name=best_model_name).to(DEVICE)
best_model.load_state_dict(torch.load(f'{CHECKPOINT_DIR}/{best_ckpt_name}_best.pt', weights_only=True, map_location=DEVICE))
best_model.eval()

test_probs = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        binary_logits, _ = best_model(input_ids, attention_mask)
        probs = F.softmax(binary_logits, dim=1)[:, 1].cpu().tolist()
        test_probs.extend(probs)

del best_model; gc.collect(); torch.cuda.empty_cache()

test_preds = [1 if p >= best_threshold else 0 for p in test_probs]

test_pred_path = f'{BASE_DIR}/test.txt'
with open(test_pred_path, 'w') as f:
    for p in test_preds:
        f.write(f'{p}\n')
print(f'Test predictions saved to {test_pred_path}')
print(f'  {len(test_preds)} predictions, {sum(test_preds)} predicted PCL')
print(f'  Using threshold: {best_threshold:.2f}')

## 11. Error Analysis

In [None]:
# ---- Error analysis: compare baseline vs best model ----
# Reload baseline from checkpoint
model_baseline = BaselineModel(model_name=BASELINE_MODEL).to(DEVICE)
model_baseline.load_state_dict(torch.load(f'{CHECKPOINT_DIR}/baseline_best.pt', weights_only=True, map_location=DEVICE))

baseline_dev_metrics = evaluate(model_baseline,
    DataLoader(PCLDataset(dev_df['text'].tolist(), dev_df['binary_label'].tolist(),
                          dev_df['category_labels'].tolist(), tok_bl, MAX_LENGTH),
               batch_size=EVAL_BATCH_SIZE, shuffle=False),
    DEVICE, threshold=0.5)

del model_baseline; gc.collect(); torch.cuda.empty_cache()

baseline_preds = baseline_dev_metrics['preds']
best_preds = best_metrics['preds']
true_labels = best_metrics['labels']
dev_texts = dev_df['text'].tolist()

print('='*60)
print('ERROR ANALYSIS: Baseline vs Best Model on Dev Set')
print('='*60)

baseline_fp = sum(1 for t, p in zip(true_labels, baseline_preds) if t == 0 and p == 1)
baseline_fn = sum(1 for t, p in zip(true_labels, baseline_preds) if t == 1 and p == 0)
baseline_tp = sum(1 for t, p in zip(true_labels, baseline_preds) if t == 1 and p == 1)
best_fp = sum(1 for t, p in zip(true_labels, best_preds) if t == 0 and p == 1)
best_fn = sum(1 for t, p in zip(true_labels, best_preds) if t == 1 and p == 0)
best_tp = sum(1 for t, p in zip(true_labels, best_preds) if t == 1 and p == 1)

print(f'\n{"Metric":<25} {"Baseline":>10} {"Best Model":>12}')
print('-' * 50)
print(f'{"True Positives":<25} {baseline_tp:>10} {best_tp:>12}')
print(f'{"False Positives":<25} {baseline_fp:>10} {best_fp:>12}')
print(f'{"False Negatives":<25} {baseline_fn:>10} {best_fn:>12}')

fixed_fn = []
fixed_fp = []
for i, (t, bp, mp) in enumerate(zip(true_labels, baseline_preds, best_preds)):
    if t == 1 and bp == 0 and mp == 1:
        fixed_fn.append(i)
    if t == 0 and bp == 1 and mp == 0:
        fixed_fp.append(i)

print(f'\nBest model fixes {len(fixed_fn)} FN and {len(fixed_fp)} FP from baseline')

print(f'\n--- PCL missed by baseline but caught by best model ({min(5, len(fixed_fn))} shown) ---')
for idx in fixed_fn[:5]:
    print(f'  [{idx}] {dev_texts[idx][:150]}...')

remaining_fn = [i for i, (t, p) in enumerate(zip(true_labels, best_preds)) if t == 1 and p == 0]
print(f'\n--- Remaining false negatives ({min(5, len(remaining_fn))}/{len(remaining_fn)} shown) ---')
for idx in remaining_fn[:5]:
    print(f'  [{idx}] {dev_texts[idx][:150]}...')

fn_lengths = [len(dev_texts[i].split()) for i in remaining_fn]
fp_indices = [i for i, (t, p) in enumerate(zip(true_labels, best_preds)) if t == 0 and p == 1]
fp_lengths = [len(dev_texts[i].split()) for i in fp_indices]
all_lengths = [len(t.split()) for t in dev_texts]

print(f'\n--- Text length analysis ---')
print(f'  Overall mean length:     {np.mean(all_lengths):.1f} words')
print(f'  False negative mean:     {np.mean(fn_lengths):.1f} words' if fn_lengths else '  No false negatives')
print(f'  False positive mean:     {np.mean(fp_lengths):.1f} words' if fp_lengths else '  No false positives')

## 12. Custom Metrics

In [None]:
# ---- Precision-Recall Curve: Best Model vs Baseline ----
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# --- Left: Precision-Recall Curves ---
ax = axes[0]

# Best model PR curve
prec_best, rec_best, thresholds_best = precision_recall_curve(
    best_metrics['labels'], best_metrics['probs'], pos_label=1)
ax.plot(rec_best, prec_best, label=f'Best model ({best_key})', color='tab:blue', linewidth=2)

# Baseline PR curve
prec_base, rec_base, thresholds_base = precision_recall_curve(
    baseline_dev_metrics['labels'], baseline_dev_metrics['probs'], pos_label=1)
ax.plot(rec_base, prec_base, label='RoBERTa-base baseline', color='tab:orange', linewidth=2, linestyle='--')

# Mark the operating points
ax.scatter([best_metrics['recall']], [best_metrics['precision']],
           marker='*', s=200, color='tab:blue', zorder=5, label=f'Best @ t={best_threshold:.2f}')
ax.scatter([baseline_dev_metrics['recall']], [baseline_dev_metrics['precision']],
           marker='*', s=200, color='tab:orange', zorder=5, label=f'Baseline @ t={thresh_bl:.2f}')

ax.set_xlabel('Recall', fontsize=12)
ax.set_ylabel('Precision', fontsize=12)
ax.set_title('Precision-Recall Curve', fontsize=13)
ax.legend(fontsize=9)
ax.set_xlim([0, 1.02])
ax.set_ylim([0, 1.02])
ax.grid(True, alpha=0.3)

# --- Right: Confusion Matrix Heatmap (Best Model) ---
ax = axes[1]

cm = confusion_matrix(best_metrics['labels'], best_metrics['preds'], labels=[0, 1])
im = ax.imshow(cm, interpolation='nearest', cmap='Blues')
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)

classes = ['No PCL (0)', 'PCL (1)']
tick_marks = [0, 1]
ax.set_xticks(tick_marks)
ax.set_xticklabels(classes, fontsize=11)
ax.set_yticks(tick_marks)
ax.set_yticklabels(classes, fontsize=11)

# Annotate each cell with count
for i in range(2):
    for j in range(2):
        color = 'white' if cm[i, j] > cm.max() / 2 else 'black'
        ax.text(j, i, f'{cm[i, j]}', ha='center', va='center', fontsize=16, fontweight='bold', color=color)

ax.set_xlabel('Predicted', fontsize=12)
ax.set_ylabel('Actual', fontsize=12)
ax.set_title(f'Confusion Matrix — Best Model ({best_key}, t={best_threshold:.2f})', fontsize=13)

plt.tight_layout()
plt.savefig(f'{BASE_DIR}/custom_metrics.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'Figure saved to {BASE_DIR}/custom_metrics.png')

## 13. Ablation Study Summary

In [None]:
# ---- Ablation Study Summary ----
ablation_table = pd.DataFrame([
    {'Config': 'Baseline (unweighted CE, t=0.5)', 'Weighted CE': 'No', 'Multi-task': 'No', 'Thresh Opt': 'No',
     'Dev F1': f'{thresh_metrics_bl["f1"]:.4f}'},
    {'Config': 'Config A (full)', 'Weighted CE': 'Yes', 'Multi-task': 'Yes', 'Thresh Opt': 'Yes',
     'Dev F1': f'{thresh_metrics_a["f1"]:.4f}'},
    {'Config': 'A w/o Multi-task', 'Weighted CE': 'Yes', 'Multi-task': 'No', 'Thresh Opt': 'Yes',
     'Dev F1': f'{thresh_metrics_abl_nomt["f1"]:.4f}'},
    {'Config': 'A w/o Threshold Opt', 'Weighted CE': 'Yes', 'Multi-task': 'Yes', 'Thresh Opt': 'No',
     'Dev F1': f'{thresh_metrics_abl_nothresh["f1"]:.4f}'},
    {'Config': 'A w/o Weighted CE', 'Weighted CE': 'No', 'Multi-task': 'Yes', 'Thresh Opt': 'Yes',
     'Dev F1': f'{thresh_metrics_abl_nowe["f1"]:.4f}'},
])

print('='*80)
print('ABLATION STUDY')
print('='*80)
print(ablation_table.to_string(index=False))

# Component contributions
full_f1 = thresh_metrics_a['f1']
print(f'\nComponent contributions (F1 drop when removed from Config A):')
print(f'  Multi-task learning:    {full_f1 - thresh_metrics_abl_nomt["f1"]:+.4f}')
print(f'  Threshold optimisation: {full_f1 - thresh_metrics_abl_nothresh["f1"]:+.4f}')
print(f'  Weighted CE:            {full_f1 - thresh_metrics_abl_nowe["f1"]:+.4f}')
print(f'\nTotal improvement over baseline: {full_f1 - thresh_metrics_bl["f1"]:+.4f} F1')