In [1]:
from pathlib import Path
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import sys
import torch
from transformers import (
    AutoModelForSequenceClassification,
    RobertaTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import json
import joblib
from datetime import datetime
import zipfile
from collections import Counter
from sklearn.model_selection import train_test_split


MODELS_DIR = Path("models/roberta")
BASE_DRIVE_DIR = Path("/content/drive/MyDrive/NLP-Clarity")

if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    MODELS_DIR = BASE_DRIVE_DIR / "models" / "roberta"

MODELS_DIR.mkdir(parents=True, exist_ok=True)


Mounted at /content/drive


In [2]:
DATA_DIR = Path("data")

if 'google.colab' in sys.modules:
    DATA_DIR = BASE_DRIVE_DIR / "data"

DATA_DIR.mkdir(parents=True, exist_ok=True)

train_path = DATA_DIR / "train_cleaned.csv"
val_path = DATA_DIR / "val.csv"


def load_qevasion_dataset():
    if train_path.exists() and val_path.exists():
        df_train = pd.read_csv(train_path)
        df_val = pd.read_csv(val_path)
        return df_train, df_val
    else:
        dataset = load_dataset("ailsntua/QEvasion")
        df_train = dataset["train"].to_pandas()
        df_val = dataset["test"].to_pandas()
        df_train.to_csv(train_path, index=False)
        df_val.to_csv(val_path, index=False)
        return df_train, df_val

df_train, df_test = load_qevasion_dataset()

print(f"Train: {len(df_train)}, Test: {len(df_test)}")

Train: 3403, Test: 308


In [3]:
# f1_for_class is the exact function used by the authors (they posted it on discord group)
def f1_for_class(gold_annotations, predictions, target_class):
    """
    Calculates Precision/Recall/F1 for only one class.

    gold_annotations: list of lists (or sets) with labels per sample
    predictions: list with one prediction per sample
    target_class: the class for which we want the F1
    """
    TP = FP = FN = 0

    for gold, pred in zip(gold_annotations, predictions):
        gold = set(gold)

        if pred == target_class and target_class in gold:
            TP += 1  # we correctly predicted target_class
        elif pred == target_class and target_class not in gold:
            FP += 1  # we predicted target_class but it was not in gold
        elif target_class in gold and pred not in gold:
            FN += 1  # the class was in gold but the sample is overall wrong

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return {"precision": precision, "recall": recall, "f1": f1, "tp": TP, "fp": FP, "fn": FN}


def compute_macro_f1(gold_annotations, predictions):
    """
    Compute Macro-F1 score (same as CodaBench leaderboard).

    Args:
        gold_annotations: list of lists - each inner list contains valid labels from annotators
        predictions: list of strings - one prediction per sample

    Returns:
        float: Macro F1 score
    """
    all_classes = set()
    for gold in gold_annotations:
        all_classes.update(gold)
    classes = sorted(list(all_classes))

    f1_scores = []
    for cls in classes:
        result = f1_for_class(gold_annotations, predictions, cls)
        f1_scores.append(result["f1"])

    macro_f1 = float(np.mean(f1_scores))

    return macro_f1


def compute_metrics_custom(eval_pred):
    logits, _ = eval_pred # We ignore the dummy labels provided here

    pred_indices = np.argmax(logits, axis=-1)

    pred_strings = label_encoder.inverse_transform(pred_indices)

    gold_annotations = df_val_eval[['annotator1', 'annotator2', 'annotator3']].values.tolist()

    cleaned_gold = []
    for row in gold_annotations:
        cleaned_gold.append([lbl for lbl in row if isinstance(lbl, str) and lbl != ''])

    macro_f1 = compute_macro_f1(cleaned_gold, pred_strings)

    return {"macro_f1": macro_f1}

In [4]:
MODEL_NAME = "roberta-base"

label_encoder = LabelEncoder()
label_encoder.fit(df_train['evasion_label'])
num_labels = len(label_encoder.classes_)

print(f"Classes ({num_labels}): {label_encoder.classes_}")

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)


Classes (9): ['Claims ignorance' 'Clarification' 'Declining to answer' 'Deflection'
 'Dodging' 'Explicit' 'General' 'Implicit' 'Partial/half-answer']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [5]:
def format_input(row):
    """Combine question and answer into a single input string."""
    return f"Q: {row['question']}\nA: {row['interview_answer']}"


def prepare_dataset(df, label_encoder):
    # Filter out rows where 'evasion_label' is an empty string or NaN
    df_filtered = df[df['evasion_label'].notna() & (df['evasion_label'] != '')]
    texts = [format_input(row) for _, row in df_filtered.iterrows()]
    labels = label_encoder.transform(df_filtered['evasion_label']).tolist()
    return Dataset.from_dict({"text": texts, "label": labels})

def prepare_test_dataset(df):
    texts = [format_input(row) for _, row in df.iterrows()]

    # We will IGNORE these labels during metric calculation.
    labels = [0] * len(texts)

    return Dataset.from_dict({"text": texts, "label": labels})

def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding=False,
    )


train_dataset = prepare_dataset(df_train, label_encoder)
train_dataset = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

df_val_eval = df_test.reset_index(drop=True)
val_dataset = prepare_test_dataset(df_val_eval)
val_dataset = val_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

print(f"Train samples: {len(train_dataset)}")
print(f"Val (Test) samples: {len(val_dataset)}")

Map:   0%|          | 0/3403 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Train samples: 3403
Val (Test) samples: 308


In [6]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(num_labels),
    y=label_encoder.transform(df_train['evasion_label'])
)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

print("Class weights:")
for i, cls in enumerate(label_encoder.classes_):
    print(f"  {cls}: {class_weights[i]:.3f}")


Class weights:
  Claims ignorance: 3.204
  Clarification: 4.110
  Declining to answer: 2.626
  Deflection: 1.003
  Dodging: 0.542
  Explicit: 0.365
  General: 0.998
  Implicit: 0.788
  Partial/half-answer: 4.786


In [7]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights.to(self.args.device)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fn = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda


In [9]:
config = {
    "model": MODEL_NAME,
    "epochs": 15,
    "batch_size": 16,
    "learning_rate": 1e-5,
    "weight_decay": 0.01,
    "warmup_ratio": 0.1,
}


In [10]:

training_args = TrainingArguments(
    output_dir=str(MODELS_DIR / "checkpoints"),
    num_train_epochs=config["epochs"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    learning_rate=config["learning_rate"],
    weight_decay=config["weight_decay"],
    warmup_ratio=config["warmup_ratio"],
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    logging_steps=50,
    report_to="none",
    fp16=torch.cuda.is_available(),
)

In [11]:
trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics_custom
)

trainer.train()

  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Macro F1
1,2.1841,2.159296,0.069487
2,2.0378,3.424102,0.290519
3,1.7952,3.547039,0.320051
4,1.6811,3.613606,0.376123
5,1.4194,4.458251,0.368957
6,1.311,4.924813,0.367342
7,1.1912,4.761642,0.442009
8,1.0867,4.7395,0.437024
9,0.99,4.860992,0.453197
10,0.8384,4.820635,0.477076


TrainOutput(global_step=3195, training_loss=1.1815602865204191, metrics={'train_runtime': 308.014, 'train_samples_per_second': 165.723, 'train_steps_per_second': 10.373, 'total_flos': 1.3423596972674724e+16, 'train_loss': 1.1815602865204191, 'epoch': 15.0})

In [12]:
def evaluate_model(model, tokenizer, label_encoder, df_val):
    model.eval()
    device = next(model.parameters()).device

    predictions = []

    with torch.no_grad():
        for _, row in df_val.iterrows():
            text = format_input(row)
            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            outputs = model(**inputs)
            pred_idx = outputs.logits.argmax(dim=-1).item()
            predictions.append(label_encoder.inverse_transform([pred_idx])[0])

    gold_annotations = df_val[['annotator1', 'annotator2', 'annotator3']].values.tolist()
    macro_f1 = compute_macro_f1(gold_annotations, predictions)

    return macro_f1, predictions


macro_f1, predictions = evaluate_model(model, tokenizer, label_encoder, df_test)
print(f"\n{'='*50}")
print(f"Macro F1 on Test: {macro_f1:.4f}")
print(f"{'='*50}")

print(f"\nPrediction distribution:")
print(Counter(predictions))



Macro F1 on Test: 0.4770

Prediction distribution:
Counter({'General': 87, 'Explicit': 52, 'Deflection': 49, 'Implicit': 47, 'Dodging': 37, 'Claims ignorance': 13, 'Declining to answer': 10, 'Partial/half-answer': 8, 'Clarification': 5})


In [15]:
BEST_MODEL_PATH = MODELS_DIR / "best_model"

model.save_pretrained(BEST_MODEL_PATH)
tokenizer.save_pretrained(BEST_MODEL_PATH)
joblib.dump(label_encoder, BEST_MODEL_PATH / "label_encoder.pkl")

with open(BEST_MODEL_PATH / "metadata.json", "w") as f:
    json.dump({
        "config": config,
        "macro_f1": macro_f1,
        "timestamp": datetime.now().isoformat(),
        "train_path": str(train_path)
    }, f, indent=2)

print(f"Model saved to {BEST_MODEL_PATH}")


Model saved to /content/drive/MyDrive/NLP-Clarity/models/roberta/best_model


**Run this cell ONLY to generate submission files for CodaBench.**

This pipeline will:
1. Load your **best saved RoBERTa model** (`best_model/`) from the models directory.
2. Download the **"test" dataset** from HuggingFace.
3. Generate predictions for both:
   - **Task 2 (Evasion)**: Direct predictions from the model (9 labels).
   - **Task 1 (Clarity)**: Derived by mapping evasion labels to clarity categories (3 labels).
4. Save formatted `.zip` files ready for upload to CodaBench.

The best model was automatically saved during training based on the highest Macro F1 score on the validation set.


In [16]:
EVASION_TO_CLARITY = {
    'Explicit': 'Clear Reply',
    'Implicit': 'Ambivalent',
    'Dodging': 'Ambivalent',
    'General': 'Ambivalent',
    'Deflection': 'Ambivalent',
    'Partial/half-answer': 'Ambivalent',
    'Declining to answer': 'Clear Non-Reply',
    'Claims ignorance': 'Clear Non-Reply',
    'Clarification': 'Clear Non-Reply',
}

SUBMISSIONS_DIR = MODELS_DIR / "submissions"


def load_best_model():
    """Load best RoBERTa model from disk."""
    model = AutoModelForSequenceClassification.from_pretrained(BEST_MODEL_PATH)
    tokenizer = RobertaTokenizer.from_pretrained(BEST_MODEL_PATH)
    label_encoder = joblib.load(BEST_MODEL_PATH / "label_encoder.pkl")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    return model, tokenizer, label_encoder


def load_test_data():
    """Download fresh test data from HuggingFace."""
    dataset = load_dataset("ailsntua/QEvasion")
    return dataset["test"].to_pandas()


def evasion_to_clarity(y_evasion):
    """Map evasion labels to clarity labels."""
    return [EVASION_TO_CLARITY[e] for e in y_evasion]


def save_submission(predictions, task_name):
    """Save predictions as a properly formatted zip for CodaBench."""
    SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)

    pred_path = SUBMISSIONS_DIR / f"prediction_{task_name}"
    zip_path = SUBMISSIONS_DIR / f"submission_{task_name}.zip"

    with open(pred_path, 'w') as f:
        f.write('\n'.join(predictions))

    with zipfile.ZipFile(zip_path, 'w') as zf:
        zf.write(pred_path, "prediction")

    return zip_path

def generate_submissions():
    """Full pipeline: load model → predict → save submissions."""
    best_model, best_tokenizer, best_label_encoder = load_best_model()

    df_test = load_test_data()

    _, y_evasion = evaluate_model(best_model, best_tokenizer, best_label_encoder, df_test)
    y_clarity = evasion_to_clarity(y_evasion)

    zip_task2 = save_submission(y_evasion, "task2")
    zip_task1 = save_submission(y_clarity, "task1")

    return {
        "task1_zip": zip_task1,
        "task2_zip": zip_task2,
        "evasion_dist": Counter(y_evasion),
        "clarity_dist": Counter(y_clarity),
    }


results = generate_submissions()
results


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3448 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

{'task1_zip': PosixPath('/content/drive/MyDrive/NLP-Clarity/models/roberta/submissions/submission_task1.zip'),
 'task2_zip': PosixPath('/content/drive/MyDrive/NLP-Clarity/models/roberta/submissions/submission_task2.zip'),
 'evasion_dist': Counter({'Dodging': 36,
          'General': 88,
          'Implicit': 47,
          'Deflection': 49,
          'Explicit': 52,
          'Declining to answer': 10,
          'Partial/half-answer': 8,
          'Claims ignorance': 13,
          'Clarification': 5}),
 'clarity_dist': Counter({'Ambivalent': 228,
          'Clear Reply': 52,
          'Clear Non-Reply': 28})}