In [1]:
from pathlib import Path
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
import sys
import torch
from transformers import (
    AutoModelForSequenceClassification,
    RobertaTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import json
import joblib
from datetime import datetime
import zipfile
from collections import Counter
from sklearn.model_selection import train_test_split


MODELS_DIR = Path("models/roberta")
BASE_DRIVE_DIR = Path("/content/drive/MyDrive/NLP-Clarity")

if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    MODELS_DIR = BASE_DRIVE_DIR / "models" / "roberta"

MODELS_DIR.mkdir(parents=True, exist_ok=True)


  from .autonotebook import tqdm as notebook_tqdm
W0108 23:08:03.823000 7620 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [13]:
DATA_DIR = Path("data")

if 'google.colab' in sys.modules:
    DATA_DIR = BASE_DRIVE_DIR / "data"

DATA_DIR.mkdir(parents=True, exist_ok=True)

train_path = DATA_DIR / "train.csv"
val_path = DATA_DIR / "val.csv"


# - The 'test' split on HuggingFace (308 samples) IS the public leaderboard set.
# - We treat this as our VALIDATION set ('df_val') to select the best model.
# - We also save the train and val to disk, in case dataset from huggingface is updated (e.g., when evaluation phase will start).
def load_qevasion_dataset():
    if train_path.exists() and val_path.exists():
        df_train = pd.read_csv(train_path)
        df_val = pd.read_csv(val_path)
        return df_train, df_val
    else:
        dataset = load_dataset("ailsntua/QEvasion")
        df_train = dataset["train"].to_pandas()
        df_val = dataset["test"].to_pandas()
        df_train.to_csv(train_path, index=False)
        df_val.to_csv(val_path, index=False)
        return df_train, df_val

df_train, df_test = load_qevasion_dataset()

df_val = df_test.copy()

print(f"Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")

Train: 3448, Val: 308, Test: 308


In [3]:
# f1_for_class is the exact function used by the authors (they posted it on discord group)
def f1_for_class(gold_annotations, predictions, target_class):
    """
    Calculates Precision/Recall/F1 for only one class.

    gold_annotations: list of lists (or sets) with labels per sample
    predictions: list with one prediction per sample
    target_class: the class for which we want the F1
    """
    TP = FP = FN = 0

    for gold, pred in zip(gold_annotations, predictions):
        gold = set(gold)

        if pred == target_class and target_class in gold:
            TP += 1  # we correctly predicted target_class
        elif pred == target_class and target_class not in gold:
            FP += 1  # we predicted target_class but it was not in gold
        elif target_class in gold and pred not in gold:
            FN += 1  # the class was in gold but the sample is overall wrong

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return {"precision": precision, "recall": recall, "f1": f1, "tp": TP, "fp": FP, "fn": FN}


def compute_macro_f1(gold_annotations, predictions):
    """
    Compute Macro-F1 score (same as CodaBench leaderboard).

    Args:
        gold_annotations: list of lists - each inner list contains valid labels from annotators
        predictions: list of strings - one prediction per sample

    Returns:
        float: Macro F1 score
    """
    all_classes = set()
    for gold in gold_annotations:
        all_classes.update(gold)
    classes = sorted(list(all_classes))

    f1_scores = []
    for cls in classes:
        result = f1_for_class(gold_annotations, predictions, cls)
        f1_scores.append(result["f1"])

    macro_f1 = float(np.mean(f1_scores))

    return macro_f1

In [4]:
MODEL_NAME = "roberta-base"

label_encoder = LabelEncoder()
label_encoder.fit(df_train['evasion_label'])
num_labels = len(label_encoder.classes_)

print(f"Classes ({num_labels}): {label_encoder.classes_}")

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)


Classes (9): ['Claims ignorance' 'Clarification' 'Declining to answer' 'Deflection'
 'Dodging' 'Explicit' 'General' 'Implicit' 'Partial/half-answer']


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
def format_input(row):
    """Combine question and answer into a single input string."""
    return f"Q: {row['question']}\nA: {row['interview_answer']}"


def prepare_dataset(df, label_encoder):
    texts = [format_input(row) for _, row in df.iterrows()]
    labels = label_encoder.transform(df['evasion_label']).tolist()
    return Dataset.from_dict({"text": texts, "label": labels})


def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding=False,
    )


train_dataset = prepare_dataset(df_train, label_encoder)
val_dataset = prepare_dataset(df_val, label_encoder)

train_dataset = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

print(f"Train samples: {len(train_dataset)}")
print(f"Val samples: {len(val_dataset)}")


Map: 100%|██████████| 3103/3103 [00:05<00:00, 615.61 examples/s]
Map: 100%|██████████| 345/345 [00:00<00:00, 546.51 examples/s]

Train samples: 3103
Val samples: 345





In [6]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(num_labels),
    y=label_encoder.transform(df_train['evasion_label'])
)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

print("Class weights:")
for i, cls in enumerate(label_encoder.classes_):
    print(f"  {cls}: {class_weights[i]:.3f}")


Class weights:
  Claims ignorance: 3.222
  Clarification: 4.154
  Declining to answer: 2.632
  Deflection: 1.005
  Dodging: 0.543
  Explicit: 0.364
  General: 0.994
  Implicit: 0.785
  Partial/half-answer: 4.856


In [7]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights.to(self.args.device)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fn = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu


In [10]:
config = {
    "model": MODEL_NAME,
    "epochs": 10,
    "batch_size": 16,
    "learning_rate": 1e-5,
    "weight_decay": 0.01,
    "warmup_ratio": 0.1,
}


In [11]:

training_args = TrainingArguments(
    output_dir=str(MODELS_DIR / "checkpoints"),
    num_train_epochs=config["epochs"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    learning_rate=config["learning_rate"],
    weight_decay=config["weight_decay"],
    warmup_ratio=config["warmup_ratio"],
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=50,
    report_to="none",
    fp16=torch.cuda.is_available(),
)


In [12]:
trainer = WeightedTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
)

trainer.train()


  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
def evaluate_model(model, tokenizer, label_encoder, df_val):
    model.eval()
    device = next(model.parameters()).device

    predictions = []

    with torch.no_grad():
        for _, row in df_val.iterrows():
            text = format_input(row)
            inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            outputs = model(**inputs)
            pred_idx = outputs.logits.argmax(dim=-1).item()
            predictions.append(label_encoder.inverse_transform([pred_idx])[0])

    gold_annotations = df_val[['annotator1', 'annotator2', 'annotator3']].values.tolist()
    macro_f1 = compute_macro_f1(gold_annotations, predictions)

    return macro_f1, predictions


macro_f1, predictions = evaluate_model(model, tokenizer, label_encoder, df_test)
print(f"\n{'='*50}")
print(f"Macro F1 on Test: {macro_f1:.4f}")
print(f"{'='*50}")

print(f"\nPrediction distribution:")
print(Counter(predictions))


In [None]:
BEST_MODEL_PATH = MODELS_DIR / "best_model"

model.save_pretrained(BEST_MODEL_PATH)
tokenizer.save_pretrained(BEST_MODEL_PATH)
joblib.dump(label_encoder, BEST_MODEL_PATH / "label_encoder.pkl")

with open(BEST_MODEL_PATH / "metadata.json", "w") as f:
    json.dump({
        "config": config,
        "macro_f1": macro_f1,
        "timestamp": datetime.now().isoformat(),
    }, f, indent=2)

print(f"Model saved to {BEST_MODEL_PATH}")


**Run this cell ONLY to generate submission files for CodaBench.**

This pipeline will:
1. Load your **best saved RoBERTa model** (`best_model/`) from the models directory.
2. Download the **"test" dataset** from HuggingFace.
3. Generate predictions for both:
   - **Task 2 (Evasion)**: Direct predictions from the model (9 labels).
   - **Task 1 (Clarity)**: Derived by mapping evasion labels to clarity categories (3 labels).
4. Save formatted `.zip` files ready for upload to CodaBench.

The best model was automatically saved during training based on the highest Macro F1 score on the validation set.


In [None]:
EVASION_TO_CLARITY = {
    'Explicit': 'Clear Reply',
    'Implicit': 'Ambivalent',
    'Dodging': 'Ambivalent',
    'General': 'Ambivalent',
    'Deflection': 'Ambivalent',
    'Partial/half-answer': 'Ambivalent',
    'Declining to answer': 'Clear Non-Reply',
    'Claims ignorance': 'Clear Non-Reply',
    'Clarification': 'Clear Non-Reply',
}

SUBMISSIONS_DIR = MODELS_DIR / "submissions"


def load_best_model():
    """Load best RoBERTa model from disk."""
    model = AutoModelForSequenceClassification.from_pretrained(BEST_MODEL_PATH)
    tokenizer = RobertaTokenizer.from_pretrained(BEST_MODEL_PATH)
    label_encoder = joblib.load(BEST_MODEL_PATH / "label_encoder.pkl")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    return model, tokenizer, label_encoder


def load_test_data():
    """Download fresh test data from HuggingFace."""
    dataset = load_dataset("ailsntua/QEvasion")
    return dataset["test"].to_pandas()


def evasion_to_clarity(y_evasion):
    """Map evasion labels to clarity labels."""
    return [EVASION_TO_CLARITY[e] for e in y_evasion]


def save_submission(predictions, task_name):
    """Save predictions as a properly formatted zip for CodaBench."""
    SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)

    pred_path = SUBMISSIONS_DIR / f"prediction_{task_name}"
    zip_path = SUBMISSIONS_DIR / f"submission_{task_name}.zip"

    with open(pred_path, 'w') as f:
        f.write('\n'.join(predictions))

    with zipfile.ZipFile(zip_path, 'w') as zf:
        zf.write(pred_path, "prediction")

    return zip_path

def generate_submissions():
    """Full pipeline: load model → predict → save submissions."""
    best_model, best_tokenizer, best_label_encoder = load_best_model()

    df_test = load_test_data()

    _, y_evasion = evaluate_model(best_model, best_tokenizer, best_label_encoder, df_test)
    y_clarity = evasion_to_clarity(y_evasion)

    zip_task2 = save_submission(y_evasion, "task2")
    zip_task1 = save_submission(y_clarity, "task1")

    return {
        "task1_zip": zip_task1,
        "task2_zip": zip_task2,
        "evasion_dist": Counter(y_evasion),
        "clarity_dist": Counter(y_clarity),
    }


results = generate_submissions()
results
