In [None]:
%%capture
import os, re, sys
if 'google.colab' in sys.modules:
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    %pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    %pip install --no-deps unsloth
else:
    %pip install unsloth

%pip install transformers==4.56.2
%pip install --no-deps trl==0.22.2

In [None]:
from unsloth import FastLanguageModel
from trl import SFTConfig, SFTTrainer
from pathlib import Path
import sys
from datasets import load_dataset
import pandas as pd
from datasets import Dataset
from tqdm import tqdm
import numpy as np
import torch
import zipfile
from collections import Counter

MODELS_DIR = Path("models/llama31_8b_lora")
BASE_DRIVE_DIR = Path("/content/drive/MyDrive/NLP-Clarity")

if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    MODELS_DIR = BASE_DRIVE_DIR / "models" / "llama31_8b_lora"

MODELS_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# === MODEL CONFIG ===
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# === LOAD MODEL ===
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
DATA_DIR = Path("data")

if 'google.colab' in sys.modules:
    DATA_DIR = BASE_DRIVE_DIR / "data"

train_path = DATA_DIR / "train.csv"
val_path = DATA_DIR / "val.csv"


# - The 'test' split on HuggingFace (308 samples) IS the public leaderboard set.
# - We treat this as our VALIDATION set ('df_val') to select the best model.
# - We also save the train and val to disk, in case dataset from huggingface is updated (e.g., when evaluation phase will start).
def load_qevasion_dataset():
    if train_path.exists() and val_path.exists():
        df_train = pd.read_csv(train_path)
        df_val = pd.read_csv(val_path)
        return df_train, df_val
    else:
        dataset = load_dataset("ailsntua/QEvasion")
        df_train = dataset["train"].to_pandas()
        df_val = dataset["test"].to_pandas()
        df_train.to_csv(train_path, index=False)
        df_val.to_csv(val_path, index=False)
        return df_train, df_val

df_train, df_val = load_qevasion_dataset()

In [None]:
INSTRUCTION = """You are an expert in political discourse analysis. Analyze the following question-answer pair from a political interview and classify the evasion strategy.

Context: The question is a specific sub-question, but the answer is the full response. Focus only on the portion of the answer relevant to the sub-question.

The taxonomy of responses consists of 3 main clarity levels, which are further divided into 9 specific evasion types:

1. Clear Reply (Unambiguous)
   - 'Explicit': The information requested is explicitly stated (in the requested form)

2. Ambivalent Reply
   - 'Implicit': The information requested is given, but without being explicitly stated (not in the expected form)
   - 'General': The information provided is too general/lacks the requested specificity
   - 'Partial/half-answer': Offers only a specific component of the requested information
   - 'Dodging': Ignoring the question altogether
   - 'Deflection': Starts on topic but shifts focus and makes a different point than asked

3. Clear Non-Reply
   - 'Declining to answer': Acknowledge the question but directly or indirectly refusing to answer at the moment
   - 'Claims ignorance': The answerer claims/admits not to know the answer themselves
   - 'Clarification': Does not provide the requested information and asks for clarification

Task: Output ONLY the specific evasion type (e.g., 'Explicit', 'Deflection', etc.)."""

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

In [None]:
def formatting_prompts_func(examples):
    questions = examples["question"]
    answers = examples["interview_answer"]
    labels = examples["evasion_label"]
    
    texts = []
    for question, answer, label in zip(questions, answers, labels):
        input_text = f"Question: {question}\nAnswer: {answer}"
        text = alpaca_prompt.format(INSTRUCTION, input_text, label) + EOS_TOKEN
        texts.append(text)
    
    return {"text": texts}

train_dataset = Dataset.from_pandas(df_train)
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

def print_training_stats(trainer_stats):
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(
        f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
    )
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.") 

In [None]:
# f1_for_class is the exact function used by the authors (they posted it on discord group)
def f1_for_class(gold_annotations, predictions, target_class):
    """
    Calculates Precision/Recall/F1 for only one class.

    gold_annotations: list of lists (or sets) with labels per sample
    predictions: list with one prediction per sample
    target_class: the class for which we want the F1
    """
    TP = FP = FN = 0

    for gold, pred in zip(gold_annotations, predictions):
        gold = set(gold)

        if pred == target_class and target_class in gold:
            TP += 1  # we correctly predicted target_class
        elif pred == target_class and target_class not in gold:
            FP += 1  # we predicted target_class but it was not in gold
        elif target_class in gold and pred not in gold:
            FN += 1  # the class was in gold but the sample is overall wrong

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return {"precision": precision, "recall": recall, "f1": f1, "tp": TP, "fp": FP, "fn": FN}


def compute_macro_f1(gold_annotations, predictions):
    """
    Compute Macro-F1 score (same as CodaBench leaderboard).

    Args:
        gold_annotations: list of lists - each inner list contains valid labels from annotators
        predictions: list of strings - one prediction per sample

    Returns:
        float: Macro F1 score
    """
    all_classes = set()
    for gold in gold_annotations:
        all_classes.update(gold)
    classes = sorted(list(all_classes))

    f1_scores = []
    for cls in classes:
        result = f1_for_class(gold_annotations, predictions, cls)
        f1_scores.append(result["f1"])

    macro_f1 = float(np.mean(f1_scores))

    return macro_f1

In [None]:
def predict_evasion_labels(model, tokenizer, df):
    FastLanguageModel.for_inference(model)
    
    predictions = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Predicting"):
        input_text = f"Question: {row['question']}\nAnswer: {row['interview_answer']}"
        prompt = alpaca_prompt.format(INSTRUCTION, input_text, "")
        
        inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=20, use_cache=True)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        if "### Response:" in response:
            pred_label = response.split("### Response:")[-1].strip()
        else:
            pred_label = response.strip()
        
        pred_label = pred_label.split("\n")[0].strip()
        predictions.append(pred_label)
    
    return predictions

def evaluate_model(model, tokenizer, df_val):
    predictions = predict_evasion_labels(model, tokenizer, df_val)
    gold_annotations = df_val[['annotator1', 'annotator2', 'annotator3']].values.tolist()
    macro_f1 = compute_macro_f1(gold_annotations, predictions)
    return macro_f1

In [None]:
import json
from datetime import datetime

RESULTS_LOG_PATH = MODELS_DIR / "experiment_log.json"
BEST_SCORE_PATH = MODELS_DIR / "best_score.json"


def log_experiment(config, macro_f1):
    """Append experiment to log file."""
    if RESULTS_LOG_PATH.exists():
        with open(RESULTS_LOG_PATH, 'r') as f:
            experiment_log = json.load(f)
    else:
        experiment_log = []
    
    experiment_log.append({
        "timestamp": datetime.now().isoformat(),
        "config": config,
        "train_data": train_path,
        "macro_f1": macro_f1
    })
    
    with open(RESULTS_LOG_PATH, 'w') as f:
        json.dump(experiment_log, f, indent=2)


def save_if_best(model, tokenizer, config, macro_f1):
    """Save LoRA model if it beats current best. Returns True if saved."""
    if BEST_SCORE_PATH.exists():
        with open(BEST_SCORE_PATH, 'r') as f:
            best_f1 = json.load(f).get("macro_f1", 0)
    else:
        best_f1 = 0
    
    if macro_f1 > best_f1:
        with open(BEST_SCORE_PATH, 'w') as f:
            json.dump({
                "timestamp": datetime.now().isoformat(),
                "config": config,
                "macro_f1": macro_f1
            }, f, indent=2)
        
        model.save_pretrained(MODELS_DIR / "best_lora")
        tokenizer.save_pretrained(MODELS_DIR / "best_lora")
        
        print(f"New Best! F1: {macro_f1:.4f} (prev: {best_f1:.4f})")
        return True
    
    return False

**Use this cell to train models, track experiments, and save the best performing one.**

This pipeline will:
1. **Train Llama 3.1 8B** with LoRA adapters using the specified configuration.
2. **Evaluate** performance on the validation set using the official Macro-F1 metric.
3. **Log** every experiment to `experiment_log.json` (so you don't lose history).
4. **Auto-Save Best Model**: If the current model beats the previous best F1 score, it automatically saves LoRA adapters to `best_lora/`.

**How to use:**
- Change the `config` dictionary below.
- Run the cell.

In [None]:
def run_experiment(config):
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = train_dataset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        packing = False, # Can make training 5x faster for short sequences.
        args = SFTConfig(
            per_device_train_batch_size = config.get("batch_size", 2),
            gradient_accumulation_steps = config.get("grad_accum", 4),
            warmup_steps = config.get("warmup_steps", 5),
            num_train_epochs = config["epochs"],
            learning_rate = config["learning_rate"],
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = config.get("weight_decay", 0.001),
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = str(MODELS_DIR / "checkpoints"),
            report_to = "none", # Use TrackIO/WandB etc
        ),
    )

    trainer_stats = trainer.train()
    print_training_stats(trainer_stats)

    macro_f1 = evaluate_model(model, tokenizer, df_val)

    log_experiment(config, macro_f1)
    save_if_best(model, tokenizer, config, macro_f1)
    
    return macro_f1, trainer_stats


In [None]:
config = {
    "model": "Llama-3.1-8B",
    "adapter": "LoRA",
    "r": 16,
    "epochs": 5,
    "learning_rate": 2e-4,
    "instruction": INSTRUCTION, 
    "batch_size": 16,
    "grad_accum": 2,
}

macro_f1, stats = run_experiment(config)

Run this cell ONLY to generate submission files for CodaBench.

This pipeline will:
1. Load your **best saved LoRA model** (`best_lora/`) from the models directory.
2. Download the **"test" dataset** from HuggingFace.
3. Generate predictions for both:
   - **Task 2 (Evasion)**: Direct predictions from the model (9 labels).
   - **Task 1 (Clarity)**: Derived by mapping evasion labels to clarity categories (3 labels).
4. Save formatted `.zip` files ready for upload to CodaBench.

In [None]:
EVASION_TO_CLARITY = {
    'Explicit': 'Clear Reply',
    'Implicit': 'Ambivalent',
    'Dodging': 'Ambivalent',
    'General': 'Ambivalent',
    'Deflection': 'Ambivalent',
    'Partial/half-answer': 'Ambivalent',
    'Declining to answer': 'Clear Non-Reply',
    'Claims ignorance': 'Clear Non-Reply',
    'Clarification': 'Clear Non-Reply',
}

SUBMISSIONS_DIR = MODELS_DIR / "submissions"


def load_best_model():
    """Load best LoRA model from disk."""
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = str(MODELS_DIR / "best_lora"),
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)
    return model, tokenizer


def load_test_data():
    """Download fresh test data from HuggingFace."""
    dataset = load_dataset("ailsntua/QEvasion")
    return dataset["test"].to_pandas()


def evasion_to_clarity(y_evasion):
    """Map evasion labels to clarity labels."""
    return [EVASION_TO_CLARITY[e] for e in y_evasion]


def save_submission(predictions, task_name):
    """Save predictions as a properly formatted zip for CodaBench."""
    SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
    
    pred_path = SUBMISSIONS_DIR / f"prediction_{task_name}"
    zip_path = SUBMISSIONS_DIR / f"submission_{task_name}.zip"
    
    with open(pred_path, 'w') as f:
        f.write('\n'.join(predictions))
    
    with zipfile.ZipFile(zip_path, 'w') as zf:
        zf.write(pred_path, "prediction")
    
    return zip_path


def generate_submissions():
    """Full pipeline: load model → predict → save submissions."""
    best_model, best_tokenizer = load_best_model()
    
    df_test = load_test_data()
    
    y_evasion = predict_evasion_labels(best_model, best_tokenizer, df_test)
    y_clarity = evasion_to_clarity(y_evasion)
    
    zip_task2 = save_submission(y_evasion, "task2")
    zip_task1 = save_submission(y_clarity, "task1")
    
    return {
        "task1_zip": zip_task1,
        "task2_zip": zip_task2,
        "evasion_dist": Counter(y_evasion),
        "clarity_dist": Counter(y_clarity),
    }

results = generate_submissions()
results