In [None]:
from pathlib import Path
from datasets import load_dataset
import pandas as pd
import numpy as np
import sys
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight
import xgboost as xgb
import json
import joblib
from datetime import datetime
import zipfile
from collections import Counter
from itertools import product


MODELS_DIR = Path("models/embeddings_xgboost")
BASE_DRIVE_DIR = Path("/content/drive/MyDrive/NLP-Clarity")

if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    MODELS_DIR = BASE_DRIVE_DIR / "models" / "embeddings_xgboost"

MODELS_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
DATA_DIR = Path("data")

if 'google.colab' in sys.modules:
    DATA_DIR = BASE_DRIVE_DIR / "data"

train_path = DATA_DIR / "train.csv"
val_path = DATA_DIR / "val.csv"


# - The 'test' split on HuggingFace (308 samples) IS the public leaderboard set.
# - We treat this as our VALIDATION set ('df_val') to select the best model.
# - We also save the train and val to disk, in case dataset from huggingface is updated (e.g., when evaluation phase will start).
def load_qevasion_dataset():
    if train_path.exists() and val_path.exists():
        df_train = pd.read_csv(train_path)
        df_val = pd.read_csv(val_path)
        return df_train, df_val
    else:
        dataset = load_dataset("ailsntua/QEvasion")
        df_train = dataset["train"].to_pandas()
        df_val = dataset["test"].to_pandas()
        df_train.to_csv(train_path, index=False)
        df_val.to_csv(val_path, index=False)
        return df_train, df_val

df_train, df_val = load_qevasion_dataset()

In [None]:
# f1_for_class is the exact function used by the authors (they posted it on discord group)
def f1_for_class(gold_annotations, predictions, target_class):
    """
    Calculates Precision/Recall/F1 for only one class.

    gold_annotations: list of lists (or sets) with labels per sample
    predictions: list with one prediction per sample
    target_class: the class for which we want the F1
    """
    TP = FP = FN = 0

    for gold, pred in zip(gold_annotations, predictions):
        gold = set(gold)

        if pred == target_class and target_class in gold:
            TP += 1  # we correctly predicted target_class
        elif pred == target_class and target_class not in gold:
            FP += 1  # we predicted target_class but it was not in gold
        elif target_class in gold and pred not in gold:
            FN += 1  # the class was in gold but the sample is overall wrong

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return {"precision": precision, "recall": recall, "f1": f1, "tp": TP, "fp": FP, "fn": FN}


def compute_macro_f1(gold_annotations, predictions):
    """
    Compute Macro-F1 score (same as CodaBench leaderboard).

    Args:
        gold_annotations: list of lists - each inner list contains valid labels from annotators
        predictions: list of strings - one prediction per sample

    Returns:
        float: Macro F1 score
    """
    all_classes = set()
    for gold in gold_annotations:
        all_classes.update(gold)
    classes = sorted(list(all_classes))

    f1_scores = []
    for cls in classes:
        result = f1_for_class(gold_annotations, predictions, cls)
        f1_scores.append(result["f1"])

    macro_f1 = float(np.mean(f1_scores))

    return macro_f1

In [None]:
model_name = 'sentence-transformers/all-mpnet-base-v2'
embedder = SentenceTransformer(model_name)


def extract_features(df):
    """
    Features:
        - Question embedding (768 dims)
        - Answer embedding (768 dims)
        - Cosine similarity (1 dim)
    
    All of the above is flattened.
    """
    q_emb = embedder.encode(df['question'].tolist(), show_progress_bar=True)
    a_emb = embedder.encode(df['interview_answer'].tolist(), show_progress_bar=True)
    cos_sim = cosine_similarity(q_emb, a_emb).diagonal().reshape(-1, 1)
    X = np.hstack([q_emb, a_emb, cos_sim])
    return X


X_train = extract_features(df_train)
X_val = extract_features(df_val)

# Experiment Tracking & Training

**Use this cell to train models, track experiments, and save the best performing one.**

This pipeline will:
1. **Train XGBoost** with the specified configuration (hyperparameters).
2. **Evaluate** performance on the validation set using the official Macro-F1 metric.
3. **Log** every experiment to `experiment_log.json` (so you don't lose history).
4. **Auto-Save Best Model**: If the current model beats the previous best F1 score, it automatically overwrites `best_model.pkl`.

**How to use:**
- Change the `config` dictionary at the bottom.
- Run the cell.

In [None]:
RESULTS_LOG_PATH = MODELS_DIR / "experiment_log.json"
BEST_SCORE_PATH = MODELS_DIR / "best_score.json"
BEST_MODEL_PATH = MODELS_DIR / "best_model.pkl"


def train_xgboost(config, X_train, y_train):
    """Train XGBoost with given config. Returns model and label_encoder."""
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y_train)
    
    weights = compute_sample_weight('balanced', y_encoded)
    
    clf = xgb.XGBClassifier(
        n_estimators=config["n_estimators"],
        max_depth=config["max_depth"],
        learning_rate=config["learning_rate"],
        objective='multi:softmax',
        num_class=len(label_encoder.classes_),
        random_state=42,
        n_jobs=-1,
        eval_metric='mlogloss',
        device='cuda'
    )
    clf.fit(X_train, y_encoded, sample_weight=weights)
    
    return clf, label_encoder


def evaluate_model(clf, label_encoder, X_val, df_val):
    """Predict and compute macro F1 against annotator gold labels."""
    y_pred_encoded = clf.predict(X_val)
    y_pred = label_encoder.inverse_transform(y_pred_encoded)
    gold_annotations = df_val[['annotator1', 'annotator2', 'annotator3']].values.tolist()
    macro_f1 = compute_macro_f1(gold_annotations, y_pred)
    return macro_f1


def log_experiment(config, macro_f1):
    """Append experiment to log file."""
    if RESULTS_LOG_PATH.exists():
        with open(RESULTS_LOG_PATH, 'r') as f:
            experiment_log = json.load(f)
    else:
        experiment_log = []
    
    experiment_log.append({
        "timestamp": datetime.now().isoformat(),
        "config": config,
        "macro_f1": macro_f1
    })
    
    with open(RESULTS_LOG_PATH, 'w') as f:
        json.dump(experiment_log, f, indent=2)


def save_if_best(clf, label_encoder, config, macro_f1):
    """Save model if it beats the current best. Returns True if saved."""
    if BEST_SCORE_PATH.exists():
        with open(BEST_SCORE_PATH, 'r') as f:
            best_f1 = json.load(f).get("macro_f1", 0)
    else:
        best_f1 = 0
    
    if macro_f1 > best_f1:
        with open(BEST_SCORE_PATH, 'w') as f:
            json.dump({
                "timestamp": datetime.now().isoformat(),
                "config": config,
                "macro_f1": macro_f1
            }, f, indent=2)
        
        joblib.dump({
            "model": clf,
            "label_encoder": label_encoder,
            "config": config
        }, BEST_MODEL_PATH)

        print(f"New Best! F1: {macro_f1:.4f} (prev: {best_f1:.4f})")
        return True
    
    return False


def run_experiment(config):
    clf, label_encoder = train_xgboost(config, X_train, df_train['evasion_label'])
    macro_f1 = evaluate_model(clf, label_encoder, X_val, df_val)
    log_experiment(config, macro_f1)
    save_if_best(clf, label_encoder, config, macro_f1)
    
    return macro_f1

# === RUN GRID SEARCH ===
params_grid = {
    "n_estimators": [200, 500, 1000],
    "max_depth": [4, 6, 8, 10],
    "learning_rate": [0.05, 0.1, 0.2, 0.3],
}

keys = params_grid.keys()
combinations = list(product(*params_grid.values()))
results = []
for i, values in enumerate(combinations):
    config = {
        "model": "XGBoost",
        "embedding_model": "all-mpnet-base-v2",
        **dict(zip(keys, values))
    }

    print(f"\n[{i+1}/{len(combinations)}] {config}")
    f1 = run_experiment(config)
    results.append((config, f1))

sorted(results, key=lambda x: x[1], reverse=True)

Run this cell ONLY to generate submission files for CodaBench.

This pipeline will:
1. Load your **best saved model** (`best_model.pkl`) from the models directory.
2. Download the **"test" dataset** from HuggingFace.
3. Generate predictions for both:
   - **Task 2 (Evasion)**: Direct predictions from the model.
   - **Task 1 (Clarity)**: Derived by mapping evasion labels to clarity categories.
4. Save formatted `.zip` files ready for upload to CodaBench.

In [None]:
EVASION_TO_CLARITY = {
    'Explicit': 'Clear Reply',
    'Implicit': 'Ambivalent',
    'Dodging': 'Ambivalent',
    'General': 'Ambivalent',
    'Deflection': 'Ambivalent',
    'Partial/half-answer': 'Ambivalent',
    'Declining to answer': 'Clear Non-Reply',
    'Claims ignorance': 'Clear Non-Reply',
    'Clarification': 'Clear Non-Reply',
}

SUBMISSIONS_DIR = MODELS_DIR / "submissions"


def load_best_model():
    """Load best model, label encoder, and config from disk."""
    data = joblib.load(BEST_MODEL_PATH)
    return data["model"], data["label_encoder"], data["config"]


def load_test_data():
    """Download fresh test data from HuggingFace."""
    dataset = load_dataset("ailsntua/QEvasion")
    return dataset["test"].to_pandas()


def predict_evasion(clf, label_encoder, X):
    """Predict evasion labels."""
    y_encoded = clf.predict(X)
    return label_encoder.inverse_transform(y_encoded)


def evasion_to_clarity(y_evasion):
    """Map evasion labels to clarity labels."""
    return [EVASION_TO_CLARITY[e] for e in y_evasion]


def save_submission(predictions, task_name):
    """Save predictions as a properly formatted zip for CodaBench."""
    SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
    
    pred_path = SUBMISSIONS_DIR / f"prediction_{task_name}"
    zip_path = SUBMISSIONS_DIR / f"submission_{task_name}.zip"
    
    with open(pred_path, 'w') as f:
        f.write('\n'.join(predictions))
    
    with zipfile.ZipFile(zip_path, 'w') as zf:
        zf.write(pred_path, "prediction")
    
    return zip_path


def generate_submissions():
    """Full pipeline: load model → predict → save submissions."""
    clf, label_encoder, config = load_best_model()
    
    df_test = load_test_data()
    X_test = extract_features(df_test)
    
    y_evasion = predict_evasion(clf, label_encoder, X_test)
    y_clarity = evasion_to_clarity(y_evasion)
    
    zip_task2 = save_submission(y_evasion, "task2")
    zip_task1 = save_submission(y_clarity, "task1")
    
    return {
        "task1_zip": zip_task1,
        "task2_zip": zip_task2,
        "evasion_dist": Counter(y_evasion),
        "clarity_dist": Counter(y_clarity),
    }


results = generate_submissions()
results