# Baseline evaluation

This notebook runs two baselines on the processed dataset:
1. TF-IDF + LogisticRegression (classical baseline)
2. Mock LLM via `PromptRunner` (simulated LLM baseline)

Metrics: Quadratic Weighted Kappa (QWK), ±1 accuracy, confusion matrices, and classification report.

In [None]:
import os
import glob
import pandas as pd
from sklearn.model_selection import train_test_split

# Try to locate processed parquet; fallback to sample CSV
parquet_candidates = glob.glob("data/processed.*.parquet")
if len(parquet_candidates) > 0:
    path = parquet_candidates[0]
    print("Using processed parquet:", path)
    df = pd.read_parquet(path)
else:
    print("No processed.parquet found; falling back to data/sample_sanitized.csv")
    df = pd.read_csv("data/sample_sanitized.csv")

print("Rows:", len(df))
df = df.dropna(subset=["full_text", "score"])
df["score"] = df["score"].astype(int)

# Use a small subset for quick runs
df_small = df.sample(n=min(1000, len(df)), random_state=42).reset_index(drop=True)
X = df_small["full_text"].astype(str).tolist()
y = df_small["score"].astype(int).tolist()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train size:", len(X_train), "Test size:", len(X_test))

In [None]:
from src.baseline.classical import TFIDFLogistic
from src.eval.metrics import (
    qwk,
    within_one_accuracy,
    confusion,
    classification_report_dict,
)
from src.baseline.llm_client import MockLLMClient
from src.baseline.prompt_runner import PromptRunner

# Train TFIDF baseline
clf = TFIDFLogistic(max_features=5000, ngram_range=(1, 2))
clf.fit(X_train, y_train)
preds_clf = clf.predict(X_test)

print("TFIDF Logistic QWK:", qwk(y_test, preds_clf))
print("TFIDF Logistic ±1 acc:", within_one_accuracy(y_test, preds_clf))

cm_clf, labels = confusion(y_test, preds_clf, labels=sorted(set(y)))
print("Confusion matrix (rows=true, cols=pred):")
print(labels)
print(cm_clf)

print("\nClassification report (TFIDF):")
import json

print(json.dumps(classification_report_dict(y_test, preds_clf), indent=2))

In [None]:
# Run Mock LLM prompt runner on test set
mock = MockLLMClient()
runner = PromptRunner(llm_client=mock)
llm_outputs = [runner.run_single(text) for text in X_test]
preds_llm = [int(o["score"]) for o in llm_outputs]

print("\nMock LLM QWK:", qwk(y_test, preds_llm))
print("Mock LLM ±1 acc:", within_one_accuracy(y_test, preds_llm))
cm_llm, _ = confusion(y_test, preds_llm, labels=sorted(set(y)))
print("Mock LLM confusion matrix:")
print(cm_llm)

print("\nExample mock outputs (first 5):")
for i, out in enumerate(llm_outputs[:5]):
    print(i, out)