# RLHF Evaluation Notebook

This notebook evaluates a base model vs an RLHF model using:
1) RM-based scoring
2) GPT-based preference judging
3) Style/structure metrics

Outputs: a markdown report and raw CSV/JSON artifacts.


In [1]:
# Install dependencies (Colab)
!pip -q install transformers accelerate sentencepiece openai tiktoken


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import json
import math
import re
from glob import glob
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForSequenceClassification

# ---- Config ----
DATA_ROOT = "/content/drive/Mydrive/LikeLion/실전 프로젝트 2/train"
DATA_GLOB = "**/RMlabel.json"
MAX_SAMPLES = 50  # adjust
SEED = 42

# Use the same model for base/RLHF for now (as requested)
BASE_MODEL_ID = "LGAI-EXAONE/EXAONE-4.0-1.2B"
RLHF_MODEL_ID = "LGAI-EXAONE/EXAONE-4.0-1.2B"
# LLaMA-family reward model (change if needed)
REWARD_MODEL_ID = "NoahBSchwartz/llama-2-7b-Reward-Model"

GEN_KWARGS = dict(max_new_tokens=512, temperature=0.7, top_p=0.9, do_sample=True)

RUN_GPT_EVAL = True  # set True if you have API access
OPENAI_MODEL = "gpt-4o-mini"

ANSWER_KEY = "a"  # store answer in key 'a' in JSONL

TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
REPORT_PATH = f"/content/drive/MyDrive/LikeLion/실전 프로젝트 2/evaluation/rlhf_eval_report_{TIMESTAMP}.md"
ARTIFACT_DIR = f"/content/drive/MyDrive/LikeLion/실전 프로젝트 2/evaluation/artifacts_{TIMESTAMP}"
GEN_DIR = os.path.join(ARTIFACT_DIR, "generations")
EVAL_DIR = os.path.join(ARTIFACT_DIR, "eval")

os.makedirs(GEN_DIR, exist_ok=True)
os.makedirs(EVAL_DIR, exist_ok=True)

torch.manual_seed(SEED)
np.random.seed(SEED)


In [4]:
def load_rm_label_single(path):
    # Try utf-8 first, then cp949 to recover Korean in legacy files
    data = None
    # try:
    with open(path, "r") as f:
        data = json.load(f)
        print(len(data))
    # except Exception:
    #     data = None
    # if data is None:
    #     raise ValueError(f"Failed to decode {path}")

    items = data.get("data_info", [])
    rows = []
    for item in items[:10]:
        q = item.get("question", "")
        if not q:
            continue
        rows.append({
            "data_id": item.get("data_id"),
            "question": q.strip()
        })
    return rows


DATA_ROOT = "/content/drive/MyDrive/LikeLion/실전 프로젝트 2/train/RMlabel.json"

rows = load_rm_label_single(DATA_ROOT)
print(f"Loaded {len(rows)} questions")

df = pd.DataFrame(rows).drop_duplicates(subset=["data_id"])
df.head()

2
Loaded 10 questions


Unnamed: 0,data_id,question
0,0827c2bf-592d-4048-8f95-3c5fe336c1d1,동물에 대한 인식을 높이고 교육하는 데 관심 있어?
1,e14a45b5-9e2c-44a1-ba37-e1015116e665,언론이 사회에 미치는 영향은 어때?
2,6a5d63bf-ca9e-46a9-8e5c-0d6c5ea7ce81,일본 항공권 예약하는 방법 알려 줘.
3,c6c9ba7c-116d-4b07-8815-00e0826044ad,노래를 배워본 적 있어?
4,081afbe1-8324-4650-9fc2-e5987f37cd1e,홈트레이닝 운동 영상을 추천해 줘.


In [5]:
def load_causal_model(model_id):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True, trust_remote_code=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_id, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
    )
    return model, tok

def load_reward_model(model_id):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True, trust_remote_code=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    model = AutoModelForSequenceClassification.from_pretrained(
        model_id, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
    )
    return model, tok

base_model, base_tok = load_causal_model(BASE_MODEL_ID)
rlhf_model, rlhf_tok = load_causal_model(RLHF_MODEL_ID)
rm_model, rm_tok = load_reward_model(REWARD_MODEL_ID)

base_model.eval()
rlhf_model.eval()
rm_model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NoahBSchwartz/llama-2-7b-Reward-Model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRM

In [6]:
def build_prompt(question, tokenizer):
    if hasattr(tokenizer, "apply_chat_template"):
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": question},
        ]
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return f"Question:\n{question}\n\nAnswer:\n"

@torch.inference_mode()
def generate_answer(model, tokenizer, question, **gen_kwargs):
    prompt = build_prompt(question, tokenizer)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
    outputs = model.generate(**inputs, **gen_kwargs)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if text.startswith(prompt):
        text = text[len(prompt):]
    return text.strip()


In [13]:
def rm_format(question, answer):
    return f"Human: {question}\nAssistant: {answer}"

@torch.inference_mode()
def rm_score(question, answer, model, tokenizer):
    text = rm_format(question, answer)
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to(model.device)
    out = model(**inputs)
    # score = out.logits.squeeze().float().item()
    logits = out.logits.squeeze().float()
    score = (logits[1]-logits[0]).item()
    return score


In [22]:
GPT_PROMPT = '''You are an evaluator judging which answer better matches human preference.

Question:
{Q}

Answer A:
{A}

Answer B:
{B}

Evaluation criteria:
- Clear and well-structured explanation
- Formal written style
- Minimal redundancy
- Appropriate professional and safe tone
- Directly addresses the question

Choose which answer is better overall.
Output strictly in the following format:
Winner: A
Reasons:
- Clear and well-structured explanation : A
- Formal written style : B
- Minimal redundancy : B
- Appropriate professional and safe tone : A
- Directly addresses the question : A
Reason: one short sentence
'''

def parse_gpt_judge(text):
    winner = None
    m = re.search(r"Winner:\s*([AB])", text)
    if m:
        winner = m.group(1)
    return winner, text

def gpt_judge(question, ans_a, ans_b, client, model=OPENAI_MODEL):
    prompt = GPT_PROMPT.format(Q=question, A=ans_a, B=ans_b)
    resp = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
    )
    text = resp.choices[0].message.content
    return parse_gpt_judge(text)


In [23]:
KOR_STOPWORDS = set([
    "이", "그", "저", "것", "수", "등", "및", "에서", "으로", "에게", "으로써",
    "하다", "되다", "있다", "없다", "그리고", "하지만", "또한", "그러나", "때문", "때문에", "및", "대한", "관련"
])
EN_STOPWORDS = set(["the", "a", "an", "and", "or", "but", "to", "of", "in", "for", "on", "with"])

FORMAL_ENDINGS = ["입니다", "합니다", "됩니다", "하십시오", "습니다"]
COLLOQUIAL_ENDINGS = ["요", "죠", "음"]
IMPERATIVE_ENDINGS = ["하세요", "하십시오", "하라", "하세요"]
SPECULATIVE = ["일 수", "가능", "추정", "보입니다", "같습니다"]
APOLOGY = ["죄송", "유감"]
HEDGE = ["약간", "어느 정도", "아마", "대체로"]
DEFENSIVE = ["도움이 되길 바랍니다", "참고 바랍니다"]
ASSERTIVE = ["반드시", "항상", "절대", "무조건"]
RISKY = ["자살", "살해", "폭탄", "불법", "마약"]
CONDITIONAL = ["만약", "경우", "때", "라면"]
NEUTRAL = ["일반적으로", "대개", "보통", "중립", "권장"]
CONJ = ["그리고", "하지만", "또한", "그러나", "즉", "따라서"]
SUBORD = ["때문에", "면서", "도록", "므로", "는데"]

def tokenize(text):
    # simple mixed tokenizer (Korean/English/digits)
    return re.findall(r"[A-Za-z]+|[0-9]+|[가-힣]+", text)

def split_sentences(text):
    parts = re.split(r"(?<=[\.\!\?])\s+|\n+", text.strip())
    return [p.strip() for p in parts if p.strip()]

def split_paragraphs(text):
    paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
    return paras if paras else [text.strip()]

def ngram_repetition(tokens, n):
    if len(tokens) < n:
        return 0.0
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    counts = Counter(ngrams)
    repeated = sum(c for c in counts.values() if c > 1)
    return repeated / max(1, len(ngrams))

def cosine_sim(a, b):
    common = set(a) | set(b)
    if not common:
        return 0.0
    va = np.array([a.get(k, 0) for k in common])
    vb = np.array([b.get(k, 0) for k in common])
    denom = (np.linalg.norm(va) * np.linalg.norm(vb))
    if denom == 0:
        return 0.0
    return float(np.dot(va, vb) / denom)

def sentence_similarity_avg(sentences):
    if len(sentences) < 2:
        return 0.0
    vecs = []
    for s in sentences:
        toks = tokenize(s)
        vecs.append(Counter(toks))
    sims = []
    for i in range(len(vecs)):
        for j in range(i+1, len(vecs)):
            sims.append(cosine_sim(vecs[i], vecs[j]))
    return float(np.mean(sims)) if sims else 0.0

def keyword_overuse_ratio(tokens):
    if not tokens:
        return 0.0
    counts = Counter(tokens)
    top = sum(c for _, c in counts.most_common(5))
    return top / len(tokens)

def list_usage_ratio(text):
    lines = [l for l in text.splitlines() if l.strip()]
    if not lines:
        return 0.0
    list_lines = sum(1 for l in lines if re.match(r"\s*(\d+\.|-|\*)\s+", l))
    return list_lines / len(lines)

def intro_body_conclusion(text):
    paras = split_paragraphs(text)
    if len(paras) < 2:
        return 0
    intro = bool(re.search(r"서론|개요|먼저", paras[0]))
    concl = bool(re.search(r"결론|요약|마무리", paras[-1]))
    return int(intro and concl)

def last_sentence_summary(sentences):
    if not sentences:
        return 0
    return int(bool(re.search(r"요약|정리|결론", sentences[-1])))

def ending_ratio(text, endings):
    sentences = split_sentences(text)
    if not sentences:
        return 0.0
    hits = 0
    for s in sentences:
        if any(s.endswith(e) for e in endings):
            hits += 1
    return hits / len(sentences)

def count_ratio(text, patterns):
    if not text:
        return 0.0
    count = sum(text.count(p) for p in patterns)
    return count / max(1, len(split_sentences(text)))

def comma_density(text):
    sentences = split_sentences(text)
    if not sentences:
        return 0.0
    commas = sum(s.count(",") for s in sentences)
    return commas / len(sentences)

def conjunction_density(text):
    sentences = split_sentences(text)
    if not sentences:
        return 0.0
    hits = sum(1 for s in sentences if any(c in s for c in CONJ))
    return hits / len(sentences)

def fragment_ratio(text):
    sentences = split_sentences(text)
    if not sentences:
        return 0.0
    endings = FORMAL_ENDINGS + COLLOQUIAL_ENDINGS
    fragments = sum(1 for s in sentences if not any(s.endswith(e) for e in endings))
    return fragments / len(sentences)

def entity_clarity_ratio(text):
    sentences = split_sentences(text)
    if not sentences:
        return 0.0
    hits = sum(1 for s in sentences if re.search(r"(은|는|이|가)", s))
    return hits / len(sentences)

def keyword_coverage(question, answer):
    q_tokens = [t for t in tokenize(question) if t not in KOR_STOPWORDS and t not in EN_STOPWORDS]
    if not q_tokens:
        return 0.0
    a_tokens = set(tokenize(answer))
    covered = sum(1 for t in set(q_tokens) if t in a_tokens)
    return covered / len(set(q_tokens))

def proper_noun_ratio(tokens):
    if not tokens:
        return 0.0
    # heuristic: English capitalized tokens or tokens with digits
    hits = sum(1 for t in tokens if re.search(r"[A-Z]", t) or re.search(r"\d", t))
    return hits / len(tokens)

def info_units_per_sentence(text):
    sentences = split_sentences(text)
    if not sentences:
        return 0.0
    units = []
    for s in sentences:
        toks = [t for t in tokenize(s) if t not in KOR_STOPWORDS and t not in EN_STOPWORDS]
        units.append(len(set(toks)))
    return float(np.mean(units))

def stopword_ratio(tokens):
    if not tokens:
        return 0.0
    hits = sum(1 for t in tokens if t in KOR_STOPWORDS or t in EN_STOPWORDS)
    return hits / len(tokens)

def redundant_sentence_ratio(sentences, threshold=0.8):
    if len(sentences) < 2:
        return 0.0
    vecs = [Counter(tokenize(s)) for s in sentences]
    redundant = 0
    total = 0
    for i in range(len(vecs)):
        for j in range(i+1, len(vecs)):
            total += 1
            if cosine_sim(vecs[i], vecs[j]) >= threshold:
                redundant += 1
    return redundant / total if total else 0.0

def viewpoint_changes(text):
    pronouns = ["저", "나", "우리", "당신", "그", "그녀", "그들"]
    tokens = tokenize(text)
    seq = [t for t in tokens if t in pronouns]
    if len(seq) < 2:
        return 0
    changes = sum(1 for i in range(1, len(seq)) if seq[i] != seq[i-1])
    return changes

def order_stability(text):
    nums = [int(n) for n in re.findall(r"\b(\d+)\.", text)]
    if len(nums) < 2:
        return 1.0
    stable = all(nums[i] <= nums[i+1] for i in range(len(nums)-1))
    return 1.0 if stable else 0.0

def style_features(question, answer):
    tokens = tokenize(answer)
    sentences = split_sentences(answer)
    paras = split_paragraphs(answer)

    feats = {}
    # 1) length/structure
    feats["token_count"] = len(tokens)
    feats["paragraph_count"] = len(paras)
    feats["avg_sentences_per_paragraph"] = float(np.mean([len(split_sentences(p)) for p in paras])) if paras else 0.0
    feats["list_usage_ratio"] = list_usage_ratio(answer)
    feats["intro_body_conclusion"] = intro_body_conclusion(answer)
    feats["first_sentence_length"] = len(tokenize(sentences[0])) if sentences else 0
    feats["last_sentence_summary"] = last_sentence_summary(sentences)

    # 2) repetition
    feats["ngram_rep_2"] = ngram_repetition(tokens, 2)
    feats["ngram_rep_3"] = ngram_repetition(tokens, 3)
    feats["ngram_rep_4"] = ngram_repetition(tokens, 4)
    feats["sentence_similarity_avg"] = sentence_similarity_avg(sentences)
    feats["phrase_repeat_rate"] = answer.count("다음과 같습니다") / max(1, len(sentences))
    feats["keyword_overuse_ratio"] = keyword_overuse_ratio(tokens)

    # 3) style/tone
    feats["formal_ending_ratio"] = ending_ratio(answer, FORMAL_ENDINGS)
    feats["colloquial_ratio"] = ending_ratio(answer, COLLOQUIAL_ENDINGS)
    feats["exclaim_ratio"] = answer.count("!") / max(1, len(sentences))
    feats["question_mark_ratio"] = answer.count("?") / max(1, len(sentences))
    feats["first_person_ratio"] = count_ratio(answer, ["저는", "제가", "나는", "우리"])
    feats["imperative_ratio"] = count_ratio(answer, IMPERATIVE_ENDINGS)
    feats["speculative_ratio"] = count_ratio(answer, SPECULATIVE)

    # 4) apology/hedge/defensive
    feats["apology_ratio"] = count_ratio(answer, APOLOGY)
    feats["defensive_ratio"] = count_ratio(answer, DEFENSIVE)
    feats["hedge_ratio"] = count_ratio(answer, HEDGE)

    # 5) clarity
    feats["avg_sentence_length"] = float(np.mean([len(tokenize(s)) for s in sentences])) if sentences else 0.0
    feats["subordinate_ratio"] = count_ratio(answer, SUBORD)
    feats["comma_density"] = comma_density(answer)
    feats["conjunction_density"] = conjunction_density(answer)
    feats["fragment_ratio"] = fragment_ratio(answer)
    feats["entity_clarity_ratio"] = entity_clarity_ratio(answer)

    # 6) info density
    feats["proper_noun_ratio"] = proper_noun_ratio(tokens)
    feats["keyword_coverage"] = keyword_coverage(question, answer)
    feats["info_units_per_sentence"] = info_units_per_sentence(answer)
    feats["stopword_ratio"] = stopword_ratio(tokens)
    feats["redundant_sentence_ratio"] = redundant_sentence_ratio(sentences)

    # 7) safety/neutral
    feats["assertive_ratio"] = count_ratio(answer, ASSERTIVE)
    feats["risky_keyword_ratio"] = count_ratio(answer, RISKY)
    feats["conditional_ratio"] = count_ratio(answer, CONDITIONAL)
    feats["neutral_vocab_ratio"] = count_ratio(answer, NEUTRAL)

    # 8) consistency
    feats["question_keyword_reuse"] = keyword_coverage(question, answer)
    feats["viewpoint_changes"] = viewpoint_changes(answer)
    feats["logic_connector_consistency"] = conjunction_density(answer)
    feats["order_stability"] = order_stability(answer)

    return feats


In [16]:
# Stage 1: generate and save JSONL per model
base_out = os.path.join(GEN_DIR, f"base_generations_{TIMESTAMP}.jsonl")
rlhf_out = os.path.join(GEN_DIR, f"rlhf_generations_{TIMESTAMP}.jsonl")

def write_jsonl(path, rows):
    with open(path, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

base_rows = []
rlhf_rows = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    q = row["question"]
    data_id = row["data_id"]
    base_ans = generate_answer(base_model, base_tok, q, **GEN_KWARGS)
    rlhf_ans = generate_answer(rlhf_model, rlhf_tok, q, **GEN_KWARGS)

    base_rows.append({"data_id": data_id, "question": q, ANSWER_KEY: base_ans})
    rlhf_rows.append({"data_id": data_id, "question": q, ANSWER_KEY: rlhf_ans})

write_jsonl(base_out, base_rows)
write_jsonl(rlhf_out, rlhf_rows)
print(base_out)
print(rlhf_out)


  0%|          | 0/10 [00:02<?, ?it/s]


KeyboardInterrupt: 

In [24]:
# Stage 2: evaluate from saved JSONL
from dotenv import load_dotenv

load_dotenv()

def read_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                rows.append(json.loads(line))
    return rows

base_rows = read_jsonl(base_out)
rlhf_rows = read_jsonl(rlhf_out)
base_map = {r["data_id"]: r for r in base_rows}
rlhf_map = {r["data_id"]: r for r in rlhf_rows}

results = []
if RUN_GPT_EVAL:
    from openai import OpenAI
    client = OpenAI()
else:
    client = None

for data_id in tqdm(sorted(set(base_map) & set(rlhf_map))):
    q = base_map[data_id]["question"]
    base_ans = base_map[data_id][ANSWER_KEY]
    rlhf_ans = rlhf_map[data_id][ANSWER_KEY]

    base_rm = rm_score(q, base_ans, rm_model, rm_tok)
    rlhf_rm = rm_score(q, rlhf_ans, rm_model, rm_tok)

    gpt_winner = None
    gpt_raw = None
    if RUN_GPT_EVAL:
        gpt_winner, gpt_raw = gpt_judge(q, rlhf_ans, base_ans, client)

    base_feats = style_features(q, base_ans)
    rlhf_feats = style_features(q, rlhf_ans)

    results.append({
        "data_id": data_id,
        "question": q,
        "base_answer": base_ans,
        "rlhf_answer": rlhf_ans,
        "base_rm_score": base_rm,
        "rlhf_rm_score": rlhf_rm,
        "gpt_winner": gpt_winner,
        "gpt_raw": gpt_raw,
        "base_features": base_feats,
        "rlhf_features": rlhf_feats,
    })

len(results)


100%|██████████| 10/10 [00:24<00:00,  2.44s/it]


10

In [25]:
# Flatten results for analysis
rows = []
for r in results:
    base = {"model": "base", "data_id": r["data_id"], "rm_score": r["base_rm_score"], **r["base_features"]}
    rlhf = {"model": "rlhf", "data_id": r["data_id"], "rm_score": r["rlhf_rm_score"], **r["rlhf_features"]}
    rows.extend([base, rlhf])

feat_df = pd.DataFrame(rows)
feat_df.to_csv(os.path.join(EVAL_DIR, f"style_metrics_{TIMESTAMP}.csv"), index=False)

raw_path = os.path.join(EVAL_DIR, f"raw_results_{TIMESTAMP}.json")
with open(raw_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# GPT win rate
gpt_wins = [r["gpt_winner"] for r in results if r["gpt_winner"]]
gpt_win_rate = None
if gpt_wins:
    gpt_win_rate = gpt_wins.count("A") / len(gpt_wins)

numeric_df = feat_df.select_dtypes(include="number")

summary = (
    pd.concat([feat_df["model"], numeric_df], axis=1)
      .groupby("model")
      .agg(["mean", "std"])
)
summary.to_csv(os.path.join(EVAL_DIR, f"style_metrics_summary_{TIMESTAMP}.csv"))
summary.head()


Unnamed: 0_level_0,rm_score,rm_score,token_count,token_count,paragraph_count,paragraph_count,avg_sentences_per_paragraph,avg_sentences_per_paragraph,list_usage_ratio,list_usage_ratio,...,neutral_vocab_ratio,neutral_vocab_ratio,question_keyword_reuse,question_keyword_reuse,viewpoint_changes,viewpoint_changes,logic_connector_consistency,logic_connector_consistency,order_stability,order_stability
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
base,-0.437073,1.420052,155.4,60.842602,10.4,2.01108,2.415438,0.451884,0.418135,0.183763,...,0.011334,0.026521,1.0,0.0,0.1,0.316228,0.0,0.0,1.0,0.0
rlhf,-1.242139,1.294531,178.6,60.457882,11.2,2.097618,2.621059,0.495769,0.492323,0.210087,...,0.009073,0.014785,1.0,0.0,0.0,0.0,0.00625,0.019764,1.0,0.0


In [26]:
def write_report(path, feat_df, gpt_win_rate, dataset_paths):
    models = feat_df["model"].unique().tolist()
    metrics = [c for c in feat_df.columns if c not in ["model", "data_id"]]

    with open(path, "w", encoding="utf-8") as f:
        f.write("# RLHF Evaluation Report\n\n")
        f.write("## Dataset\n")
        f.write(f"- Source files: {len(dataset_paths)}\n")
        f.write(f"- Samples used: {feat_df['data_id'].nunique()}\n\n")

        f.write("## RM Score\n")
        for m in models:
            mean = feat_df[feat_df["model"] == m]["rm_score"].mean()
            std = feat_df[feat_df["model"] == m]["rm_score"].std()
            f.write(f"- {m}: mean={mean:.4f}, std={std:.4f}\n")
        f.write("\n")

        f.write("## GPT Preference (A=RLHF, B=Base)\n")
        if gpt_win_rate is None:
            f.write("- GPT eval not run.\n")
        else:
            f.write(f"- RLHF win rate: {gpt_win_rate:.3f}\n")
        f.write("\n")

        f.write("## Style Metrics (mean ± std)\n")
        for metric in metrics:
            if metric == "rm_score":
                continue
            f.write(f"### {metric}\n")
            for m in models:
                s = feat_df[feat_df["model"] == m][metric]
                f.write(f"- {m}: {s.mean():.4f} ± {s.std():.4f}\n")
            f.write("\n")

        f.write("## Notes\n")
        f.write("- Style metrics use lightweight heuristics. Replace with a richer tokenizer or domain-specific analyzer if needed.\n")

write_report(REPORT_PATH, feat_df, gpt_win_rate, DATA_ROOT)
REPORT_PATH


'/content/drive/MyDrive/LikeLion/실전 프로젝트 2/evaluation/rlhf_eval_report_20260119_065000.md'