In [1]:
import json
from typing import List, Dict
from sentence_transformers import SentenceTransformer, util
from bert_score import score
import pandas as pd

import os
tensor_parallel_size=2
os.environ["CUDA_VISIBLE_DEVICES"] = "3, 4, 5, 6"
os.environ["VLLM_CACHE_ROOT"] = "/data/ydh/nlp/model/vllm_cache"
os.environ["HF_HOME"] = "/data/ydh/nlp/model/huggingface_model"

sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm
2025-05-27 15:23:51.067548: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748327031.094227  537397 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748327031.102366  537397 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748327031.124733  537397 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748327031.124752  537397 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1748327031.124755  537397

In [None]:
# 유사도 계산 함수
def compute_similarity_scores(output_just: str, gt_just: str) -> Dict[str, float]:
    emb_out = sbert_model.encode(output_just, convert_to_tensor=True)
    emb_gt = sbert_model.encode(gt_just, convert_to_tensor=True)
    cosine_sim = float(util.cos_sim(emb_out, emb_gt))

    P, R, F1 = score([output_just], [gt_just], lang="en", verbose=False)
    return {
        "cosine_similarity": cosine_sim,
        "bertscore_f1": float(F1[0])
    }

# 평가 함수
def evaluate_justifications(gt_path: str, output_path: str) -> List[Dict]:
    with open(gt_path, "r", encoding="utf-8") as f:
        gt_items = [json.loads(line) for line in f]
    with open(output_path, "r", encoding="utf-8") as f:
        out_items = [json.loads(line) for line in f]

    gt_map = {item["id"]: item for item in gt_items}
    results = []

    for out in out_items:
        qid = out["id"]
        if qid not in gt_map:
            continue

        gt = gt_map[qid]
        out_cands = out["candidate_answers"]
        gt_cands = gt["candidates"]

        for option, out_info in out_cands.items():
            if option not in gt_cands:
                continue

            out_just = out_info["justification"]
            gt_just = gt_cands[option]["justification"]
            gt_label = gt_cands[option]["groundtruth"]
            is_chosen = out_info["predicted"]

            sim_scores = compute_similarity_scores(out_just, gt_just)

            results.append({
                "id": qid,
                "choice": option,
                "chosen_by_gpt": is_chosen,
                "is_groundtruth": gt_label,
                "cosine_similarity": sim_scores["cosine_similarity"],
                "bertscore_f1": sim_scores["bertscore_f1"],
                "output_justification": out_just,
                "gt_justification": gt_just
            })

    return results

In [8]:
# 실행 예시
gt_path = "/data/ydh/nlp/dataset/test.jsonl"
output_path = "/data/ydh/nlp/dataset/output.jsonl"
results = evaluate_justifications(gt_path, output_path)

# 결과 저장 (선택사항)
df = pd.DataFrame(results)
df.to_csv("evaluation_results.csv", index=False)

KeyError: 'predicted'