In [19]:
import json
import re
import pandas as pd

# =================配置区域=================
INPUT_FILE = "results.jsonl"  
# INPUT_FILE = "results_original_data+prompt_gaze.jsonl"
# =========================================

def extract_model_answer(text):
    """
    从模型的输出中提取 A 或 B。
    策略：找 "Answer:" 后面的第一个 A 或 B，或者找最后出现的 A/B。
    """
    if not isinstance(text, str):
        return "Unknown"
    
    # 优先匹配明确的格式 "Answer: A" 或 "Answer: <A>"
    match = re.search(r"Answer:\s*[:<]?\s*([AB])\b", text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    
    # 如果没找到标准格式，尝试找文本最后出现的选项（这是一个兜底策略）
    # 也就是寻找最后出现的 "A" 或 "B" 字符
    matches = re.findall(r"\b([AB])\b", text)
    if matches:
        return matches[-1].upper()
    
    return "Unknown"

def detect_question_type(item):
    """
    根据选项文本内容判断是 NT 还是 NF。
    逻辑依据：
    - NT (Negation True): 正确选项包含否定词。
    - NF (Negation False): 错误选项包含否定词。
    """
    ground_truth = item.get("ground_truth", "").strip().upper()
    inference_A = item.get("inference_A", "")
    inference_B = item.get("inference_B", "")
    
    if ground_truth not in ["A", "B"]:
        return "Error"

    # 1. 确定哪个文本是“正确选项”，哪个是“错误选项”
    if ground_truth == "A":
        correct_text = inference_A
        incorrect_text = inference_B
    else:
        correct_text = inference_B
        incorrect_text = inference_A
        
    # 2. 定义否定词列表 (根据 VIBE 论文逻辑)
    negation_keywords = ["not", "don't", "doesn't", "didn't", "cannot", "never"]
    
    def has_negation(text):
        text_lower = text.lower()
        return any(kw in text_lower for kw in negation_keywords)

    is_correct_negated = has_negation(correct_text)
    is_incorrect_negated = has_negation(incorrect_text)

    # 3. 分类逻辑
    if is_correct_negated:
        return "NT"  # 正确答案是否定的 -> Negation True
    elif is_incorrect_negated:
        return "NF"  # 错误答案是否定的 -> Negation False
    else:
        return "Standard" # 其他类型（论文中的 Fixed/Distractor 若无否定词通常落入此类）

def evaluate_vibe(file_path):
    data = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip(): continue
            item = json.loads(line)
            
            # 1. 提取预测结果
            prediction = extract_model_answer(item.get("model_output"))
            truth = item.get("ground_truth")
            
            # 2. 判断正误
            is_correct = (prediction == truth)
            
            # 3. 判断题目类型
            q_type = detect_question_type(item)
            
            data.append({
                "Task ID": item.get("task_id"),
                "Type": q_type,
                "Correct": is_correct,
                "Model Output": prediction,
                "Ground Truth": truth
            })
            
    df = pd.DataFrame(data)
    
    # 计算各类别准确率
    results = {}
    
    # 计算 Overall (All)
    results['All'] = df['Correct'].mean() * 100
    
    # 计算各子类别 (NT, NF, Standard)
    for q_type in ['NT', 'NF', 'Standard']:
        subset = df[df['Type'] == q_type]
        if len(subset) > 0:
            results[q_type] = subset['Correct'].mean() * 100
        else:
            results[q_type] = 0.0
            
    # 格式化输出表格
    print("\n" + "="*50)
    print("VIBE Evaluation Results (Accuracy %)")
    print("="*50)
    
    # 创建类似论文的表格 DataFrame
    # 注意：这里用 'Standard' 代替了 Fixed/Distractor，因为那是内部元数据
    # 如果你想区分 Fixed/Distractor，需要在 jsonl 里有额外的 tag 字段
    
    summary_df = pd.DataFrame([{
        "Model": "Qwen2.5-VL-7B",
        "Standard": f"{results['Standard']:.1f}",
        "NF": f"{results['NF']:.1f}",
        "NT": f"{results['NT']:.1f}",
        "All": f"{results['All']:.1f}"
    }])
    # summary_df = pd.DataFrame([{
    #         "Model": "Qwen2.5-VL-3B",
    #         "Standard": f"70.1",
    #         "NF": f"81.1",
    #         "NT": f"30.6",
    #         "All": f"68.6"
    #     }])
    
    print(summary_df.to_string(index=False))
    print("="*50)
    
    # 打印详细统计
    print(f"\n样本统计:\nTotal: {len(df)}\nNT count: {len(df[df['Type']=='NT'])}\nNF count: {len(df[df['Type']=='NF'])}\nStandard count: {len(df[df['Type']=='Standard'])}")

    return df

if __name__ == "__main__":
    # 请确保你的 results.jsonl 文件里包含了 inference_A 和 inference_B
    # 如果没有，你需要先用代码把原始数据 merge 进去
    if "inference_A" not in open(INPUT_FILE).readline():
        print("错误：JSONL 文件中缺少 'inference_A/B' 字段。无法进行 NT/NF 分类。")
        print("请修改之前的脚本，在写入 results.jsonl 时把原始 item 的所有字段都写进去。")
    else:
        df = evaluate_vibe(INPUT_FILE)

In [None]:

# =================配置区域=================
INPUT_FILE = "results.jsonl"  
# INPUT_FILE = "results_original_data+prompt_gaze.jsonl"
# =========================================

def extract_model_answer(text):
    """
    从模型的输出中提取 A 或 B。
    策略：找 "Answer:" 后面的第一个 A 或 B，或者找最后出现的 A/B。
    """
    if not isinstance(text, str):
        return "Unknown"
    
    # 优先匹配明确的格式 "Answer: A" 或 "Answer: <A>"
    match = re.search(r"Answer:\s*[:<]?\s*([AB])\b", text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    
    # 如果没找到标准格式，尝试找文本最后出现的选项（这是一个兜底策略）
    # 也就是寻找最后出现的 "A" 或 "B" 字符
    matches = re.findall(r"\b([AB])\b", text)
    if matches:
        return matches[-1].upper()
    
    return "Unknown"

def detect_question_type(item):
    """
    根据选项文本内容判断是 NT 还是 NF。
    逻辑依据：
    - NT (Negation True): 正确选项包含否定词。
    - NF (Negation False): 错误选项包含否定词。
    """
    ground_truth = item.get("ground_truth", "").strip().upper()
    inference_A = item.get("inference_A", "")
    inference_B = item.get("inference_B", "")
    
    if ground_truth not in ["A", "B"]:
        return "Error"

    # 1. 确定哪个文本是“正确选项”，哪个是“错误选项”
    if ground_truth == "A":
        correct_text = inference_A
        incorrect_text = inference_B
    else:
        correct_text = inference_B
        incorrect_text = inference_A
        
    # 2. 定义否定词列表 (根据 VIBE 论文逻辑)
    negation_keywords = ["not", "don't", "doesn't", "didn't", "cannot", "never"]
    
    def has_negation(text):
        text_lower = text.lower()
        return any(kw in text_lower for kw in negation_keywords)

    is_correct_negated = has_negation(correct_text)
    is_incorrect_negated = has_negation(incorrect_text)

    # 3. 分类逻辑
    if is_correct_negated:
        return "NT"  # 正确答案是否定的 -> Negation True
    elif is_incorrect_negated:
        return "NF"  # 错误答案是否定的 -> Negation False
    else:
        return "Standard" # 其他类型（论文中的 Fixed/Distractor 若无否定词通常落入此类）

def evaluate_vibe(file_path):
    data = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip(): continue
            item = json.loads(line)
            
            # 1. 提取预测结果
            prediction = extract_model_answer(item.get("model_output"))
            truth = item.get("ground_truth")
            
            # 2. 判断正误
            is_correct = (prediction == truth)
            
            # 3. 判断题目类型
            q_type = detect_question_type(item)
            
            data.append({
                "Task ID": item.get("task_id"),
                "Type": q_type,
                "Correct": is_correct,
                "Model Output": prediction,
                "Ground Truth": truth
            })
            
    df = pd.DataFrame(data)
    
    # 计算各类别准确率
    results = {}
    
    # 计算 Overall (All)
    results['All'] = df['Correct'].mean() * 100
    
    # 计算各子类别 (NT, NF, Standard)
    for q_type in ['NT', 'NF', 'Standard']:
        subset = df[df['Type'] == q_type]
        if len(subset) > 0:
            results[q_type] = subset['Correct'].mean() * 100
        else:
            results[q_type] = 0.0
            
    # 格式化输出表格
    print("\n" + "="*50)
    print("VIBE Evaluation Results (Accuracy %)")
    print("="*50)
    
    # 创建类似论文的表格 DataFrame
    # 注意：这里用 'Standard' 代替了 Fixed/Distractor，因为那是内部元数据
    # 如果你想区分 Fixed/Distractor，需要在 jsonl 里有额外的 tag 字段
    
    summary_df = pd.DataFrame([{
        "Model": "Qwen2.5-VL-7B",
        "Standard": f"{results['Standard']:.1f}",
        "NF": f"{results['NF']:.1f}",
        "NT": f"{results['NT']:.1f}",
        "All": f"{results['All']:.1f}"
    }])
    # summary_df = pd.DataFrame([{
    #         "Model": "Qwen2.5-VL-3B",
    #         "Standard": f"70.1",
    #         "NF": f"81.1",
    #         "NT": f"30.6",
    #         "All": f"68.6"
    #     }])
    
    print(summary_df.to_string(index=False))
    print("="*50)
    
    # 打印详细统计
    print(f"\n样本统计:\nTotal: {len(df)}\nNT count: {len(df[df['Type']=='NT'])}\nNF count: {len(df[df['Type']=='NF'])}\nStandard count: {len(df[df['Type']=='Standard'])}")

    return df

if __name__ == "__main__":
    # 请确保你的 results.jsonl 文件里包含了 inference_A 和 inference_B
    # 如果没有，你需要先用代码把原始数据 merge 进去
    if "inference_A" not in open(INPUT_FILE).readline():
        print("错误：JSONL 文件中缺少 'inference_A/B' 字段。无法进行 NT/NF 分类。")
        print("请修改之前的脚本，在写入 results.jsonl 时把原始 item 的所有字段都写进去。")
    else:
        df = evaluate_vibe(INPUT_FILE)