In [2]:
import json
import re
import pandas as pd

jsonl_path = "results/qwen-3b-07.jsonl"   # 如果文件就在当前目录
# jsonl_path = "/mnt/data/qwen7b.jsonl"  # 如果你在这个环境下

NEG_PAT = re.compile(r"\b(not|no|never|n't|cannot|can't|does not|do not|did not|is not|are not)\b", re.I)

def extract_pred_letter(model_output: str):
    """
    从 model_output 中抽取预测答案（A/B）
    你的输出格式看起来像："... Answer: B"
    """
    if not isinstance(model_output, str):
        return None
    m = re.search(r"Answer\s*:\s*([AB])\b", model_output, flags=re.I)
    if m:
        return m.group(1).upper()
    # 兜底：取最后出现的 A/B
    m2 = re.findall(r"\b([AB])\b", model_output.upper())
    return m2[-1] if m2 else None

def is_negation(s: str) -> bool:
    return bool(NEG_PAT.search(s or ""))

def infer_is_NT(item: dict):
    """
    返回：
    - is_neg_task: 是否是 Negation题（A/B 里恰好一个是 negation）
    - neg_choice: 哪个选项是 negation (A or B)
    - is_NT: 是否为 NT（negation 为真，即 neg_choice 就是 ground_truth）
    """
    ia = item.get("inference_A", "")
    ib = item.get("inference_B", "")
    gt = (item.get("ground_truth") or "").strip().upper()

    a_neg = is_negation(ia)
    b_neg = is_negation(ib)

    # 必须恰好一个带否定，才算 Negation 题
    if a_neg == b_neg:
        return False, None, False

    neg_choice = "A" if a_neg else "B"
    is_NT = (gt == neg_choice)
    return True, neg_choice, is_NT

bad_nt_rows = []

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line_idx, line in enumerate(f, 1):
        line = line.strip()
        if not line:
            continue
        try:
            item = json.loads(line)
        except Exception:
            continue

        pred = extract_pred_letter(item.get("model_output", ""))
        gt = (item.get("ground_truth") or "").strip().upper()
        if pred not in ("A", "B") or gt not in ("A", "B"):
            continue

        is_neg_task, neg_choice, is_NT = infer_is_NT(item)
        if not is_neg_task or not is_NT:
            continue  # 只保留 NT

        if pred != gt:  # NT 判断错误
            bad_nt_rows.append({
                "line_idx": line_idx,
                "task_id": item.get("task_id"),
                "pred": pred,
                "ground_truth": gt,
                "neg_choice": neg_choice,
                "inference_A": item.get("inference_A"),
                "inference_B": item.get("inference_B"),
                "model_output": item.get("model_output"),
            })

df_bad_nt = pd.DataFrame(bad_nt_rows)
df_bad_nt

Unnamed: 0,line_idx,task_id,pred,ground_truth,neg_choice,inference_A,inference_B,model_output
0,2,video_1,A,B,B,Her gestures suggest she is feeling anger.,Her gestures do not suggest she is feeling anger.,The woman in the red dress is standing with he...
1,24,video_12,B,A,A,Her eyes being focused downward does not signi...,Her eyes being focused downward often signifie...,Thinking out loud: The fact that her eyes are ...
2,39,video_20,B,A,A,Her relaxed posture does not suggest that she ...,Her relaxed posture suggests that she is calm ...,Thinking out loud: The fact states that her po...
3,43,video_24,B,A,A,"Based on these observations, the speaker is li...","Based on these observations, the speaker is li...",Thinking out loud: The woman's slight smile co...
4,53,video_32,A,B,B,This indicates that she is feeling down or upset.,This does not indicate that she is feeling dow...,"I see two people in the frame, one of whom is ..."
...,...,...,...,...,...,...,...,...
74,906,video_397,B,A,A,His slightly bowed head does not suggest a sen...,His slightly bowed head suggests a sense of me...,Thinking out loud: The fact that his head is s...
75,914,video_399,A,B,B,The downcast facial expression suggests a feel...,The downcast facial expression does not sugges...,I'll analyze the situation and provide a respo...
76,915,video_399,A,B,B,Her posture suggests that she is feeling confi...,Her posture does not suggest that she is feeli...,Thinking out loud: The woman's posture in the ...
77,926,video_405,B,A,A,The furrowed eyebrows do not indicate that he ...,The furrowed eyebrows indicate that he is spea...,Thinking out loud: The furrowed eyebrows can i...


In [3]:
out_path = "qwen3b_NT_wrong.jsonl"

with open(jsonl_path, "r", encoding="utf-8") as f_in, open(out_path, "w", encoding="utf-8") as f_out:
    for line in f_in:
        item = json.loads(line)
        pred = extract_pred_letter(item.get("model_output", ""))
        gt = (item.get("ground_truth") or "").strip().upper()

        is_neg_task, neg_choice, is_NT = infer_is_NT(item)
        if is_neg_task and is_NT and pred in ("A", "B") and gt in ("A", "B") and pred != gt:
            f_out.write(json.dumps(item, ensure_ascii=False) + "\n")

print("saved:", out_path)

saved: qwen3b_NT_wrong.jsonl
