Covert student excel and save all dialogues in json：

In [None]:
import os
import pandas as pd
import json
import re

def find_matching_column(columns, keyword):
    for col in columns:
        if keyword.lower() in col.lower():
            return col
    return None

def parse_dialogue(text):
    if not isinstance(text, str) or not text.strip():
        return []

    turns = []
    # 使用正则切割每个发言段，避免截断内容
    segments = re.split(r'\n*(?=(C|P):)', text.strip())
    
    i = 0
    while i < len(segments) - 1:
        if segments[i] in ['C', 'P']:
            speaker_code = segments[i]
            content = segments[i + 1].lstrip(':').strip()
            role = "assistant" if speaker_code == "C" else "user"
            speaker = "doctor" if speaker_code == "C" else "patient"
            turns.append({
                "role": role,
                "speaker": speaker,
                "content": f"** {content}"
            })
            i += 2
        else:
            i += 1
    return turns

# Batch process files in a folder
input_folder = "test"
output_file = "all_dialogues.json"
all_dialogues = []
seen_dialogues = set()  # To track unique dialogues

for filename in os.listdir(input_folder):
    if filename.endswith(".csv") or filename.endswith(".tsv"):
        filepath = os.path.join(input_folder, filename)
        sep = "\t" if filename.endswith(".tsv") else ","
        
        try:
            df = pd.read_csv(
                filepath,
                sep=sep,
                encoding='utf-8',
                quotechar='"',
                doublequote=True,
                escapechar="\\",
                on_bad_lines='skip',
                engine="python"
            ).fillna("")
        except Exception as e:
            print(f"❌ Failed to load {filename}: {e}")
            continue

        df.columns = [col.strip() for col in df.columns]
        col_basic = find_matching_column(df.columns, "basic conversation")
        col_physical = find_matching_column(df.columns, "physical function")
        col_emotional = find_matching_column(df.columns, "emotional feedback")

        for _, row in df.iterrows():
            basic = parse_dialogue(row.get(col_basic, ""))
            physical = parse_dialogue(row.get(col_physical, ""))
            emotional = parse_dialogue(row.get(col_emotional, ""))
            full = basic + physical + emotional

            if full:
                dialogue_str = json.dumps(full, ensure_ascii=False, sort_keys=True)
                if dialogue_str not in seen_dialogues:
                    all_dialogues.append(full)
                    seen_dialogues.add(dialogue_str)

# Save all unique dialogues
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_dialogues, f, ensure_ascii=False, indent=2)

print(f"✅ Processed {len(all_dialogues)} unique dialogues and saved to {output_file}")


statistic:

In [14]:
import os
import re
import json
import csv
from collections import Counter, defaultdict
import numpy as np
import pandas as pd

# =========================
# Config
# =========================
input_folder = "test"
EXPORT_CSV = True
CSV_PATH = "test_corpus_overview_stats.csv"
DEBUG_SHOW_SAMPLES = 3  # 打印前 N 段解析样本（设 0 关闭）

# 支持多种表头写法：basic/physical/emotional 以及 part one/two/three
CANDIDATE_KEYS = {
    "basic": [
        "basic conversation", "basic consultation", "basic", "intro",
        "introduction", "opening", "greeting",
        "part one", "part 1", "section one", "section 1"
    ],
    "physical": [
        "physical function", "functional follow-up", "functional details",
        "functional section", "function", "follow-up", "physical",
        "part two", "part 2", "section two", "section 2"
    ],
    "emotional": [
        "emotional feedback", "emotional", "affective", "emotion",
        "emotional perspective", "psychological",
        "part three", "part 3", "section three", "section 3"
    ],
}

# 可能是模板说明的关键词（出现且无标签时丢弃该段）
INSTRUCTION_HINTS = [
    "rounds", "greetings", "small talk", "variations", "inherent order",
    "logically follow", "function disability", "make it natural",
    "severity level", "give more variations"
]

# =========================
# Column helpers
# =========================
def find_matching_column(columns, key_phrase):
    norm_cols = {c: re.sub(r"\s+", " ", str(c)).strip().lower() for c in columns}
    for col, nc in norm_cols.items():
        if key_phrase.lower() in nc:
            return col
    return None

def find_first_hit(columns, candidates):
    for cand in candidates:
        col = find_matching_column(columns, cand)
        if col:
            return col
    return None

# =========================
# Dialogue parsing
# =========================
def normalize_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", str(s)).strip()

# 统一的标签检测（要求标签前为行首或非字母边界）
TAG_PATTERN = re.compile(
    r'(^|[^A-Za-z])(?:'
    r'(?P<C>'                       # Clinician
        r'C\s*\d*\s*[:：.\-]'       # C:, C1., C2-
        r'|Doctor\s*[:：]'          # Doctor:
        r'|Dr\s*[:：]'              # Dr:
        r'|Clinician\s*[:：]'       # Clinician:
        r'|医生\s*[:：]'            # 中文：医生:
        r'|Q\s*[:：]'               # Q: 视为 Clinician
    r')'
    r'|'
    r'(?P<P>'                       # Patient
        r'P\s*\d*\s*[:：.\-]'       # P:, P1., P2-
        r'|Patient\s*[:：]'         # Patient:
        r'|患者\s*[:：]'             # 中文：患者:
        r'|病人\s*[:：]'             # 中文：病人:
        r'|A\s*[:：]'               # A: 视为 Patient
    r'))',
    flags=re.IGNORECASE | re.MULTILINE | re.DOTALL
)

def has_speaker_labels(text: str) -> bool:
    return bool(TAG_PATTERN.search(text or ""))

def looks_like_dialogue(text: str) -> bool:
    """包含标签则认为像对话；若无标签且命中说明词，则丢弃。"""
    if not isinstance(text, str):
        return False
    t = text.strip()
    if not t:
        return False
    if has_speaker_labels(t):
        return True
    low = t.lower()
    if any(h in low for h in INSTRUCTION_HINTS):
        return False
    return False  # 无标签且不像对话

def extract_turns_by_speaker(text: str):
    """
    直接在原文上扫描真正的标签位置，并按区间提取内容。
    不先替换，避免把 Cynthia/course/case 等误切。
    """
    if not isinstance(text, str):
        return []

    # 仅统一换行，不做替换
    t = text.replace("\r\n", "\n").replace("\r", "\n")

    turns = []
    last_role = None
    last_end = None

    for m in TAG_PATTERN.finditer(t):
        role = "doctor" if m.group("C") else "patient"
        label_end = m.end()  # 从标签末尾开始是其内容

        if last_role is None:
            last_role = role
            last_end = label_end
        else:
            content = t[last_end:m.start()].strip()
            content = normalize_spaces(content)
            if content:
                turns.append({"speaker": last_role, "content": content})
            last_role = role
            last_end = label_end

    if last_role is not None and last_end is not None:
        tail = normalize_spaces(t[last_end:])
        if tail:
            turns.append({"speaker": last_role, "content": tail})

    # 兜底（极少）：未检测到标签，则不返回任何 turn（避免把整段说明当作对话）
    return turns

# =========================
# Stats
# =========================
def words_in(text: str) -> int:
    return len(normalize_spaces(text).split()) if isinstance(text, str) else 0

def compute_stats(dialogues):
    n = len(dialogues)
    if n == 0:
        return {
            "nr_conversations": 0,
            "total_turns": 0,
            "avg_turns_per_conversation": 0.0,
            "avg_turn_length": 0.0,
            "avg_words_per_dialogue": 0.0,
        }
    turn_counts, dialogue_word_totals, all_turn_word_counts = [], [], []
    for dlg in dialogues:
        turn_counts.append(len(dlg))
        wsum = 0
        for t in dlg:
            wc = words_in(t.get("content", ""))
            all_turn_word_counts.append(wc)
            wsum += wc
        dialogue_word_totals.append(wsum)

    total_turns = int(np.sum(turn_counts))
    return {
        "nr_conversations": n,
        "total_turns": total_turns,
        "avg_turns_per_conversation": round(float(np.mean(turn_counts)), 2),
        "avg_turn_length": round(float(np.mean(all_turn_word_counts)), 2),
        "avg_words_per_dialogue": round(float(np.mean(dialogue_word_totals)), 2),
    }

# =========================
# Main: read → parse → stats
# =========================
all_dialogues = []
seen_dialogues = set()
category_dialogue_tracker = defaultdict(list)
category_counter = Counter()

for filename in os.listdir(input_folder):
    if not (filename.endswith(".csv") or filename.endswith(".tsv")):
        continue

    filepath = os.path.join(input_folder, filename)
    sep = "\t" if filename.endswith(".tsv") else ","

    try:
        df = pd.read_csv(
            filepath, sep=sep, encoding="utf-8",
            quotechar='"', doublequote=True, escapechar="\\",
            on_bad_lines="skip", engine="python"
        ).fillna("")
    except Exception as e:
        print(f"❌ Failed to load {filename}: {e}")
        continue

    df.columns = [str(col).strip() for col in df.columns]

    # 优先匹配三段列；若匹配失败，将在行级做文本拼接兜底
    col_basic    = find_first_hit(df.columns, CANDIDATE_KEYS["basic"])
    col_physical = find_first_hit(df.columns, CANDIDATE_KEYS["physical"])
    col_emotional= find_first_hit(df.columns, CANDIDATE_KEYS["emotional"])

    # ICF 类别来自文件名
    m = re.search(r"(D420|D445|D465|D470)", filename, flags=re.I)
    icf_category = m.group(1).upper() if m else "UNKNOWN"

    for _, row in df.iterrows():
        parts = []
        # 优先使用三段列
        if col_basic:     parts.append(str(row.get(col_basic, "")))
        if col_physical:  parts.append(str(row.get(col_physical, "")))
        if col_emotional: parts.append(str(row.get(col_emotional, "")))

        # 若三段都缺，兜底：拼接本行所有看起来像对话的文本列
        if not any([col_basic, col_physical, col_emotional]):
            row_texts = []
            for col in df.columns:
                val = row.get(col, "")
                if isinstance(val, str) and val.strip() and looks_like_dialogue(val):
                    row_texts.append(val)
            parts = row_texts

        # 合并为完整对话文本
        full_text = "\n".join([p for p in parts if isinstance(p, str) and p.strip()])

        # 必须含有标签，否则跳过（避免把“说明行”当对话）
        if not has_speaker_labels(full_text):
            continue

        turns = extract_turns_by_speaker(full_text)
        if not turns:
            continue

        # 去重：按 turn 序列 JSON
        key = json.dumps(turns, ensure_ascii=False, sort_keys=True)
        if key in seen_dialogues:
            continue
        seen_dialogues.add(key)

        all_dialogues.append(turns)
        category_dialogue_tracker[icf_category].append(turns)

# Per-category counts
for cat, dialogues in category_dialogue_tracker.items():
    category_counter[cat] = len(dialogues)

# Compute stats
per_category_stats = {cat: compute_stats(dialogues)
                      for cat, dialogues in sorted(category_dialogue_tracker.items())}
overall_stats = compute_stats(all_dialogues)

# =========================
# Print
# =========================
def print_block(title, stats):
    print(f"\n📊 {title}")
    print(f"Total nr. of conversations:   {stats['nr_conversations']}")
    print(f"Total nr. of turns:           {stats['total_turns']}")
    print(f"Average turns/Conversation:   {stats['avg_turns_per_conversation']}")
    print(f"Average turn length (words):  {stats['avg_turn_length']}")
    print(f"Avg words/Dialogue:           {stats['avg_words_per_dialogue']}")

for cat in sorted(per_category_stats.keys()):
    print_block(f"ICF Category {cat}", per_category_stats[cat])
print_block("All Categories (Overall)", overall_stats)

# =========================
# Debug samples
# =========================
if DEBUG_SHOW_SAMPLES > 0:
    shown = 0
    for cat in sorted(category_dialogue_tracker.keys()):
        for dlg in category_dialogue_tracker[cat][:DEBUG_SHOW_SAMPLES]:
            print(f"\n--- SAMPLE ({cat}) turns={len(dlg)} ---")
            for i, t in enumerate(dlg[:8], 1):
                print(f"[{i}] {t['speaker']}: {t['content'][:140]}")
            shown += 1
            if shown >= DEBUG_SHOW_SAMPLES:
                break
        if shown >= DEBUG_SHOW_SAMPLES:
            break

# =========================
# CSV export (optional)
# =========================
if EXPORT_CSV:
    fieldnames = [
        "icf_category",
        "nr_conversations",
        "total_turns",
        "avg_turns_per_conversation",
        "avg_turn_length",
        "avg_words_per_dialogue",
    ]
    with open(CSV_PATH, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for cat, stats in sorted(per_category_stats.items()):
            writer.writerow({"icf_category": cat, **stats})
        writer.writerow({"icf_category": "OVERALL", **overall_stats})
    print(f"\n[INFO] CSV exported to: {os.path.abspath(CSV_PATH)}")



📊 ICF Category D420
Total nr. of conversations:   21
Total nr. of turns:           379
Average turns/Conversation:   18.05
Average turn length (words):  16.94
Avg words/Dialogue:           305.81

📊 ICF Category D445
Total nr. of conversations:   18
Total nr. of turns:           335
Average turns/Conversation:   18.61
Average turn length (words):  15.44
Avg words/Dialogue:           287.44

📊 ICF Category D465
Total nr. of conversations:   21
Total nr. of turns:           406
Average turns/Conversation:   19.33
Average turn length (words):  14.38
Avg words/Dialogue:           277.95

📊 ICF Category D470
Total nr. of conversations:   25
Total nr. of turns:           487
Average turns/Conversation:   19.48
Average turn length (words):  13.8
Avg words/Dialogue:           268.76

📊 All Categories (Overall)
Total nr. of conversations:   85
Total nr. of turns:           1607
Average turns/Conversation:   18.91
Average turn length (words):  15.03
Avg words/Dialogue:           284.14

--- SAM

Exact all follow-up questions from turn 3(the second C) and save them in json:

In [None]:
import json

# Input and output file paths
input_file = "data/test data/all_dialogues.json"
output_file = "data/test data/followup_references_from_second_c_flat.json"

# Load dialogues
with open(input_file, "r", encoding="utf-8") as f:
    dialogues = json.load(f)

# Store results
followup_data = []

for i, dialogue in enumerate(dialogues):
    dialogue_id = i + 1

    # Get all doctor utterances
    doctor_turns = [
        turn["content"].lstrip("* ").strip()
        for turn in dialogue
        if turn.get("speaker", "").lower() == "doctor"
    ]

    # Extract from the 2nd doctor turn (index 1)
    followups = doctor_turns[1:] if len(doctor_turns) >= 2 else []

    # Add each follow-up question as a separate item
    for j, q in enumerate(followups):
        followup_data.append({
            "dialogue_id": dialogue_id,
            "turn_index": j + 2,  # +2 to reflect the actual doctor turn index (starting from 0)
            "followup_question": q
        })

# Save as JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(followup_data, f, ensure_ascii=False, indent=2)

print(f"✅ Flattened follow-up questions saved to {output_file}")


In [None]:
import json

# Input and output file paths
input_file = "data/test data/all_dialogues.json"
output_file = "data/test data/followup_references_from_second_c.json"

# Load dialogues
with open(input_file, "r", encoding="utf-8") as f:
    dialogues = json.load(f)

# Store results
followup_data = []

for i, dialogue in enumerate(dialogues):
    # Get all doctor utterances
    doctor_turns = [
        turn["content"].lstrip("* ").strip()
        for turn in dialogue
        if turn.get("speaker", "").lower() == "doctor"
    ]
    
    # Extract from the 2nd doctor turn (index 1)
    followups = doctor_turns[1:] if len(doctor_turns) >= 2 else []
    
    followup_data.append({
        "dialogue_id": i + 1,
        "followup_questions": followups
    })

# Save as JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(followup_data, f, ensure_ascii=False, indent=2)

print(f"✅ Extracted follow-up questions from the 2nd doctor turn onward. Saved to {output_file}")



Generate 10 variants for each follow-up questions：

In [None]:
import json
import re
import time
from tqdm import tqdm
from openai import OpenAI

# ========== Config ==========
input_file = "data/test data/all_dialogues.json"
output_file = "data/test data/doctor_paraphrase_references_with_context.json"

# Local LLM API
client = OpenAI(base_url="http://localhost:8000/v1", api_key="cltl")
model_name = "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf"  # Replace with your registered local model name

# Load dialogues
with open(input_file, "r", encoding="utf-8") as f:
    dialogues = json.load(f)

# Extract follow-up questions with context
question_records = []
for dialogue_id, dialogue in enumerate(dialogues, start=1):
    context_turns = []
    doctor_turn_count = 0
    
    for turn in dialogue:
        speaker = turn.get("speaker", "").lower()
        content = turn.get("content", "").lstrip("* ").strip()
        
        if speaker == "doctor":
            doctor_turn_count += 1
            if doctor_turn_count >= 2:
                question_records.append({
                    "dialogue_id": dialogue_id,
                    "question_index": doctor_turn_count - 2,
                    "context": " ".join(context_turns).strip(),
                    "question": content
                })
        context_turns.append(f"{speaker.capitalize()}: {content}")

print(f"✅ Loaded {len(question_records)} questions with context")

# Build Prompt + Call Model
def build_prompt(context, question):
    return (
        "You are a helpful and professional assistant. "
        "Below is a clinical conversation followed by a question asked by the doctor. "
        "Rewrite this doctor's question into 10 different semantically equivalent variants, using the provided context.\n\n"
        f"Conversation context:\n{context}\n\n"
        f"Doctor's question: \"{question}\"\n\n"
        "Paraphrased versions:\n"
    )

def generate_variants(context, question):
    prompt = build_prompt(context, question)
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.8,
            top_p=0.95,
            max_tokens=512,
        )
        content = response.choices[0].message.content.strip()
        lines = content.split("\n")
        variants = []

        for line in lines:
            match = re.match(r"^\d+[\.\)]?\s*(.*)", line.strip())
            if match:
                variants.append(match.group(1))
            elif line.strip():
                variants.append(line.strip())

        return variants[:10]

    except Exception as e:
        print(f"❌ Failed for question: {question}\nError: {e}")
        return []

# Generate paraphrases and save
output = []
for record in tqdm(question_records, desc="Generating paraphrases with context"):
    context = record["context"]
    question = record["question"]
    variants = generate_variants(context, question)
    output.append({
        "dialogue_id": record["dialogue_id"],
        "question_index": record["question_index"],
        "original": question,
        "context": context,
        "paraphrases": variants
    })
    time.sleep(1)  # To prevent API overload

# Save JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f"✅ Contextual paraphrase generation complete. Saved to {output_file}")


evaluation attempt：

Calculate automatic metrics：

base model——0shot：

In [28]:
import json
import pandas as pd
from evaluate import load
from tqdm import tqdm
import sacrebleu

# === 文件路径 ===
pred_file = "prediction/basemodel_0shot_output.json"
para_file = "doctor_paraphrase_references_with_context.json"

# === 加载数据 ===
with open(pred_file, "r", encoding="utf-8") as f:
    predictions = json.load(f)

with open(para_file, "r", encoding="utf-8") as f:
    para_list = json.load(f)

# === 数据对齐 ===
aligned_preds = []
gold_refs = []
variant_refs = []
aligned_ids = []
skipped = 0

for i, item in enumerate(para_list):
    if i >= len(predictions):
        skipped += 1
        continue

    pred_item = predictions[i]
    pred = pred_item.get("generated_followup") or pred_item.get("prediction")
    if not isinstance(pred, str) or not pred.strip():
        skipped += 1
        continue

    prediction = pred.strip()
    original = item["original"].strip()
    paraphrases = item.get("paraphrases", [])

    # Exclude the first variant
    variants = paraphrases[1:]

    aligned_preds.append(prediction)
    gold_refs.append(original)
    variant_refs.append(variants)
    aligned_ids.append(f'{item["dialogue_id"]}_{i}')

print(f"\n✅ Aligned samples: {len(aligned_preds)}, Skipped: {skipped}")

# 添加标记来追踪有效性
valid_gold = []
valid_variants = []

# === 修改 compute_metrics 函数 ===
def compute_metrics(preds, refs):
    valid_preds, valid_refs, valid_idx = [], [], []
    bleu_scores_all = [None] * len(preds)
    rouge_scores_all = [None] * len(preds)
    bert_scores_all = [None] * len(preds)

    rouge = load("rouge")
    bertscore = load("bertscore")

    for idx, (p, r) in enumerate(zip(preds, refs)):
        if r:  # refs非空
            valid_preds.append(p)
            valid_refs.append(r)
            valid_idx.append(idx)

    # 计算BLEU
    bleu_scores = [sacrebleu.sentence_bleu(p, r).score / 100 for p, r in zip(valid_preds, valid_refs)]

    # 计算ROUGE
    rouge_result = rouge.compute(
        predictions=valid_preds,
        references=valid_refs,
        use_stemmer=True,
        use_aggregator=False,
        rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"]
    )

    # 计算BERTScore
    F1_max = []
    for pred, ref_group in tqdm(zip(valid_preds, valid_refs), total=len(valid_preds)):
        best_F1 = max(bertscore.compute(
            predictions=[pred] * len(ref_group),
            references=ref_group,
            lang="en",
            rescale_with_baseline=False,
            use_fast_tokenizer=True
        )["f1"])
        F1_max.append(best_F1)

    # 把计算好的有效分数放回原始位置，无效位置填充None
    for i, idx in enumerate(valid_idx):
        bleu_scores_all[idx] = bleu_scores[i]
        rouge_scores_all[idx] = rouge_result["rougeL"][i]
        bert_scores_all[idx] = F1_max[i]

    metrics = {
        "BLEU": sum(bleu_scores) / len(bleu_scores),
        "ROUGE-L": sum(rouge_result['rougeL']) / len(rouge_result['rougeL']),
        "BERTScore_F1": sum(F1_max) / len(F1_max)
    }

    return metrics, bleu_scores_all, rouge_scores_all, bert_scores_all

# === Metrics: Prediction vs Gold ===
gold_metrics, gold_bleu, gold_rouge, gold_f1 = compute_metrics(aligned_preds, [[ref] for ref in gold_refs])

# === Metrics: Prediction vs Variants ===
variant_metrics, variant_bleu, variant_rouge, variant_f1 = compute_metrics(aligned_preds, variant_refs)

# === 打印结果 ===
print("\n📊 Evaluation (Prediction vs Gold):")
print(f"BLEU (avg)      : {gold_metrics['BLEU']:.4f}")
print(f"ROUGE-L (avg)   : {gold_metrics['ROUGE-L']:.4f}")
print(f"BERTScore F1    : {gold_metrics['BERTScore_F1']:.4f}")

print("\n📊 Evaluation (Prediction vs Variants, excluding first):")
print(f"BLEU (avg)      : {variant_metrics['BLEU']:.4f}")
print(f"ROUGE-L (avg)   : {variant_metrics['ROUGE-L']:.4f}")
print(f"BERTScore F1    : {variant_metrics['BERTScore_F1']:.4f}")

# === 保存 CSV（确保长度一致） ===
df = pd.DataFrame({
    "sample_id": aligned_ids,
    "prediction": aligned_preds,
    "gold_reference": gold_refs,
    "variant_references": [" • " + "\n • ".join(v) for v in variant_refs],
    "BLEU_gold": gold_bleu,
    "ROUGE-L_gold": gold_rouge,
    "BERTScore_F1_gold": gold_f1,
    "BLEU_variants": variant_bleu,
    "ROUGE-L_variants": variant_rouge,
    "BERTScore_F1_variants": variant_f1,
})

# 空缺位置可能为None，可以填充为NaN或其他值
df.to_csv("prediction/base_zero_automatic_scores_separated.csv", index=False)
print("📁 Saved to 'base_zero_automatic_scores_separated.csv'")



✅ Aligned samples: 730, Skipped: 0


  0%|                                                   | 0/730 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|█████████████████████████████████████████| 730/730 [01:19<00:00,  9.16it/s]
  0%|                                                   | 0/729 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|█████████████████████████████████████████| 729/729 [04:14<00:00,  2.86it/s]


📊 Evaluation (Prediction vs Gold):
BLEU (avg)      : 0.0334
ROUGE-L (avg)   : 0.1706
BERTScore F1    : 0.8572

📊 Evaluation (Prediction vs Variants, excluding first):
BLEU (avg)      : 0.0656
ROUGE-L (avg)   : 0.2469
BERTScore F1    : 0.8662
📁 Saved to 'base_zero_automatic_scores_separated.csv'





base model——fewshot：

In [16]:
import json
import pandas as pd
from evaluate import load
from tqdm import tqdm
import sacrebleu

# === 文件路径 ===
pred_file = "prediction/basemodel_fewshot_output.json"
para_file = "doctor_paraphrase_references_with_context.json"

# === 加载数据 ===
with open(pred_file, "r", encoding="utf-8") as f:
    predictions = json.load(f)

with open(para_file, "r", encoding="utf-8") as f:
    para_list = json.load(f)

# === 数据对齐 ===
aligned_preds = []
gold_refs = []
variant_refs = []
aligned_ids = []
skipped = 0

for i, item in enumerate(para_list):
    if i >= len(predictions):
        skipped += 1
        continue

    pred_item = predictions[i]
    pred = pred_item.get("generated_followup") or pred_item.get("prediction")
    if not isinstance(pred, str) or not pred.strip():
        skipped += 1
        continue

    prediction = pred.strip()
    original = item["original"].strip()
    paraphrases = item.get("paraphrases", [])

    # Exclude the first variant
    variants = paraphrases[1:]

    aligned_preds.append(prediction)
    gold_refs.append(original)
    variant_refs.append(variants)
    aligned_ids.append(f'{item["dialogue_id"]}_{i}')

print(f"\n✅ Aligned samples: {len(aligned_preds)}, Skipped: {skipped}")

# 添加标记来追踪有效性
valid_gold = []
valid_variants = []

# === 修改 compute_metrics 函数 ===
def compute_metrics(preds, refs):
    valid_preds, valid_refs, valid_idx = [], [], []
    bleu_scores_all = [None] * len(preds)
    rouge_scores_all = [None] * len(preds)
    bert_scores_all = [None] * len(preds)

    rouge = load("rouge")
    bertscore = load("bertscore")

    for idx, (p, r) in enumerate(zip(preds, refs)):
        if r:  # refs非空
            valid_preds.append(p)
            valid_refs.append(r)
            valid_idx.append(idx)

    # 计算BLEU
    bleu_scores = [sacrebleu.sentence_bleu(p, r).score / 100 for p, r in zip(valid_preds, valid_refs)]

    # 计算ROUGE
    rouge_result = rouge.compute(
        predictions=valid_preds,
        references=valid_refs,
        use_stemmer=True,
        use_aggregator=False,
        rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"]
    )

    # 计算BERTScore
    F1_max = []
    for pred, ref_group in tqdm(zip(valid_preds, valid_refs), total=len(valid_preds)):
        best_F1 = max(bertscore.compute(
            predictions=[pred] * len(ref_group),
            references=ref_group,
            lang="en",
            rescale_with_baseline=False,
            use_fast_tokenizer=True
        )["f1"])
        F1_max.append(best_F1)

    # 把计算好的有效分数放回原始位置，无效位置填充None
    for i, idx in enumerate(valid_idx):
        bleu_scores_all[idx] = bleu_scores[i]
        rouge_scores_all[idx] = rouge_result["rougeL"][i]
        bert_scores_all[idx] = F1_max[i]

    metrics = {
        "BLEU": sum(bleu_scores) / len(bleu_scores),
        "ROUGE-L": sum(rouge_result['rougeL']) / len(rouge_result['rougeL']),
        "BERTScore_F1": sum(F1_max) / len(F1_max)
    }

    return metrics, bleu_scores_all, rouge_scores_all, bert_scores_all

# === Metrics: Prediction vs Gold ===
gold_metrics, gold_bleu, gold_rouge, gold_f1 = compute_metrics(aligned_preds, [[ref] for ref in gold_refs])

# === Metrics: Prediction vs Variants ===
variant_metrics, variant_bleu, variant_rouge, variant_f1 = compute_metrics(aligned_preds, variant_refs)

# === 打印结果 ===
print("\n📊 Evaluation (Prediction vs Gold):")
print(f"BLEU (avg)      : {gold_metrics['BLEU']:.4f}")
print(f"ROUGE-L (avg)   : {gold_metrics['ROUGE-L']:.4f}")
print(f"BERTScore F1    : {gold_metrics['BERTScore_F1']:.4f}")

print("\n📊 Evaluation (Prediction vs Variants, excluding first):")
print(f"BLEU (avg)      : {variant_metrics['BLEU']:.4f}")
print(f"ROUGE-L (avg)   : {variant_metrics['ROUGE-L']:.4f}")
print(f"BERTScore F1    : {variant_metrics['BERTScore_F1']:.4f}")

# === 保存 CSV（确保长度一致） ===
df = pd.DataFrame({
    "sample_id": aligned_ids,
    "prediction": aligned_preds,
    "gold_reference": gold_refs,
    "variant_references": [" • " + "\n • ".join(v) for v in variant_refs],
    "BLEU_gold": gold_bleu,
    "ROUGE-L_gold": gold_rouge,
    "BERTScore_F1_gold": gold_f1,
    "BLEU_variants": variant_bleu,
    "ROUGE-L_variants": variant_rouge,
    "BERTScore_F1_variants": variant_f1,
})

# 空缺位置可能为None，可以填充为NaN或其他值
df.to_csv("prediction/base_few_automatic_scores_separated.csv", index=False)
print("📁 Saved to 'base_few_automatic_scores_separated.csv'")



✅ Aligned samples: 730, Skipped: 0


  0%|                                                   | 0/730 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|█████████████████████████████████████████| 730/730 [01:21<00:00,  9.01it/s]
  0%|                                                   | 0/729 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|█████████████████████████████████████████| 729/729 [04:24<00:00,  2.76it/s]


📊 Evaluation (Prediction vs Gold):
BLEU (avg)      : 0.0404
ROUGE-L (avg)   : 0.1916
BERTScore F1    : 0.8578

📊 Evaluation (Prediction vs Variants, excluding first):
BLEU (avg)      : 0.0708
ROUGE-L (avg)   : 0.2578
BERTScore F1    : 0.8651
📁 Saved to 'base_few_automatic_scores_separated.csv'





fine-tuned qwen3——0shot：

In [25]:
import json
import pandas as pd
from evaluate import load
from tqdm import tqdm
import sacrebleu

# === 文件路径 ===
pred_file = "prediction/finetuned_0shot_output.json"
para_file = "doctor_paraphrase_references_with_context.json"

# === 加载数据 ===
with open(pred_file, "r", encoding="utf-8") as f:
    predictions = json.load(f)

with open(para_file, "r", encoding="utf-8") as f:
    para_list = json.load(f)

# === 数据对齐 ===
aligned_preds = []
gold_refs = []
variant_refs = []
aligned_ids = []
skipped = 0

for i, item in enumerate(para_list):
    if i >= len(predictions):
        skipped += 1
        continue

    pred_item = predictions[i]
    pred = pred_item.get("generated_followup") or pred_item.get("prediction")
    if not isinstance(pred, str) or not pred.strip():
        skipped += 1
        continue

    prediction = pred.strip()
    original = item["original"].strip()
    paraphrases = item.get("paraphrases", [])

    # Exclude the first variant
    variants = paraphrases[1:]

    aligned_preds.append(prediction)
    gold_refs.append(original)
    variant_refs.append(variants)
    aligned_ids.append(f'{item["dialogue_id"]}_{i}')

print(f"\n✅ Aligned samples: {len(aligned_preds)}, Skipped: {skipped}")

# 添加标记来追踪有效性
valid_gold = []
valid_variants = []

# === 修改 compute_metrics 函数 ===
def compute_metrics(preds, refs):
    valid_preds, valid_refs, valid_idx = [], [], []
    bleu_scores_all = [None] * len(preds)
    rouge_scores_all = [None] * len(preds)
    bert_scores_all = [None] * len(preds)

    rouge = load("rouge")
    bertscore = load("bertscore")

    for idx, (p, r) in enumerate(zip(preds, refs)):
        if r:  # refs非空
            valid_preds.append(p)
            valid_refs.append(r)
            valid_idx.append(idx)

    # 计算BLEU
    bleu_scores = [sacrebleu.sentence_bleu(p, r).score / 100 for p, r in zip(valid_preds, valid_refs)]

    # 计算ROUGE
    rouge_result = rouge.compute(
        predictions=valid_preds,
        references=valid_refs,
        use_stemmer=True,
        use_aggregator=False,
        rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"]
    )

    # 计算BERTScore
    F1_max = []
    for pred, ref_group in tqdm(zip(valid_preds, valid_refs), total=len(valid_preds)):
        best_F1 = max(bertscore.compute(
            predictions=[pred] * len(ref_group),
            references=ref_group,
            lang="en",
            rescale_with_baseline=False,
            use_fast_tokenizer=True
        )["f1"])
        F1_max.append(best_F1)

    # 把计算好的有效分数放回原始位置，无效位置填充None
    for i, idx in enumerate(valid_idx):
        bleu_scores_all[idx] = bleu_scores[i]
        rouge_scores_all[idx] = rouge_result["rougeL"][i]
        bert_scores_all[idx] = F1_max[i]

    metrics = {
        "BLEU": sum(bleu_scores) / len(bleu_scores),
        "ROUGE-L": sum(rouge_result['rougeL']) / len(rouge_result['rougeL']),
        "BERTScore_F1": sum(F1_max) / len(F1_max)
    }

    return metrics, bleu_scores_all, rouge_scores_all, bert_scores_all

# === Metrics: Prediction vs Gold ===
gold_metrics, gold_bleu, gold_rouge, gold_f1 = compute_metrics(aligned_preds, [[ref] for ref in gold_refs])

# === Metrics: Prediction vs Variants ===
variant_metrics, variant_bleu, variant_rouge, variant_f1 = compute_metrics(aligned_preds, variant_refs)

# === 打印结果 ===
print("\n📊 Evaluation (Prediction vs Gold):")
print(f"BLEU (avg)      : {gold_metrics['BLEU']:.4f}")
print(f"ROUGE-L (avg)   : {gold_metrics['ROUGE-L']:.4f}")
print(f"BERTScore F1    : {gold_metrics['BERTScore_F1']:.4f}")

print("\n📊 Evaluation (Prediction vs Variants, excluding first):")
print(f"BLEU (avg)      : {variant_metrics['BLEU']:.4f}")
print(f"ROUGE-L (avg)   : {variant_metrics['ROUGE-L']:.4f}")
print(f"BERTScore F1    : {variant_metrics['BERTScore_F1']:.4f}")

# === 保存 CSV（确保长度一致） ===
df = pd.DataFrame({
    "sample_id": aligned_ids,
    "prediction": aligned_preds,
    "gold_reference": gold_refs,
    "variant_references": [" • " + "\n • ".join(v) for v in variant_refs],
    "BLEU_gold": gold_bleu,
    "ROUGE-L_gold": gold_rouge,
    "BERTScore_F1_gold": gold_f1,
    "BLEU_variants": variant_bleu,
    "ROUGE-L_variants": variant_rouge,
    "BERTScore_F1_variants": variant_f1,
})

# 空缺位置可能为None，可以填充为NaN或其他值
df.to_csv("prediction/fine_zero_automatic_scores_separated.csv", index=False)
print("📁 Saved to 'fine_zero_automatic_scores_separated.csv'")



✅ Aligned samples: 729, Skipped: 1


  0%|                                                   | 0/729 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|█████████████████████████████████████████| 729/729 [01:05<00:00, 11.10it/s]
  0%|                                                   | 0/728 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|█████████████████████████████████████████| 728/728 [02:48<00:00,  4.32it/s]


📊 Evaluation (Prediction vs Gold):
BLEU (avg)      : 0.0553
ROUGE-L (avg)   : 0.1986
BERTScore F1    : 0.8846

📊 Evaluation (Prediction vs Variants, excluding first):
BLEU (avg)      : 0.0966
ROUGE-L (avg)   : 0.2690
BERTScore F1    : 0.8848
📁 Saved to 'fine_zero_automatic_scores_separated.csv'





fine-tuned qwen3——few shot：

In [24]:
import json
import pandas as pd
from evaluate import load
from tqdm import tqdm
import sacrebleu

# === 文件路径 ===
pred_file = "prediction/finetuned_fewshot_output.json"
para_file = "doctor_paraphrase_references_with_context.json"

# === 加载数据 ===
with open(pred_file, "r", encoding="utf-8") as f:
    predictions = json.load(f)

with open(para_file, "r", encoding="utf-8") as f:
    para_list = json.load(f)

# === 数据对齐 ===
aligned_preds = []
gold_refs = []
variant_refs = []
aligned_ids = []
skipped = 0

for i, item in enumerate(para_list):
    if i >= len(predictions):
        skipped += 1
        continue

    pred_item = predictions[i]
    pred = pred_item.get("generated_followup") or pred_item.get("prediction")
    if not isinstance(pred, str) or not pred.strip():
        skipped += 1
        continue

    prediction = pred.strip()
    original = item["original"].strip()
    paraphrases = item.get("paraphrases", [])

    # Exclude the first variant
    variants = paraphrases[1:]

    aligned_preds.append(prediction)
    gold_refs.append(original)
    variant_refs.append(variants)
    aligned_ids.append(f'{item["dialogue_id"]}_{i}')

print(f"\n✅ Aligned samples: {len(aligned_preds)}, Skipped: {skipped}")

# 添加标记来追踪有效性
valid_gold = []
valid_variants = []

# === 修改 compute_metrics 函数 ===
def compute_metrics(preds, refs):
    valid_preds, valid_refs, valid_idx = [], [], []
    bleu_scores_all = [None] * len(preds)
    rouge_scores_all = [None] * len(preds)
    bert_scores_all = [None] * len(preds)

    rouge = load("rouge")
    bertscore = load("bertscore")

    for idx, (p, r) in enumerate(zip(preds, refs)):
        if r:  # refs非空
            valid_preds.append(p)
            valid_refs.append(r)
            valid_idx.append(idx)

    # 计算BLEU
    bleu_scores = [sacrebleu.sentence_bleu(p, r).score / 100 for p, r in zip(valid_preds, valid_refs)]

    # 计算ROUGE
    rouge_result = rouge.compute(
        predictions=valid_preds,
        references=valid_refs,
        use_stemmer=True,
        use_aggregator=False,
        rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"]
    )

    # 计算BERTScore
    F1_max = []
    for pred, ref_group in tqdm(zip(valid_preds, valid_refs), total=len(valid_preds)):
        best_F1 = max(bertscore.compute(
            predictions=[pred] * len(ref_group),
            references=ref_group,
            lang="en",
            rescale_with_baseline=False,
            use_fast_tokenizer=True
        )["f1"])
        F1_max.append(best_F1)

    # 把计算好的有效分数放回原始位置，无效位置填充None
    for i, idx in enumerate(valid_idx):
        bleu_scores_all[idx] = bleu_scores[i]
        rouge_scores_all[idx] = rouge_result["rougeL"][i]
        bert_scores_all[idx] = F1_max[i]

    metrics = {
        "BLEU": sum(bleu_scores) / len(bleu_scores),
        "ROUGE-L": sum(rouge_result['rougeL']) / len(rouge_result['rougeL']),
        "BERTScore_F1": sum(F1_max) / len(F1_max)
    }

    return metrics, bleu_scores_all, rouge_scores_all, bert_scores_all

# === Metrics: Prediction vs Gold ===
gold_metrics, gold_bleu, gold_rouge, gold_f1 = compute_metrics(aligned_preds, [[ref] for ref in gold_refs])

# === Metrics: Prediction vs Variants ===
variant_metrics, variant_bleu, variant_rouge, variant_f1 = compute_metrics(aligned_preds, variant_refs)

# === 打印结果 ===
print("\n📊 Evaluation (Prediction vs Gold):")
print(f"BLEU (avg)      : {gold_metrics['BLEU']:.4f}")
print(f"ROUGE-L (avg)   : {gold_metrics['ROUGE-L']:.4f}")
print(f"BERTScore F1    : {gold_metrics['BERTScore_F1']:.4f}")

print("\n📊 Evaluation (Prediction vs Variants, excluding first):")
print(f"BLEU (avg)      : {variant_metrics['BLEU']:.4f}")
print(f"ROUGE-L (avg)   : {variant_metrics['ROUGE-L']:.4f}")
print(f"BERTScore F1    : {variant_metrics['BERTScore_F1']:.4f}")

# === 保存 CSV（确保长度一致） ===
df = pd.DataFrame({
    "sample_id": aligned_ids,
    "prediction": aligned_preds,
    "gold_reference": gold_refs,
    "variant_references": [" • " + "\n • ".join(v) for v in variant_refs],
    "BLEU_gold": gold_bleu,
    "ROUGE-L_gold": gold_rouge,
    "BERTScore_F1_gold": gold_f1,
    "BLEU_variants": variant_bleu,
    "ROUGE-L_variants": variant_rouge,
    "BERTScore_F1_variants": variant_f1,
})

# 空缺位置可能为None，可以填充为NaN或其他值
df.to_csv("prediction/fine_few_automatic_scores_separated.csv", index=False)
print("📁 Saved to 'fine_few_automatic_scores_separated.csv'")



✅ Aligned samples: 716, Skipped: 14


  0%|                                                   | 0/716 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|█████████████████████████████████████████| 716/716 [01:03<00:00, 11.25it/s]
  0%|                                                   | 0/715 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|█████████████████████████████████████████| 715/715 [02:42<00:00,  4.39it/s]


📊 Evaluation (Prediction vs Gold):
BLEU (avg)      : 0.0628
ROUGE-L (avg)   : 0.2124
BERTScore F1    : 0.8878

📊 Evaluation (Prediction vs Variants, excluding first):
BLEU (avg)      : 0.0931
ROUGE-L (avg)   : 0.2724
BERTScore F1    : 0.8865
📁 Saved to 'fine_zero_automatic_scores_separated.csv'





In [45]:
import json
import pandas as pd
from evaluate import load
from tqdm import tqdm
import sacrebleu

# === 文件路径 ===
para_file = "doctor_paraphrase_references_with_context.json"

# === 加载参考数据 ===
with open(para_file, "r", encoding="utf-8") as f:
    para_list = json.load(f)

# === 数据整理：gold vs 9个variants（排除第一个）===
gold_refs = []
variant_refs = []
aligned_ids = []
skipped = 0

for i, item in enumerate(para_list):
    original = item.get("original", "").strip()
    paraphrases = item.get("paraphrases", [])

    if not original or len(paraphrases) < 2:
        skipped += 1
        continue

    gold_refs.append(original)
    # 排除第一个paraphrase（假设是自动生成最接近gold的）
    variant_refs.append(paraphrases[1:])
    aligned_ids.append(f'{item["dialogue_id"]}_{i}')

print(f"\n✅ Aligned samples: {len(gold_refs)}, Skipped: {skipped}")

# === 计算评估指标 ===
def compute_similarity(gold_refs, variant_groups):
    bleu_scores = []
    rouge = load("rouge")
    bertscore = load("bertscore")

    rouge_gold = []
    bert_f1_max = []

    for gold, variants in tqdm(zip(gold_refs, variant_groups), total=len(gold_refs)):
        # --- BLEU ---
        bleu = sacrebleu.sentence_bleu(gold, variants).score / 100
        bleu_scores.append(bleu)

        # --- ROUGE ---
        rouge_result = rouge.compute(
            predictions=[gold] * len(variants),
            references=variants,
            use_stemmer=True,
            use_aggregator=False,
            rouge_types=["rougeL"]
        )
        rouge_avg = sum(rouge_result["rougeL"]) / len(rouge_result["rougeL"])
        rouge_gold.append(rouge_avg)

        # --- BERTScore (取 F1 最大值) ---
        bert_result = bertscore.compute(
            predictions=[gold] * len(variants),
            references=variants,
            lang="en",
            rescale_with_baseline=False,
            use_fast_tokenizer=True
        )
        best_f1 = max(bert_result["f1"])
        bert_f1_max.append(best_f1)

    return bleu_scores, rouge_gold, bert_f1_max

# === 执行评估 ===
bleu_scores, rouge_scores, bert_f1_scores = compute_similarity(gold_refs, variant_refs)

# === 输出平均结果 ===
print("\n📊 Evaluation (Gold vs Variants):")
print(f"BLEU (avg)      : {sum(bleu_scores) / len(bleu_scores):.4f}")
print(f"ROUGE-L (avg)   : {sum(rouge_scores) / len(rouge_scores):.4f}")
print(f"BERTScore F1    : {sum(bert_f1_scores) / len(bert_f1_scores):.4f}")

# === 保存结果 ===
df = pd.DataFrame({
    "sample_id": aligned_ids,
    "gold_reference": gold_refs,
    "variant_references": [" • " + "\n • ".join(v) for v in variant_refs],
    "BLEU_gold_vs_variants": bleu_scores,
    "ROUGE-L_gold_vs_variants": rouge_scores,
    "BERTScore_F1_gold_vs_variants": bert_f1_scores
})

df.to_csv("prediction/gold_vs_variants_similarity_scores.csv", index=False)
print("📁 Saved to 'prediction/gold_vs_variants_similarity_scores.csv'")



✅ Aligned samples: 729, Skipped: 1


  0%|                                                   | 0/729 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|█████████████████████████████████████████| 729/729 [02:34<00:00,  4.73it/s]


📊 Evaluation (Gold vs Variants):
BLEU (avg)      : 0.2494
ROUGE-L (avg)   : 0.2776
BERTScore F1    : 0.9191
📁 Saved to 'prediction/gold_vs_variants_similarity_scores.csv'



