Covert student excel and save all dialogues in json：

In [65]:
import os
import pandas as pd
import json
import re

def find_matching_column(columns, keyword):
    for col in columns:
        if keyword.lower() in col.lower():
            return col
    return None

def parse_dialogue(text):
    if not isinstance(text, str) or not text.strip():
        return []

    turns = []
    # 使用正则切割每个发言段，避免截断内容
    segments = re.split(r'\n*(?=(C|P):)', text.strip())
    
    i = 0
    while i < len(segments) - 1:
        if segments[i] in ['C', 'P']:
            speaker_code = segments[i]
            content = segments[i + 1].lstrip(':').strip()
            role = "assistant" if speaker_code == "C" else "user"
            speaker = "doctor" if speaker_code == "C" else "patient"
            turns.append({
                "role": role,
                "speaker": speaker,
                "content": f"** {content}"
            })
            i += 2
        else:
            i += 1
    return turns

# Batch process files in a folder
input_folder = "test"
output_file = "all_dialogues.json"
all_dialogues = []
seen_dialogues = set()  # To track unique dialogues

for filename in os.listdir(input_folder):
    if filename.endswith(".csv") or filename.endswith(".tsv"):
        filepath = os.path.join(input_folder, filename)
        sep = "\t" if filename.endswith(".tsv") else ","
        
        try:
            df = pd.read_csv(
                filepath,
                sep=sep,
                encoding='utf-8',
                quotechar='"',
                doublequote=True,
                escapechar="\\",
                on_bad_lines='skip',
                engine="python"
            ).fillna("")
        except Exception as e:
            print(f"❌ Failed to load {filename}: {e}")
            continue

        df.columns = [col.strip() for col in df.columns]
        col_basic = find_matching_column(df.columns, "basic conversation")
        col_physical = find_matching_column(df.columns, "physical function")
        col_emotional = find_matching_column(df.columns, "emotional feedback")

        for _, row in df.iterrows():
            basic = parse_dialogue(row.get(col_basic, ""))
            physical = parse_dialogue(row.get(col_physical, ""))
            emotional = parse_dialogue(row.get(col_emotional, ""))
            full = basic + physical + emotional

            if full:
                dialogue_str = json.dumps(full, ensure_ascii=False, sort_keys=True)
                if dialogue_str not in seen_dialogues:
                    all_dialogues.append(full)
                    seen_dialogues.add(dialogue_str)

# Save all unique dialogues
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(all_dialogues, f, ensure_ascii=False, indent=2)

print(f"✅ Processed {len(all_dialogues)} unique dialogues and saved to {output_file}")


✅ Processed 85 unique dialogues and saved to all_dialogues.json


Convert all test dialogues into input format(0shot) for Qwen3-fine-tuned:

In [None]:
import json
import re
from tqdm import tqdm

# === 文件路径设置 ===
input_file = "all_dialogues.json"
json_output_file = "test_inputs_for_model_fixed.json"
txt_output_file = "test_inputs_for_model_chatml_fixed.txt"

# === Markdown 清理函数 ===
def clean_markdown(text):
    return re.sub(r'^\*+\s*', '', text.strip())

# === ChatML 文本格式转换（调试查看用）===
def convert_to_chatml(system_prompt, turns):
    lines = [f"<|im_start|>system\n{system_prompt}\n<|im_end|>"]
    for turn in turns:
        role = "user" if turn["speaker"] == "patient" else "assistant"
        content = clean_markdown(turn["content"])
        lines.append(f"<|im_start|>{role}\n{content}\n<|im_end|>")
    lines.append("<|im_start|>assistant\n")  # 生成从此开始
    return "\n".join(lines)

# === OpenAI 风格 messages 构建 ===
def convert_to_messages(system_prompt, turns):
    messages = [{"role": "system", "content": system_prompt}]
    for turn in turns:
        role = "user" if turn["speaker"] == "patient" else "assistant"
        content = clean_markdown(turn["content"])
        messages.append({"role": role, "content": content})
    return messages

# === System prompt ===
system_prompt = (
    "You are an attentive and empathetic clinician engaging in a mobility-focused consultation. "
    "The conversation pertains to one of the following ICF categories: "
    "D420 (Transferring Oneself), D445 (Hand and Arm Use), D465 (Moving Around Using Equipment), "
    "and D470 (Using Transportation). "
    "Based on the patient's previous response, generate a clinically appropriate follow-up question "
    "that helps clarify functional limitations or explore emotional and contextual factors. "
    "The question should be relevant, concise, and aligned with clinical reasoning. /no_think."
)

# === 加载数据 ===
with open(input_file, "r", encoding="utf-8") as f:
    dialogues = json.load(f)

generation_inputs = []
chatml_blocks = []

for i, dialogue in tqdm(enumerate(dialogues), total=len(dialogues), desc="Generating test prompts"):
    dialogue_id = i + 1
    doctor_indices = [idx for idx, turn in enumerate(dialogue) if turn.get("speaker") == "doctor"]

    for doc_turn_idx in doctor_indices[1:]:
        prefix_turns = dialogue[:doc_turn_idx]
        if not prefix_turns:
            continue

        # 结构化输出
        messages = convert_to_messages(system_prompt, prefix_turns)

        generation_inputs.append({
            "dialogue_id": dialogue_id,
            "turn_index": doc_turn_idx,
            "messages": messages
        })

        # ChatML 展示文本
        chatml = convert_to_chatml(system_prompt, prefix_turns)
        chatml_blocks.append(chatml)

# === 保存 JSON 用于推理 ===
with open(json_output_file, "w", encoding="utf-8") as f_json:
    json.dump(generation_inputs, f_json, ensure_ascii=False, indent=2)

# === 保存纯文本 ChatML 用于人工检查 ===
with open(txt_output_file, "w", encoding="utf-8") as f_txt:
    f_txt.write("\n\n".join(chatml_blocks))

print(f"✅ JSON file saved to: {json_output_file}")
print(f"✅ ChatML .txt file saved to: {txt_output_file}")
print(f"✅ Total test prompts: {len(generation_inputs)}")


Test with json format as input:

In [85]:
import json
from tqdm import tqdm
from openai import OpenAI

# Local OpenAI API proxy address
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="cltl"  # Replace with your actual key if needed
)

# Input/output paths
input_path = "test_inputs_for_model_fixed.json"
output_path = "finetuned_qwen3_0shout_test_output.json"
model_name = "Qwen3_8B_Merged-8.2B-Q8_0.gguf"
 # Fill in according to your deployed model name

# 读取测试数据
with open(input_path, "r", encoding="utf-8") as f:
    test_data = json.load(f)

results = []
for item in tqdm(test_data, desc="Generating"):
    messages = item["messages"]

    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=messages,
            temperature=0.7,
            max_tokens=256
        )
        item["generated_followup"] = response.choices[0].message.content
    except Exception as e:
        item["generated_followup"] = f"[ERROR] {str(e)}"

    results.append(item)

# 保存结果
with open(output_path, "w", encoding="utf-8") as f_out:
    json.dump(results, f_out, ensure_ascii=False, indent=2)

print(f"✅ Output saved to {output_path}")


Generating: 100%|███████████████████████████| 730/730 [1:57:35<00:00,  9.66s/it]

✅ Output saved to finetuned_qwen3_0shout_test_output.json





Test with ChatML as input:

In [17]:
import json
from tqdm import tqdm
from openai import OpenAI

# Local OpenAI API proxy address
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="cltl"  # Replace if needed
)

# 文件路径
input_path = "test_inputs_for_model_chatml_fixed.txt"
output_path = "3finetuned_qwen3_chatml_test_output.json"
model_name = "Qwen3_8B_Merged-8.2B-Q8_0.gguf"

# 读取 ChatML 测试数据（每个 prompt 之间是 \n\n 分隔）
with open(input_path, "r", encoding="utf-8") as f:
    blocks = f.read().strip().split("\n\n")

results = []
for i, prompt_text in enumerate(tqdm(blocks, desc="Generating")):
    try:
        response = client.completions.create(
            model=model_name,
            prompt=prompt_text,
            temperature=0.7,
            max_tokens=256,
        )
        result = {
            "id": i + 1,
            "chatml_prompt": prompt_text,
            "generated_followup": response.choices[0].text.strip()  # ✅ 修复这里
        }
    except Exception as e:
        result = {
            "id": i + 1,
            "chatml_prompt": prompt_text,
            "generated_followup": f"[ERROR] {str(e)}"
        }

    results.append(result)

# 保存结果
with open(output_path, "w", encoding="utf-8") as f_out:
    json.dump(results, f_out, ensure_ascii=False, indent=2)

print(f"✅ Output saved to {output_path}")


Generating: 100%|███████████████████████████| 730/730 [2:02:18<00:00, 10.05s/it]

✅ Output saved to 3finetuned_qwen3_chatml_test_output.json





Convert all test dialogues into input format(few shot):

In [46]:
import json
import re
from tqdm import tqdm

# === 文件路径设置 ===
input_file = "all_dialogues.json"
output_txt_file = "test_inputs_with_fewshot_chatml.txt"
output_json_file = "test_inputs_with_fewshot.json"

# === 示例对话（few-shot） ===
example1 = [
    {"role": "assistant", "content": "C: Hi, how are you doing today?"},
    {"role": "user", "content": "P: I'm doing pretty well. I walked my dog to the park this morning, and we played for a bit."},
    {"role": "assistant", "content": "C: That sounds lovely! What's your dog’s name?"},
    {"role": "user", "content": "P: His name is Timo."},
    {"role": "assistant", "content": "C: Timo—what a great name! What did you two play?"},
    {"role": "user", "content": "P: We played frisbee fetch. It was fun."},
    {"role": "assistant", "content": "C: That’s good to hear. Sounds like your arms are feeling better lately."},
    {"role": "user", "content": "P: Yeah, I’m really able to throw the frisbee now, which is great."},

    {"role": "assistant", "content": "C: That’s encouraging. Did you feel any discomfort when throwing the frisbee?"},
    {"role": "user", "content": "P: Just a little. After a few throws, my shoulder felt a bit sore, but nothing like before."},
    {"role": "assistant", "content": "C: Did you notice any limitation in your range of motion when reaching back to throw?"},
    {"role": "user", "content": "P: A little when I raised my arm overhead. It's not painful, but it feels a bit tight."},
    {"role": "assistant", "content": "C: I see. How about twisting your arm—for example, opening bottles or turning door handles?"},
    {"role": "user", "content": "P: Doorknobs are fine, but opening bottles is still a bit hard. My wrist doesn’t feel very strong."},
    {"role": "assistant", "content": "C: Got it. What about carrying things like shopping bags or pouring from a kettle?"},
    {"role": "user", "content": "P: Heavier stuff is still a challenge. I had some trouble lifting a full kettle yesterday."},
    {"role": "assistant", "content": "C: And how about reaching into high cabinets or stretching your arm out to grab things?"},
    {"role": "user", "content": "P: If I reach too far, especially upward, I feel a pulling near my elbow. So I try not to overdo it."},
    {"role": "assistant", "content": "C: That’s wise. Are there daily activities you’ve had to adapt or take slower?"},
    {"role": "user", "content": "P: Folding laundry takes longer. Lifting and twisting both arms just wears me out faster."},

    {"role": "assistant", "content": "C: How do you feel about the progress you’ve made so far?"},
    {"role": "user", "content": "P: Honestly, it’s a relief. A few weeks ago I couldn’t even lift a mug comfortably, so this feels like a big win."},
    {"role": "assistant", "content": "C: That’s great to hear. Do you ever feel discouraged on harder days?"},
    {"role": "user", "content": "P: Yeah, definitely. Some days my arms feel weaker, and I start worrying that I might be slipping backwards."},
    {"role": "assistant", "content": "C: That’s totally understandable. Have you found anything that helps you stay motivated when that happens?"},
    {"role": "user", "content": "P: I try to focus on small wins, like managing the laundry or playing with Timo. It reminds me that I’m getting stronger."},
    {"role": "assistant", "content": "C: That’s a great approach. And it sounds like Timo is good at keeping your spirits up."},
    {"role": "user", "content": "P: He really is. He keeps me moving—and smiling."}
]
example2 = [
    {"role": "assistant", "content": "C: Hi there! Before we get started, could I have your name and age, please?"},
    {"role": "user", "content": "P: My name is Voila. I’m 70 years old."},
    {"role": "assistant", "content": "C: No way! You don’t look 70."},
    {"role": "user", "content": "P: Aw, thank you! That’s so sweet of you to say."},
    {"role": "assistant", "content": "C: You're very welcome. Now, how can I help you today?"},
    {"role": "user", "content": "P: Well, my walker isn’t really working for me anymore."},
    {"role": "assistant", "content": "C: I see. Do you want to get a new one, or are you thinking of trying something else?"},
    {"role": "user", "content": "P: Actually, I think I might need a wheelchair now."},

    {"role": "assistant", "content": "C: I'm sorry to hear that. I know it can be tough. How have you been managing with the walker so far?"},
    {"role": "user", "content": "P: It’s been okay, but lately I’ve needed more support, especially when I’m outside or walking longer distances."},
    {"role": "assistant", "content": "C: What kind of surfaces are most challenging with the walker—like uneven ground, slopes, or indoors?"},
    {"role": "user", "content": "P: Pavement with cracks and any uphill sections are really hard. Indoors is a bit easier, but carpets can make the wheels stick."},
    {"role": "assistant", "content": "C: That makes sense. Do you ever find yourself getting stuck or needing to stop and rest?"},
    {"role": "user", "content": "P: Yes, quite often. I get tired halfway through my block, and my hands start to ache from gripping the handles."},
    {"role": "assistant", "content": "C: Have you tried using any other mobility devices before—like a rollator with a seat or a powered wheelchair?"},
    {"role": "user", "content": "P: I’ve used a rollator once during rehab, but it was still hard to push. I haven’t tried powered devices yet."},
    {"role": "assistant", "content": "C: What about turning or navigating tight spaces? Has that become more difficult recently?"},
    {"role": "user", "content": "P: Definitely. I used to be able to make sharp turns in the hallway, but now I bump into the wall or have to back up and try again."},
    {"role": "assistant", "content": "C: And when you go out—like to the grocery store—do you feel comfortable using the walker, or do you hesitate?"},
    {"role": "user", "content": "P: I’ve started avoiding longer outings unless someone’s with me. I just don’t trust myself to get through them anymore."},
    {"role": "assistant", "content": "C: That’s completely understandable. Based on what you’re describing, a self-propelled or even lightweight wheelchair might offer more stability and less strain."},
    {"role": "user", "content": "P: I’ve been thinking the same. I want more freedom to move around, not less."},

    {"role": "assistant", "content": "C: How are you feeling about needing to make that switch to a wheelchair?"},
    {"role": "user", "content": "P: Honestly, it’s a little overwhelming. I do not like the fact that I need a wheelchair."},
    {"role": "assistant", "content": "C: That’s a really common feeling. But a lot of people find the right equipment actually helps them feel more in control and confident."},
    {"role": "user", "content": "P: I hope so. I’m just afraid it’ll make people see me differently, like I’m fragile."},
    {"role": "assistant", "content": "C: I hear you. But using the right tool for your needs doesn’t change who you are—it just helps you move through life more safely."},
    {"role": "user", "content": "P: Yeah, I guess that’s true."},
    {"role": "assistant", "content": "C: Do you have support around you for this transition—family, friends, maybe someone who could help you test out different options?"},
    {"role": "user", "content": "P: Yes, I have two daughters. They’ve been very encouraging."},
    {"role": "assistant", "content": "C: That’s wonderful to hear. You’re not alone in this, and we’ll work together to find the setup that best supports your independence."}
]

# === System prompt ===
system_prompt = (
    "You are an attentive and empathetic clinician engaging in a mobility-focused consultation. "
    "The conversation pertains to one of the following ICF categories: "
    "D420 (Transferring Oneself), D445 (Hand and Arm Use), D465 (Moving Around Using Equipment), "
    "and D470 (Using Transportation). "
    "Below are two example conversations between clinician (C) and patient (P). Use them as reference for tone and structure."
    "Then you will see a real consultation. Based on the patient's previous response, generate a clinically appropriate follow-up question "
    "that helps clarify functional limitations or explore emotional and contextual factors."
    "The question should be relevant, concise, and aligned with clinical reasoning."
    "/no_think"
)

# === Markdown 清理函数 ===
def clean_markdown(text):
    return re.sub(r'^\*+\s*', '', text.strip())

# === ChatML 文本格式构建 ===
def build_chatml(system_prompt, fewshot_turns, real_turns):
    lines = [f"<|im_start|>system\n{system_prompt}\n<|im_end|>"]
    lines.append("<|im_start|>system\nExample 1:\n<|im_end|>")
    for turn in fewshot_turns[0]:
        role = "user" if turn["role"] == "user" else "assistant"
        lines.append(f"<|im_start|>{role}\n{clean_markdown(turn['content'])}\n<|im_end|>")
    lines.append("<|im_start|>system\nExample 2:\n<|im_end|>")
    for turn in fewshot_turns[1]:
        role = "user" if turn["role"] == "user" else "assistant"
        lines.append(f"<|im_start|>{role}\n{clean_markdown(turn['content'])}\n<|im_end|>")
    lines.append("<|im_start|>system\nNow here is the real consultation:\n<|im_end|>")
    for turn in real_turns:
        role = "user" if turn["speaker"] == "patient" else "assistant"
        lines.append(f"<|im_start|>{role}\n{clean_markdown(turn['content'])}\n<|im_end|>")
    lines.append("<|im_start|>assistant\n")  # 模型从此开始生成
    return "\n".join(lines)

# === OpenAI 风格结构化格式 ===
def build_messages(system_prompt, fewshot_turns, real_turns):
    messages = [{"role": "system", "content": system_prompt}]
    messages.append({"role": "system", "content": "Example 1:"})
    messages.extend(fewshot_turns[0])
    messages.append({"role": "system", "content": "Example 2:"})
    messages.extend(fewshot_turns[1])
    messages.append({"role": "system", "content": "Now here is the real consultation:"})
    for turn in real_turns:
        role = "user" if turn["speaker"] == "patient" else "assistant"
        messages.append({"role": role, "content": clean_markdown(turn["content"])})
    return messages

# === 加载数据 ===
with open(input_file, "r", encoding="utf-8") as f:
    dialogues = json.load(f)

chatml_blocks = []
json_blocks = []

for i, dialogue in tqdm(enumerate(dialogues), total=len(dialogues), desc="Generating few-shot prompts"):
    dialogue_id = i + 1
    doctor_indices = [idx for idx, turn in enumerate(dialogue) if turn.get("speaker") == "doctor"]

    for doc_turn_idx in doctor_indices[1:]:  # 从第二次医生发言开始
        prefix_turns = dialogue[:doc_turn_idx]
        if not prefix_turns:
            continue

        chatml_text = build_chatml(system_prompt, [example1, example2], prefix_turns)
        json_structured = {
            "dialogue_id": dialogue_id,
            "turn_index": doc_turn_idx,
            "messages": build_messages(system_prompt, [example1, example2], prefix_turns)
        }

        chatml_blocks.append(chatml_text)
        json_blocks.append(json_structured)

# === 保存 ChatML 和 JSON ===
with open(output_txt_file, "w", encoding="utf-8") as f_txt:
    f_txt.write("\n\n".join(chatml_blocks))

with open(output_json_file, "w", encoding="utf-8") as f_json:
    json.dump(json_blocks, f_json, ensure_ascii=False, indent=2)

print(f"✅ ChatML saved to: {output_txt_file}")
print(f"✅ JSON saved to: {output_json_file}")
print(f"✅ Total prompts: {len(chatml_blocks)}")


Generating few-shot prompts: 100%|████████████| 85/85 [00:00<00:00, 2438.73it/s]

✅ ChatML saved to: test_inputs_with_fewshot_chatml.txt
✅ JSON saved to: test_inputs_with_fewshot.json
✅ Total prompts: 730





test with ChatML format as input:

In [None]:
import json
from tqdm import tqdm
from openai import OpenAI

# Local OpenAI API proxy address
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="cltl"  # Replace if needed
)

# 文件路径
input_path = "test_inputs_with_fewshot_chatml.txt"
output_path = "fewshot_finetuned_qwen3_output.json"
model_name = "Qwen3_8B_Merged-8.2B-Q8_0.gguf"

# 读取 ChatML 测试数据（每个 prompt 之间是 \n\n 分隔）
with open(input_path, "r", encoding="utf-8") as f:
    blocks = f.read().strip().split("\n\n")

results = []
for i, prompt_text in enumerate(tqdm(blocks, desc="Generating")):
    try:
        response = client.completions.create(
            model=model_name,
            prompt=prompt_text,
            temperature=0.7,
            max_tokens=256,
        )
        result = {
            "id": i + 1,
            "chatml_prompt": prompt_text,
            "generated_followup": response.choices[0].text.strip()  # ✅ 修复这里
        }
    except Exception as e:
        result = {
            "id": i + 1,
            "chatml_prompt": prompt_text,
            "generated_followup": f"[ERROR] {str(e)}"
        }

    results.append(result)

# 保存结果
with open(output_path, "w", encoding="utf-8") as f_out:
    json.dump(results, f_out, ensure_ascii=False, indent=2)

print(f"✅ Output saved to {output_path}")

改了一个例子：

In [None]:
import json
from tqdm import tqdm
from openai import OpenAI

# Local OpenAI API proxy address
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="cltl"  # Replace if needed
)

# 文件路径
input_path = "test_inputs_with_fewshot_chatml.txt"
output_path = "fewshot_finetuned_qwen3_chatml_test_output.json"
model_name = "Qwen3_8B_Merged-8.2B-Q8_0.gguf"

# 读取 ChatML 测试数据（每个 prompt 之间是 \n\n 分隔）
with open(input_path, "r", encoding="utf-8") as f:
    blocks = f.read().strip().split("\n\n")

results = []
for i, prompt_text in enumerate(tqdm(blocks, desc="Generating")):
    try:
        response = client.completions.create(
            model=model_name,
            prompt=prompt_text,
            temperature=0.7,
            max_tokens=256,
        )
        result = {
            "id": i + 1,
            "chatml_prompt": prompt_text,
            "generated_followup": response.choices[0].text.strip()  # ✅ 修复这里
        }
    except Exception as e:
        result = {
            "id": i + 1,
            "chatml_prompt": prompt_text,
            "generated_followup": f"[ERROR] {str(e)}"
        }

    results.append(result)

# 保存结果
with open(output_path, "w", encoding="utf-8") as f_out:
    json.dump(results, f_out, ensure_ascii=False, indent=2)

print(f"✅ Output saved to {output_path}")


再试一遍：

In [23]:
import json
from tqdm import tqdm
from openai import OpenAI

# Local OpenAI API proxy address
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="cltl"  # Replace if needed
)

# 文件路径
input_path = "test_inputs_with_fewshot_chatml.txt"
output_path = "22fewshot_finetuned_qwen3_chatml_test_output.json"
model_name = "Qwen3_8B_Merged-8.2B-Q8_0.gguf"

# 读取 ChatML 测试数据（每个 prompt 之间是 \n\n 分隔）
with open(input_path, "r", encoding="utf-8") as f:
    blocks = f.read().strip().split("\n\n")

results = []
for i, prompt_text in enumerate(tqdm(blocks, desc="Generating")):
    try:
        response = client.completions.create(
            model=model_name,
            prompt=prompt_text,
            temperature=0.7,
            max_tokens=256,
        )
        result = {
            "id": i + 1,
            "chatml_prompt": prompt_text,
            "generated_followup": response.choices[0].text.strip()  # ✅ 修复这里
        }
    except Exception as e:
        result = {
            "id": i + 1,
            "chatml_prompt": prompt_text,
            "generated_followup": f"[ERROR] {str(e)}"
        }

    results.append(result)

# 保存结果
with open(output_path, "w", encoding="utf-8") as f_out:
    json.dump(results, f_out, ensure_ascii=False, indent=2)

print(f"✅ Output saved to {output_path}")


Generating: 100%|███████████████████████████| 730/730 [1:45:15<00:00,  8.65s/it]


✅ Output saved to 22fewshot_finetuned_qwen3_chatml_test_output.json
