In [1]:
import json

# List to store our final data
output_data = []

# File paths
file_paths = ["RJUA_train.json", "RJUA_valid.json", "RJUA_test.json"]

# Process each file
for file_path in file_paths:
    try:
        # Open the file and read line by line
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():  # Check if line is not empty
                    # Parse the JSON object from the line
                    data = json.loads(line.strip())
                    
                    # Extract required fields
                    question = data.get("question", "")
                    context = data.get("context", "")
                    answer = data.get("answer", "")
                    disease = data.get("disease", "")
                    advice = data.get("advice", "")
                    
                    # Create the formatted entry
                    formatted_entry = {
                        "prompt": question,
                        "chosen": f"<think>{context}</think>，{answer}，症狀為{disease}，建議：{advice}"
                    }
                    
                    # Add to our output data
                    output_data.append(formatted_entry)
        
        print(f"Successfully processed {file_path}")
                    
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

# Write the output to a JSON file
try:
    with open("output.json", 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)
    print("Successfully created output.json")
except Exception as e:
    print(f"Error writing output file: {e}")


Successfully processed RJUA_train.json
Successfully processed RJUA_valid.json
Successfully processed RJUA_test.json
Successfully created output.json


In [3]:
import json
import os

# 檔案路徑
input_path = "./complete_data/medical.json"
output_path = "./complete_data/medical_modified.json"

# 檢查檔案是否存在
if not os.path.exists(input_path):
    raise FileNotFoundError(f"找不到檔案：{input_path}")

# 讀取原始資料
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# 處理每一筆資料
modified_data = []
for item in data:
    complex_cot = item.get("Complex_CoT", "")
    response = item.get("Response", "")
    reject = item.get("reject", "")

    # 新的 response 結合 Complex_CoT
    new_response = f"<think>{complex_cot}</think> {response}"

    # 建立新的格式，保留 reject
    new_item = {
        "prompt": item.get("Question", ""),
        "chosen": new_response,
        "reject": reject
    }

    modified_data.append(new_item)

# 寫入新檔案
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(modified_data, f, ensure_ascii=False, indent=2)

print(f"✅ 處理完成，結果已儲存到：{output_path}")

✅ 處理完成，結果已儲存到：./complete_data/medical_modified.json


In [6]:
import json

# 讀取 RUJA.json 和 medical.json
with open('./complete_data/RJUA.json', 'r', encoding='utf-8') as f1:
    rujas = json.load(f1)

with open('./complete_data/medical.json', 'r', encoding='utf-8') as f2:
    medicals = json.load(f2)

# 合併兩個列表
combined_data = rujas + medicals

# 將每筆資料轉為具有 prompt、chosen、reject 三個 key 的格式
formatted_data = []
for item in combined_data:
    new_item = {
        'prompt': item.get('prompt', ''),
        'chosen': item.get('chosen', ''),
        'reject': item.get('reject', '')  # 如果原本有 reject 就保留，否則設為空字串
    }
    formatted_data.append(new_item)

# 輸出到新的 JSON 檔案
with open('combined_output.json', 'w', encoding='utf-8') as f:
    json.dump(formatted_data, f, ensure_ascii=False, indent=4)

print("已成功合併並儲存為 combined_output.json")


已成功合併並儲存為 combined_output.json


In [11]:
import json
import re

def extract_and_add_cot(data):
    """
    從 JSON 數據中的每筆資料的 "chosen" 欄位提取 <think> 標籤內的文字，
    並將提取出的內容(不含標籤)新增到新的 "COT" 欄位中。

    Args:
    data (list): 包含字典的列表，每個字典都包含 "prompt"、"chosen" 和 "reject" 欄位。

    Returns:
    list: 更新後的列表，其中每筆資料都包含一個額外的 "COT" 欄位，
          該欄位包含從 "chosen" 欄位提取的 <think> 標籤內的文字(不含標籤)。
    """
    for item in data:
        chosen_text = item.get("chosen", "")
        # 使用正則表達式尋找 <think> 標籤內的文字
        match = re.search(r"<think>(.*?)</think>", chosen_text, re.DOTALL)
        if match:
            # 提取 <think> 標籤內的文字，只保留中間內容
            cot_content = match.group(1)
            # 將提取出的內容新增到 "COT" 欄位中，不包含標籤
            item["COT"] = cot_content
            
            # 檢測是否有後續的逗號，並一併移除
            full_match = match.group(0)
            if chosen_text[match.end():].startswith("，"):
                replace_text = full_match + "，"
            else:
                replace_text = full_match
                
            # 從 "chosen" 欄位中移除 <think> 標籤及其內容(以及可能的逗號)
            item["chosen"] = chosen_text.replace(replace_text, "").strip()
        else:
            # 如果找不到 <think> 標籤，則將 "COT" 欄位設定為 None
            item["COT"] = None
    return data

# 使用正確方法讀取 JSON 文件
try:
    with open('./complete_data/combined.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
except FileNotFoundError:
    print("錯誤：找不到文件 './complete_data/combined.json'")
    print("請確認文件路徑是否正確，或提供正確的文件路徑")
    exit(1)

# 呼叫函數以提取 <think> 標籤內的文字並新增 "COT" 欄位
updated_data = extract_and_add_cot(data)

# 將更新後的數據轉換為 JSON 格式並儲存
output_filename = "updated_data.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(updated_data, f, ensure_ascii=False, indent=4)

print(f"已成功將處理後的數據保存至 {output_filename}")


已成功將處理後的數據保存至 updated_data.json


In [12]:
import json
from opencc import OpenCC

# 初始化 OpenCC，設定轉換模式為簡體到繁體
cc = OpenCC('s2t')

input_file = './updated_data.json'  # 請替換為您的檔案名稱

try:
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
except FileNotFoundError:
    print(f"找不到檔案 {input_file}")
    data = []

def recursive_convert(obj):
    if isinstance(obj, str):
        return cc.convert(obj)
    elif isinstance(obj, list):
        return [recursive_convert(item) for item in obj]
    elif isinstance(obj, dict):
        return {key: recursive_convert(value) for key, value in obj.items()}
    else:
        return obj

# 轉換所有字串中的簡體中文為繁體中文
converted_data = recursive_convert(data)

# 將轉換後的資料保存到新的 JSON 檔案
output_file = 'traditional_chinese_data.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(converted_data, f, ensure_ascii=False, indent=4)

print(f'已成功將簡體中文轉換為繁體中文並保存至 {output_file}')

已成功將簡體中文轉換為繁體中文並保存至 traditional_chinese_data.json
