# **智慧医疗赛道：颈椎核磁影像多序列多任务分析**

本代码主要对比不同的多模态大模型、不同的prompt对赛题任务的表现，并对其进行优化。

#### **参数预设、导包**

需要修改以下API key, url以及数据路径

In [None]:
# API参数
API_KEY = #YOUR_API_KEY
API_URL = "https://api.siliconflow.cn/v1"

# 数据路径，自己用50个train样本进行测试
train_dir = "D:\\比赛\\通用人工智能大赛\\test_train"  
train_label_dir = "D:\\比赛\\通用人工智能大赛\\cervai_challenge-main\\cervai_challenge-main\\data\\train.json"  

In [None]:
# 导包
from PIL import Image
import io
import base64
import json  
from openai import OpenAI
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

#### **图像转webp base64**

In [None]:
def convert_image_to_webp_base64(input_image_path):
    try:
        with Image.open(input_image_path) as img:
            byte_arr = io.BytesIO()
            img.save(byte_arr, format='webp')
            byte_arr = byte_arr.getvalue()
            base64_str = base64.b64encode(byte_arr).decode('utf-8')
            return base64_str
    except IOError:
        print(f"Error: Unable to open or convert the image {input_image_path}")
        return None

In [None]:
# 这是一个测试不同prompt输出的cell

image_path = "D:/比赛/通用人工智能大赛/test/1744810399870.jpg"  # 图片路径
base64_image=convert_image_to_webp_base64(image_path)

client = OpenAI(
    api_key = API_KEY, 
    base_url = API_URL
)

prompt_text = '''请根据提供的颈椎MRI矢状位图像，判断该患者颈椎的生理曲度状态：请从以下三类中选择最符合的分类结果，并输出对应的标签数字（0 / 1 / 2）：
                - 标签 0：正常（颈椎自然前凸，呈C形曲度）
                - 标签 1：曲度变直（颈椎前凸曲度消失）
                - 标签 2：反弓（颈椎向后弯曲，呈反向曲度）
                仅输出最终的分类标签数字。'''

response = client.chat.completions.create(
    model="Qwen/Qwen2-VL-72B-Instruct",
    messages=[{
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                }
            },
            {
                "type": "text",
                "text": prompt_text
            }
        ]
    }],
    stream=True
)

for chunk in response:
    chunk_message = chunk.choices[0].delta.content
    print(chunk_message, end='', flush=True)

### **调用大模型直接对任务进行实现和测试**

任务一、二使用矢状位3张图像进行判断，任务三、四使用对应位置的横轴位图像进行判断

每个model和prompt，分别调用API跑3次，得到3个结果，使用统计投票法选出最常见的分类结果

分别测试不同model和prompt的测试结果

##### **计算指标**

测试指标：准确性、Macro-F1、Weighted-F1

Macro-F1 是对**每个类别的 F1 分数取平均值**，它对每个类别一视同仁，即使某个类别样本很少也会被平等计算。若预测结果偏向某一类，Macro-F1 可能低于 Accuracy。比赛使用的是 Macro-F1 而非 Weighted-F1，是为了鼓励模型在所有类别上表现均衡。

Weighted-F1 也会对每个类别计算 F1 分数，但不是直接平均，而是**按每个类别的样本数量（support）加权平均**。样本多的类别对最终结果贡献更大。

#### **调用不同大模型对分类任务进行实现**

##### **OpenAI接口**

In [None]:
# siliconflow上有效的多模态大模型名称，有时候会有部分模型不能跑

model_names = {
    # 'qwen2.5_7b': "Pro/Qwen/Qwen2.5-VL-7B-Instruct",
    # 'qwen2.5_32b': "Qwen/Qwen2.5-VL-32B-Instruct",
    # 'qwen2.5_72b': "Qwen/Qwen2.5-VL-72B-Instruct",
    'qwen2_72b': "Qwen/Qwen2-VL-72B-Instruct",
    # 'deepseek': "deepseek-ai/deepseek-vl2"
}

In [None]:
# 图像分类函数，使用openai接口
from openai import OpenAI

def classify_image(base64_images, prompt, model_name, labels, vote_times=3):
    '''
    prompt: 大模型提示词
    model_name：模型名字
    labels：最后分类的标签
    '''
    
    client = OpenAI(
        api_key = API_KEY, 
        base_url = API_URL
    )

    votes = []

    for _ in range(vote_times):
        # 传入的 base64_images 可以是一个list
        content = [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}} for img in base64_images]
        content.append({"type": "text", "text": prompt})

        response = client.chat.completions.create(
            model = model_name,
            messages=[{"role": "user", "content": content}],
            stream=True
        )
    
        # 读取结果
        result_text = ""
        for chunk in response:
            if chunk.choices[0].delta.content:
                result_text += chunk.choices[0].delta.content
    
        # 从结果中提取数字标签
        for label in labels:
            if label in result_text:
                votes.append(int(label))
                break
        else:
            votes.append(-1)  # 没识别出有效标签
                
    # 统计票数，返回最多的一个标签
    filtered_votes = [v for v in votes if v != -1]
    
    if filtered_votes:
        return Counter(filtered_votes).most_common(1)[0][0]
    else:
        return -1

##### **ZhipuAI 接口**

In [None]:
API_KEY = # ZhipuAI_API_KEY

# 现在智谱的多模态模型
model_names = {
    'glm_4v_plus': "glm-4v-plus",
    'glm_4v': "glm-4v",
    # 'glm_4v_flash': "glm-4v-flash"
}

In [None]:
# 图像分类函数，使用ZhipuAI接口
from zhipuai import ZhipuAI

def classify_image(base64_images, prompt, model_name, labels, vote_times=3):
    '''
    prompt: 大模型提示词
    model_name：模型名字
    labels：最后分类的标签
    '''
    
    client = ZhipuAI(api_key=API_KEY)

    votes = []

    for _ in range(vote_times):
        # 传入的 base64_images 可以是一个list
        content = [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}} for img in base64_images]
        content.append({"type": "text", "text": prompt})

        response = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": content}],
            stream=True
        )
    
        # 读取结果
        result_text = ""
        for chunk in response:
            if chunk.choices[0].delta.content:
                result_text += chunk.choices[0].delta.content
    
        # 从结果中提取数字标签
        for label in labels:
            if label in result_text:
                votes.append(int(label))
                break
        else:
            votes.append(-1)  # 没识别出有效标签
                
    # 统计票数，返回最多的一个标签
    filtered_votes = [v for v in votes if v != -1]
    
    if filtered_votes:
        return Counter(filtered_votes).most_common(1)[0][0]
    else:
        return -1

### **任务一：颈椎曲度评估**

目标：判断颈椎生理曲度状态，分类为：
- 正常（标签 0）
- 曲度变直（标签 1）
- 反弓（标签 2）

说明：颈椎曲度是指颈椎自然的生理弯曲，通常呈前凸（C 形）。曲度评估分为三种：直（生理曲度消失）、正常（前凸曲度良好）、反弓（曲度反向，呈后凸）。

##### **prompt设计说明**
- prompt_1：简单说明任务
- prompt_2：在说明任务的基础上，添加医学背景信息
- prompt_3：在2的基础上，添加任务流程
- prompt_4：在说明任务的基础上，添加医学背景信息（非量化，描述性医学背景知识）
- prompt_5：在4的基础上，添加任务流程

In [None]:
# 双语 prompt

prompt_task_1 = {
    'prompt_1': '''请根据提供的颈椎MRI矢状位图像，判断该患者颈椎的生理曲度状态：请从以下三类中选择最符合的分类结果，并输出对应的标签数字（0 / 1 / 2）：
    - 标签 0：正常（颈椎自然前凸，呈C形曲度）
    - 标签 1：曲度变直（颈椎前凸曲度消失）
    - 标签 2：反弓（颈椎向后弯曲，呈反向曲度）
    仅输出最终的分类标签数字。''',

    'prompt_2': '''
    任务：你是一名经验丰富的医学影像分析专家，请你根据下述给你的信息和提供的颈椎MRI矢状位图像，判断该患者颈椎的生理曲度状态。
    
    医学背景信息：  
    颈椎生理曲度是指颈椎自然的生理弯曲，通常呈前凸（C形）。我们可以利用Cobb角测量法来对颈椎生理曲度进行评估：
    Cobb角测量法通常选择C2（枢椎）和C7（第七颈椎）作为测量的基准椎体，在MRI的矢状位图像上，找到C2椎体下缘和C7椎体下缘，从C2椎体下缘的后部画一条水平线，从C7椎体下缘的后部画另一条水平线，这两条水平线之间的夹角即为Cobb角。
    Cobb角的正常范围及分类：
    正常范围：一般认为C2-C7的Cobb角在22°±5°之间为正常，即大约在17°到27°之间。
    曲度变直：如果Cobb角小于17°，则被认为是颈椎曲度变直。这种情况下，颈椎的生理前凸减少，几乎呈直线状。
    反弓：如果Cobb角小于0°，即出现负角度，这表明颈椎出现了后凸，也就是反弓。这种情况下，颈椎的生理前凸完全消失，并且出现了向后的弯曲。
    
    输出要求：请根据你通过MRI分析的患者颈椎的生理曲度状态，按照下述标签格式进行输出：
    - 标签 0：正常（颈椎自然前凸，呈C形曲度）
    - 标签 1：曲度变直（颈椎前凸曲度消失）
    - 标签 2：反弓（颈椎向后弯曲，呈反向曲度）
    要求仅输出最终的分类标签数字。''',

    'prompt_3': '''
    任务：你是一名经验丰富的医学影像分析专家，请你根据下述给你的信息和提供的颈椎MRI矢状位图像，判断该患者颈椎的生理曲度状态。
    
    医学背景信息：  
    颈椎生理曲度是指颈椎自然的生理弯曲，通常呈前凸（C形）。我们可以利用Cobb角测量法来对颈椎生理曲度进行评估：
    Cobb角测量法通常选择C2（枢椎）和C7（第七颈椎）作为测量的基准椎体，在MRI的矢状位图像上，找到C2椎体下缘和C7椎体下缘，从C2椎体下缘的后部画一条水平线，从C7椎体下缘的后部画另一条水平线，这两条水平线之间的夹角即为Cobb角。
    Cobb角的正常范围及分类：
    正常范围：一般认为C2-C7的Cobb角在22°±5°之间为正常，即大约在17°到27°之间。
    曲度变直：如果Cobb角小于17°，则被认为是颈椎曲度变直。这种情况下，颈椎的生理前凸减少，几乎呈直线状。
    反弓：如果Cobb角小于0°，即出现负角度，这表明颈椎出现了后凸，也就是反弓。这种情况下，颈椎的生理前凸完全消失，并且出现了向后的弯曲。
    
    输出要求：请根据你通过MRI分析的患者颈椎的生理曲度状态，按照下述标签格式进行输出：
    - 标签 0：正常（颈椎自然前凸，呈C形曲度）
    - 标签 1：曲度变直（颈椎前凸曲度消失）
    - 标签 2：反弓（颈椎向后弯曲，呈反向曲度）
    要求仅输出最终的分类标签数字。
    
    现在，我们要根据上面给出的任务、医学背景信息及输出要求，按照以下步骤来根据颈椎MRI矢状位图像判断颈椎的生理曲度状态：
    - 获取颈椎MRI矢状位图；
    - 定位基准椎体：在MRI矢状位图像上，找到C2（枢椎）和C7（第七颈椎）椎体。C2椎体是颈椎中第二个椎体，有一个明显的齿状突；C7椎体是颈椎中最大的椎体；
    - 画线测量：从C2椎体下缘的后部画一条水平线，这条线应该与C2椎体下缘平行；从C7椎体下缘的后部画另一条水平线，这条线应该与C7椎体下缘平行；测量估计这两条线之间的夹角；
    - 根据预估的夹角给出诊断: 正常(17°-27°，表现为自然前凸)；曲度变直(小于17°，表现为前凸曲度消失)；反弓：(小于0°，表现为向后弯曲)；
    - 按照诊断和输出要求进行输出，仅输出标签数字。''',

    'prompt_4': '''
    任务：你是一名经验丰富的医学影像分析专家，请你根据下述给你的信息和提供的颈椎MRI矢状位图像，判断该患者颈椎的生理曲度状态。
    
    医学背景信息：  
    颈椎生理曲度是指颈椎自然的生理弯曲，通常呈前凸（C形）。我们可以通过观察MRI图像上颈椎的整体形态来评估颈椎生理曲度：
    - 正常颈椎曲度：在MRI矢状位图像上，颈椎呈现自然的前凸曲线，各椎体排列整齐，椎间隙均匀，未见明显狭窄或增宽。椎间盘保持良好的水分和弹性，未见明显退变、突出或膨出。
    - 颈椎曲度变直：在MRI矢状位图像上，颈椎的前凸曲线减少，几乎呈直线状。各椎体排列基本整齐，但椎间隙略显不均匀，C4-C5、C5-C6椎间盘轻度膨出，但未见明显压迫硬膜囊或神经根。
    - 颈椎反弓：在MRI矢状位图像上，颈椎出现明显的后凸曲线，C2到C7椎体的前凸曲度完全消失，并且出现了向后的弯曲。C3-C4、C4-C5椎间盘明显突出，压迫硬膜囊和神经根。C3、C4椎体轻度骨质增生。
    
    输出要求：请根据你通过MRI分析的患者颈椎的生理曲度状态，按照下述标签格式进行输出：
    - 标签 0：正常（颈椎自然前凸，呈C形曲度）
    - 标签 1：曲度变直（颈椎前凸曲度消失）
    - 标签 2：反弓（颈椎向后弯曲，呈反向曲度）
    要求仅输出最终的分类标签数字。''',

    'prompt_5': '''
    任务：你是一名经验丰富的医学影像分析专家，请你根据下述给你的信息和提供的颈椎MRI矢状位图像，判断该患者颈椎的生理曲度状态。
    
    医学背景信息：  
    颈椎生理曲度是指颈椎自然的生理弯曲，通常呈前凸（C形）。我们可以通过观察MRI图像上颈椎的整体形态来评估颈椎生理曲度：
    - 正常颈椎曲度：在MRI矢状位图像上，颈椎呈现自然的前凸曲线，各椎体排列整齐，椎间隙均匀，未见明显狭窄或增宽。椎间盘保持良好的水分和弹性，未见明显退变、突出或膨出。
    - 颈椎曲度变直：在MRI矢状位图像上，颈椎的前凸曲线减少，几乎呈直线状。各椎体排列基本整齐，但椎间隙略显不均匀，C4-C5、C5-C6椎间盘轻度膨出，但未见明显压迫硬膜囊或神经根。
    - 颈椎反弓：在MRI矢状位图像上，颈椎出现明显的后凸曲线，C2到C7椎体的前凸曲度完全消失，并且出现了向后的弯曲。C3-C4、C4-C5椎间盘明显突出，压迫硬膜囊和神经根。C3、C4椎体轻度骨质增生。
    
    输出要求：请根据你通过MRI分析的患者颈椎的生理曲度状态，按照下述标签格式进行输出：
    - 标签 0：正常（颈椎自然前凸，呈C形曲度）
    - 标签 1：曲度变直（颈椎前凸曲度消失）
    - 标签 2：反弓（颈椎向后弯曲，呈反向曲度）
    要求仅输出最终的分类标签数字。
    
    现在，我们要根据上面给出的任务、医学背景信息及输出要求，按照以下步骤来根据颈椎MRI矢状位图像判断颈椎的生理曲度状态：
    - 获取颈椎MRI矢状位图；
    - 定位基准椎体：在MRI矢状位图像上，找到C2（枢椎）到 C7（第七颈椎）的几节椎体【C2椎体是颈椎中第二个椎体，有一个明显的齿状突；C7椎体是颈椎中最大的椎体】，识别并判断C2-C7几节椎体的位置；
    - 已知图片左边是人脸（也即前面），判断椎体连接成线的弯曲方向：如果呈现比较明显的C形曲度（也即弯曲呈现左凸右凹），则为正常；如果呈现很小幅度C形曲度或近乎直线，则为曲度变直；如果呈现倒C曲度（也即弯曲呈现左凹右凸）, 则为颈椎反弓；
    - 根据椎体弯曲方向和程度给出诊断；
    - 按照诊断和输出要求进行输出，仅输出标签数字。'''
                 }

prompt_task_1_en = {
    'prompt_1': '''Please analyze the provided cervical spine MRI sagittal image and determine the physiological curvature status of the patient's cervical spine. Choose the most appropriate classification from the following three categories and output the corresponding label number (0 / 1 / 2):
    - Label 0: Normal (natural cervical lordosis, C-shaped curvature)
    - Label 1: Straightened (loss of cervical lordosis)
    - Label 2: Kyphosis (reversed cervical curvature)
    Only output the final classification label number.''',

    'prompt_2': '''
    Task: You are an experienced medical imaging analysis expert. Based on the information provided below and the given cervical spine MRI sagittal image, determine the physiological curvature status of the patient's cervical spine.

    Medical background information:
    Cervical physiological curvature refers to the natural forward curve (lordosis) of the cervical spine, typically C-shaped. The Cobb angle measurement method can be used to evaluate this curvature:
    - The Cobb angle is usually measured between the inferior endplates of C2 (axis) and C7 (seventh cervical vertebra).
    - On the sagittal MRI image, draw a horizontal line along the inferior endplate of C2 and another horizontal line along the inferior endplate of C7. The angle between these two lines is the Cobb angle.

    Classification based on Cobb angle:
    - Normal: Cobb angle between 17° and 27° (22°±5°) is considered normal.
    - Straightened: Cobb angle less than 17° indicates a straightened cervical curve, with loss of natural forward curvature.
    - Kyphosis: Cobb angle less than 0° (negative angle) indicates cervical kyphosis, where the spine curves backward abnormally.

    Output requirement: Based on your MRI analysis, output the classification label as follows:
    - Label 0: Normal (natural cervical lordosis, C-shaped curvature)
    - Label 1: Straightened (loss of cervical lordosis)
    - Label 2: Kyphosis (reversed cervical curvature)
    Only output the final classification label number.''',

    'prompt_3': '''
    Task: You are an experienced medical imaging analysis expert. Based on the information provided below and the given cervical spine MRI sagittal image, determine the physiological curvature status of the patient's cervical spine.

    Medical background information:
    Cervical physiological curvature refers to the natural forward curve (lordosis) of the cervical spine, typically C-shaped. The Cobb angle measurement method can be used to evaluate this curvature:
    - Measure the angle between the inferior endplates of C2 and C7 vertebrae by drawing horizontal lines along them on the sagittal MRI image. The angle between these lines is the Cobb angle.

    Classification based on Cobb angle:
    - Normal: Cobb angle between 17° and 27°.
    - Straightened: Cobb angle less than 17°, indicating loss of cervical lordosis.
    - Kyphosis: Cobb angle less than 0°, indicating reversed curvature.

    Output requirement: Based on your MRI analysis, output the classification label as follows:
    - Label 0: Normal (natural cervical lordosis, C-shaped curvature)
    - Label 1: Straightened (loss of cervical lordosis)
    - Label 2: Kyphosis (reversed cervical curvature)
    Only output the final classification label number.

    Now, based on the above task, medical background, and output requirement, follow these steps to determine the cervical spine curvature from the MRI sagittal image:
    - Acquire the cervical MRI sagittal image;
    - Locate the reference vertebrae: identify C2 (axis) and C7 vertebrae (C2 has a distinctive odontoid process; C7 is the largest cervical vertebra);
    - Draw horizontal lines along the inferior endplates of C2 and C7 and estimate the Cobb angle;
    - Diagnose based on the estimated angle: normal (17°-27°, natural lordosis), straightened (less than 17°, reduced lordosis), or kyphosis (less than 0°, reversed curve);
    - Output only the classification label number.''',

    'prompt_4': '''
    Task: You are an experienced medical imaging analysis expert. Based on the information provided below and the given cervical spine MRI sagittal image, determine the physiological curvature status of the patient's cervical spine.

    Medical background information:
    Cervical physiological curvature refers to the natural forward curve (lordosis) of the cervical spine, typically C-shaped. Evaluation can be based on overall morphology in MRI images:
    - Normal curvature: Natural C-shaped curve, well-aligned vertebrae, uniform intervertebral spaces, no significant narrowing or widening. Discs retain good hydration and elasticity, no obvious degeneration or protrusion.
    - Straightened curvature: Decreased forward curve, spine appears almost straight. Vertebrae alignment is generally good, but slight unevenness in intervertebral spaces. Mild disc bulges at C4-C5 and C5-C6 without significant compression of the dural sac or nerve roots.
    - Kyphosis: Reversed curve visible, loss of normal lordosis from C2 to C7. Significant disc protrusions at C3-C4 and C4-C5 compressing the dural sac and nerve roots, mild bony overgrowth at C3 and C4.

    Output requirement: Based on your MRI analysis, output the classification label as follows:
    - Label 0: Normal (natural cervical lordosis, C-shaped curvature)
    - Label 1: Straightened (loss of cervical lordosis)
    - Label 2: Kyphosis (reversed cervical curvature)
    Only output the final classification label number.''',

    'prompt_5': '''
    Task: You are an experienced medical imaging analysis expert. Based on the information provided below and the given cervical spine MRI sagittal image, determine the physiological curvature status of the patient's cervical spine.

    Medical background information:
    Cervical physiological curvature refers to the natural forward curve (lordosis) of the cervical spine, typically C-shaped. Evaluation can be based on overall morphology in MRI images:
    - Normal curvature: Natural C-shaped curve, well-aligned vertebrae, uniform intervertebral spaces, no significant narrowing or widening. Discs retain good hydration and elasticity, no obvious degeneration or protrusion.
    - Straightened curvature: Decreased forward curve, spine appears almost straight. Vertebrae alignment is generally good, slight unevenness in intervertebral spaces, mild disc bulges at C4-C5 and C5-C6 without significant compression of the dural sac or nerve roots.
    - Kyphosis: Reversed curve visible, loss of normal lordosis from C2 to C7. Significant disc protrusions at C3-C4 and C4-C5 compressing the dural sac and nerve roots, mild bony overgrowth at C3 and C4.

    Output requirement: Based on your MRI analysis, output the classification label as follows:
    - Label 0: Normal (natural cervical lordosis, C-shaped curvature)
    - Label 1: Straightened (loss of cervical lordosis)
    - Label 2: Kyphosis (reversed cervical curvature)
    Only output the final classification label number.

    Now, based on the above task, medical background, and output requirement, follow these steps to determine the cervical spine curvature from the MRI sagittal image:
    - Acquire the cervical MRI sagittal image;
    - Locate the reference vertebrae: identify vertebrae from C2 to C7 (C2 has a distinctive odontoid process; C7 is the largest cervical vertebra);
    - Knowing that the left side of the image corresponds to the anterior (front) of the patient, judge the bending direction of the spinal alignment:
      - If a clear C-shaped curve (convex anteriorly) is observed, it's normal;
      - If a very mild curve or nearly straight line is observed, it's straightened;
      - If an inverted C-shape (convex posteriorly) is observed, it's kyphosis;
    - Diagnose based on the bending direction and degree;
    - Output only the classification label number.'''
}

In [None]:
import base64, os, time, json
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import pandas as pd
import time

# 加载标签
with open(train_label_dir, 'r', encoding='utf-8') as f:
        raw_list = json.load(f)

label_dict = {item['id']: item['qd'] for item in raw_list}

labels = ['0','1','2']

# 调用大模型进行预测
results = []

for model in model_names:
    model_name = model_names[model]
    
    for p in prompt_task_1:
        prompt = prompt_task_1[p]

        print(f"[运行] Model - {model}, Prompt - {p} ......")
        start_time = time.time() 
        
        y_true, y_pred = [], []

        for patient_id in sorted(os.listdir(train_dir)):
            sag_dir = os.path.join(train_dir, patient_id, "sag")
            if not os.path.exists(sag_dir):
                continue
        
            image_files = [f for f in os.listdir(sag_dir) if f.lower().endswith(('.jpg', '.png'))]
            if not image_files:
                continue
        
            image_paths = [os.path.join(sag_dir, f"{i}.png") for i in [5, 6, 7]]  # 取三张图
        
            base64_images = [convert_image_to_webp_base64(path) for path in image_paths]
            if any(img is None for img in base64_images):
                print(f"[跳过] 无法处理图像{patient_id}：{image_path}")
                continue
        
            pred_label = classify_image(base64_image, prompt, model_name, labels)
            true_label = label_dict.get(patient_id, -1)
                
            if true_label != -1 and pred_label != -1:
                y_true.append(true_label)
                y_pred.append(pred_label)

        # 计算评估指标
        acc = accuracy_score(y_true, y_pred)
        macro_f1 = f1_score(y_true, y_pred, average='macro')
        weighted_f1 = f1_score(y_true, y_pred, average='weighted')

        runtime = time.time() - start_time
        
        results.append({
            "model": model,
            "prompt": p,
            "accuracy": acc,
            "macro_f1": macro_f1,
            "weighted_f1": weighted_f1,
            "Valid sample": len(y_true),
            "runtime_sec": round(runtime, 2) # 记录运行时间（秒）
        })

        # 保存预测结果
        output_dir = "./results/"  # 你可以自定义保存路径
        os.makedirs(output_dir, exist_ok=True)
        
        filename = f"task_1_{model}_{p}.json".replace(" ", "_")  # 避免空格或特殊字符
        save_path = os.path.join(output_dir, filename)

        with open(save_path, "w", encoding="utf-8") as f:
            json.dump({
                "model": model,
                "prompt": p,
                "y_true": y_true,
                "y_pred": y_pred,
                "accuracy": acc,
                "macro_f1": macro_f1,
                "weighted_f1": weighted_f1,
                "Valid sample": len(y_true),
                "runtime_sec": round(runtime, 2) # 记录运行时间（秒）
            }, f, ensure_ascii=False, indent=2)
        
        print(f"[保存] 已保存结果到 {save_path}")

# 用 pandas 输出表格
df = pd.DataFrame(results)
df

### **任务二：颈椎顺列评估**

目标：判断颈椎顺列状态，分类为：
- 顺列差（标签 0）
- 顺列可（标签 1）

说明：顺列是指颈椎椎体之间的排列关系。顺列评估分为顺列差（椎体排列不齐，可能存在脱位或滑脱）和顺列可（排列基本正常）。

##### **prompt设计说明**
- prompt_1：简单说明任务
- prompt_2：在说明任务的基础上，添加医学背景信息
- prompt_3：在2的基础上，添加任务流程

In [None]:
# 双语 prompt

prompt_task_2 = {
    'prompt_1': '''请根据提供的颈椎MRI矢状位图像，判断该患者颈椎椎体的排列（顺列）状态。
    请从以下两类中选择最符合的分类结果，并输出对应的标签数字（0 或 1）：
    - 标签 0：顺列差（椎体排列不齐，可能存在脱位或滑脱）
    - 标签 1：顺列可（椎体排列基本正常）
    仅输出最终的分类标签数字。''',

    'prompt_2': '''
    任务：你是一名经验丰富的医学影像分析专家，请你根据下述给你的信息和提供的颈椎MRI矢状位图像，判断该患者颈椎椎体的排列（顺列）状态。
    
    医学背景信息：  
    在正常情况下，颈椎各椎体的后缘应在矢状位图像上形成一条平滑的曲线。如果椎体后缘出现阶梯状错位或不连续，可能提示椎体排列异常，如脱位或滑脱。
    前纵韧带在矢状位图像上应呈现为一条连续的低信号带，紧贴椎体前缘。如果该韧带显示不连续或与椎体前缘的关系异常，可能提示椎体排列异常。
    C2-C7 SVA是评估颈椎矢状位平衡的重要参数。正常情况下，C2-C7 SVA值较小，表示颈椎排列良好。如果该值增大，可能提示颈椎前移或排列异常。
    C2-C7 Cobb角用于评估颈椎的生理曲度。虽然主要用于评估曲度，但在判断椎体排列时也有参考价值。异常的Cobb角可能与椎体排列异常相关。
    
    输出要求：请根据你通过MRI分析的患者颈椎椎体的排列状态，请从以下两类中选择最符合的分类结果，并输出对应的标签数字（0 或 1）：
    - 标签 0：顺列差（椎体排列不齐，可能存在脱位或滑脱）
    - 标签 1：顺列可（椎体排列基本正常）
    仅输出最终的分类标签数字。''',

    'prompt_3': '''
    任务：你是一名经验丰富的医学影像分析专家，请你根据下述给你的信息和提供的颈椎MRI矢状位图像，判断该患者颈椎的生理曲度状态。
    
    医学背景信息：  
    在正常情况下，颈椎各椎体的后缘应在矢状位图像上形成一条平滑的曲线。如果椎体后缘出现阶梯状错位或不连续，可能提示椎体排列异常，如脱位或滑脱。
    前纵韧带在矢状位图像上应呈现为一条连续的低信号带，紧贴椎体前缘。如果该韧带显示不连续或与椎体前缘的关系异常，可能提示椎体排列异常。
    C2-C7 SVA是评估颈椎矢状位平衡的重要参数。正常情况下，C2-C7 SVA值较小，表示颈椎排列良好。如果该值增大，可能提示颈椎前移或排列异常。
    C2-C7 Cobb角用于评估颈椎的生理曲度。虽然主要用于评估曲度，但在判断椎体排列时也有参考价值。异常的Cobb角可能与椎体排列异常相关。
    
    输出要求：请根据你通过MRI分析的患者颈椎椎体的排列状态，请从以下两类中选择最符合的分类结果，并输出对应的标签数字（0 或 1）：
    - 标签 0：顺列差（椎体排列不齐，可能存在脱位或滑脱）
    - 标签 1：顺列可（椎体排列基本正常）
    仅输出最终的分类标签数字。
    
    现在，我们要根据上面给出的任务、医学背景信息及输出要求，按照以下步骤来根据颈椎MRI矢状位图像判断颈椎椎体的排列状态：
    - 获取颈椎MRI矢状位图；
    - 定位基准椎体：在MRI矢状位图像上，识别C2（枢椎）至C7（第七颈椎）椎体的位置。C2椎体是颈椎中第二个椎体，有一个明显的齿状突；C7椎体是颈椎中最大的椎体；
    - 评估椎体后缘排列：观察椎体后缘是否形成平滑的曲线，是否存在阶梯状错位或不连续；
    - 检查前纵韧带的连续性：评估前纵韧带是否连续，是否与椎体前缘的关系正常；
    - 综合判断椎体排列状态：根据上述评估结果，判断椎体排列是否正常；
    - 按照诊断和输出要求进行输出，仅输出标签数字。'''
                 }

prompt_task_2_en = {
    'prompt_1': '''Please judge the alignment status of the cervical vertebrae based on the provided cervical spine sagittal MRI images.
    Choose the classification that best fits from the two categories below, and output the corresponding label number (0 or 1):
    - Label 0: Poor alignment (vertebrae are irregularly arranged, possible dislocation or slippage)
    - Label 1: Good alignment (vertebrae are basically normally arranged)
    Output only the final classification label number.''',
    
    'prompt_2': '''
    Task: You are an experienced medical imaging expert. Please judge the alignment status of the cervical vertebrae of the patient based on the information below and the provided cervical spine sagittal MRI images.
    
    Medical background information:  
    Under normal conditions, the posterior edges of the cervical vertebrae should form a smooth curve on the sagittal MRI images. Step-like misalignments or discontinuities in the posterior edges may indicate vertebral alignment abnormalities such as dislocation or slippage.
    The anterior longitudinal ligament should appear as a continuous low-signal band closely attached to the anterior edge of the vertebrae on sagittal MRI images. Discontinuity or abnormal relationships between this ligament and the vertebral anterior edge may indicate alignment abnormalities.
    The C2-C7 SVA (sagittal vertical axis) is an important parameter for evaluating cervical sagittal balance. Normally, a small C2-C7 SVA value indicates good vertebral alignment. An increased value may indicate forward displacement or alignment abnormalities.
    The C2-C7 Cobb angle is used to evaluate the physiological curvature of the cervical spine. Although mainly used for curvature assessment, it can also provide reference information when judging vertebral alignment. Abnormal Cobb angles may be related to vertebral alignment issues.
    
    Output requirements: Based on your MRI analysis of the patient's cervical vertebral alignment, please select the classification that best fits from the two categories below and output the corresponding label number (0 or 1):
    - Label 0: Poor alignment (vertebrae are irregularly arranged, possible dislocation or slippage)
    - Label 1: Good alignment (vertebrae are basically normally arranged)
    Output only the final classification label number.''',
    
    'prompt_3': '''
    Task: You are an experienced medical imaging expert. Please judge the alignment status of the cervical vertebrae of the patient based on the information below and the provided cervical spine sagittal MRI images.
    
    Medical background information:  
    Under normal conditions, the posterior edges of the cervical vertebrae should form a smooth curve on the sagittal MRI images. Step-like misalignments or discontinuities in the posterior edges may indicate vertebral alignment abnormalities such as dislocation or slippage.
    The anterior longitudinal ligament should appear as a continuous low-signal band closely attached to the anterior edge of the vertebrae on sagittal MRI images. Discontinuity or abnormal relationships between this ligament and the vertebral anterior edge may indicate alignment abnormalities.
    The C2-C7 SVA (sagittal vertical axis) is an important parameter for evaluating cervical sagittal balance. Normally, a small C2-C7 SVA value indicates good vertebral alignment. An increased value may indicate forward displacement or alignment abnormalities.
    The C2-C7 Cobb angle is used to evaluate the physiological curvature of the cervical spine. Although mainly used for curvature assessment, it can also provide reference information when judging vertebral alignment. Abnormal Cobb angles may be related to vertebral alignment issues.
    
    Output requirements: Based on your MRI analysis of the patient's cervical vertebral alignment, please select the classification that best fits from the two categories below and output the corresponding label number (0 or 1):
    - Label 0: Poor alignment (vertebrae are irregularly arranged, possible dislocation or slippage)
    - Label 1: Good alignment (vertebrae are basically normally arranged)
    Output only the final classification label number.
    
    Now, following the above task, medical background information, and output requirements, please judge the cervical vertebral alignment status from the sagittal cervical spine MRI images according to the steps below:
    - Obtain the cervical spine sagittal MRI images;
    - Locate the reference vertebrae: identify the positions of C2 (axis) through C7 (seventh cervical vertebra) on the sagittal MRI images. The C2 vertebra is the second cervical vertebra with a prominent odontoid process; the C7 vertebra is the largest cervical vertebra;
    - Assess the posterior vertebral edge alignment: observe whether the posterior edges form a smooth curve and check for step-like misalignments or discontinuities;
    - Examine the continuity of the anterior longitudinal ligament: assess whether the ligament is continuous and if its relationship with the anterior vertebral edges is normal;
    - Make a comprehensive judgment of vertebral alignment status based on the above assessments;
    - Output only the label number according to the diagnosis and output requirements.'''
}

In [None]:
from openai import OpenAI
import base64, os, time, json
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import pandas as pd
import time

# 加载标签
with open(train_label_dir, 'r', encoding='utf-8') as f:
    raw_list = json.load(f)

label_dict = {item['id']: item['sl'] for item in raw_list}

labels = ['0','1']

# 调用大模型进行预测
results = []

for model in model_names:
    model_name = model_names[model]
    
    for p in prompt_task_2:
        prompt = prompt_task_2[p]

        print(f"[运行] Model - {model}, Prompt - {p} ......")
        start_time = time.time() 
        
        y_true, y_pred = [], []

        for patient_id in sorted(os.listdir(train_dir)):
            sag_dir = os.path.join(train_dir, patient_id, "sag")
            if not os.path.exists(sag_dir):
                continue
        
            image_files = [f for f in os.listdir(sag_dir) if f.lower().endswith(('.jpg', '.png'))]
            if not image_files:
                continue
        
            image_paths = [os.path.join(sag_dir, f"{i}.png") for i in [5, 6, 7]]  # 取三张图
        
            base64_images = [convert_image_to_webp_base64(path) for path in image_paths]
            if any(img is None for img in base64_images):
                print(f"[跳过] 无法处理图像{patient_id}：{image_path}")
                continue
        
            pred_label = classify_image(base64_images, prompt, model_name, labels)
            true_label = label_dict.get(patient_id, -1)
                
            if true_label != -1 and pred_label != -1:
                y_true.append(true_label)
                y_pred.append(pred_label)

        # 计算评估指标
        acc = accuracy_score(y_true, y_pred)
        macro_f1 = f1_score(y_true, y_pred, average='macro')
        weighted_f1 = f1_score(y_true, y_pred, average='weighted')

        runtime = time.time() - start_time
        
        results.append({
            "model": model,
            "prompt": p,
            "accuracy": acc,
            "macro_f1": macro_f1,
            "weighted_f1": weighted_f1,
            "Valid sample": len(y_true),
            "runtime_sec": round(runtime, 2) # 记录运行时间（秒）
        })

        # 保存预测结果
        output_dir = "./results/"  # 你可以自定义保存路径
        os.makedirs(output_dir, exist_ok=True)
        
        filename = f"task_2_{model}_{p}.json".replace(" ", "_")  # 避免空格或特殊字符
        save_path = os.path.join(output_dir, filename)

        with open(save_path, "w", encoding="utf-8") as f:
            json.dump({
                "model": model,
                "prompt": p,
                "y_true": y_true,
                "y_pred": y_pred,
                "accuracy": acc,
                "macro_f1": macro_f1,
                "weighted_f1": weighted_f1,
                "Valid sample": len(y_true),
                "runtime_sec": round(runtime, 2) # 记录运行时间（秒）
            }, f, ensure_ascii=False, indent=2)
        
        print(f"[保存] 已保存结果到 {save_path}")

# 用 pandas 输出表格
df = pd.DataFrame(results)
df

### **任务三：椎间盘膨突评估**

目标：对 C2-C3 至 C6-C7 共五个椎间位置 进行状态分类：
- 正常（标签 0）
- 膨出（标签 1）
- 突出（标签 2）
- 脱出（标签 3）

**说明：** 颈椎椎间盘膨突是指颈椎间盘的外层变弱或破裂，导致内部物质向外凸出，可能会压迫附近的神经或脊髓。评估分为四种情况：正常（椎间盘没有异常），膨出（椎间盘整体轻微外凸，但外层没有破裂），突出（外层部分破裂，内部物质局部凸出），脱出（外层完全破裂，内部物质可能掉出并移位）。

##### **prompt设计说明**
- prompt_1：简单说明任务
- prompt_2：在说明任务的基础上，添加医学背景信息

In [None]:
prompt_task_3 = {
    'prompt_1': '''请根据提供的颈椎MRI横轴位图像，判断对应椎间盘的位置状态状态。
    请从以下四类中选择最符合的分类结果，并输出对应的标签数字：
    - 标签 0：正常（椎间盘没有异常）
    - 标签 1：膨出（椎间盘整体轻微外凸，但外层没有破裂）
    - 标签 2：突出（外层部分破裂，内部物质局部凸出）
    - 标签 3：脱出（外层完全破裂，内部物质可能掉出并移位）
    仅输出最终的分类标签数字。''',

    'prompt_2': '''
    任务：你是一名经验丰富的医学影像分析专家，请你根据下述给你的信息和提供的颈椎MRI横轴位图像，判断对应椎间盘的位置状态状态。
    
    医学背景信息：  
    椎间盘位于相邻椎体之间，正常情况下，呈椭圆形或近似方形，边缘光滑，无明显变窄或膨出。如果椎间盘异常，可能会出现以下几种情况：
    膨出：椎间盘的髓核部分膨胀，尚未突破纤维环，膨出的椎间盘通常呈现为均匀的高信号，且椎间盘后缘可能变得不规则；
    突出：椎间盘的髓核部分突破纤维环，形成一个局部的突出，突出部分可能呈现为椎间盘与椎体之间的间隙增宽，突出的髓核区域呈现高信；
    脱出：髓核完全脱离纤维环，形成游离体，脱出的髓核可能出现在椎管内，与硬膜囊或神经根接触。
    
    输出要求：请根据你通过MRI分析的患者颈椎椎体的排列状态，请从以下四类中选择最符合的分类结果，并输出对应的标签数字：
    - 标签 0：正常（椎间盘没有异常）
    - 标签 1：膨出（椎间盘整体轻微外凸，但外层没有破裂）
    - 标签 2：突出（外层部分破裂，内部物质局部凸出）
    - 标签 3：脱出（外层完全破裂，内部物质可能掉出并移位）
    仅输出最终的分类标签数字。'''
                 }

prompt_task_3_en = {
    'prompt_1': '''Please judge the status of the corresponding intervertebral disc based on the provided cervical spine axial MRI images.
    Choose the classification that best fits from the following four categories, and output the corresponding label number:
    - Label 0: Normal (no abnormalities in the disc)
    - Label 1: Bulging (the disc is slightly protruding outward overall, but the outer layer is intact)
    - Label 2: Protrusion (the outer layer is partially torn, and the internal material is locally bulging out)
    - Label 3: Extrusion (the outer layer is completely torn, and internal material may have escaped and displaced)
    Output only the final classification label number.''',
    
    'prompt_2': '''
    Task: You are an experienced medical imaging expert. Please judge the status of the corresponding intervertebral disc based on the information below and the provided cervical spine axial MRI images.
    
    Medical background information:  
    Intervertebral discs are located between adjacent vertebral bodies. Under normal conditions, they appear elliptical or roughly square-shaped, with smooth edges and no obvious narrowing or bulging. When abnormalities occur, the following situations may be present:
    Bulging: The nucleus pulposus part of the disc expands but does not breach the annulus fibrosus. A bulging disc typically shows uniform high signal intensity, and the posterior disc edge may appear irregular;
    Protrusion: The nucleus pulposus partially breaks through the annulus fibrosus, forming a localized protrusion. The protruded area may present as a widened gap between the disc and vertebral body, and the protruding nucleus region shows high signal intensity;
    Extrusion: The nucleus pulposus completely separates from the annulus fibrosus, forming a free fragment. The extruded nucleus may be found within the spinal canal, contacting the dural sac or nerve roots.
    
    Output requirements: Based on your MRI analysis of the patient’s cervical spine, please select the classification that best fits from the following four categories and output the corresponding label number:
    - Label 0: Normal (no abnormalities in the disc)
    - Label 1: Bulging (the disc is slightly protruding outward overall, but the outer layer is intact)
    - Label 2: Protrusion (the outer layer is partially torn, and the internal material is locally bulging out)
    - Label 3: Extrusion (the outer layer is completely torn, and internal material may have escaped and displaced)
    Output only the final classification label number.'''
}

In [None]:
from openai import OpenAI
import base64, os, time, json
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import pandas as pd
import time

# 加载标签
with open(train_label_dir, 'r', encoding='utf-8') as f:
        raw_list = json.load(f)

label_dict = {item['id']: item['zjppt'] for item in raw_list}

labels = ['0','1','2','3']
CERVICAL_LEVELS = ["2-3", "3-4", "4-5", "5-6", "6-7"]  # 五个椎间位置

# 调用大模型进行预测
results = []

for model in model_names:
    model_name = model_names[model]
    
    for p in prompt_task_3:
        prompt = prompt_task_3[p]

        print(f"[运行] Model - {model}, Prompt - {p} ......")
        start_time = time.time() 

        y_true_all, y_pred_all = [], []
        patient_preds = {}  # 每个病人5个椎间的预测

        for patient_id in sorted(os.listdir(train_dir)):

            patient_true = label_dict.get(patient_id, [])
            if not patient_true or len(patient_true) != 5:
                continue

            patient_pred = []

            for i, level in enumerate(CERVICAL_LEVELS):
                
                tra_img = os.path.join(train_dir, patient_id, "tra", f"{level}.png")
                if not os.path.exists(tra_img):
                    patient_pred.append(-1)
                    continue
            
                base64_image = {convert_image_to_webp_base64(tra_img)} 
                if base64_image is None:
                    print(f"[跳过] 无法处理图像{patient_id}：{image_path}")
                    continue
        
                pred_label = classify_image(base64_image, prompt, model_name, labels)
                patient_pred.append(pred_label)
                    
                if true_label[i] != -1 and pred_label != -1:
                    y_true_all.append(true_label[i])
                    y_pred_all.append(pred_label)

            patient_preds[patient_id] = patient_pred

        # 计算评估指标
        acc = accuracy_score(y_true_all, y_pred_all)
        macro_f1 = f1_score(y_true_all, y_pred_all, average='macro')
        weighted_f1 = f1_score(y_true_all, y_pred_all, average='weighted')

        runtime = time.time() - start_time
        
        results.append({
            "model": model,
            "prompt": p,
            "accuracy": acc,
            "macro_f1": macro_f1,
            "weighted_f1": weighted_f1,
            "Valid sample": len(y_true_all),
            "runtime_sec": round(runtime, 2) # 记录运行时间（秒）
        })

        # 保存预测结果
        output_dir = "./results/"  # 自定义保存路径
        os.makedirs(output_dir, exist_ok=True)
        filename = f"task_3_{model}_{p}.json".replace(" ", "_")  # 避免空格或特殊字符
        save_path = os.path.join(output_dir, filename)

        with open(save_path, "w", encoding="utf-8") as f:
            json.dump({
                "model": model,
                "prompt": p,
                "patient_preds": patient_preds,
                "y_true_flattened": y_true_all,
                "y_pred_flattened": y_pred_all,
                "accuracy": acc,
                "macro_f1": macro_f1,
                "weighted_f1": weighted_f1,
                "Valid sample": len(y_true_all),
                "runtime_sec": round(runtime, 2) # 记录运行时间（秒）
            }, f, ensure_ascii=False, indent=2)
        
        print(f"[保存] 已保存结果到 {save_path}")

# 用 pandas 输出表格
df = pd.DataFrame(results)
df

### **任务四：中央椎管评估**

目标：对 C2、C2-C3、C3、C3-C4、...、C6-C7、C7 共 11 个位置
进行分级：
- 0 级（标签 0）
- 1 级（标签 1）
- 2 级（标签 2）
- 3 级（标签 3）

**说明：** 颈椎中央椎管是指颈椎椎管的中枢部分，包含脊髓、脑脊液及其周围的硬膜等结构，是保护脊髓和神经的重要通道。评估颈椎中央椎管通常分为 0-3 级：0 级表示椎管正常，无狭窄或压迫；1 级为轻度狭窄，脊髓无明显受压；2 级为中度狭窄，脊髓受压但无明显信号改变；3 级为重度狭窄，脊髓明显受压并伴有信号改变。

##### **prompt设计说明**
- prompt_1：简单说明任务
- prompt_2：在说明任务的基础上，添加医学背景信息

In [None]:
prompt_task_4 = {
    'prompt_1': '''请根据提供的颈椎MRI横轴位图像，对颈椎中央椎管进行分级评估。
    请从以下四类中选择最符合的分类结果，并输出对应的标签数字：
    - 标签 0：0 级（椎管正常，无狭窄或压迫）
    - 标签 1：1 级（轻度狭窄，脊髓无明显受压）
    - 标签 2：2 级（中度狭窄，脊髓受压但无明显信号改变）
    - 标签 3：3 级（重度狭窄，脊髓明显受压并伴有信号改变）
    仅输出最终的分类标签数字。''',

    'prompt_2': '''
    任务：你是一名经验丰富的医学影像分析专家，请你根据下述给你的信息和提供的颈椎MRI横轴位图像，对颈椎中央椎管进行分级评估。
    
    医学背景信息：  
    颈椎中央椎管是指颈椎椎管的中枢部分，包含脊髓、脑脊液及其周围的硬膜等结构，是保护脊髓和神经的重要通道。我们可以通过MRI来评估颈椎中央椎管的狭窄情况：
    0级（无狭窄）：颈椎的中央椎管看起来很宽敞，没有被任何东西压迫。在MRI图像上，可以看到脊髓周围有明显的空隙，这些空隙是正常的蛛网膜下腔，看起来像是脊髓周围有一圈“空隙”。
    1级（轻度狭窄）：椎管变窄，但脊髓本身还没有变形。在MRI图像上，可以看到脊髓周围的空隙（蛛网膜下腔）减少了一半以上，但脊髓本身看起来还是正常的，没有被压扁或变形。
    2级（中度狭窄）：椎管进一步变窄，脊髓开始变形，但脊髓内部还没有出现损伤。在MRI图像上，脊髓被压扁或变形，但脊髓内部的信号（颜色或亮度）看起来还是正常的，没有出现异常的信号变化。
    3级（重度狭窄）：椎管严重变窄，脊髓不仅变形，而且内部出现了损伤。在MRI图像上，脊髓被严重压扁或变形，而且脊髓内部的信号发生了变化，比如出现高信号（通常是亮色），这表明脊髓内部有损伤或病变。
    
    输出要求：请根据你通过MRI分析的患者颈椎椎体的排列状态，请从以下四类中选择最符合的分类结果，并输出对应的标签数字：
    - 标签 0：0 级（椎管正常，无狭窄或压迫）
    - 标签 1：1 级（轻度狭窄，脊髓无明显受压）
    - 标签 2：2 级（中度狭窄，脊髓受压但无明显信号改变）
    - 标签 3：3 级（重度狭窄，脊髓明显受压并伴有信号改变）
    仅输出最终的分类标签数字。'''
                 }

prompt_task_4_en = {
    'prompt_1': '''Please perform a graded assessment of the cervical spinal canal based on the provided axial cervical MRI images.
    Select the classification that best fits from the following four categories and output the corresponding label number:
    - Label 0: Grade 0 (Normal spinal canal, no stenosis or compression)
    - Label 1: Grade 1 (Mild stenosis, spinal cord not obviously compressed)
    - Label 2: Grade 2 (Moderate stenosis, spinal cord compressed but no obvious signal change)
    - Label 3: Grade 3 (Severe stenosis, spinal cord obviously compressed with signal changes)
    Output only the final classification label number.''',
    
    'prompt_2': '''
    Task: You are an experienced medical imaging expert. Based on the information below and the provided axial cervical MRI images, please perform a graded assessment of the cervical spinal canal.
    
    Medical background information:
    The cervical spinal canal refers to the central part of the cervical vertebral canal, containing the spinal cord, cerebrospinal fluid, and surrounding dura mater. It is an important passage protecting the spinal cord and nerves. MRI can be used to evaluate stenosis of the cervical spinal canal:
    - Grade 0 (No stenosis): The central spinal canal appears spacious with no compression. On MRI images, clear spaces can be seen around the spinal cord, representing the normal subarachnoid space, appearing like a “gap” surrounding the spinal cord.
    - Grade 1 (Mild stenosis): The canal is narrowed, but the spinal cord itself remains undeformed. On MRI images, the subarachnoid space around the spinal cord is reduced by more than half, but the spinal cord looks normal without compression or deformation.
    - Grade 2 (Moderate stenosis): The canal is further narrowed, and the spinal cord begins to deform, but no internal injury is seen. On MRI images, the spinal cord is compressed or deformed, but the internal signal (color or brightness) of the spinal cord looks normal with no abnormal changes.
    - Grade 3 (Severe stenosis): The canal is severely narrowed, with the spinal cord deformed and showing internal injury. On MRI images, the spinal cord is severely compressed or deformed, and the internal signal changes (e.g., high signal intensity, usually bright), indicating injury or pathology inside the spinal cord.
    
    Output requirements: Based on your MRI analysis of the patient’s cervical vertebral canal, please select the classification that best fits from the following four categories and output the corresponding label number:
    - Label 0: Grade 0 (Normal spinal canal, no stenosis or compression)
    - Label 1: Grade 1 (Mild stenosis, spinal cord not obviously compressed)
    - Label 2: Grade 2 (Moderate stenosis, spinal cord compressed but no obvious signal change)
    - Label 3: Grade 3 (Severe stenosis, spinal cord obviously compressed with signal changes)
    Output only the final classification label number.
    '''
}

In [None]:
from openai import OpenAI
import base64, os, time, json
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
import pandas as pd
import time

# 加载标签
with open(train_label_dir, 'r', encoding='utf-8') as f:
        raw_list = json.load(f)

label_dict = {item['id']: item['zyzg'] for item in raw_list}

labels = ['0','1','2','3']
CERVICAL_LEVELS = ["2", "2-3", "3", "3-4", "4", "4-5", "5", "5-6", "6", "6-7", "7"]  # 五个椎间位置

# 调用大模型进行预测
results = []

for model in model_names:
    model_name = model_names[model]
    
    for p in prompt_task_4:
        prompt = prompt_task_4[p]

        print(f"[运行] Model - {model}, Prompt - {p} ......")
        start_time = time.time() 

        y_true_all, y_pred_all = [], []
        patient_preds = {}  # 每个病人11个中央椎管的预测

        for patient_id in sorted(os.listdir(train_dir)):

            patient_true = label_dict.get(patient_id, [])
            if not patient_true or len(patient_true) != 11:
                continue

            patient_pred = []

            for i, level in enumerate(CERVICAL_LEVELS):
                
                tra_img = os.path.join(train_dir, patient_id, "tra", f"{level}.png")
                if not os.path.exists(tra_img):
                    patient_pred.append(-1)
                    continue
            
                base64_image = {convert_image_to_webp_base64(tra_img)} 
                if base64_image is None:
                    print(f"[跳过] 无法处理图像{patient_id}：{image_path}")
                    continue
        
                pred_label = classify_image(base64_image, prompt, model_name, labels)
                patient_pred.append(pred_label)
                
                true_label = label_dict.get(patient_id, -1)

                if true_label[i] != -1 and pred_label != -1:
                    y_true_all.append(true_label[i])
                    y_pred_all.append(pred_label)

            patient_preds[patient_id] = patient_pred

        # 计算评估指标
        acc = accuracy_score(y_true_all, y_pred_all)
        macro_f1 = f1_score(y_true_all, y_pred_all, average='macro')
        weighted_f1 = f1_score(y_true_all, y_pred_all, average='weighted')

        runtime = time.time() - start_time
        
        results.append({
            "model": model,
            "prompt": p,
            "accuracy": acc,
            "macro_f1": macro_f1,
            "weighted_f1": weighted_f1,
            "Valid sample": len(y_true_all),
            "runtime_sec": round(runtime, 2) # 记录运行时间（秒）
        })

        # 保存预测结果
        output_dir = "./results/"  # 自定义保存路径
        os.makedirs(output_dir, exist_ok=True)
        filename = f"task_4_{model}_{p}.json".replace(" ", "_")  # 避免空格或特殊字符
        save_path = os.path.join(output_dir, filename)

        with open(save_path, "w", encoding="utf-8") as f:
            json.dump({
                "model": model,
                "prompt": p,
                "patient_preds": patient_preds,
                "y_true_flattened": y_true_all,
                "y_pred_flattened": y_pred_all,
                "accuracy": acc,
                "macro_f1": macro_f1,
                "weighted_f1": weighted_f1,
                "Valid sample": len(y_true_all),
                "runtime_sec": round(runtime, 2) # 记录运行时间（秒）
            }, f, ensure_ascii=False, indent=2)
        
        print(f"[保存] 已保存结果到 {save_path}")

# 用 pandas 输出表格
df = pd.DataFrame(results)
df

#### **读取并整理结果**

In [None]:
# 分别整理不同task的结果进行整合

import os
import json
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

# 读取预测结果并计算指标
results = []

input_dir = "./results/task_4"

for filename in os.listdir(input_dir):
    if filename.endswith(".json"):
        file_path = os.path.join(input_dir, filename)
        
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        model = data.get("model", "")
        prompt = data.get("prompt", "")
        y_true = data.get("y_true", [])
        y_pred = data.get("y_pred", [])
        valid_sample = data.get("Valid sample", "")
        runtime_sec = data.get("runtime_sec", "")
        acc = data.get("accuracy", "")
        macro_f1 = data.get("macro_f1", "")
        weighted_f1 = data.get("weighted_f1", "")

        results.append({
            "model": model,
            "prompt": prompt,
            "accuracy": acc,
            "macro_f1": macro_f1,
            "weighted_f1": weighted_f1,
            "Valid sample": valid_sample,
            "runtime_sec": runtime_sec
        })

# 整理成表格
df = pd.DataFrame(results)

# 打印出来
print(df)

# 保存结果
os.makedirs("./metrics/", exist_ok=True)
df.to_csv("./metrics/metrics_summary_task_4.csv", index=False, encoding='utf-8-sig')

In [None]:
# 对整合的结果进行分析

file_path = './metrics/result.xlsx'
df = pd.read_excel(file_path)

# 分析每个任务下，不同模型的平均评估结果
model_task_avg = df.groupby(['任务', '模型'])[['Accuracy', 'Macro-F1', 'Weighted-F1']].mean().reset_index()

# 分析每个任务下，不同 Prompt 的平均评估结果
prompt_task_avg = df.groupby(['任务', 'Prompt'])[['Accuracy', 'Macro-F1', 'Weighted-F1']].mean().reset_index()

# 可选：将结果保存为Excel文件
with pd.ExcelWriter('taskwise_model_prompt_avg.xlsx') as writer:
    model_task_avg.to_excel(writer, sheet_name='Model_Average', index=False)
    prompt_task_avg.to_excel(writer, sheet_name='Prompt_Average', index=False)
