### 1. 预处理
#### 1.1. JSON转为TSV

In [40]:
import json

In [41]:
def json2tsv(input_file = "./NER/IMCS-V2_dev.json",output_file = "./NER/dev.tsv"):
    with open(input_file, "r", encoding="utf-8") as src,open(output_file,"w", encoding="utf-8") as dst:
        print("dialogue_id\tsent_id\ttext\tlabel\n",file=dst)
        # 使用json.load将文件内容解析为Python对象
        data = json.load(src)
        for key,value in data.items():
            dialogue_id = key
            dialogue = value["dialogue"]
            for sentence in dialogue:
                sent_id = sentence["sentence_id"]
                text = sentence["sentence"]
                label = sentence["BIO_label"]
                print(f"{dialogue_id}\t{sent_id}\t{text}\t{label}\n",file=dst)


In [42]:
json_train = "./NER/IMCS-V2_train.json"
tsv_train =  "./NER/IMCS-V2_train.tsv"
json2tsv(input_file=json_train,output_file=tsv_train)

json_dev = "./NER/IMCS-V2_dev.json"
tsv_dev = "./NER/IMCS-V2_dev.tsv"
json2tsv(input_file=json_dev,output_file=tsv_dev)

#### 1.2. 使用Seqeval进行评估

In [43]:
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

def get_metrics(y_true,y_pred):
    """
    单独计算 Precision, Recall, F1-score
    """
    copy_pred = y_pred.copy()
    if len(y_true) != len(copy_pred):
        raise ValueError("y_true is not the same size with y_pred ")
    problematic = 0
    for i in range(len(y_true)):
        pred = copy_pred[i]
        label = y_true[i]
        if len(pred) != len(label):
            # print(f"not the same length,\n pred length: {len(pred)},text = {pred}\n label length: {len(label)},text={label}\n")
            # pred = ["X"]*len(label)
            # print(pred,["O"]*len(label))
            pred = (pred + ["O"]*len(label))[0:len(label)]
            # print(pred)
            copy_pred[i] = pred
            problematic += 1

    precision = precision_score(y_true, copy_pred)
    accuracy = accuracy_score(y_true, copy_pred,)
    recall = recall_score(y_true, copy_pred)
    f1 = f1_score(y_true, copy_pred)

    print(f"# Precision: {precision}")
    print(f"# Accuracy: {accuracy}")
    print(f"# Recall: {recall}")
    print(f"# F1 Score: {f1}")

    # 使用 strict 模式计算 F1-score
    f1_strict = f1_score(y_true, copy_pred, mode='strict',average="micro")
    print(f"# F1-micro Score (Strict): {f1_strict}")
    print(f"# problematic records: {problematic}")

    # 使用 partial 模式计算 F1-score
    # f1_partial = f1_score(y_true, y_pred, mode='partial',average="micro")
    # print(f"F1 Score (Partial): {f1_partial}")
    return precision,accuracy,recall,f1,f1_strict

In [44]:
def get_acc(y_pred,y_true):
    if len(y_true) != len(y_pred):
        raise ValueError("y_true is not the same size with y_pred ")
    results = []
    for i in range(len(y_true)):
        pred = " ".join(y_pred[i])
        label = " ".join(y_true[i])
        results.append(pred == label)
    return sum(results)/len(results)

In [45]:
# 示例标签
y_true = [
    ["O", "B-Symptom", "I-Symptom", "O", "B-Drug", "I-Drug"],
    ["O", "B-Symptom", "I-Symptom", "O"]
]
y_pred = [
    ["O", "B-Symptom", "I-Symptom", "O", "B-Drug", "O"],
    ["O", "B-Symptom", "I-Symptom", "O"]
]
print("*"*20)
print(get_metrics(y_pred=y_pred,y_true=y_true))
print("*"*20)
print(get_acc(y_pred=y_pred,y_true=y_true))
print("*"*20)
# 打印分类报告
print("Classification Report:")
print(classification_report(y_true, y_pred))

********************
# Precision: 0.6666666666666666
# Accuracy: 0.9
# Recall: 0.6666666666666666
# F1 Score: 0.6666666666666666
# F1-micro Score (Strict): 0.6666666666666666
# problematic records: 0
(0.6666666666666666, 0.9, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666)
********************
0.5
********************
Classification Report:
              precision    recall  f1-score   support

        Drug       0.00      0.00      0.00         1
     Symptom       1.00      1.00      1.00         2

   micro avg       0.67      0.67      0.67         3
   macro avg       0.50      0.50      0.50         3
weighted avg       0.67      0.67      0.67         3



### 2. 使用原始预训练模型

#### 2.1. 加载原始模型

In [46]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import pandas as pd
from tqdm import tqdm

In [8]:
model_name = "/gemini/pretrain/Qwen2.5-0.5B-Instruct"


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Using config file: /etc/orion/env/env.conf


#### 2.2. 使用Prompt和预训练模型

In [9]:
def get_bio_lable(text):
    """
        调用大模型进行NER
    """
    sys_prompt = """
    你是一个医疗领域的命名实体识别专家，请将用户的输入中的命名实体识别出来并以严格的BIO规范标注，即B-X开头，以I-X结尾，
    其中X代表命名实体类别Symptom，Drug，Drug_Category，Medical_Examination，Operation中的一种，O代表不属于任何一个类别，
    命名实体的类型包括：
    Symptom，病人因患病而表现出来的异常状况，如 发热、呼吸困难、鼻塞 等。
    Drug，具体的药物名称，如 妈咪爱、蒙脱石散、蒲地蓝 等。
    Drug_Category，根据药物功能进行划分的药物种类，如 消炎药、感冒药、益生菌 等。
    Medical_Examination，医学检验，如 血常规、x光片、CRP分析 等。
    Operation，相关的医疗操作，如 输液、雾化、接种疫苗 等。
    比如用户输入为： “你好，咳嗽是连声咳吗？有痰吗？有没流鼻涕，鼻塞？”，
    你的输出为："O O O B-Symptom I-Symptom O O O B-Symptom O O O B-Symptom O O O O B-Symptom I-Symptom I-Symptom O B-Symptom I-Symptom O".
    """
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": text}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512,
        top_p = 0.3,
        temperature = 0.1
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

def get_batch_label(texts,labels):
    results = []
    with open("./pred_resultv3_1.tsv","w", encoding="utf-8") as pred_result:
        for i in tqdm(range(len(texts))):
            text = texts[i]
            label = labels[i]
            result = get_bio_lable(text = text).replace(" <END>", "")
            print(f"{i}\t{text}\t{label}\t{result}\t{' '.join(label)==result}",file=pred_result)
            result = result.split(" ")
            # print(f"================={i}=======================")
            # print(f"input ({len(text)}):", text)
            # print(f"output({len(result)}):", result)
            # print(f"label ({len(label)}):", label)
            # if len(label) != len(result):
            #     print("===WARNING: length unmatched ====")
            results.append(result)
    return results

In [10]:
#get_bio_lable("咳嗽有几天了？")

#### 2.3. 获取训练与开发数据集

In [47]:
import pandas as pd
train_data = pd.read_csv(tsv_train,sep="\t",encoding="utf-8");
X_train = train_data["text"].to_numpy()
y_train = train_data["label"].apply(lambda x: x.split(" ")).to_numpy()
print(X_train.shape,y_train.shape)

(98529,) (98529,)


In [48]:
import pandas as pd
dev_data = pd.read_csv(tsv_dev,sep="\t",encoding="utf-8")
# .sample(n=10)
X_dev = dev_data["text"].to_numpy()
y_dev = dev_data["label"].apply(lambda x: x.split(" ")).to_numpy()
print(X_dev.shape,y_dev.shape)

(33267,) (33267,)


In [49]:
n = 8
X_dev[n]," ".join(y_dev[n])

('大便怎么样？干不干？胃口怎么样？', 'O O O O O O O O O O O O O O O O')

In [39]:
# 练前的测试，0.5B和7B的准确率都接近0
y_pred = get_batch_label(X_dev,y_dev)
print( get_acc( y_pred, y_dev ) )
print( get_metrics( y_pred, y_dev ) )

100%|██████████| 10/10 [00:16<00:00,  1.70s/it]

0.7
not the same length,
 pred length: 14,text = ['O', 'B-Symptom', 'I-Symptom', 'I-Symptom', 'O', 'B-Symptom', 'I-Symptom', 'O', 'B-Symptom', 'I-Symptom', 'O', 'O', 'O', 'O']
 label length: 13,text=['O', 'B-Symptom', 'I-Symptom', 'O', 'B-Symptom', 'I-Symptom', 'O', 'B-Symptom', 'I-Symptom', 'O', 'O', 'O', 'O']

['O', 'B-Symptom', 'I-Symptom', 'I-Symptom', 'O', 'B-Symptom', 'I-Symptom', 'O', 'B-Symptom', 'I-Symptom', 'O', 'O', 'O', 'O'] ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'B-Symptom', 'I-Symptom', 'I-Symptom', 'O', 'B-Symptom', 'I-Symptom', 'O', 'B-Symptom', 'I-Symptom', 'O', 'O', 'O']
not the same length,
 pred length: 15,text = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Symptom', 'I-Symptom']
 label length: 40,text=['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Symptom', 'I-Symptom', 'O', 'B-Operation', 'I-Operation', 'I-Operation', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',




### 3. 微调准备

#### 3.1 准备训练数据集

In [15]:
def json2Alpaca(src,dst):
    sys_prompt = "请将用户的输入中的命名实体识别出来并以严格的BIO规范标注输出"
        # """
        # 请将用户的输入中的命名实体识别出来并以严格的BIO规范标注，即B-X开头，以I-X结尾，
        # 其中X代表命名实体类别Symptom，Drug，Drug_Category，Medical_Examination，Operation中的一种，O代表不属于任何一个类别，
        # 命名实体的类型包括：
        # Symptom:病人因患病而表现出来的异常状况，如 发热、呼吸困难、鼻塞 等。
        # Drug:具体的药物名称，如 妈咪爱、蒙脱石散、蒲地蓝 等。
        # Drug_Category:根据药物功能进行划分的药物种类，如 消炎药、感冒药、益生菌 等。
        # Medical_Examination:医学检验，如 血常规、x光片、CRP分析 等。
        # Operation:相关的医疗操作，如 输液、雾化、接种疫苗 等。
        # 比如用户输入为： "你好，咳嗽是连声咳吗？有痰吗？有没流鼻涕，鼻塞？"，
        # 你的输出为："O O O B-Symptom I-Symptom O O O B-Symptom O O O B-Symptom O O O O B-Symptom I-Symptom I-Symptom O B-Symptom I-Symptom O"
        # """
    entries = []

    with open(src, "r", encoding="utf-8") as src,open(dst,"w", encoding="utf-8") as dst:
        # 使用json.load将文件内容解析为Python对象
        data = json.load(src)
        for key,value in data.items():
            # dialogue_id = key
            dialogue = value["dialogue"]
            for sentence in dialogue:
                # sent_id = sentence["sentence_id"]
                text = sentence["sentence"]
                label = sentence["BIO_label"]

                entry = {"instruction": sys_prompt,
                         "input": text,
                         "output": label,
                         "system": "你是一个医疗领域的命名实体识别(NER)专家。"
                        }
                # print(entry)  # 打印每一行的数据
                entries.append(entry)
        json.dump(obj=entries, ensure_ascii=False, indent=4, fp=dst)


In [16]:
json2Alpaca(src=json_train, dst="./NER/IMCS-V2_train_alpaca.json")
json2Alpaca(src=json_dev, dst="./NER/IMCS-V2_dev_alpaca.json")

#### 3.2. 使用训练过的0.5B模型

In [50]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import pandas as pd
from tqdm import tqdm

model_name = "./Qwen2.5-0.5B-Instruct-jinm"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Using config file: /etc/orion/env/env.conf


In [51]:
def get_bio_lable2(text):
    """
        调用大模型进行NER
    """
    sys_prompt = "你是一个医疗命名实体专家！请根据当前对话文本内容，识别出每一句话中的BIO实体标签"
    # """
    # 你是一个医疗领域的命名实体识别专家，请将用户的输入中的命名实体识别出来并以严格的BIO规范标注，即B-X开头，以I-X结尾，
    # 其中X代表命名实体类别Symptom，Drug，Drug_Category，Medical_Examination，Operation中的一种，O代表不属于任何一个类别，
    # """
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": text}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512,
        top_p = 0.3,
        temperature = 0.1
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

def get_batch_label2(texts,labels):
    results = []
    with open("./pred_resultv3_2.tsv","w", encoding="utf-8") as pred_result:
        for i in tqdm(range(len(texts))):
            text = texts[i]
            label = labels[i]
            result = get_bio_lable2(text = text).replace(" <END>", "")
            print(f"{i}\t{text}\t{label}\t{result}\t{' '.join(label)==result}",file=pred_result)
            result = result.split(" ")
            # print(f"================={i}=======================")
            # print(f"input ({len(text)}):", text)
            # print(f"output({len(result)}):", result)
            # print(f"label ({len(label)}):", label)
            # if len(label) != len(result):
            #     print("===WARNING: length unmatched ====")
            
            results.append(result)
    return results

In [52]:
# get_bio_lable2("咳嗽有几天了？喉咙有痰吗？大便稀？")

In [53]:
y_pred = get_batch_label2(X_dev,y_dev)
print( get_acc( y_pred, y_dev ) )
print( get_metrics( y_pred, y_dev ) )

 78%|███████▊  | 26073/33267 [3:43:04<51:45,  2.32it/s]   IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100%|██████████| 33267/33267 [5:02:23<00:00,  1.83it/s]  


0.8868849009528963
# Precision: 0.867416207042851
# Accuracy: 0.9639835317681095
# Recall: 0.8403551317357886
# F1 Score: 0.8536712666235203
# F1-micro Score (Strict): 0.8536712666235203
# problematic records: 734
(0.867416207042851, 0.9639835317681095, 0.8403551317357886, 0.8536712666235203, 0.8536712666235203)
