In [1]:
SYMPTOMS = ["pain", "tooth pain", "sensitivity"]
DISEASES = ["caries", "dental caries", "decay"]
BODY_PARTS = ["tooth", "molar", "molar 17"]

import spacy
from sentence_transformers import SentenceTransformer, util

nlp = spacy.load("en_core_web_sm")
embed_model = SentenceTransformer('all-MiniLM-L6-v2')  # 小型轻量

def extract_medical_info(text, embedding_threshold=0.75):#相似度阈值设置不严格（比如 0.6），可能误把不相关词匹配成症状或疾病
    doc = nlp(text)    #不只是分词，还做了 POS、NER、依存分析等，是一个完整的 NLP pipeline   
                        #会有分不出的情况 拼写错误 缩写
    lowered = text.lower()
    
    result = {"text": text, "symptoms": [], "diseases": [], "body_parts": []}
    
    # ---------- Day4: 全等匹配 ----------
    for s in SYMPTOMS:
        if s in lowered:
            result["symptoms"].append(s)
    for d in DISEASES:
        if d in lowered:
            result["diseases"].append(d)
    for b in BODY_PARTS:
        if b in lowered:
            result["body_parts"].append(b)
    
    # ---------- Day5: embedding 相似匹配 ----------
    candidate_terms = SYMPTOMS + DISEASES + BODY_PARTS  #只能找词表里的词或者与它语义相似的词
    #定期更新词表，用预训练 NER 模型，微调小模型
    #不一定非要调用外部 API nlp = spacy.load("en_core_sci_lg")  # scispacy 或其他医疗 NER 模型
    text_emb = embed_model.encode(text, convert_to_tensor=True)  #把文本拆成 subword Day5 不依赖 Day4，保证语义匹配能覆盖整个句子
    
    for term in candidate_terms:
        term_emb = embed_model.encode(term, convert_to_tensor=True)   #用预训练神经网络生成向量，神经网络（Transformer）前向传播
        #模型架构参数（通常在训练/微调时才调   
        #推理 / embedding 参数（encode 时可调）Pooling 方法，是否标准化向量，Batch size / device，是否转 tensor，最大 token 长度
        
        similarity = util.cos_sim(text_emb, term_emb).item() #余弦相似度-1~1
        if similarity >= embedding_threshold and term not in result["symptoms"] \
            and term not in result["diseases"] and term not in result["body_parts"]:
            # 分类到合适类别
            if term in SYMPTOMS:
                result["symptoms"].append(term + "_sim")
            elif term in DISEASES:
                result["diseases"].append(term + "_sim")
            else:
                result["body_parts"].append(term + "_sim")
    
    return result
#把 Day4 分好的结果喂给 Day5 不会降低精度太多，反而通常能提高召回率； Precision 基本不下降，Recall 会提升
#Day4 span 如果识别不够精确，embedding 可能“误补”同义词到错误类别
#核心思想：Day4 高精度 → Day5 高召回，两道筛独立
raw_texts = [
    "Pt presents with tooth pain, no caries.",
    "Patient reports dental decay and sensitivity.",
    "Patient complains of molar 17 pain."
]

results = []
for text in nlp.pipe(raw_texts, batch_size=32):
    results.append(extract_medical_info(text.text))

for r in results:
    print(r)

{'text': 'Pt presents with tooth pain, no caries.', 'symptoms': ['pain', 'tooth pain'], 'diseases': ['caries'], 'body_parts': ['tooth']}
{'text': 'Patient reports dental decay and sensitivity.', 'symptoms': ['sensitivity'], 'diseases': ['decay'], 'body_parts': []}
{'text': 'Patient complains of molar 17 pain.', 'symptoms': ['pain'], 'diseases': [], 'body_parts': ['molar', 'molar 17']}


In [2]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 假设人工标注
gold_labels = [
    {"symptoms":["tooth pain"], "diseases":["caries"], "body_parts":["tooth"]},
    {"symptoms":["sensitivity"], "diseases":["decay"], "body_parts":[]},
    {"symptoms":["pain"], "diseases":[],"body_parts":["molar 17"]}
]

# 简单的二分类计算：每条实体存在即1，不存在即0
def flatten_entities(entity_list):
    return set(entity_list)

y_true = []
y_pred = []

for g, r in zip(gold_labels, results):
    for key in ["symptoms","diseases","body_parts"]:
        y_true.extend([1]*len(g[key]))
        y_true.extend([0]*(10 - len(g[key])))  # 假设最大10个实体
        y_pred.extend([1]*len(r[key]))
        y_pred.extend([0]*(10 - len(r[key])))

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

Precision: 0.78, Recall: 1.00, F1: 0.88
