In [1]:
SYMPTOMS = ["pain", "tooth pain", "sensitivity"]
DISEASES = ["caries", "dental caries", "decay"]
BODY_PARTS = ["tooth", "molar", "molar 17"]

raw_docs = [
    "Patient presents with tooth pain.",
    "No evidence of caries.",
    "Molar 17 has decay, patient reports pain.",
    "Patient is healthy.",
    "Mild sensitivity in upper molar."
]

from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np

# 加载轻量模型
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # 取所有 token 的平均向量
    return outputs.last_hidden_state.mean(dim=1)[0].numpy()

    # 生成词表的向量表示
def get_word_embeddings(word_list):
    embeddings = {}
    for word in word_list:
        embeddings[word] = get_embedding(word)
    return embeddings

symptom_emb = get_word_embeddings(SYMPTOMS)
disease_emb = get_word_embeddings(DISEASES)
body_emb = get_word_embeddings(BODY_PARTS)

In [2]:
from numpy.linalg import norm

def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b) + 1e-8)

def extract_medical_info(text, top_n=1, threshold=0.7):
    # 文本向量
    text_emb = get_embedding(text)

    result = {"text": text, "symptoms": [], "diseases": [], "body_parts": []}

    # 1️⃣ 匹配症状
    for s, emb in symptom_emb.items():
        sim = cosine_similarity(text_emb, emb)
        if sim >= threshold:
            result["symptoms"].append(s)

    # 2️⃣ 匹配疾病
    for d, emb in disease_emb.items():
        sim = cosine_similarity(text_emb, emb)
        if sim >= threshold:
            result["diseases"].append(d)

    # 3️⃣ 匹配部位
    for b, emb in body_emb.items():
        sim = cosine_similarity(text_emb, emb)
        if sim >= threshold:
            result["body_parts"].append(b)

    return result

In [3]:
for text in raw_docs:
    info = extract_medical_info(text)
    print(info)

{'text': 'Patient presents with tooth pain.', 'symptoms': ['tooth pain'], 'diseases': ['dental caries'], 'body_parts': []}
{'text': 'No evidence of caries.', 'symptoms': [], 'diseases': ['caries', 'dental caries'], 'body_parts': []}
{'text': 'Molar 17 has decay, patient reports pain.', 'symptoms': [], 'diseases': ['dental caries'], 'body_parts': ['molar 17']}
{'text': 'Patient is healthy.', 'symptoms': [], 'diseases': [], 'body_parts': []}
{'text': 'Mild sensitivity in upper molar.', 'symptoms': [], 'diseases': ['dental caries'], 'body_parts': ['molar 17']}
