In [1]:
import nltk
from nltk.tokenize import word_tokenize

text = "Patient presents with dental caries and gingival inflammation."
tokens = word_tokenize(text)
print(tokens)


['Patient', 'presents', 'with', 'dental', 'caries', 'and', 'gingival', 'inflammation', '.']


In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Patient diagnosed with dental caries.")

for token in doc:
    print(token.text, token.pos_)

Patient PROPN
diagnosed VERB
with ADP
dental ADJ
caries NOUN
. PUNCT


In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Patient diagnosed with dental caries.")

# 分词
print([token.text for token in doc])

# POS & NER 只是看效果
print([(token.text, token.pos_) for token in doc])
print([(ent.text, ent.label_) for ent in doc.ents])

['Patient', 'diagnosed', 'with', 'dental', 'caries', '.']
[('Patient', 'PROPN'), ('diagnosed', 'VERB'), ('with', 'ADP'), ('dental', 'ADJ'), ('caries', 'NOUN'), ('.', 'PUNCT')]
[]


In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = "Pt presents with dental caries, no pain."
doc = nlp(text)

tokens = [
    token.lemma_.lower()
    for token in doc
    if not token.is_stop and not token.is_punct
]

print(tokens)

['pt', 'present', 'dental', 'carie', 'pain']


In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = "Pt presents with dental caries, no pain."
doc = nlp(text)

tokens = []
for token in doc:
    if not token.is_stop and not token.is_punct:
        tokens.append(token.lemma_.lower())

print(tokens)

['pt', 'present', 'dental', 'carie', 'pain']


In [6]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = "Pt presents with dental caries, no pain."
doc = nlp(text)

clean_tokens = []

for token in doc:
    if token.is_punct:
        continue
    if token.is_stop:
        continue

    lemma = token.lemma_
    lemma = lemma.lower()

    clean_tokens.append(lemma)

print(clean_tokens)

['pt', 'present', 'dental', 'carie', 'pain']


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

docs = [
    "patient has dental caries",
    "patient has no dental caries",
    "patient reports tooth pain"
]

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(docs)

In [8]:
print(tfidf.get_feature_names_out())
print(X.toarray())

['caries' 'dental' 'has' 'no' 'pain' 'patient' 'reports' 'tooth']
[[0.52682017 0.52682017 0.52682017 0.         0.         0.40912286
  0.         0.        ]
 [0.43306685 0.43306685 0.43306685 0.56943086 0.         0.33631504
  0.         0.        ]
 [0.         0.         0.         0.         0.54645401 0.32274454
  0.54645401 0.54645401]]


In [10]:
raw_docs = [
    "Pt presents with dental caries, no pain.",
    "Patient has no evidence of caries.",
    "Patient reports tooth pain."
]
import spacy
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    doc = nlp(text)
    tokens = []

    for token in doc:
        if token.is_punct:
            continue
        if token.is_stop:
            continue

        lemma = token.lemma_.lower()
        tokens.append(lemma)

    return " ".join(tokens)

clean_docs = []

for text in raw_docs:
    clean_docs.append(clean_text(text))

print(clean_docs)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),   # 单词，双词
    min_df=1,   #只要出现过一次就保留
    max_df=0.9  #出现在 >90% 文档的词会被丢掉
)

X = tfidf.fit_transform(clean_docs)
print(tfidf.get_feature_names_out())   #全局词表（vocabulary）
print(X.toarray())

['pt present dental carie pain', 'patient evidence carie', 'patient report tooth pain']
['carie' 'carie pain' 'dental' 'dental carie' 'evidence' 'evidence carie'
 'pain' 'patient' 'patient evidence' 'patient report' 'present'
 'present dental' 'pt' 'pt present' 'report' 'report tooth' 'tooth'
 'tooth pain']
[[0.26628951 0.35013871 0.35013871 0.35013871 0.         0.
  0.26628951 0.         0.         0.         0.35013871 0.35013871
  0.35013871 0.35013871 0.         0.         0.         0.        ]
 [0.37302199 0.         0.         0.         0.49047908 0.49047908
  0.         0.37302199 0.49047908 0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.30650422 0.30650422 0.         0.40301621 0.         0.
  0.         0.         0.40301621 0.40301621 0.40301621 0.40301621]]


In [11]:
labels = [
    0,  # "Pt presents with dental caries, no pain." → no pain
    0,  # "Patient has no evidence of caries." → no pain
    1   # "Patient reports tooth pain." → pain
]
y = labels    #不是“必须”人工打 label？
from sklearn.linear_model import LogisticRegression        

model = LogisticRegression(max_iter=1000)    #score = Σ (TF-IDF值 × 词权重) + bias
model.fit(X, y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [12]:
import numpy as np

feature_names = tfidf.get_feature_names_out()   #看最重要的词（非常关键）
coefficients = model.coef_[0]

top_positive = np.argsort(coefficients)[-5:]
top_negative = np.argsort(coefficients)[:5]

print("Most pain-related words:")
for i in top_positive:
    print(feature_names[i], coefficients[i])

print("\nMost non-pain words:")
for i in top_negative:
    print(feature_names[i], coefficients[i])

Most pain-related words:
tooth 0.22303266954736461
patient report 0.22303266954736461
report 0.22303266954736461
report tooth 0.22303266954736461
tooth pain 0.22303266954736461

Most non-pain words:
carie -0.17695882766092352
evidence carie -0.13639140163995433
evidence -0.13639140163995433
patient evidence -0.13639140163995433
dental -0.0962881910477085


In [13]:
test_texts = [
    "Patient complains of severe tooth pain.",
    "Dental caries noted, patient asymptomatic."
]

test_clean = []
for t in test_texts:
    test_clean.append(clean_text(t))

X_test = tfidf.transform(test_clean)   #complains / severe / asymptomatic 在 TF-IDF 世界里 = 不存在

preds = model.predict(X_test)
print(preds)

[0 0]


In [14]:
print(X_test.toarray())

[[0.         0.         0.         0.         0.         0.
  0.42804604 0.42804604 0.         0.         0.         0.
  0.         0.         0.         0.         0.5628291  0.5628291 ]
 [0.42804604 0.         0.5628291  0.5628291  0.         0.
  0.         0.42804604 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]]


In [17]:
for word, coef in zip(feature_names, coefficients):
    if word in ["pain", "tooth pain", "caries", "dental"]:
        print(word, coef)       #pain 和 dental 的信号差不多强， 训练数据太少

print(model.intercept_)

dental -0.0962881910477085
pain 0.09639244279536455
tooth pain 0.22303266954736461
[-0.71206627]


In [18]:
#进阶， Negation Detection
NEGATION_WORDS = {"no", "not", "denies", "without"}

def clean_text(text):
    doc = nlp(text)
    tokens = []

    for token in doc:
        if token.is_punct:
            continue

        # ⚠️ 否定词不能删
        if token.is_stop and token.text.lower() not in NEGATION_WORDS:
            continue

        lemma = token.lemma_.lower()
        tokens.append(lemma)

    return " ".join(tokens)

clean_docs = []

for text in raw_docs:
    clean_docs.append(clean_text(text))

print(clean_docs)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),   # 单词，双词
    min_df=1,   #只要出现过一次就保留
    max_df=0.9  #出现在 >90% 文档的词会被丢掉
)

X = tfidf.fit_transform(clean_docs)
print(tfidf.get_feature_names_out())   #全局词表（vocabulary）
print(X.toarray())

['pt present dental carie no pain', 'patient no evidence carie', 'patient report tooth pain']
['carie' 'carie no' 'dental' 'dental carie' 'evidence' 'evidence carie'
 'no' 'no evidence' 'no pain' 'pain' 'patient' 'patient no'
 'patient report' 'present' 'present dental' 'pt' 'pt present' 'report'
 'report tooth' 'tooth' 'tooth pain']
[[0.24374827 0.32049968 0.32049968 0.32049968 0.         0.
  0.24374827 0.         0.32049968 0.24374827 0.         0.
  0.         0.32049968 0.32049968 0.32049968 0.32049968 0.
  0.         0.         0.        ]
 [0.31757018 0.         0.         0.         0.41756662 0.41756662
  0.31757018 0.41756662 0.         0.         0.31757018 0.41756662
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.30650422 0.30650422 0.
  0.40301621 0.         0.         0.         0.         0.40301621
  0.40301621 0.40301621 0.40

In [22]:
labels = [
    0,  # "Pt presents with dental caries, no pain." → no pain
    0,  # "Patient has no evidence of caries." → no pain
    1   # "Patient reports tooth pain." → pain
]
y = labels    #不是“必须”人工打 label？
from sklearn.linear_model import LogisticRegression        

model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"    #给模型“平权”（非常常用）自动放大 pain 类影响 bias 不再严重偏向 0
)
model.fit(X, y)

#修法 2：人为增加 pain 样本（教学最直观）
#修法 3：关闭正则化（仅教学用）


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [23]:
test_texts = [
    "Patient complains of severe tooth pain.",
    "Dental caries noted, patient asymptomatic."
]

test_clean = []
for t in test_texts:
    test_clean.append(clean_text(t))

X_test = tfidf.transform(test_clean)   #complains / severe / asymptomatic 在 TF-IDF 世界里 = 不存在

preds = model.predict(X_test)
print(preds)

[1 0]


In [24]:
#在高维、稀疏、小样本文本特征下，Linear SVM 往往比 Logistic Regression 稳定
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X, y)

svm_preds = svm.predict(X_test)
print(svm_preds)

[1 0]


In [25]:
coef = svm.coef_[0]
for word, c in zip(feature_names, coef):
    if word in ["pain", "tooth pain", "caries"]:
        print(word, c)
#SVM 不输出概率，所以系数只是决定 哪边离超平面更近

pain -0.29751787254424794
tooth pain 0.34649050898148154
