In [27]:
# 加载模型
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import pandas as pd

# 确保模型已下载: python -m spacy download en_core_web_sm
try:
    nlp = spacy.load("en_core_web_md")
except OSError:
    print("Downloading model...")
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# import warnings

# # 核心：全局忽略所有类型的警告
# warnings.filterwarnings('ignore')

# # 额外适配 Jupyter Notebook 的输出设置
# import os
# os.environ['PYTHONWARNINGS'] = 'ignore'

In [28]:
text = "Elon Musk bought Twitter for $44 billion in 2022. He plans to change the verified system."
doc = nlp(text)
# 交互式实体识别 (NER Visualization)
# Jupyter=True 允许直接在笔记本渲染
print("=== 命名实体识别 (NER) ===")
displacy.render(doc, style="ent", jupyter=True)

# 提取实体详情
entities = [(ent.text, ent.label_, spacy.explain(ent.label_)) for ent in doc.ents]
df_ent = pd.DataFrame(entities, columns=["Text", "Label", "Explanation"])
display(df_ent)

=== 命名实体识别 (NER) ===


Unnamed: 0,Text,Label,Explanation
0,Elon Musk,PERSON,"People, including fictional"
1,Twitter,PRODUCT,"Objects, vehicles, foods, etc. (not services)"
2,$44 billion,MONEY,"Monetary values, including unit"
3,2022,CARDINAL,Numerals that do not fall under another type


In [29]:
# 句法依存树 (Dependency Parsing)
# 调整距离使其更易读
options = {"compact": True, "bg": "#09a3d5", "color": "white", "font": "Source Sans Pro"}
print("\n=== 句法依存关系 (Dependency Parse) ===")
# 逐句渲染，防止图片过长
for sent in doc.sents:
    displacy.render(sent, style="dep", options=options, jupyter=True)


=== 句法依存关系 (Dependency Parse) ===


In [31]:
# 词向量相似度 (Word Vectors)
token1 = nlp("dog")
token2 = nlp("cat")
token3 = nlp("apple")

print(f"\n=== 语义相似度 (Similarity) ===")
print(f"Dog <-> Cat: {token1.similarity(token2):.4f}")
print(f"Dog <-> Apple: {token1.similarity(token3):.4f}")


=== 语义相似度 (Similarity) ===
Dog <-> Cat: 1.0000
Dog <-> Apple: 0.2334


In [26]:
# 基于规则的匹配 (Matcher) - 高级功能
# 查找所有类似 "buying ... startup" 或 "bought ... company" 的模式
matcher = Matcher(nlp.vocab)
pattern = [
    {"POS": "VERB"},           # 动词 (bought/buy)
    {"OP": "*"},               # 任意中间词
    {"LOWER": {"IN": ["twitter", "company", "startup"]}} # 目标词
]
matcher.add("AcquisitionPattern", [pattern])

matches = matcher(doc)
print("\n=== 自定义规则匹配结果 ===")
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(f"Match ID: {string_id}, Text: '{span.text}'")


=== 自定义规则匹配结果 ===
Match ID: AcquisitionPattern, Text: 'bought Twitter'
