In [2]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import spacy
import pandas as pd
import re
from collections import Counter

# ================= 配置区域 =================
pytesseract.pytesseract.tesseract_cmd = r'd:\Tesseract-OCR\tesseract.exe'

# 加载 NLP 模型
nlp = spacy.load("en_core_web_sm")

print("环境加载完成。OCR 引擎已就绪。")

环境加载完成。OCR 引擎已就绪。


In [3]:
def smart_extract_text_from_pdf(pdf_path):
    """
    智能读取 PDF：
    1. 尝试直接提取文本。
    2. 如果提取的文本太少（说明可能是扫描件），则自动对该页进行 OCR 识别。
    """
    doc = fitz.open(pdf_path)
    full_text = []
    
    print(f"正在处理文件: {pdf_path} (共 {len(doc)} 页)...")
    
    for page_num, page in enumerate(doc):
        # 1. 尝试直接提取文本
        text = page.get_text()
        
        # 2. 判断逻辑：如果一页的字符数少于 50 个，很有可能是扫描图片或纯图表
        if len(text.strip()) < 50:
            print(f"  -> 第 {page_num + 1} 页看起来像图片/扫描件，正在进行 OCR 识别 (可能稍慢)...")
            
            # 将 PDF 页面渲染为高分辨率图片 (zoom=2 表示 2倍清晰度，提高识别率)
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
            
            # 将 PyMuPDF 的图像格式转换为 PIL 图像格式
            img_data = pix.tobytes("png")
            img = Image.open(io.BytesIO(img_data))
            
            # 调用 Tesseract 进行 OCR
            text = pytesseract.image_to_string(img, lang='eng')
            
        full_text.append(text)
        
    print("PDF 读取与识别完成！")
    return "\n".join(full_text)

# === 测试部分 ===
# 请将这里改为你实际上传的 PDF 文件名
# 你可以上传一个 纯文本PDF 或 扫描版PDF 来测试
pdf_filename = "paper.pdf" 

# 这里为了演示，如果没有文件，我会创建一个假的提示
import os
if not os.path.exists(pdf_filename):
    print(f"【警告】找不到 {pdf_filename}，请将你的论文 PDF 拖入左侧文件栏并重命名。")
    # 创建一个空的模拟文本以防报错
    extracted_text = "This is a placeholder text for generative artificial intelligence and neural networks."
else:
    extracted_text = smart_extract_text_from_pdf(pdf_filename)

# 打印前 500 个字符看看效果
print("\n=== 提取内容预览 ===")
print(extracted_text[:500] + "...")

正在处理文件: paper.pdf (共 25 页)...
  -> 第 19 页看起来像图片/扫描件，正在进行 OCR 识别 (可能稍慢)...
  -> 第 20 页看起来像图片/扫描件，正在进行 OCR 识别 (可能稍慢)...
  -> 第 21 页看起来像图片/扫描件，正在进行 OCR 识别 (可能稍慢)...
  -> 第 22 页看起来像图片/扫描件，正在进行 OCR 识别 (可能稍慢)...
  -> 第 23 页看起来像图片/扫描件，正在进行 OCR 识别 (可能稍慢)...
  -> 第 24 页看起来像图片/扫描件，正在进行 OCR 识别 (可能稍慢)...
  -> 第 25 页看起来像图片/扫描件，正在进行 OCR 识别 (可能稍慢)...
PDF 读取与识别完成！

=== 提取内容预览 ===
Nature Health | Volume 1 | January 2026 | 78–89
78
nature health
Article
https://doi.org/10.1038/s44360-025-00010-z
A videogame for perceived risk of harm from 
opioid misuse in adolescents: a randomized 
controlled trial
 
Tyra Boomer 
  1,2 
, Lily Hoerner 
  3, Kaitlyn Larkin4, Kaitlin Maciejewski5, 
Tassos C. Kyriakides5 & Lynn E. Fiellin 
  1,2
The opioid epidemic greatly impacts adolescents, especially those with low 
perceived risk of harm—an important predictor of misuse initiation. Here...


In [4]:
def clean_academic_text(text):
    print("正在清洗文本...")
    
    # 1. 修复连字符换行 (Hyphenation)
    # 例如: "net-\nwork" -> "network"
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
    
    # 2. 移除多余的换行符，把段落连起来
    # 这一步比较激进，把所有换行换成空格，适合做术语提取，但不适合看原文格式
    text = text.replace('\n', ' ')
    
    # 3. 移除多余的空格
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

cleaned_text = clean_academic_text(extracted_text)
print("文本清洗完成。")


正在清洗文本...
文本清洗完成。


In [7]:
def extract_terms_with_context(text, top_n=50):
    print("正在进行 NLP 分析与术语提取...")
    # 增加 max_length 限制，防止处理几十万字的论文时内存溢出
    doc = nlp(text[:1000000]) 
    
    candidates = []
    
    # 遍历所有名词短语
    for chunk in doc.noun_chunks:
        term = chunk.text.lower().strip()
        
        # 过滤规则：
        # 1. 长度 > 2
        # 2. 必须包含空格 (我们需要复合术语，如 "machine learning", 不仅仅是 "learning")
        # 3. 不包含数字
        if len(term) > 2 and " " in term and not any(char.isdigit() for char in term):
            # 排除代词开头的短语 (e.g., "our model")
            if chunk[0].pos_ != "PRON":
                candidates.append(term)
                
    # 统计频率
    term_counts = Counter(candidates)
    
    # 准备输出数据
    results = []
    sentences = list(doc.sents) # 分句
    
    for term, freq in term_counts.most_common(top_n):
        # 寻找例句：找到第一个包含该术语的句子
        example_sentence = "Not found"
        for sent in sentences:
            if term in sent.text.lower():
                # 清洗一下例句的格式
                example_sentence = sent.text.strip()
                break
        
        results.append({
            "Term (Source)": term,
            "Frequency": freq,
            "Context (Example Sentence)": example_sentence,
            "Translation (CN)": "" # 留空给译者填
        })
        
    return pd.DataFrame(results)

df_glossary = extract_terms_with_context(cleaned_text)

# 展示预览
print("\n=== 提取的术语表预览 ===")
display(df_glossary.head(100)) # Jupyter 特有的展示表格方法

正在进行 NLP 分析与术语提取...

=== 提取的术语表预览 ===


Unnamed: 0,Term (Source),Frequency,Context (Example Sentence),Translation (CN)
0,opioid misuse,36,A videogame for perceived risk of harm from op...,
1,perceived risk,22,A videogame for perceived risk of harm from op...,
2,the study,22,Withdrawn: 1 3 months Completed 3-month assess...,
3,great risk,18,"At 3 months, 29% of PlaySmart versus 23% of co...",
4,the playsmart group,14,"Primary outcome At baseline, 16% (43/269) of t...",
5,the control group,13,"Primary outcome At baseline, 16% (43/269) of t...",
6,nature health | volume,11,Nature Health | Volume 1,
7,controlled trial,10,A videogame for perceived risk of harm from op...,
8,the game,10,They answered agree/strongly agree to the foll...,
9,substance use,9,"Eligible students—16–19 years old, no prior op...",


In [None]:
output_filename = "Paper_Glossary_Analysis.xlsx"

# 使用 ExcelWriter 可以调整格式（虽然这里只做简单保存）
with pd.ExcelWriter(output_filename) as writer:
    df_glossary.to_excel(writer, sheet_name="Terminology", index=False)
    
    # 我们也可以把清洗后的纯文本保存到一个单独的 Sheet，方便对照
    # 由于 Excel 单元格有字符限制，我们只存前 30000 字符作为参考
    df_raw = pd.DataFrame({"Full Text Content": [cleaned_text[:30000]]})
    df_raw.to_excel(writer, sheet_name="Source Text", index=False)

print(f"\n成功！文件已生成: {output_filename}")
print("请在左侧文件浏览器中右键点击该文件，选择 'Download' 进行下载。")