In [1]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"  # 关键！国内加速镜站
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "300"  # 超时改成 5 分钟
os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.expanduser("~/huggingface_cache")
import json
from typing import List

from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings

In [2]:
load_dotenv()

True

In [4]:
def partition_document(path: str):
    elements = partition_pdf(
        filename=path,
        strategy="hi_res",    # 采用高分辨率的处理方式，这种方法较慢但准确
        infer_table_structure=True,   # 将表格提取为结构化的html
        extract_image_block_types=["Image"],   # 表示需要处理图像
        extract_image_block_to_payload=True    # 将图像编码为base64格式，这是互联网传输图像的编码格式
    )
    print("Elements Num: ", len(elements))
    return elements

file_path = "./docs/Concept than Document.pdf"
elements = partition_document(file_path)

Elements Num:  734


In [7]:
elements

[<unstructured.documents.elements.Text at 0x7a1b221e3380>,
 <unstructured.documents.elements.Text at 0x7a1b22b55710>,
 <unstructured.documents.elements.Text at 0x7a1b221e38c0>,
 <unstructured.documents.elements.Text at 0x7a1b221e3b60>,
 <unstructured.documents.elements.Text at 0x7a1b221e3ee0>,
 <unstructured.documents.elements.Header at 0x7a1b4aa6b360>,
 <unstructured.documents.elements.Text at 0x7a1b22b54050>,
 <unstructured.documents.elements.Text at 0x7a1b22b542f0>,
 <unstructured.documents.elements.Text at 0x7a1b22b544b0>,
 <unstructured.documents.elements.Text at 0x7a1b22b54670>,
 <unstructured.documents.elements.Text at 0x7a1b22b54c90>,
 <unstructured.documents.elements.Text at 0x7a1b22b54e50>,
 <unstructured.documents.elements.Title at 0x7a1b221e1a20>,
 <unstructured.documents.elements.Title at 0x7a1b221e1f60>,
 <unstructured.documents.elements.NarrativeText at 0x7a1b221e2f20>,
 <unstructured.documents.elements.Title at 0x7a1b221e31c0>,
 <unstructured.documents.elements.Text at 

In [14]:
elements[12].to_dict()

{'type': 'Title',
 'element_id': 'ccd34e67898751c9d16386c39bd74c99',
 'text': 'Concept than Document: Context Compression via AMR-based Conceptual Entropy',
 'metadata': {'detection_class_prob': 0.5847083330154419,
  'is_extracted': 'true',
  'coordinates': {'points': ((np.float64(372.43353271484375),
     np.float64(217.78654388888867)),
    (np.float64(372.43353271484375), np.float64(301.91487722222234)),
    (np.float64(1274.264404296875), np.float64(301.91487722222234)),
    (np.float64(1274.264404296875), np.float64(217.78654388888867))),
   'system': 'PixelSpace',
   'layout_width': 1654,
   'layout_height': 2339},
  'last_modified': '2025-11-25T17:13:34',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 1,
  'file_directory': './docs',
  'filename': 'Concept than Document.pdf',
  'parent_id': '6cd49bf08086157d27a846b1e1a2d5ab'}}

In [15]:
images = [element for element in elements if element.category == "Image"]
len(images)

3

In [16]:
images[0].to_dict()

{'type': 'Image',
 'element_id': '26a3762c15ba266c1781daae23f388a7',
 'text': '9 What is George Rankin\'s occupation? fan) Conceptual Entropy-based Compression Major General George James Rankin, was an Australian soldier and politician. He served in both the House of Representatives and the... Entropy — Pruning "8 Assessment soldier, Australian, politician, George James Rankin, major-general. serve, House of Representatives, Senate, Country Party of Australia egw Politician ©',
 'metadata': {'coordinates': {'points': ((np.float64(850.3944444444444),
     np.float64(610.8533541000002)),
    (np.float64(850.3944444444444), np.float64(1012.525)),
    (np.float64(1456.6782608444444), np.float64(1012.525)),
    (np.float64(1456.6782608444444), np.float64(610.8533541000002))),
   'system': 'PixelSpace',
   'layout_width': 1654,
   'layout_height': 2339},
  'last_modified': '2025-11-25T17:13:34',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 1,
  'image_base64': '/

In [17]:
tables = [table for table in elements if table.category == "Table"]
len(tables)

7

In [18]:
tables[0].to_dict()

{'type': 'Table',
 'element_id': '6025f49371cfc81c60f320d520cb8ab2',
 'text': 'K= 1 2 3 4 5 6 7 8 9 PopQA 280 298 174 172 160 153 149 155 135 EQ 489 572 373 295 239 199 179 169 130 10 125 113',
 'metadata': {'detection_class_prob': 0.8351184725761414,
  'is_extracted': 'true',
  'coordinates': {'points': ((np.float64(855.9657592773438),
     np.float64(505.3922424316406)),
    (np.float64(855.9657592773438), np.float64(589.4478759765625)),
    (np.float64(1451.3629150390625), np.float64(589.4478759765625)),
    (np.float64(1451.3629150390625), np.float64(505.3922424316406))),
   'system': 'PixelSpace',
   'layout_width': 1654,
   'layout_height': 2339},
  'last_modified': '2025-11-25T17:13:34',
  'text_as_html': '<table><thead><tr><th>K=</th><th>1</th><th>2</th><th>3</th><th>4</th><th>5</th><th>6</th><th>7</th><th>8</th><th>9</th><th>10</th></tr></thead><tbody><tr><td>PopQA |</td><td>280</td><td>| 298</td><td>| 174</td><td>| 172</td><td>| 160</td><td>| 153</td><td>| 149</td><td>| 155</

In [19]:
def create_chunk_by_title(elements):
    chunks = chunk_by_title(
        elements,
        max_characters=3000,
        new_after_n_chars=2400,
        combine_text_under_n_chars=500     # 将小于500个字符的分块合并到相邻的分块中
    )
    print(f"creared {len(chunks)} chunks.")
    return chunks

chunks = create_chunk_by_title(elements)

creared 36 chunks.


In [20]:
chunks

[<unstructured.documents.elements.CompositeElement at 0x7a1b221416a0>,
 <unstructured.documents.elements.CompositeElement at 0x7a1b47b0ac10>,
 <unstructured.documents.elements.CompositeElement at 0x7a1b4abe3250>,
 <unstructured.documents.elements.CompositeElement at 0x7a1b49f329e0>,
 <unstructured.documents.elements.CompositeElement at 0x7a1b49f303e0>,
 <unstructured.documents.elements.CompositeElement at 0x7a1b49f02d50>,
 <unstructured.documents.elements.CompositeElement at 0x7a1b4994de10>,
 <unstructured.documents.elements.CompositeElement at 0x7a1b22b58270>,
 <unstructured.documents.elements.CompositeElement at 0x7a1b4a0fa350>,
 <unstructured.documents.elements.CompositeElement at 0x7a1b4a0f9e50>,
 <unstructured.documents.elements.CompositeElement at 0x7a1b4dc206e0>,
 <unstructured.documents.elements.CompositeElement at 0x7a1b4dc21040>,
 <unstructured.documents.elements.CompositeElement at 0x7a1afabd4c90>,
 <unstructured.documents.elements.CompositeElement at 0x7a1afabd51d0>,
 <unst

In [21]:
chunks[1].to_dict()

{'type': 'CompositeElement',
 'element_id': '6412bffe-66ee-4900-beab-0ebfbb15ba68',
 'text': 'Introduction\n\nLarge Language Models (LLMs) are increasingly equipped with mechanisms to incorporate long con- texts, allowing them to leverage external informa- tion beyond their training data (Lewis et al., 2020; Karpukhin et al., 2020). However, as the context length grows, LLMs often struggle to effectively identify and utilize truly relevant information, lead- ing to performance degradation and inefficiency. This challenge, reflecting the trade-off between re- trieval recall and precision, becomes particularly acute in scenarios such as Retrieval-Augmented Generation (RAG), where the inclusion of more\n\n9 What is George Rankin\'s occupation? fan) Conceptual Entropy-based Compression Major General George James Rankin, was an Australian soldier and politician. He served in both the House of Representatives and the... Entropy — Pruning "8 Assessment soldier, Australian, politician, George 

In [22]:
chunks[1].metadata.orig_elements

[<unstructured.documents.elements.Title at 0x7a1b22b55010>,
 <unstructured.documents.elements.NarrativeText at 0x7a1b22b551d0>,
 <unstructured.documents.elements.Image at 0x7a1b49f31220>,
 <unstructured.documents.elements.FigureCaption at 0x7a1b4a943570>,
 <unstructured.documents.elements.NarrativeText at 0x7a1b22b57c40>,
 <unstructured.documents.elements.NarrativeText at 0x7a1b22b549f0>,
 <unstructured.documents.elements.NarrativeText at 0x7a1b22b55390>]

In [31]:
def separate_content_types(chunk):
    content_data = {
        "text": chunk.text,
        "tables": [],
        "images": [],
        "types": ["text"]
    }
    if hasattr(chunk, "metadata") and hasattr(chunk.metadata, "orig_elements"):
        for element in chunk.metadata.orig_elements:
            element_type = type(element).__name__
            
            if element_type == "Table":
                content_data["types"].append("table")
                table_html = getattr(element.metadata, "text_as_html", element.text)
                content_data["tables"].append(table_html)
            elif element_type == "Image":
                if hasattr(element, "metadata") and hasattr(element.metadata, "image_base64"):
                    content_data["types"].append("image")
                    content_data["images"].append(element.metadata.image_base64)
    content_data["types"] = list(set(content_data["types"]))
    return content_data

In [33]:
def create_ai_enhanced_summary(text: str, images: List[str], tables: List[str]) -> str:
    try:
        llm = ChatOpenAI(model="doubao-seed-1-6-lite-251015")
        prompt = f"""你正在创建一个可供搜索的描述信息用于文档内容的检索
        需要分析的内容如下：
        文本内容：
        {text}
        """
        if tables:
            prompt += "表格内容如下：\n"
            for i, table in enumerate(tables):
                prompt += f"table-{i+1}:\n{table}\n\n"
            prompt += f"""针对表格内容，你的任务是：生成一个全面的，详细的，可供搜索的表格阐述，包含以下内容：
            1. 表格中的关键数据以及文本内容
            2. 主要讨论的主题以及概念
            3. 创建针对该内容可能需要回答的问题
            4. 如果涉及到相关的其他图像或者表格内容，需要进行联合分析并进行总结
            确保你给出的内容详尽并且可以用于用户搜索
            """
        message_content = [{"type": "text", "text": prompt}]
        for image_base64 in images:
            message_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
            })
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
    except Exception as e:
        print("ai summary error!")
         
        return str(response.content)

In [37]:
def summarise_chunks(chunks):
    langchain_documents = []
    for i, chunk in enumerate(chunks):
        content_data = separate_content_types(chunk)

        raw_text = content_data.get("text") or ""  # 防止 None
        tables = content_data.get("tables") or []
        images = content_data.get("images") or []

        if tables or images:
            enhanced_content = create_ai_enhanced_summary(raw_text, images, tables)
        else:
            enhanced_content = raw_text

        if enhanced_content is None:
            enhanced_content = raw_text
        elif not isinstance(enhanced_content, str):
            enhanced_content = str(enhanced_content)

        doc = Document(
            page_content=enhanced_content,
            metadata={
                "original_content": json.dumps(
                    {
                        "raw_text": raw_text,
                        "tables_html": tables,
                        "images_base64": images
                    },
                    ensure_ascii=False
                )
            }
        )
        langchain_documents.append(doc)

    print(f"Processed {len(langchain_documents)} chunks")
    return langchain_documents

In [38]:
processed_chunks = summarise_chunks(chunks)

Processed 36 chunks


In [39]:
processed_chunks

[Document(metadata={'original_content': '{"raw_text": "5\\n\\n2025\\n\\n2\\n\\n0\\n\\n2\\n\\nv o N 4 2 ] L C . s c [ 1 v 2 3 8 8 1 . 1 1 5 2\\n\\n:\\n\\nv\\n\\ni\\n\\nX\\n\\nr\\n\\na\\n\\nConcept than Document: Context Compression via AMR-based Conceptual Entropy\\n\\nKaize Shi ♡, Xueyao Sun♣,♠, Xiaohui Tao ♡, Lin Li ‹, Qika Lin ■, Guandong Xu♢\\n\\n♡ University of Southern Queensland, ♣ University of Technology Sydney ♠ The Hong Kong Polytechnic University, ‹ Wuhan University of Technology ■ National University of Singapore, ♢ The Education University of Hong Kong\\n\\nAbstract\\n\\nLarge Language Models (LLMs) face informa-\\n\\ntion overload when handling long contexts, par- ticularly in Retrieval-Augmented Generation (RAG) where extensive supporting documents often introduce redundant content. This issue not only weakens reasoning accuracy but also increases computational overhead. We propose an unsupervised context compression frame- work that exploits Abstract Meaning Represen- t

In [41]:
def create_vector_store(documents, persist_directory="./dbv1/chroma_db"):
    embedding_model = HuggingFaceEmbeddings(
        model_name="mixedbread-ai/mxbai-embed-large-v1",
        encode_kwargs={"normalize_embeddings": True}
    )
    vectorstore = Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=persist_directory,
        collection_metadata={"hnsw:space": "cosine"},
    )
    print("finished vector store.")
    return vectorstore

In [42]:
db = create_vector_store(processed_chunks)

modules.json: 0.00B [00:00, ?B/s]

config_sentence_transformers.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

finished vector store.


In [43]:
query = "What is the AMR Graph in this aritcle?" 
retriever = db.as_retriever(search_kwargs={"k": 3})
chunks = retriever.invoke(query)
chunks

[Document(id='fdd729ae-73c7-4262-818f-04d408b439cd', metadata={'original_content': '{"raw_text": "3.2 AMR Graph Construction\\n\\nFor each document di P D, we construct it to the sentence-level AMR graphs with an mBart-based parser 1 trained in the AMR 3.0 corpus 2 to address potential multilingual concerns. Let Gi “ pVi,Eiq denote the AMR graph for document di, where Vi represents the set of concept nodes and Ei rep- resents the semantic relations between concepts. Each concept node v P Vi corresponds to a seman- tic concept (e.g., entities, predicates, or modifiers) and is associated with its textual realization in the raw document. The edges in Ei represent semantic relationships such as agent-of (ARG0), patient-of (ARG1), and various semantic roles.\\n\\nOur approach is grounded in the cognitive hy- pothesis that both human comprehension and LLM inference can effectively reconstruct semantic sce- narios from discrete informative concepts without explicit relational encoding (Xu et 

In [46]:
def get_final_anwer(chunks, query):
    try:
        llm = ChatOpenAI(model="doubao-seed-1-6-lite-251015")
        prompt = f"""Based on the following documents, Please answer my question: {query}
        documents content as follows:
        """
        for i, chunk in enumerate(chunks):
            prompt += f"---Document {i+1}---\n"
            if "original_content" in chunk.metadata:
                original_data = json.loads(chunk.metadata["original_content"])
                raw_text = original_data.get("raw_text", "")
                if raw_text:
                    prompt += f"Text Content:\n{raw_text}\n\n"
                tables_html = original_data.get("tables_html", [])
                if tables_html:
                    prompt += "Tables content:\n"
                    for j, table in enumerate(tables_html):
                        prompt += f"---table {j+1} --- \n{table}\n\n"
        prompt += """请根据上述给定的文档内容，例如文本，表格以及图片，使用中文回答我的问题，如果文档中的内容无法回答提出的问题，请你回答：信息不足，无法给出回答！"""

        message_content = [{"type": "text", "text": prompt}]
        for chunk in chunks:
            if "original_content" in chunk.metadata:
                original_data = json.loads(chunk.metadata["original_content"])
                images_base64 = original_data.get("images_base64", [])
                for img in images_base64:
                    message_content.append({
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{img}"}
                    })
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        return response.content
    except Exception as e:
        return "sorry, I encountered a error while generating this question."

In [47]:
final_answer = get_final_anwer(chunks, query)
print(final_answer)

文中的AMR图（Abstract Meaning Representation Graph，抽象意义表示图）是对文档构建的**句子级语义结构**，核心信息如下：  

### 1. 构建方式  
对每个文档\( d_i \)，使用**基于mBart的解析器**（在AMR 3.0语料库中训练）生成句子级AMR图，以应对潜在的多语言问题。  


### 2. 结构定义  
AMR图记为\( G_i = (V_i, E_i) \)，包含两部分：  
- **节点\( V_i \)**：集合中的每个节点对应一个**语义概念**（如实体、谓词、修饰语等），且关联原始文档中的文本实现（即该概念在原文中的表述）。  
- **边\( E_i \)**：表示概念间的**语义关系**（如施事\( \text{ARG0} \)、受事\( \text{ARG1} \)等语义角色关系）。  


### 3. 核心特点  
- 抽象句法变异：不依赖具体句法形式，适合跨语言、跨领域应用；  
- 节点编码高熵语义：捕捉丰富概念信息，可用于结构化上下文工程；  
- 文中后续设计中会**丢弃显式边\( E_i \)**，仅保留概念节点\( V_i \)，以避免人工关系符号干扰大模型理解，同时利用模型对概念间隐式关系的推理能力。  


（结合图中示例，AMR图包含带语义标签的节点（如“soldier”“politician”）和带关系标签的边（如\( \text{:mod} \)“country”、\( \text{:name} \)“George”等），直观呈现语义概念及关联。）
