In [1]:
from llama_parse import LlamaParse
from llama_index.core import Document
from llama_index.core import SimpleDirectoryReader, Settings, PromptTemplate
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.llms.openai import OpenAI
import chromadb
from chromadb.config import Settings
import os
from dotenv import load_dotenv
import json
import re

load_dotenv()

True

In [2]:
# Đặt OpenAI API key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv("LLAMA_API_KEY")
embedding = OpenAIEmbedding(model="text-embedding-3-small")
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.1)

# Định dạng marker
PAGE_MARKER_TEMPLATE = "[[__PAGE_{page}__]]"
PAGE_MARKER_RE = re.compile(r"\[\[__PAGE_(\d+)__\]\]")
HEADING_RE = re.compile(r'^\s*(#{1,6})\s+(.*\S)\s*$')

In [18]:
# 1. Parse PDF sang Markdown
parser = LlamaParse(
    result_type="markdown",
    auto_mode=True,
    auto_mode_trigger_on_image_in_page=True,
    auto_mode_trigger_on_table_in_page=True,
    skip_diagonal_text=True,
    preserve_layout_alignment_across_pages=True,
    num_workers=4,
    max_timeout=180,
    )  # hoặc "md"
pdf_path = r"D:\\study\\LammaIndex\\documents\\NetSure_732_Brochure.pdf"
file_name = os.path.splitext(os.path.basename(pdf_path))[0]
print("Đang parse PDF sang Markdown...")
parsed_docs = parser.load_data(pdf_path)  # Mỗi trang PDF -> 1 Document dạng markdown

Đang parse PDF sang Markdown...
Started parsing the file under job_id 6619678f-0830-4724-8a85-2c90eba865d4


In [19]:
product_name = "Unknown Product"

In [20]:
parts = []
for i, d in enumerate(parsed_docs, start=1):
    if i==1:
        content = d.text.split('\n')
        # Tìm dòng đầu tiên bắt đầu bằng '#'
        first_hash_line = next((line for line in content if line.startswith("#")), None)
        print(first_hash_line)
        if first_hash_line.startswith("# "):
            product_name = first_hash_line.strip("# ").strip()
    parts.append(d.text)
    parts.append(f"\n{PAGE_MARKER_TEMPLATE.format(page=i)}\n")

merged_text = "".join(parts)

# NetSure™ 732 A41


In [21]:
product_name

'NetSure™ 732 A41'

In [22]:
big_document = Document(
    text=merged_text,
    metadata={
        "file_name": file_name,
        "product_name": product_name,
    },
)

In [23]:
md_parser = MarkdownNodeParser(include_metadata=True, include_prev_next_rel=True)
nodes = md_parser.get_nodes_from_documents([big_document])

In [24]:
def find_all_pages_in_text(text: str):
    """Trả về list số trang xuất hiện trong text theo thứ tự. Loại bỏ trùng lặp liên tiếp."""
    pages = [int(m.group(1)) for m in PAGE_MARKER_RE.finditer(text)]
    dedup = []
    for p in pages:
        if not dedup or dedup[-1] != p:
            dedup.append(p)
    return dedup
def strip_page_markers(text: str) -> str:
    """Xoá marker trang khỏi text để không ảnh hưởng embedding."""
    cleaned = PAGE_MARKER_RE.sub("", text)
    return cleaned.strip("\n\r ")

In [25]:
def extract_levels_from_text(text: str):
    """
    Duyệt tất cả dòng trong node (đã sạch marker) để tìm heading.
    Trả về (level1, level2) — nếu không thấy thì None.
    Khi gặp # mới -> reset level2 (vì bối cảnh đổi).
    Lấy heading cuối cùng của từng cấp xuất hiện trong node.
    """
    level1 = None
    level2 = None
    for line in text.splitlines():
        m = HEADING_RE.match(line)
        if not m:
            continue
        hashes, title = m.groups()
        depth = len(hashes)
        title = title.strip()
        if depth == 1:
            level1 = title
            level2 = None  # reset khi vào level1 mới
        elif depth == 2:
            level2 = title
        # depth >= 3: bỏ qua (có thể lưu sau)
    return level1, level2

In [27]:
def get_llm_summary(node_content: str) -> dict:
    """
    Gọi LLM để lấy thông tin từ node_content:
    - summary: 1 câu tóm tắt ngắn gọn.
    - table_name: tên bảng (hoặc 'None' nếu không có).
    - figure_name: tên hình (hoặc 'None' nếu không có).
    Trả về dict.
    """
    template = (
        "Analyze the following text and provide a JSON object with the fields:\n"
        "- summary: A one-sentence summary of the text.\n"
        "- table_name: If a table is present, provide the table name along with a brief summary of it; otherwise, return 'None'\n"
        "- figure_name: The name of the figure if present, otherwise 'None'.\n\n"
        "Text:\n---\n{node_content}\n---\n\n"
        "Return ONLY the JSON object, nothing else."
    )
    prompt_template = PromptTemplate(template)
    response = Settings.llm.predict(prompt_template, node_content=node_content).strip()
    # Ví dụ response: {"summary": "This text describes ...", "table_name": "Table 1", "figure_name": "None"}
    response = re.sub(r"^```(json)?|```$", "", response, flags=re.MULTILINE).strip()
    result = json.loads(response)
    print(result)
    # Đảm bảo luôn có 3 key
    for key in ["summary", "table_name", "figure_name"]:
        result.setdefault(key, "None")
    return result

In [28]:
metadata_dict = {}
current_page = 0  # bắt đầu theo yêu cầu của bạn

for idx, node in enumerate(nodes, start=1):
    pages_found = find_all_pages_in_text(node.text)

    if not pages_found:
        node_page = current_page + 1
        
    elif len(pages_found) == 1:
        # Node chứa đúng 1 marker
        node_page = pages_found[0]
        current_page = pages_found[0]

    else:
        # Node chứa Nhiều marker (bắc cầu nhiều trang)
        node_page = pages_found[0]        # page của node = page đầu tiên
        current_page = pages_found[-1]    # cập nhật current sang page cuối cùng

    # Gắn page chính cho node
    node.metadata["page"] = node_page

    # Làm sạch text
    cleaned_text = strip_page_markers(node.text)
    node.text = cleaned_text

        # --- 3. Trích heading cấp # và ## từ text đã sạch ---
    lvl1_local, lvl2_local = extract_levels_from_text(cleaned_text)

    # Cập nhật ngữ cảnh heading
    if lvl1_local is not None:
        current_level1 = lvl1_local
        current_level2 = None  # reset khi đổi level1
    if lvl2_local is not None:
        current_level2 = lvl2_local

    # --- 4. Gán heading ngữ cảnh vào metadata ---
    node.metadata["level1"] = current_level1
    node.metadata["level2"] = current_level2

    summary_info = get_llm_summary(node.text)
    node.metadata["summary"] = summary_info["summary"]
    node.metadata["table_name"] = summary_info["table_name"]
    node.metadata["figure_name"] = summary_info["figure_name"]
    # Lưu metadata
    metadata_dict[idx] = dict(node.metadata)

{'summary': 'The text describes the NetSure™ 732 A41, a 48V DC embedded power supply.', 'table_name': 'None', 'figure_name': 'None'}
{'summary': 'The text describes the key features of an ultra-compact power system that offers high efficiency, adaptability, and multiple communication interfaces.', 'table_name': 'None', 'figure_name': 'None'}
{'summary': 'NetSure™ 732 A41 is a versatile rack-mounted power system designed for telecom sites, featuring advanced technology for high energy efficiency and remote monitoring.', 'table_name': {'name': 'System Configuration', 'summary': 'The table outlines the specifications of the NetSure™ 732 A41 power system, detailing its AC and DC input capacities, control and power modules, input voltages, output voltages, and optional items.'}, 'figure_name': 'None'}
{'summary': 'The text provides detailed specifications for the NetSure™ 732 A41 power converter, including electrical, mechanical, and controller parameters.', 'table_name': 'Configuration and

In [29]:
with open("NetSure_732_Brochure.json", "w", encoding="utf-8") as f:
    json.dump(metadata_dict, f, ensure_ascii=False, indent=4)

In [15]:
output_file = "NetSure_732_User_Manual.md"
with open(output_file, "w", encoding="utf-8") as f:
    for idx, node in enumerate(nodes, start=1):
        f.write(f"# Chunk {idx}\n")
        f.write(node.text + "\n\n\n\n")

print(f"Đã ghi {len(nodes)} chunk vào {output_file}")

Đã ghi 104 chunk vào NetSure_732_User_Manual.md
