In [1]:
from llama_parse import LlamaParse
from llama_index.core import Document
from llama_index.core import SimpleDirectoryReader, Settings, PromptTemplate
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.node_parser import MarkdownNodeParser
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.llms.openai import OpenAI
import chromadb
from chromadb.config import Settings
import os
from dotenv import load_dotenv
import json
import re

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [64]:
# Đặt OpenAI API key
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv("LLAMA_API_KEY")
embedding = OpenAIEmbedding(model="text-embedding-3-small")
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.1)

# Định dạng marker
PAGE_MARKER_TEMPLATE = "[[__PAGE_{page}__]]"
PAGE_MARKER_RE = re.compile(r"\[\[__PAGE_(\d+)__\]\]")
HEADING_RE = re.compile(r'^\s*(#{1,6})\s+(.*\S)\s*$')

In [105]:
# 1. Parse PDF sang Markdown
parser = LlamaParse(
    result_type="markdown",
    auto_mode=True,
    auto_mode_trigger_on_image_in_page=True,
    auto_mode_trigger_on_table_in_page=True,
    skip_diagonal_text=True,
    preserve_layout_alignment_across_pages=True,
    num_workers=4,
    max_timeout=180,
    )  # hoặc "md"
pdf_path = r"D:/study/LammaIndex/documents/NetSure_732_User_Manual.pdf"
file_name = os.path.splitext(os.path.basename(pdf_path))[0]
print("Đang parse PDF sang Markdown...")
parsed_docs = parser.load_data(pdf_path)  # Mỗi trang PDF -> 1 Document dạng markdown

Đang parse PDF sang Markdown...
Started parsing the file under job_id 60454ee0-9108-451b-a4ec-71358dbacef2


In [106]:
parsed_docs[0]

Document(id_='317eeb45-bb6d-4d0d-aa97-ab44601c696d', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='\n# NetSure 732 Subrack Power System User Manual\n\n资料版本 V1.1 Version V1.1\n\n归档日期 2019-12-22 Revision date December 22, 2019\n\nBOM 编码\n\n艾默生网络能源有限公司为客户提供全方位的 Vertiv provides customers with technical support.\n\n用户可与就近的艾默生网络能源有限公司办事处或客户服务中心联系，也可直接与公司总部联系。\n\n艾默生网络能源有限公司 Copyright © 2014 by Vertiv., Ltd.\n\n版权所有，保留一切权利。内容如有改动，恕不另行通知。 All rights reserved. The contents in this document are subject to change without notice.\n\n艾默生网络能源有限公司 Vertiv Co., Ltd.\n\nAddress: Block B2, Nanshan I Park, No.1001\n\n地址：深圳市南山区学苑大道 1001 号南山智园 B2 栋\n\n邮编：518055 Homepage: www.vertiv.com.cn\n\nE-mail: Support@vertiv.com\n\n客户服务热线：4008876510\n\nE-mail：Enpc.service@vertiv.com\n\n', path=None, url=None, mimetype=None), image_re

In [107]:
product_name = "Unknown Product"

In [108]:
def clean_line(line: str) -> str:
    """
    Lấy phần đầu của dòng trước khi gặp >4 dấu cách liên tiếp.
    Xóa # nếu có ở đầu.
    """
    # Nếu có nhiều hơn 4 dấu cách liên tiếp thì tách
    line = re.split(r'\s{5,}', line, maxsplit=1)[0]
    return line.lstrip('#').strip()
def remove_duplicate_headers(markdown_content):
    lines = markdown_content.splitlines()
    output_lines = []
    seen_headers = set()

    for line in lines:
        if line.startswith('# '):  # Chỉ check header cấp 1
            header_text = line[2:].strip()  # Bỏ '# ' và lấy tên header
            if header_text not in seen_headers:
                seen_headers.add(header_text)
                output_lines.append(line)
        else:
            output_lines.append(line)
    
    return '\n'.join(output_lines)

In [109]:
parts = []

for i, d in enumerate(parsed_docs, start=1):
    if i == 1:
        content = d.text.split('\n')

        # Tìm dòng đầu tiên hợp lệ
        first_non_empty_line = next(
            (line for line in content if line.strip()),
            None
        )

        if first_non_empty_line:
            product_name = clean_line(first_non_empty_line)

        print("Product name:", product_name)

    parts.append(d.text)
    parts.append(f"\n{PAGE_MARKER_TEMPLATE.format(page=i)}\n")

merged_text = "".join(parts)
merged_text = remove_duplicate_headers(merged_text)

Product name: NetSure 732 Subrack Power System User Manual


In [110]:
with open('cleaned_markdown_file.md', 'w', encoding='utf-8') as f:
    f.write(merged_text)

In [111]:
product_name

'NetSure 732 Subrack Power System User Manual'

In [112]:
big_document = Document(
    text=merged_text,
    metadata={
        "file_name": file_name,
        "product_name": product_name,
    },
)

In [113]:
md_parser = MarkdownNodeParser(include_metadata=True, include_prev_next_rel=True)
nodes = md_parser.get_nodes_from_documents([big_document])

In [114]:
def find_all_pages_in_text(text: str):
    """Trả về list số trang xuất hiện trong text theo thứ tự. Loại bỏ trùng lặp liên tiếp."""
    pages = [int(m.group(1)) for m in PAGE_MARKER_RE.finditer(text)]
    dedup = []
    for p in pages:
        if not dedup or dedup[-1] != p:
            dedup.append(p)
    return dedup
def strip_page_markers(text: str) -> str:
    """Xoá marker trang khỏi text để không ảnh hưởng embedding."""
    cleaned = PAGE_MARKER_RE.sub("", text)
    return cleaned.strip("\n\r ")

In [115]:
def extract_levels_from_text(text: str):
    """
    Duyệt tất cả dòng trong node (đã sạch marker) để tìm heading.
    Trả về (level1, level2) — nếu không thấy thì None.
    Khi gặp # mới -> reset level2 (vì bối cảnh đổi).
    Lấy heading cuối cùng của từng cấp xuất hiện trong node.
    """
    level1 = None
    level2 = None
    for line in text.splitlines():
        m = HEADING_RE.match(line)
        if not m:
            continue
        hashes, title = m.groups()
        depth = len(hashes)
        title = title.strip()
        if depth == 1:
            level1 = title
            level2 = None  # reset khi vào level1 mới
        elif depth == 2:
            level2 = title
        # depth >= 3: bỏ qua (có thể lưu sau)
    return level1, level2

In [116]:
def get_llm_summary(node_content: str) -> dict:
    """
    Gọi LLM để lấy thông tin từ node_content:
    - summary: 1 câu tóm tắt ngắn gọn.
    - table_name: tên bảng (hoặc 'None' nếu không có).
    - figure_name: tên hình (hoặc 'None' nếu không có).
    Trả về dict.
    """
    template = (
        "Analyze the following text and provide a JSON object with the fields:\n"
        "- summary: A one-sentence summary of the text.\n"
        "- table_name: If a table is present, provide its name; if the table is only named like 'Table 1.1' or has no name, infer a meaningful name like 'Table 1-1 Configuration of power system'; otherwise, return 'None'.\n"
        "- figure_name: The name of the figure if present, otherwise 'None'.\n\n"
        "Text:\n---\n{node_content}\n---\n\n"
        "Return ONLY the JSON object, nothing else."
    )
    prompt_template = PromptTemplate(template)
    response = Settings.llm.predict(prompt_template, node_content=node_content).strip()
    # Ví dụ response: {"summary": "This text describes ...", "table_name": "Table 1", "figure_name": "None"}
    response = re.sub(r"^```(json)?|```$", "", response, flags=re.MULTILINE).strip()
    result = json.loads(response)
    print(result)
    # Đảm bảo luôn có 3 key
    for key in ["summary", "table_name", "figure_name"]:
        result.setdefault(key, "None")
    return result

In [117]:
def get_llm_summary_document(document_content: str) -> str:
    template = (
        "Based on the following text, create a comprehensive summary for the entire document.\n"
        "Document:\n---\n{document_content}\n---\nSummary:"
    )
    prompt_template = PromptTemplate(template)
    response = Settings.llm.predict(prompt_template, document_content=document_content)
    return response.strip()

In [118]:
import uuid
document_id = uuid.uuid1()

In [119]:
metadata_dict = {}
current_page = 0  # bắt đầu theo yêu cầu của bạn
summary_document = ""
documents = []
for idx, node in enumerate(nodes, start=1):
    pages_found = find_all_pages_in_text(node.text)

    if not pages_found:
        node_page = current_page + 1
        
    elif len(pages_found) == 1:
        # Node chứa đúng 1 marker
        node_page = pages_found[0]
        current_page = pages_found[0]

    else:
        # Node chứa Nhiều marker (bắc cầu nhiều trang)
        node_page = pages_found[0]        # page của node = page đầu tiên
        current_page = pages_found[-1]    # cập nhật current sang page cuối cùng

    # Gắn page chính cho node
    node.metadata["page"] = node_page

    # Làm sạch text
    cleaned_text = strip_page_markers(node.text)
    node.text = cleaned_text

        # --- 3. Trích heading cấp # và ## từ text đã sạch ---
    lvl1_local, lvl2_local = extract_levels_from_text(cleaned_text)

    # Cập nhật ngữ cảnh heading
    if lvl1_local is not None:
        current_level1 = lvl1_local
        current_level2 = None  # reset khi đổi level1
    if lvl2_local is not None:
        current_level2 = lvl2_local

    # --- 4. Gán heading ngữ cảnh vào metadata ---
    node.metadata["level1"] = current_level1
    node.metadata["level2"] = current_level2

    summary_info = get_llm_summary(node.text)
    node.metadata["summary"] = summary_info["summary"]
    node.metadata["table_name"] = summary_info["table_name"]
    node.metadata["figure_name"] = summary_info["figure_name"]
    node.metadata["document_id"] = str(document_id)
    node.metadata["chunk_id"] = f"{node.metadata["file_name"]}_{idx}"
    node.metadata["type"] = "chunk_document"
    summary_document += node.metadata["summary"] + "\n"
    documents.append(
        Document(
            text=cleaned_text,
            metadata=node.metadata
        )
    )
    # Lưu metadata
    metadata_dict[idx] = dict(node.metadata)
documents.append(
    Document(
        text=get_llm_summary_document(summary_document),
        metadata={
            "file_name": file_name,
            "product_name": product_name,
            "document_id": str(document_id),
            "type": "summary_document"
        }
    )
)
metadata_dict[str(document_id)] = dict({
            "file_name": file_name,
            "product_name": product_name,
            "document_id": str(document_id),
            "type": "summary_document"
        })

{'summary': 'This document is the user manual for the NetSure 732 Subrack Power System, providing contact information and copyright details.', 'table_name': 'None', 'figure_name': 'None'}
{'summary': 'The text emphasizes the importance of understanding and adhering to safety precautions before operating Emerson products.', 'table_name': 'None', 'figure_name': 'None'}
{'summary': 'The text discusses the topic of electrical safety.', 'table_name': 'None', 'figure_name': 'None'}
{'summary': 'The text outlines safety precautions and regulations for handling hazardous voltage in power systems to prevent fatal injuries and accidents.', 'table_name': 'None', 'figure_name': 'None'}
{'summary': 'The text warns against operating high voltage equipment during thunderstorms due to the risk of lightning strikes.', 'table_name': 'None', 'figure_name': 'None'}
{'summary': 'The text emphasizes the importance of wearing an ESD wrist strap to prevent static electricity damage to sensitive electronic com

In [120]:
with open("D:/study/LammaIndex/test_data/NetSure_732_User_Manual.json", "w", encoding="utf-8") as f:
    json.dump(metadata_dict, f, ensure_ascii=False, indent=4)

In [121]:
output_file = "D:/study/LammaIndex/test_data/NetSure_732_User_Manual.md"
with open(output_file, "w", encoding="utf-8") as f:
    for idx, node in enumerate(nodes, start=1):
        f.write(f"# Chunk {idx}\n")
        f.write(node.text + "\n\n\n\n")

print(f"Đã ghi {len(nodes)} chunk vào {output_file}")

Đã ghi 93 chunk vào D:/study/LammaIndex/test_data/NetSure_732_User_Manual.md


In [122]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.core import Document, VectorStoreIndex, StorageContext, Settings
client = QdrantClient(
    url="https://a8bcf78f-0147-411f-aa58-079f863fcd6d.us-west-1-0.aws.cloud.qdrant.io:6333", 
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.nWDKIoRkHiq8BiA3aKRJub0XFx-kvd-Rsod2wijHclM",
)

aclient = AsyncQdrantClient(
    url="https://a8bcf78f-0147-411f-aa58-079f863fcd6d.us-west-1-0.aws.cloud.qdrant.io:6333", 
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.nWDKIoRkHiq8BiA3aKRJub0XFx-kvd-Rsod2wijHclM",
)

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
embedding = OpenAIEmbedding(model="text-embedding-3-small")

vector_store = QdrantVectorStore(
    "thong_tin_san_pham",
    client=client,
    aclient=aclient,
    enable_hybrid=True,
    fastembed_sparse_model="Qdrant/bm25",
    batch_size=20,
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)