In [61]:
import camelot
import json
import os
import re

In [62]:
def get_continued_tables(tables, threshold):

    continued_tables = {}
    previous_table = False
    group_counter = 0

    # typical height of a pdf is 842 points and bottom margins are anywhere between 56 and 85 points
    # therefore, accounting for margins, 792
    page_height = 792

    # iterate over the tables
    for i, table in enumerate(tables):

        # if a previous table exists (remember, we start with this as false)
        # and the previous table was on the previous page
        # and the number of columns of both tables is the same
        if previous_table and table.page == previous_table.page + 1 and len(table.cols) == len(previous_table.cols):

            # get the bottom coordinate of the previous table
            # note that for pdfs the origin (0, 0) typically starts from the bottom-left corner of the page,
            # with the y-coordinate increasing as you move upwards
            # this is why for {x0, y0, x1, y1} we need the y0 as the bottom
            previous_table_bottom = previous_table._bbox[1]

            # get the top coordinate of the current table
            # for {x0, y0, x1, y1} we need the y1 as the top
            current_table_top = table._bbox[3]

            # if the previous table ends in the last 15% of the page and the current table starts in the first 15% of the page
            if previous_table_bottom < (threshold / 100) * page_height and current_table_top > (1 - threshold / 100) * page_height:

                # if we don't have started this group of tables
                if (continued_tables.get(group_counter) is None):

                    # start by adding the first table
                    continued_tables[group_counter] = [previous_table]

                # add any of the sunsequent tables to the group
                continued_tables[group_counter].append(table)

            # if this is not a continuation of the previous table
            else:

                # increment the group number
                group_counter += 1;

        # if this is not a continuation of the previous table
        else:

            # increment the group number
            group_counter += 1;

        # the current table becomes the previous table for the next iteration
        previous_table = table

    # transform the dictionary into an array of arrays
    continued_tables = [value for value in continued_tables.values()]

    # return the combined tables
    return continued_tables

In [3]:
def table_to_json(table_data, table_info):
    """Convert table data to JSON format"""
    if not table_data:
        return {}
    
    # Create JSON structure
    json_data = {
        "metadata": {
            "source_file": table_info["source_file"],
            "page": table_info["page"],
            "table_order": table_info["order"],
            "total_rows": len(table_data),
            "total_columns": len(table_data[0]) if table_data else 0
        },
        "headers": [],
        "data": []
    }
    
    # Add headers (first row)
    if len(table_data) > 0:
        headers = [str(cell).strip() for cell in table_data[0]]
        
        # Replace first 3 headers with fixed names
        if len(headers) >= 1:
            headers[0] = "STT"
        if len(headers) >= 2:
            headers[1] = "hang_hoa"
        if len(headers) >= 3:
            headers[2] = "yeu_cau_ky_thuat"
            
        json_data["headers"] = headers
        
        # Add data rows (skip header)
        for i, row in enumerate(table_data[1:], 1):
            row_dict = {}
            for j, cell in enumerate(row):
                # Use header as key, fallback to column index if header is empty
                key = json_data["headers"][j] if j < len(json_data["headers"]) and json_data["headers"][j] else f"column_{j}"
                row_dict[key] = str(cell).strip()
            
            json_data["data"].append({
                "row_index": i,
                "values": row_dict
            })
    
    return json_data

In [4]:
def get_biggest_table(pdf_path, threshold):
    tables = camelot.read_pdf(pdf_path, flavor = 'lattice', pages = 'all')
    continued_tables = get_continued_tables(tables, threshold)

    # get the name of the PDF file we are processing (without the extension)
    pdf_file_name = os.path.splitext(os.path.basename(pdf_path))[0]

    processed = []
    all_table_jsons = []

    # iterate over found tables
    for i, table in enumerate(tables):

        # if table was already processed as part of a group
        if table in processed: continue

        # check if the current table is a continued table
        is_continued = any(table in sublist for sublist in continued_tables)

        # collect all table data (current table + continued tables if any)
        all_table_data = list(table.data)

        # if the current table is a continued table, append all subsequent continued tables data
        if is_continued:

            # get the index of the group in "continued_tables" associated with the current table
            group_index = next(index for index, sublist in enumerate(continued_tables) if table in sublist)

            # iterate over the tables in said group and append their data
            for continued_table in continued_tables[group_index]:

                # skip the current table as it's already added
                if continued_table == table or continued_table in processed: continue

                # append the data of the continued table (skip header for subsequent tables)
                all_table_data.extend(continued_table.data[1:] if len(continued_table.data) > 1 else [])

                # keep track of processed tables
                processed.append(continued_table)

        # convert to JSON
        table_info = {
            "source_file": pdf_file_name,
            "page": table.parsing_report['page'],
            "order": table.parsing_report['order']
        }
        
        json_data = table_to_json(all_table_data, table_info)
        all_table_jsons.append(json_data)
        
        # mark current table as processed
        processed.append(table)

    # find the table with the most rows
    if all_table_jsons:
        largest_table = max(all_table_jsons, key=lambda x: x.get('metadata', {}).get('total_rows', 0))
        
        # return the JSON of the largest table
        print(json.dumps(largest_table, ensure_ascii=False, indent=2))
        return largest_table
    else:
        print("No tables found in the PDF.")
        return None

In [None]:
hello = get_biggest_table("D:/study/LammaIndex/documents/Chuong_V_Yeu_cau_ky_thuat.pdf",15)

In [6]:
data = hello["data"]

In [7]:
import uuid
def clean_text(text):
    """L√†m s·∫°ch text, lo·∫°i b·ªè k√Ω t·ª± xu·ªëng d√≤ng th·ª´a"""
    return re.sub(r'\n+', '', text.strip())

def split_requirements(text):
    """T√°ch c√°c y√™u c·∫ßu d·ª±a tr√™n d·∫•u g·∫°ch ƒë·∫ßu d√≤ng"""
    requirements = []
    lines = text.split('\n')
    for line in lines:
        line = line.strip()
        if line.startswith('- '):
            requirements.append(line[2:].strip())
        elif line and not any(line.startswith(prefix) for prefix in ['- ']):
            if requirements:
                requirements[-1] += ' ' + line
            else:
                requirements.append(line)
    return requirements

def generate_random_key():
    """T·∫°o key random 5 k√Ω t·ª± t·ª´ UUID"""
    return str(uuid.uuid4()).replace('-', '')[:5].upper()

def convert_to_new_format(data):
    result = []
    current_product = None
    current_category = None
    
    for item in data:
        values = item['values']
        stt_raw  = values['STT']
        hang_hoa = clean_text(values['hang_hoa'])
        yeu_cau = values['yeu_cau_ky_thuat']


        stt = stt_raw.strip()

        roman_pattern = r'^(VII|VIII|IX|X|XI|XII|I{1,3}|IV|V|VI)\s+(.+)'
        roman_match = re.match(roman_pattern, stt)
        # N·∫øu STT l√† s·ªë La M√£ (I, II, III...) th√¨ ƒë√¢y l√† t√™n s·∫£n ph·∫©m
        hang_hoa_roman_match = re.match(roman_pattern, hang_hoa)
        if roman_match and not hang_hoa and not yeu_cau:
            if current_product:
                result.append(current_product)
            
            roman_num = roman_match.group(1)  # S·ªë La M√£
            product_name = roman_match.group(2)  # T√™n s·∫£n ph·∫©m
            
            current_product = {
                "ten_san_pham": product_name,
                "cac_muc": []
            }
            current_category = None
        elif hang_hoa_roman_match and not stt_raw and not yeu_cau:
            if current_product:
                result.append(current_product)
            
            roman_num = hang_hoa_roman_match.group(1)  # S·ªë La M√£
            product_name = hang_hoa_roman_match.group(2)  # T√™n s·∫£n ph·∫©m
            
            current_product = {
                "ten_san_pham": product_name,
                "cac_muc": []
            }
            current_category = None        
        
        elif stt in ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV']:
            if current_product:
                result.append(current_product)
            
            current_product = {
                "ten_san_pham": hang_hoa,
                "cac_muc": []
            }
            current_category = None
            
        # N·∫øu STT l√† s·ªë (1, 2, 3...) th√¨ ƒë√¢y l√† danh m·ª•c
        elif stt.isdigit():
            current_category = {
                "ten_hang_hoa": hang_hoa,
                "thong_so_ky_thuat": {}
            }
            
            # X·ª≠ l√Ω y√™u c·∫ßu k·ªπ thu·∫≠t cho danh m·ª•c
            if yeu_cau.strip():
                requirements = split_requirements(yeu_cau)
                for req in requirements:
                    key = generate_random_key()  # T·∫°o key random
                    current_category["thong_so_ky_thuat"][key] = clean_text(req)
            if current_product:
                current_product["cac_muc"].append(current_category)
                
        # N·∫øu STT tr·ªëng th√¨ ƒë√¢y l√† th√¥ng s·ªë k·ªπ thu·∫≠t chi ti·∫øt
        elif stt == '' and current_category and hang_hoa:
            # T·∫°o key random cho th√¥ng s·ªë k·ªπ thu·∫≠t
            key = generate_random_key()
            
            # L√†m s·∫°ch t√™n h√†ng h√≥a v√† y√™u c·∫ßu k·ªπ thu·∫≠t
            clean_hang_hoa = clean_text(hang_hoa)
            clean_yeu_cau = clean_text(yeu_cau)
            
            current_category["thong_so_ky_thuat"][key] = [clean_hang_hoa, clean_yeu_cau]
        elif stt == '' and current_category and not hang_hoa:
            if yeu_cau.strip():
                requirements = split_requirements(yeu_cau)
                
                # L·∫•y key cu·ªëi c√πng trong thong_so_ky_thuat (n·∫øu c√≥)
                existing_keys = list(current_category["thong_so_ky_thuat"].keys())
                last_key = existing_keys[-1] if existing_keys else None
                
                for req in requirements:
                    clean_req = clean_text(req)
                    
                    # Ki·ªÉm tra ch·ªØ c√°i ƒë·∫ßu c√≥ vi·∫øt hoa HO·∫∂C c√≥ g·∫°ch ƒë·∫ßu d√≤ng kh√¥ng
                    has_dash = req.strip().startswith('- ')
                    has_uppercase = clean_req and clean_req[0].isupper()
                    
                    if has_uppercase or has_dash:
                        # Ch·ªØ ƒë·∫ßu vi·∫øt hoa HO·∫∂C c√≥ g·∫°ch ƒë·∫ßu d√≤ng -> t·∫°o key m·ªõi
                        key = generate_random_key()
                        current_category["thong_so_ky_thuat"][key] = clean_req
                        last_key = key
                    else:
                        # Ch·ªØ ƒë·∫ßu kh√¥ng vi·∫øt hoa V√Ä kh√¥ng c√≥ g·∫°ch ƒë·∫ßu d√≤ng -> n·ªëi v√†o key tr∆∞·ªõc ƒë√≥
                        if last_key and last_key in current_category["thong_so_ky_thuat"]:
                            current_category["thong_so_ky_thuat"][last_key] += " " + clean_req
                        else:
                            # N·∫øu kh√¥ng c√≥ key tr∆∞·ªõc ƒë√≥ th√¨ v·∫´n t·∫°o key m·ªõi
                            key = generate_random_key()
                            current_category["thong_so_ky_thuat"][key] = clean_req
                            last_key = key
    
    # Th√™m s·∫£n ph·∫©m cu·ªëi c√πng
    if current_product:
        result.append(current_product)
    
    return result

# Chuy·ªÉn ƒë·ªïi d·ªØ li·ªáu
converted_data = convert_to_new_format(data)

In [None]:
converted_data

In [44]:
context_queries = {}  # Dict ch·ª©a th√¥ng tin chi ti·∫øt theo key
product_key = {}  # Dict l·ªìng: ten_san_pham -> ten_hang_hoa -> list[key]

for item in converted_data:
    ten_san_pham = item['ten_san_pham']
    for muc in item['cac_muc']:
        ten_hang_hoa = muc['ten_hang_hoa']
        thong_so_ky_thuat = muc['thong_so_ky_thuat']
        for key, value in thong_so_ky_thuat.items():
            if isinstance(value, list):
                q = value[0]
                k = value[1]
                value_str = ' '.join(value)
            else:
                q = None
                k = value
                value_str = value

            # Ghi v√†o context_queries
            context_queries[key] = {
                "ten_san_pham": ten_san_pham,
                "ten_hang_hoa": ten_hang_hoa,
                "value": value_str,
                "yeu_cau_ky_thuat_chi_tiet": k,
                "yeu_cau_ky_thuat": q
            }

            # Ghi v√†o product_key
            if ten_san_pham not in product_key:
                product_key[ten_san_pham] = {}
            if ten_hang_hoa not in product_key[ten_san_pham]:
                product_key[ten_san_pham][ten_hang_hoa] = []
            product_key[ten_san_pham][ten_hang_hoa].append(key)



In [None]:
context_queries

In [None]:
product_key

In [None]:
# L∆∞u v·ªõi t√™n file m·∫∑c ƒë·ªãnh
with open('context_prompts.json', 'w', encoding='utf-8') as f:
    json.dump(context_prompts, f, ensure_ascii=False, indent=2)

print(f"‚úÖ ƒê√£ l∆∞u {len(context_prompts)} context prompts v√†o file: context_prompts.json")

In [47]:
def prompt_create_query(context_prompt):
    prompt =  f"""
    B·∫°n s·∫Ω nh·∫≠n ƒë∆∞·ª£c m·ªôt ƒëo·∫°n m√¥ t·∫£ k·ªπ thu·∫≠t ng·∫Øn g·ªçn (context_prompt), th∆∞·ªùng l√† c√°c m·∫£nh th√¥ng tin k·ªπ thu·∫≠t r·ªùi r·∫°c. Nhi·ªám v·ª• c·ªßa b·∫°n l√† chuy·ªÉn ƒë·ªïi ƒëo·∫°n m√¥ t·∫£ ƒë√≥ th√†nh m·ªôt truy v·∫•n (query) ho√†n ch·ªânh b·∫±ng ng√¥n ng·ªØ t·ª± nhi√™n, v·ªõi m·ª•c ti√™u:

- Di·ªÖn ƒë·∫°t l·∫°i ƒëo·∫°n m√¥ t·∫£ d∆∞·ªõi d·∫°ng m·ªôt c√¢u h·ªèi ho·∫∑c c√¢u truy v·∫•n r√µ r√†ng, d·ªÖ hi·ªÉu.
- Gi·ªØ nguy√™n ƒë·∫ßy ƒë·ªß t·∫•t c·∫£ c√°c th√¥ng tin k·ªπ thu·∫≠t c√≥ trong c√¢u g·ªëc (ch·∫ø ƒë·ªô, d√≤ng ƒëi·ªán, ƒëi·ªán √°p, b∆∞·ªõc c√¥ng su·∫•t,...).
- Kh√¥ng ƒë∆∞·ª£c lo·∫°i b·ªè ho·∫∑c l√†m m·ªù b·∫•t k·ª≥ chi ti·∫øt k·ªπ thu·∫≠t n√†o.
- Truy v·∫•n c·∫ßn ph√π h·ª£p ƒë·ªÉ t√¨m ki·∫øm th√¥ng tin t∆∞∆°ng ƒë·ªìng v·ªÅ m·∫∑t ng·ªØ nghƒ©a trong m·ªôt h·ªá th·ªëng truy xu·∫•t d·ªØ li·ªáu k·ªπ thu·∫≠t.

D∆∞·ªõi ƒë√¢y l√† c√°c v√≠ d·ª•:

**Input:** "T·∫£i gi·∫£ x·∫£ acquy Th√¥ng s·ªë k·ªπ thu·∫≠t D√≤ng x·∫£ l·ªõn nh·∫•t v·ªõi d·∫£i ƒëi·ªán √°p 220V DC √∑ 240V DC ‚Äì t∆∞∆°ng ·ª©ng v·ªõi 1 b·ªô ‚â• 70 A"

**Output:** "D√≤ng x·∫£ l·ªõn nh·∫•t c·ªßa t·∫£i gi·∫£ x·∫£ acquy l√† bao nhi√™u khi d·∫£i ƒëi·ªán √°p n·∫±m trong kho·∫£ng 220V DC ƒë·∫øn 240V DC, v√† t∆∞∆°ng ·ª©ng v·ªõi 1 b·ªô th√¨ c√≥ ƒë·∫°t d√≤ng x·∫£ ‚â• 70A kh√¥ng?"

---
**Input:** "T·∫£i gi·∫£ x·∫£ acquy Th√¥ng s·ªë k·ªπ thu·∫≠t C√°c ch·∫ø ƒë·ªô x·∫£ - G·ªìm 4 ch·∫ø ƒë·ªô: d√≤ng kh√¥ng ƒë·ªïi, c√¥ng su·∫•t kh√¥ng ƒë·ªïi, theo ƒë·∫∑c t√≠nh d√≤ng cho tr∆∞·ªõc, ƒëi·ªÅu ch·ªânh th·ªß c√¥ng."

**Output:** "T·∫£i gi·∫£ x·∫£ acquy h·ªó tr·ª£ nh·ªØng ch·∫ø ƒë·ªô x·∫£ n√†o? C√≥ ph·∫£i g·ªìm d√≤ng kh√¥ng ƒë·ªïi, c√¥ng su·∫•t kh√¥ng ƒë·ªïi, theo ƒë·∫∑c t√≠nh d√≤ng ƒë·ªãnh tr∆∞·ªõc, v√† ƒëi·ªÅu ch·ªânh th·ªß c√¥ng kh√¥ng?"

---
**Input:** "T·∫£i gi·∫£ x·∫£ acquy Th√¥ng s·ªë k·ªπ thu·∫≠t B∆∞·ªõc c√¥ng su·∫•t ƒëi·ªÅu ch·ªânh x·∫£ t·∫£i 100 W"

**Output:** "B∆∞·ªõc ƒëi·ªÅu ch·ªânh c√¥ng su·∫•t x·∫£ t·∫£i nh·ªè nh·∫•t c·ªßa t·∫£i gi·∫£ x·∫£ acquy l√† bao nhi√™u? C√≥ ph·∫£i l√† 100W kh√¥ng?"

---
Tr·∫£ l·ªùi t·ª´ng `context_prompt` theo c·∫•u tr√∫c nh∆∞ tr√™n.


    D·ªÆ LI·ªÜU ƒê·∫¶U V√ÄO:
    ---
    {context_prompt}
    ---

    CH·ªà C·∫¶N TR·∫¢ V·ªÄ C√ÇU TRUY V·∫§N.
    """
    return prompt

In [48]:
from openai import OpenAI
from dotenv import load_dotenv
import re
import os
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
test = dict(list(context_queries.items())[70:75])
test

In [82]:
for key in test:
    data = test[key] 
    prompt = prompt_create_query(f"{data['ten_san_pham']} {data['ten_hang_hoa']} {data['value']}")
    response = client.responses.create(
        model="gpt-4o-mini",
        input=prompt,
        temperature=0
    )
    output_text = response.output_text.strip()
    test[key]["query"]=output_text

In [None]:
test, product_key

In [84]:
import os
from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex
from llama_index.core.vector_stores import VectorStoreInfo
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.settings import Settings
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
    FilterCondition
)

# C·∫•u h√¨nh LLM v√† Embedding
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.llm = OpenAI(model="gpt-4o-mini")
# C·∫•u h√¨nh client Qdrant
client = QdrantClient(
    url="https://a8bcf78f-0147-411f-aa58-079f863fcd6d.us-west-1-0.aws.cloud.qdrant.io:6333",
    api_key=os.getenv("QDRANT_API_KEY"),
)
aclient = AsyncQdrantClient(
    url="https://a8bcf78f-0147-411f-aa58-079f863fcd6d.us-west-1-0.aws.cloud.qdrant.io:6333",
    api_key=os.getenv("QDRANT_API_KEY"),
)
# Kh·ªüi t·∫°o Vector Store
vector_store = QdrantVectorStore(
    collection_name="thong_tin_san_pham",
    client=client,
    aclient=aclient,
)

def retrieve_document(query_str):
    file_names = []
    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
    
    filters_document = MetadataFilters(
        filters=[
            MetadataFilter(key="type", operator=FilterOperator.EQ, value="summary_document"),
        ],
    condition=FilterCondition.AND,
    )
    retriever_document = index.as_retriever(similarity_top_k=5, verbose=True, filters=filters_document)
    
    results = retriever_document.retrieve(query_str)

    for result in results:
        metadata = result.metadata
        file_names.append(metadata["file_name"])

    return file_names

def retrieve_chunk(file_names, query_str):
    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
    
    filters_chunk = MetadataFilters(
        filters=[
            MetadataFilter(key="file_name", operator=FilterOperator.IN, value=file_names),
            MetadataFilter(key="type", operator=FilterOperator.EQ, value="chunk_document"),
        ],
        condition=FilterCondition.AND,
    )

    retriever_chunk = index.as_retriever(similarity_top_k=5, verbose=True, filters=filters_chunk)
    
    results = retriever_chunk.retrieve(query_str)
    content = ""
    for i, result in enumerate(results, start=1):
        metadata = result.metadata
        file_name = metadata["file_name"]+ ".pdf"
        page = metadata["page"]
        table = metadata["table_name"]
        figure_name = metadata.get("figure_name")
        text = result.text.strip()
        content += f"Chunk {i} trong file {file_name} t·∫°i trang {page}, c√≥ ch·ª©a b·∫£ng {table} v√† h√¨nh {figure_name} c√≥ n·ªôi dung:\n{text}\n\n"

    return content


        




In [85]:
for product in product_key:
    items = product_key[product]
    for key in items:
        for item in items[key]:
            if item not in test:
                continue
            query = test[item]["query"]
            content = retrieve_chunk(file_names, query)
            test[item]["content"] = content            
            

In [86]:
for product in product_key:
    file_names = retrieve_document(product)
    items = product_key[product]
    for item in items:
        if item not in test:
            continue
        query = test[item]["query"]
        content = retrieve_chunk(file_names, query)
        test[item]["content"] = content

In [87]:
for key in test:
    print(key)

5F1A0
97D75
49234
59016
3CD8F


In [88]:
SYSTEM_PROMPT = """
B·∫°n ƒë∆∞·ª£c cung c·∫•p:
- M·ªôt ho·∫∑c nhi·ªÅu ƒëo·∫°n vƒÉn b·∫£n (chunk) t·ª´ t√†i li·ªáu k·ªπ thu·∫≠t, k√®m metadata: t√™n file, m·ª•c, b·∫£ng/h√¨nh (n·∫øu c√≥), s·ªë trang
- M·ªôt y√™u c·∫ßu k·ªπ thu·∫≠t c·ª• th·ªÉ.
- M·ªôt ƒëo·∫°n vƒÉn m·∫´u.

#Y√™u c·∫ßu tr·∫£ l·ªùi b·∫±ng ti·∫øng vi·ªát:
# 1. T√¨m th√¥ng tin k·ªπ thu·∫≠t li√™n quan tr·ª±c ti·∫øp ƒë·∫øn y√™u c·∫ßu k·ªπ thu·∫≠t.
# 2. Tr√≠ch xu·∫•t gi√° tr·ªã th√¥ng s·ªë ƒë·ªÉ x√°c ƒë·ªãnh kh·∫£ nƒÉng ƒë√°p ·ª©ng theo y√™u c·∫ßu v√† tr·∫£ v·ªÅ ƒëo·∫°n vƒÉn t∆∞∆°ng t·ª± gi·ªëng ƒëo·∫°n vƒÉn m·∫´u kh√¥ng th√™m b·ªõt nh∆∞ng th√¥ng s·ªë ph·∫£i ch√≠nh x√°c trong t√†i li·ªáu.
# 3. D·∫´n ch·ª©ng r√µ: file, section, table/figure name (n·∫øu c√≥), page, n·ªôi dung tr√≠ch d·∫´n c·ªßa nh·ªØng t√†i li·ªáu li√™n quan, nh·ªØng t√†i li·ªáu kh√°c kh√¥ng li√™n quan th√¨ b·ªè qua.

# #Output: JSON g·ªìm c√°c tr∆∞·ªùng:
- yeu_cau_ky_thuat
- kha_nang_dap_ung
- tai_lieu_tham_chieu" 

# V√≠ d·ª•:
Input:
Y√™u c·∫ßu: "S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): ‚â• 4"  
Chunk: "...NetSure 731 A41-S8: 4 rectifier slots (standard), expandable to 6..."  
Metadata:  
- file: "Netsure-731-A41-user-manual.pdf"  
- section: "Table 1-1 Configuration of power system"  
- page: 2"
ƒêo·∫°n vƒÉn m·∫´u: S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): ‚â• 4 (v√≠ d·ª• t√¨m trong t√†i li·ªáu s·ªë l∆∞·ª£ng l√† 5 th√¨ tr·∫£ v·ªÅ "S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): 5")
"""

# ƒê·ªãnh nghƒ©a function schema
FUNCTION_SCHEMA = {
    "name": "danh_gia_ky_thuat",
    "description": "ƒê√°nh gi√° kh·∫£ nƒÉng ƒë√°p ·ª©ng c·ªßa s·∫£n ph·∫©m theo y√™u c·∫ßu k·ªπ thu·∫≠t t·ª´ chunk t√†i li·ªáu.",
    "parameters": {
        "type": "object",
        "properties": {
            "yeu_cau_ky_thuat": {"type": "string"},
            "kha_nang_dap_ung": {"type": "string"},
            "tai_lieu_tham_chieu": {
                "type": "object",
                "properties": {
                    "file": {"type": "string"},
                    "section": {"type": "string"},
                    "table_or_figure": {"type": "string"},
                    "page": {"type": "integer"},
                    "evidence": {"type": "string"}
                },
                "required": ["file", "section", "page", "evidence"]
            }
        },
        "required": ["yeu_cau_ky_thuat", "kha_nang_dap_ung", "tai_lieu_tham_chieu"]
    }
}

In [89]:
# T·∫°o assistant
# === CREATE ASSISTANT ===
def create_assistant():
    assistant = client.beta.assistants.create(
        name="Technical Document Evaluator",
        instructions=SYSTEM_PROMPT,
        model="gpt-4o-mini",
        tools=[{"type": "function", "function": FUNCTION_SCHEMA}]
    )
    return assistant.id

# === CREATE THREAD ===
def create_thread():
    thread = client.beta.threads.create()
    return thread.id

# === UPDATE ASSISTANT ===
def update_assistant(assistant_id):
    assistant = client.beta.assistants.update(
        assistant_id=assistant_id,
        instructions=SYSTEM_PROMPT,
        model="gpt-4o-mini",
        tools=[{"type": "function", "function": FUNCTION_SCHEMA}]
    )
    return assistant.id

# === EVALUATE TECHNICAL REQUIREMENT ===
def evaluate_technical_requirement(user_prompt, assistant_id):
    # 1. T·∫°o thread ri√™ng cho m·ªói l·∫ßn g·ªçi
    thread = client.beta.threads.create()
    thread_id = thread.id

    # 2. G·ª≠i message v√†o thread
    client.beta.threads.messages.create(
        thread_id=thread_id,
        role="user",
        content=user_prompt
    )

    # 3. T·∫°o run
    run = client.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistant_id,
        tool_choice={"type": "function", "function": {"name": "danh_gia_ky_thuat"}}
    )

    # 4. Ch·ªù assistant x·ª≠ l√Ω (t·ªëi ƒëa 20s)
    for _ in range(20):
        run = client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
        if run.status not in ["queued", "in_progress"]:
            break
        time.sleep(1)

    # 5. L·∫•y arguments tr·ª±c ti·∫øp
    if run.status == "requires_action":
        call = run.required_action.submit_tool_outputs.tool_calls[0]
        print(f"üëâ Assistant ƒë√£ g·ªçi tool: {call.function.name}")
        print("üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:")
        print(call.function.arguments)
        return call.function.arguments

    elif run.status == "completed":
        messages = client.beta.threads.messages.list(thread_id=thread_id)
        for msg in messages.data:
            print(f"[{msg.role}] {msg.content[0].text.value}")
        return None

    else:
        print(f"Run status: {run.status}")
        return None



In [90]:
test

{'5F1A0': {'ten_san_pham': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah)',
  'ten_hang_hoa': 'Y√™u c·∫ßu chung',
  'value': 'Th·ªùi gian b·∫£o h√†nh: theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t, t·ªëi thi·ªÉu 12 th√°ng.',
  'yeu_cau_ky_thuat_chi_tiet': 'Th·ªùi gian b·∫£o h√†nh: theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t, t·ªëi thi·ªÉu 12 th√°ng.',
  'yeu_cau_ky_thuat': None,
  'query': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/48VDC k√®m theo 02 d√†n acquy 200Ah c√≥ th·ªùi gian b·∫£o h√†nh l√† bao l√¢u? C√≥ ph·∫£i theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t, t·ªëi thi·ªÉu l√† 12 th√°ng kh√¥ng?',
  'content': 'Chunk 1 trong file NetSure_732_User_Manual.pdf t·∫°i trang 29, c√≥ ch·ª©a b·∫£ng Technical and Engineering Data for NetSure 732 Subrack Power System v√† h√¨nh None c√≥ n·ªôi dung:\nIt can withstand five times of simulated lightning surge current of 20Ka at 8/20Œºs, for the positive and negative polarities respectively. The test interval is not smaller than 1

In [91]:
import openai
from openai import OpenAI
import json
client = OpenAI()

In [92]:
# V√≠ d·ª• s·ª≠ d·ª•ng
import time
assistant_id = create_assistant()
print(f"Assistant ID: {assistant_id}")


# V√≠ d·ª• user prompt

for key in test:
    if not test[key].get('content'):
        continue
    data = test[key]
    value = data['value']
    content = data['content']
    form = data['yeu_cau_ky_thuat_chi_tiet']
    # test[key].pop("content", None)
    
    user_prompt = f'''
    Chunk v√† metadata: {content}
    Y√™u c·∫ßu: {value}
    ƒêo·∫°n vƒÉn m·∫´u: {form}
    '''
    
    # G·ªçi h√†m ƒë√°nh gi√° v·ªõi thread ri√™ng
    result = evaluate_technical_requirement(user_prompt, assistant_id)
    test[key]['response'] = result




Assistant ID: asst_sRiLQPBCsGOii4EAPGi49Nml


  thread = client.beta.threads.create()
  client.beta.threads.messages.create(
  run = client.beta.threads.runs.create(
  run = client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)


üëâ Assistant ƒë√£ g·ªçi tool: danh_gia_ky_thuat
üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:
{"yeu_cau_ky_thuat":"Th·ªùi gian b·∫£o h√†nh: theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t, t·ªëi thi·ªÉu 12 th√°ng.","kha_nang_dap_ung":"Th·ªùi gian b·∫£o h√†nh c·ªßa s·∫£n ph·∫©m l√† theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t.","tai_lieu_tham_chieu":{"file":"NetSure_732_User_Manual.pdf","section":"Appendix 1  Technical And Engineering Data","table_or_figure":"None","page":29,"evidence":"Th·ªùi gian b·∫£o h√†nh c·ªßa s·∫£n ph·∫©m l√† theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t."}}
üëâ Assistant ƒë√£ g·ªçi tool: danh_gia_ky_thuat
üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:
{"yeu_cau_ky_thuat":"S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): ‚â• 4","kha_nang_dap_ung":"S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): 4","tai_lieu_tham_chieu":{"file":"NetSure_732_User_Manual.pdf","section":"Table 1-1 Configuration of power system","page":8,"evidence":"Maximum configur

In [93]:
for key in test:
    test[key].pop("content", None)
    response = test[key]['response']
    if isinstance(response, str):
        response = json.loads(response)
    test[key]["kha_nang_dap_ung"] = response['kha_nang_dap_ung']
    test[key]["tai_lieu_tham_chieu"] = {
            "file": response['tai_lieu_tham_chieu']['file'],
            "section": response['tai_lieu_tham_chieu'].get('section', ''),
            "table_or_figure": response['tai_lieu_tham_chieu'].get('table_or_figure', ''),
            "page": response['tai_lieu_tham_chieu'].get('page', 0),
            "evidence": response['tai_lieu_tham_chieu'].get('evidence', '')
        }
    test[key].pop("response", None)

In [96]:
for product in product_key:
    items = product_key[product]
    for key in items:
        for item in items[key]:
            if item not in test:
                continue
            query = test[item]["query"]
            content = retrieve_chunk(file_names, query)
            test[item]["content"] = content 

({'T·∫£i gi·∫£ x·∫£ acquy': {'Y√™u c·∫ßu chung': ['9F506', '656FA', '665C0', '146FA'],
   'Th√¥ng s·ªë k·ªπ thu·∫≠t': ['C315E',
    'C8261',
    'F46D0',
    '2A2FB',
    'C3465',
    '4A8FB',
    '96B0B',
    '54BC9',
    '8699B',
    '83051',
    'F2306',
    '329B9',
    'E7263',
    '6E7E6',
    '76366',
    'DC048',
    'F830C',
    '7B53A',
    '4A38E',
    '98209',
    '8C8F1',
    '7DD5E',
    '888BC',
    '8481C',
    '5B7E9',
    '2394F',
    '44934',
    '5F7A6',
    'BA835',
    '649E2',
    'BF81A',
    '2D1BC',
    'EEE2A',
    'A57D8',
    '18E7C',
    '97A68',
    '85D16',
    '8D24F',
    'D108B',
    '7453C']},
  'ƒê·ªìng h·ªì ƒëo n·ªôi tr·ªü acquy': {'Y√™u c·∫ßu chung': ['C0D49', '5E94A', 'BA0A9'],
   'C√°c t√≠nh nƒÉng': ['359DC', '98335', 'EA16D'],
   'ƒê·∫∑c t√≠nh k·ªπ thu·∫≠t': ['C0973', '1BD01', '7F6DD'],
   'Th√¢n m√°y': ['AFE62', '72955', '6A67B', 'D999D'],
   'Pin': ['2DF59', '6F170'],
   'Ph·ª• ki·ªán': ['CC377', '7E288', '24351', '32B8A', '550A3']},
  'B·ªô 

In [122]:
def prompt_adapt_or_not(dap_ung_ky_thuat: str, tai_lieu_tham_chieu: str) -> str:
    prompt = f"""
T√¥i c√≥ m·ªôt danh s√°ch c√°c th√¥ng s·ªë k·ªπ thu·∫≠t v√† kh·∫£ nƒÉng ƒë√°p ·ª©ng t∆∞∆°ng ·ª©ng, m·ªói d√≤ng ƒë∆∞·ª£c ngƒÉn c√°ch b·∫±ng '||' theo ƒë·ªãnh d·∫°ng:

    y√™u c·∫ßu k·ªπ thu·∫≠t || kh·∫£ nƒÉng ƒë√°p ·ª©ng

D∆∞·ªõi ƒë√¢y l√† danh s√°ch ƒë·∫ßu v√†o:
{dap_ung_ky_thuat}

Ngo√†i ra, t√¥i c≈©ng c√≥ c√°c t√†i li·ªáu tham chi·∫øu nh∆∞ sau:
{tai_lieu_tham_chieu}

Y√™u c·∫ßu c·ªßa b·∫°n l√†:
1. Ki·ªÉm tra xem kh·∫£ nƒÉng ƒë√°p ·ª©ng c√≥ th·ªèa m√£n t·ª´ng y√™u c·∫ßu k·ªπ thu·∫≠t hay kh√¥ng.
2. N·∫øu t·∫•t c·∫£ c√°c y√™u c·∫ßu ƒë·ªÅu ƒë∆∞·ª£c ƒë√°p ·ª©ng, tr·∫£ v·ªÅ "ƒë√°p ·ª©ng", ng∆∞·ª£c l·∫°i tr·∫£ v·ªÅ "kh√¥ng ƒë√°p ·ª©ng".
3. G·ªôp l·∫°i c√°c t√†i li·ªáu tham chi·∫øu b·ªã tr√πng ƒë·ªÉ tr√°nh l·∫∑p l·∫°i(v√≠ d·ª• nh∆∞ t√™n file, trang, b·∫£ng).

Ch·ªâ tr·∫£ v·ªÅ k·∫øt qu·∫£ theo ƒë·ªãnh d·∫°ng JSON nh∆∞ sau:

{{
    "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "ƒê√°p ·ª©ng" ho·∫∑c "Kh√¥ng ƒë√°p ·ª©ng",
    "t√†i li·ªáu": "danh s√°ch t√†i li·ªáu ƒë√£ r√∫t g·ªçn vi·∫øt d∆∞·ªõi d·∫°ng txt"
}}
"""
    return prompt

In [104]:
from openai import OpenAI
from dotenv import load_dotenv
import re
import os
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [119]:
def parse_output_text(output_text: str) -> dict:
    # B1: Lo·∫°i b·ªè ph·∫ßn ```json ... ```
    cleaned = re.sub(r"^```json\n|```$", "", output_text.strip())

    # B2: Gi·∫£i m√£ c√°c k√Ω t·ª± escape nh∆∞ \n, \"
    unescaped = cleaned.encode("utf-8")

    # B3: Chuy·ªÉn th√†nh dict
    return json.loads(unescaped)

In [120]:
results = []
for product in product_key:
    items = product_key[product]
    for key in items:
        dap_ung_ky_thuat = ""
        tai_lieu_tham_chieu = ""
        for item in items[key]:
            if item not in test:
                continue
            yeu_cau_ky_thuat = test[item]['yeu_cau_ky_thuat_chi_tiet']
            kha_nang_dap_ung = test[item]['kha_nang_dap_ung']
            dap_ung_ky_thuat += f"{yeu_cau_ky_thuat} || {kha_nang_dap_ung}\n"
            tai_lieu = test[item]['tai_lieu_tham_chieu']
            tai_lieu_tham_chieu += f"file {tai_lieu['file']},trang: {tai_lieu['page']}, trong b·∫£ng(figure):{tai_lieu['table_or_figure']},evidence :{tai_lieu['evidence']} \n"
        if dap_ung_ky_thuat != "" and tai_lieu_tham_chieu != "":
            prompt = prompt_adapt_or_not(dap_ung_ky_thuat, tai_lieu_tham_chieu)
            response = client.responses.create(
                model="gpt-4o-mini",
                input=prompt,
                temperature=0
            )
            output_text = response.output_text.strip()
            output_text = parse_output_text(output_text)
            print(output_text)
            product_key[product][key].append(output_text['ƒë√°p ·ª©ng k·ªπ thu·∫≠t'])
            product_key[product][key].append(output_text['t√†i li·ªáu'])

{'ƒë√°p ·ª©ng k·ªπ thu·∫≠t': 'ƒë√°p ·ª©ng', 't√†i li·ªáu': 'file NetSure_732_User_Manual.pdf, trang: 29'}
{'ƒë√°p ·ª©ng k·ªπ thu·∫≠t': 'ƒë√°p ·ª©ng', 't√†i li·ªáu': 'file NetSure_732_User_Manual.pdf, trang: 8, trong b·∫£ng(figure):, evidence: Maximum configuration: 4 pieces; file Converter_Brochure.pdf, trang: 2, trong b·∫£ng(figure): None, evidence: Output Power: 3000W Maximum; file NetSure_732_User_Manual.pdf, trang: 8, trong b·∫£ng(figure): Table 1-1, evidence: DC distribution | PL: | 32A/1P√ó2;16A/1P√ó2 MCB | NPL: 63A/1P√ó2;32A/1P√ó4;16A/1P√ó2 MCB'}


In [121]:
product_key 

{'T·∫£i gi·∫£ x·∫£ acquy': {'Y√™u c·∫ßu chung': ['9F506', '656FA', '665C0', '146FA'],
  'Th√¥ng s·ªë k·ªπ thu·∫≠t': ['C315E',
   'C8261',
   'F46D0',
   '2A2FB',
   'C3465',
   '4A8FB',
   '96B0B',
   '54BC9',
   '8699B',
   '83051',
   'F2306',
   '329B9',
   'E7263',
   '6E7E6',
   '76366',
   'DC048',
   'F830C',
   '7B53A',
   '4A38E',
   '98209',
   '8C8F1',
   '7DD5E',
   '888BC',
   '8481C',
   '5B7E9',
   '2394F',
   '44934',
   '5F7A6',
   'BA835',
   '649E2',
   'BF81A',
   '2D1BC',
   'EEE2A',
   'A57D8',
   '18E7C',
   '97A68',
   '85D16',
   '8D24F',
   'D108B',
   '7453C']},
 'ƒê·ªìng h·ªì ƒëo n·ªôi tr·ªü acquy': {'Y√™u c·∫ßu chung': ['C0D49', '5E94A', 'BA0A9'],
  'C√°c t√≠nh nƒÉng': ['359DC', '98335', 'EA16D'],
  'ƒê·∫∑c t√≠nh k·ªπ thu·∫≠t': ['C0973', '1BD01', '7F6DD'],
  'Th√¢n m√°y': ['AFE62', '72955', '6A67B', 'D999D'],
  'Pin': ['2DF59', '6F170'],
  'Ph·ª• ki·ªán': ['CC377', '7E288', '24351', '32B8A', '550A3']},
 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo

In [123]:
test

{'5F1A0': {'ten_san_pham': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah)',
  'ten_hang_hoa': 'Y√™u c·∫ßu chung',
  'value': 'Th·ªùi gian b·∫£o h√†nh: theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t, t·ªëi thi·ªÉu 12 th√°ng.',
  'yeu_cau_ky_thuat_chi_tiet': 'Th·ªùi gian b·∫£o h√†nh: theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t, t·ªëi thi·ªÉu 12 th√°ng.',
  'yeu_cau_ky_thuat': None,
  'query': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/48VDC k√®m theo 02 d√†n acquy 200Ah c√≥ th·ªùi gian b·∫£o h√†nh l√† bao l√¢u? C√≥ ph·∫£i theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t, t·ªëi thi·ªÉu l√† 12 th√°ng kh√¥ng?',
  'kha_nang_dap_ung': 'Th·ªùi gian b·∫£o h√†nh c·ªßa s·∫£n ph·∫©m l√† theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t.',
  'tai_lieu_tham_chieu': {'file': 'NetSure_732_User_Manual.pdf',
   'section': 'Appendix 1  Technical And Engineering Data',
   'table_or_figure': 'None',
   'page': 29,
   'evidence': 'Th·ªùi gian b·∫£o h√†nh c·ªßa s·∫£n ph·∫©m l√† theo ti√™u chu·∫©n c·ªßa 

In [124]:
product_key_test = {
    'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah)':{
        'Y√™u c·∫ßu chung': ['5F1A0','ƒë√°p ·ª©ng', 'file NetSure_732_User_Manual.pdf, trang: 29'],
        'C·∫•u h√¨nh thi·∫øt b·ªã ngu·ªìn': ['97D75','49234','59016','ƒë√°p ·ª©ng','file NetSure_732_User_Manual.pdf, trang: 8, trong b·∫£ng(figure):, evidence: Maximum configuration: 4 pieces; file Converter_Brochure.pdf, trang: 2, trong b·∫£ng(figure): None, evidence: Output Power: 3000W Maximum; file NetSure_732_User_Manual.pdf, trang: 8, trong b·∫£ng(figure): Table 1-1, evidence: DC distribution | PL: | 32A/1P√ó2;16A/1P√ó2 MCB | NPL: 63A/1P√ó2;32A/1P√ó4;16A/1P√ó2 MCB'],
    }
}

In [125]:
product_key_test

{'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah)': {'Y√™u c·∫ßu chung': ['5F1A0',
   'ƒë√°p ·ª©ng',
   'file NetSure_732_User_Manual.pdf, trang: 29'],
  'C·∫•u h√¨nh thi·∫øt b·ªã ngu·ªìn': ['97D75',
   '49234',
   '59016',
   'ƒë√°p ·ª©ng',
   'file NetSure_732_User_Manual.pdf, trang: 8, trong b·∫£ng(figure):, evidence: Maximum configuration: 4 pieces; file Converter_Brochure.pdf, trang: 2, trong b·∫£ng(figure): None, evidence: Output Power: 3000W Maximum; file NetSure_732_User_Manual.pdf, trang: 8, trong b·∫£ng(figure): Table 1-1, evidence: DC distribution | PL: | 32A/1P√ó2;16A/1P√ó2 MCB | NPL: 63A/1P√ó2;32A/1P√ó4;16A/1P√ó2 MCB']}}

In [126]:
test

{'5F1A0': {'ten_san_pham': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah)',
  'ten_hang_hoa': 'Y√™u c·∫ßu chung',
  'value': 'Th·ªùi gian b·∫£o h√†nh: theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t, t·ªëi thi·ªÉu 12 th√°ng.',
  'yeu_cau_ky_thuat_chi_tiet': 'Th·ªùi gian b·∫£o h√†nh: theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t, t·ªëi thi·ªÉu 12 th√°ng.',
  'yeu_cau_ky_thuat': None,
  'query': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/48VDC k√®m theo 02 d√†n acquy 200Ah c√≥ th·ªùi gian b·∫£o h√†nh l√† bao l√¢u? C√≥ ph·∫£i theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t, t·ªëi thi·ªÉu l√† 12 th√°ng kh√¥ng?',
  'kha_nang_dap_ung': 'Th·ªùi gian b·∫£o h√†nh c·ªßa s·∫£n ph·∫©m l√† theo ti√™u chu·∫©n c·ªßa nh√† s·∫£n xu·∫•t.',
  'tai_lieu_tham_chieu': {'file': 'NetSure_732_User_Manual.pdf',
   'section': 'Appendix 1  Technical And Engineering Data',
   'table_or_figure': 'None',
   'page': 29,
   'evidence': 'Th·ªùi gian b·∫£o h√†nh c·ªßa s·∫£n ph·∫©m l√† theo ti√™u chu·∫©n c·ªßa 

In [127]:
!pip install python-docx



In [128]:
from docx import Document
from docx.shared import Pt
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL
# === T·∫°o Word document ===
doc = Document()
doc.add_heading("B·∫¢NG TUY√äN B·ªê ƒê√ÅP ·ª®NG V·ªÄ K·ª∏ THU·∫¨T", level=1)

# T·∫°o b·∫£ng 6 c·ªôt
table = doc.add_table(rows=1, cols=6)
table.style = 'Table Grid'
table.alignment = WD_TABLE_ALIGNMENT.CENTER

# Header
headers = [
    "H·∫°ng m·ª•c s·ªë", "T√™n h√†ng ho√°",
    "Th√¥ng s·ªë k·ªπ thu·∫≠t v√† c√°c ti√™u chu·∫©n c·ªßa h√†ng ho√° trong E-HSMT",
    "Th√¥ng s·ªë k·ªπ thu·∫≠t v√† c√°c ti√™u chu·∫©n c·ªßa h√†ng ho√° ch√†o trong E-HSDT",
    "H·ªì s∆° tham chi·∫øu", "T√¨nh ƒë√°p ·ª©ng c·ªßa h√†ng ho√°"
]

for i, text in enumerate(headers):
    cell = table.rows[0].cells[i]
    cell.text = text
    for p in cell.paragraphs:
        for run in p.runs:
            run.font.bold = True
            run.font.size = Pt(10)
    cell.vertical_alignment = WD_ALIGN_VERTICAL.CENTER

# Ghi t·ª´ng d√≤ng
for product, hang_hoa_dict in product_key_test.items():
    for idx, (ten_hang_hoa, items) in enumerate(hang_hoa_dict.items(), start=1):
        ma_ids = items[:-2]  # C√°c ID
        dap_ung = items[-2]  # v√≠ d·ª•: "ƒë√°p ·ª©ng"
        ho_so = items[-1]    # t√†i li·ªáu tham chi·∫øu

        # T·ªïng h·ª£p th√¥ng s·ªë k·ªπ thu·∫≠t
        eh_smt = ""
        eh_hsdt = ""

        for ma in ma_ids:
            if ma in test:
                eh_smt += f"- {test[ma]['yeu_cau_ky_thuat_chi_tiet']}\n"
                eh_hsdt += f"- {test[ma]['kha_nang_dap_ung']}\n"

        # T·∫°o d√≤ng m·ªõi
        row = table.add_row().cells
        row[0].text = str(idx)
        row[1].text = ten_hang_hoa
        row[2].text = eh_smt.strip()
        row[3].text = eh_hsdt.strip()
        row[4].text = ho_so
        row[5].text = dap_ung

        for cell in row:
            cell.vertical_alignment = WD_ALIGN_VERTICAL.TOP
            for p in cell.paragraphs:
                for run in p.runs:
                    run.font.size = Pt(9)

# L∆∞u file
doc.save("bang_tuyen_bo_dap_ung.docx")