In [1]:
import camelot
import json
import os
import re

In [2]:
def get_continued_tables(tables, threshold):

    continued_tables = {}
    previous_table = False
    group_counter = 0

    # typical height of a pdf is 842 points and bottom margins are anywhere between 56 and 85 points
    # therefore, accounting for margins, 792
    page_height = 792

    # iterate over the tables
    for i, table in enumerate(tables):

        # if a previous table exists (remember, we start with this as false)
        # and the previous table was on the previous page
        # and the number of columns of both tables is the same
        if previous_table and table.page == previous_table.page + 1 and len(table.cols) == len(previous_table.cols):

            # get the bottom coordinate of the previous table
            # note that for pdfs the origin (0, 0) typically starts from the bottom-left corner of the page,
            # with the y-coordinate increasing as you move upwards
            # this is why for {x0, y0, x1, y1} we need the y0 as the bottom
            previous_table_bottom = previous_table._bbox[1]

            # get the top coordinate of the current table
            # for {x0, y0, x1, y1} we need the y1 as the top
            current_table_top = table._bbox[3]

            # if the previous table ends in the last 15% of the page and the current table starts in the first 15% of the page
            if previous_table_bottom < (threshold / 100) * page_height and current_table_top > (1 - threshold / 100) * page_height:

                # if we don't have started this group of tables
                if (continued_tables.get(group_counter) is None):

                    # start by adding the first table
                    continued_tables[group_counter] = [previous_table]

                # add any of the sunsequent tables to the group
                continued_tables[group_counter].append(table)

            # if this is not a continuation of the previous table
            else:

                # increment the group number
                group_counter += 1;

        # if this is not a continuation of the previous table
        else:

            # increment the group number
            group_counter += 1;

        # the current table becomes the previous table for the next iteration
        previous_table = table

    # transform the dictionary into an array of arrays
    continued_tables = [value for value in continued_tables.values()]

    # return the combined tables
    return continued_tables

In [3]:
def table_to_json(table_data, table_info):
    """Convert table data to JSON format"""
    if not table_data:
        return {}
    
    # Create JSON structure
    json_data = {
        "metadata": {
            "source_file": table_info["source_file"],
            "page": table_info["page"],
            "table_order": table_info["order"],
            "total_rows": len(table_data),
            "total_columns": len(table_data[0]) if table_data else 0
        },
        "headers": [],
        "data": []
    }
    
    # Add headers (first row)
    if len(table_data) > 0:
        headers = [str(cell).strip() for cell in table_data[0]]
        
        # Replace first 3 headers with fixed names
        if len(headers) >= 1:
            headers[0] = "STT"
        if len(headers) >= 2:
            headers[1] = "hang_hoa"
        if len(headers) >= 3:
            headers[2] = "yeu_cau_ky_thuat"
            
        json_data["headers"] = headers
        
        # Add data rows (skip header)
        for i, row in enumerate(table_data[1:], 1):
            row_dict = {}
            for j, cell in enumerate(row):
                # Use header as key, fallback to column index if header is empty
                key = json_data["headers"][j] if j < len(json_data["headers"]) and json_data["headers"][j] else f"column_{j}"
                row_dict[key] = str(cell).strip()
            
            json_data["data"].append({
                "row_index": i,
                "values": row_dict
            })
    
    return json_data

In [4]:
def get_biggest_table(pdf_path, threshold):
    tables = camelot.read_pdf(pdf_path, flavor = 'lattice', pages = 'all')
    continued_tables = get_continued_tables(tables, threshold)

    # get the name of the PDF file we are processing (without the extension)
    pdf_file_name = os.path.splitext(os.path.basename(pdf_path))[0]

    processed = []
    all_table_jsons = []

    # iterate over found tables
    for i, table in enumerate(tables):

        # if table was already processed as part of a group
        if table in processed: continue

        # check if the current table is a continued table
        is_continued = any(table in sublist for sublist in continued_tables)

        # collect all table data (current table + continued tables if any)
        all_table_data = list(table.data)

        # if the current table is a continued table, append all subsequent continued tables data
        if is_continued:

            # get the index of the group in "continued_tables" associated with the current table
            group_index = next(index for index, sublist in enumerate(continued_tables) if table in sublist)

            # iterate over the tables in said group and append their data
            for continued_table in continued_tables[group_index]:

                # skip the current table as it's already added
                if continued_table == table or continued_table in processed: continue

                # append the data of the continued table (skip header for subsequent tables)
                all_table_data.extend(continued_table.data[1:] if len(continued_table.data) > 1 else [])

                # keep track of processed tables
                processed.append(continued_table)

        # convert to JSON
        table_info = {
            "source_file": pdf_file_name,
            "page": table.parsing_report['page'],
            "order": table.parsing_report['order']
        }
        
        json_data = table_to_json(all_table_data, table_info)
        all_table_jsons.append(json_data)
        
        # mark current table as processed
        processed.append(table)

    # find the table with the most rows
    if all_table_jsons:
        largest_table = max(all_table_jsons, key=lambda x: x.get('metadata', {}).get('total_rows', 0))
        
        # return the JSON of the largest table
        print(json.dumps(largest_table, ensure_ascii=False, indent=2))
        return largest_table
    else:
        print("No tables found in the PDF.")
        return None

In [5]:
hello = get_biggest_table("D:/study/LammaIndex/documents/test.pdf",50)

{
  "metadata": {
    "source_file": "test",
    "page": 1,
    "table_order": 1,
    "total_rows": 15,
    "total_columns": 3
  },
  "headers": [
    "STT",
    "hang_hoa",
    "yeu_cau_ky_thuat"
  ],
  "data": [
    {
      "row_index": 1,
      "values": {
        "STT": "I",
        "hang_hoa": "B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ \n48VDC (k√®m theo 02 d√†n acquy \n200Ah)",
        "yeu_cau_ky_thuat": ""
      }
    },
    {
      "row_index": 2,
      "values": {
        "STT": "1",
        "hang_hoa": "Y√™u c·∫ßu chung",
        "yeu_cau_ky_thuat": "-  C√°c lo·∫°i thi·∫øt b·ªã, v·∫≠t t∆∞, ph·ª• ki·ªán ph·∫£i c√≥ ngu·ªìn g·ªëc xu·∫•t x·ª© r√µ r√†ng, c√≥ ch·ª©ng nh·∫≠n ch·∫•t l∆∞·ª£ng s·∫£n \nph·∫©m c·ªßa nh√† s·∫£n xu·∫•t. \n-  Thi·∫øt b·ªã m·ªõi 100% ch∆∞a qua s·ª≠ d·ª•ng \n-  Thi·∫øt b·ªã ph·∫£i ƒë∆∞·ª£c s·∫£n xu·∫•t t·ª´ nƒÉm 2021 tr·ªü l·∫°i ƒë√¢y \n-  Tu√¢n th·ªß ti√™u chu·∫©n IEC 60950-1 \n-  Thu·ªôc lo·∫°i thi·∫øt b·ªã ngu·ªìn s·ª≠ d·ª•ng k·ªπ thu·∫≠t chuy·ªÉn m·∫°ch, thi·

In [6]:
data = hello["data"]

In [7]:
import uuid
def clean_text(text):
    """L√†m s·∫°ch text, lo·∫°i b·ªè k√Ω t·ª± xu·ªëng d√≤ng th·ª´a"""
    return re.sub(r'\n+', '', text.strip())

def split_requirements(text):
    """T√°ch c√°c y√™u c·∫ßu d·ª±a tr√™n d·∫•u g·∫°ch ƒë·∫ßu d√≤ng"""
    requirements = []
    lines = text.split('\n')
    for line in lines:
        line = line.strip()
        if line.startswith('- '):
            requirements.append(line[2:].strip())
        elif line and not any(line.startswith(prefix) for prefix in ['- ']):
            if requirements:
                requirements[-1] += ' ' + line
            else:
                requirements.append(line)
    return requirements

def generate_random_key():
    """T·∫°o key random 5 k√Ω t·ª± t·ª´ UUID"""
    return str(uuid.uuid4()).replace('-', '')[:5].upper()

def convert_to_new_format(data):
    result = []
    current_product = None
    current_category = None
    
    for item in data:
        values = item['values']
        stt_raw  = values['STT']
        hang_hoa = clean_text(values['hang_hoa'])
        yeu_cau = values['yeu_cau_ky_thuat']


        stt = stt_raw.strip()

        roman_pattern = r'^(VII|VIII|IX|X|XI|XII|I{1,3}|IV|V|VI)\s+(.+)'
        roman_match = re.match(roman_pattern, stt)
        # N·∫øu STT l√† s·ªë La M√£ (I, II, III...) th√¨ ƒë√¢y l√† t√™n s·∫£n ph·∫©m
        hang_hoa_roman_match = re.match(roman_pattern, hang_hoa)
        if roman_match and not hang_hoa and not yeu_cau:
            if current_product:
                result.append(current_product)
            
            roman_num = roman_match.group(1)  # S·ªë La M√£
            product_name = roman_match.group(2)  # T√™n s·∫£n ph·∫©m
            
            current_product = {
                "ten_san_pham": product_name,
                "cac_muc": []
            }
            current_category = None
        elif hang_hoa_roman_match and not stt_raw and not yeu_cau:
            if current_product:
                result.append(current_product)
            
            roman_num = hang_hoa_roman_match.group(1)  # S·ªë La M√£
            product_name = hang_hoa_roman_match.group(2)  # T√™n s·∫£n ph·∫©m
            
            current_product = {
                "ten_san_pham": product_name,
                "cac_muc": []
            }
            current_category = None        
        
        elif stt in ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV']:
            if current_product:
                result.append(current_product)
            
            current_product = {
                "ten_san_pham": hang_hoa,
                "cac_muc": []
            }
            current_category = None
            
        # N·∫øu STT l√† s·ªë (1, 2, 3...) th√¨ ƒë√¢y l√† danh m·ª•c
        elif stt.isdigit():
            current_category = {
                "ten_hang_hoa": hang_hoa,
                "thong_so_ky_thuat": {}
            }
            
            # X·ª≠ l√Ω y√™u c·∫ßu k·ªπ thu·∫≠t cho danh m·ª•c
            if yeu_cau.strip():
                requirements = split_requirements(yeu_cau)
                for req in requirements:
                    key = generate_random_key()  # T·∫°o key random
                    current_category["thong_so_ky_thuat"][key] = clean_text(req)
            if current_product:
                current_product["cac_muc"].append(current_category)
                
        # N·∫øu STT tr·ªëng th√¨ ƒë√¢y l√† th√¥ng s·ªë k·ªπ thu·∫≠t chi ti·∫øt
        elif stt == '' and current_category and hang_hoa:
            # T·∫°o key random cho th√¥ng s·ªë k·ªπ thu·∫≠t
            key = generate_random_key()
            
            # L√†m s·∫°ch t√™n h√†ng h√≥a v√† y√™u c·∫ßu k·ªπ thu·∫≠t
            clean_hang_hoa = clean_text(hang_hoa)
            clean_yeu_cau = clean_text(yeu_cau)
            
            current_category["thong_so_ky_thuat"][key] = [clean_hang_hoa, clean_yeu_cau]
        elif stt == '' and current_category and not hang_hoa:
            if yeu_cau.strip():
                requirements = split_requirements(yeu_cau)
                
                # L·∫•y key cu·ªëi c√πng trong thong_so_ky_thuat (n·∫øu c√≥)
                existing_keys = list(current_category["thong_so_ky_thuat"].keys())
                last_key = existing_keys[-1] if existing_keys else None
                
                for req in requirements:
                    clean_req = clean_text(req)
                    
                    # Ki·ªÉm tra ch·ªØ c√°i ƒë·∫ßu c√≥ vi·∫øt hoa HO·∫∂C c√≥ g·∫°ch ƒë·∫ßu d√≤ng kh√¥ng
                    has_dash = req.strip().startswith('- ')
                    has_uppercase = clean_req and clean_req[0].isupper()
                    
                    if has_uppercase or has_dash:
                        # Ch·ªØ ƒë·∫ßu vi·∫øt hoa HO·∫∂C c√≥ g·∫°ch ƒë·∫ßu d√≤ng -> t·∫°o key m·ªõi
                        key = generate_random_key()
                        current_category["thong_so_ky_thuat"][key] = clean_req
                        last_key = key
                    else:
                        # Ch·ªØ ƒë·∫ßu kh√¥ng vi·∫øt hoa V√Ä kh√¥ng c√≥ g·∫°ch ƒë·∫ßu d√≤ng -> n·ªëi v√†o key tr∆∞·ªõc ƒë√≥
                        if last_key and last_key in current_category["thong_so_ky_thuat"]:
                            current_category["thong_so_ky_thuat"][last_key] += " " + clean_req
                        else:
                            # N·∫øu kh√¥ng c√≥ key tr∆∞·ªõc ƒë√≥ th√¨ v·∫´n t·∫°o key m·ªõi
                            key = generate_random_key()
                            current_category["thong_so_ky_thuat"][key] = clean_req
                            last_key = key
    
    # Th√™m s·∫£n ph·∫©m cu·ªëi c√πng
    if current_product:
        result.append(current_product)
    
    return result

# Chuy·ªÉn ƒë·ªïi d·ªØ li·ªáu
converted_data = convert_to_new_format(data)

In [None]:
converted_data

In [130]:
context_queries = {}  # Dict ch·ª©a th√¥ng tin chi ti·∫øt theo key
product_key = {}  # Dict l·ªìng: ten_san_pham -> ten_hang_hoa -> list[key]

for item in converted_data:
    ten_san_pham = item['ten_san_pham']
    for muc in item['cac_muc']:
        ten_hang_hoa = muc['ten_hang_hoa']
        thong_so_ky_thuat = muc['thong_so_ky_thuat']
        for key, value in thong_so_ky_thuat.items():
            if isinstance(value, list):
                q = value[0]
                k = value[1]
                value_str = ' '.join(value)
            else:
                q = None
                k = value
                value_str = value

            # Ghi v√†o context_queries
            context_queries[key] = {
                "ten_san_pham": ten_san_pham,
                "ten_hang_hoa": ten_hang_hoa,
                "value": value_str,
                "yeu_cau_ky_thuat_chi_tiet": k,
                "yeu_cau_ky_thuat": q
            }

            # Ghi v√†o product_key
            if ten_san_pham not in product_key:
                product_key[ten_san_pham] = {}
            if ten_hang_hoa not in product_key[ten_san_pham]:
                product_key[ten_san_pham][ten_hang_hoa] = []
            product_key[ten_san_pham][ten_hang_hoa].append(key)



In [None]:
context_queries

In [None]:
product_key['B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah)']

In [12]:
from openai import OpenAI
from dotenv import load_dotenv
import re
import os
import time
load_dotenv()

clientOpenAi = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [13]:
def retrieve_product_line(product_name, assistant_id="asst_j5wHMN84dpSLXD2GMH5QifS0"):
    thread = clientOpenAi.beta.threads.create()
    thread_id = thread.id
    # 2. G·ª≠i message v√†o thread
    clientOpenAi.beta.threads.messages.create(
        thread_id=thread_id,
        role="user",
        content=product_name
    )
    run = clientOpenAi.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistant_id,
        tool_choice="auto"  # ho·∫∑c thay b·∫±ng tool c·ª• th·ªÉ n·∫øu c·∫ßn
        # tool_choice={"type": "function", "function": {"name": "danh_gia_ky_thuat"}}
    )
    run_id = run.id
    # 4. ƒê·ª£i assistant x·ª≠ l√Ω xong
    while True:
        run_status = clientOpenAi.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run_id)
        if run_status.status == "completed":
            break
        elif run_status.status in ["failed", "cancelled", "expired"]:
            raise Exception(f"Run failed with status: {run_status.status}")
        time.sleep(1)

    # 5. L·∫•y k·∫øt qu·∫£ tr·∫£ v·ªÅ t·ª´ Assistant
    messages = clientOpenAi.beta.threads.messages.list(thread_id=thread_id)
    for message in reversed(messages.data):  # ƒë·∫£o ng∆∞·ª£c ƒë·ªÉ l·∫•y k·∫øt qu·∫£ m·ªõi nh·∫•t tr∆∞·ªõc
        if message.role == "assistant":
            for content in message.content:
                if content.type == "text":
                    return content.text.value

    return None

In [16]:
def create_prompt_extract_module(query_str):
    prompt = f"""
You are an expert in hardware product documentation analysis.  
Read the provided text (which can be either a detailed product brochure or a general product requirement) and extract ONLY the core physical hardware components/modules of the system.
 
For each component:
- If the text explicitly includes a model number, code, or exact specification tied to the component ‚Üí output "<Full Component Name>: <Exact Model(s)/Code(s)>".
- If the text does NOT provide a model number or code ‚Üí output only "<Full Component Name>".
 
Input:
<<<
{query_str}
>>>
 
Output format:
- <Component Name>[: <Model(s)/Code(s) if available>]
 
Rules:
1. Only include core hardware modules essential for the product‚Äôs operation (e.g., Rectifier Module, Controller, AC Input, AC Distribution, DC Distribution, Battery Distribution, Lightning Protection, Cooling System, Battery Bank).
2. Preserve the exact wording of component/module names from the text (do not paraphrase or generalize).
3. Include model numbers, codes, or exact designations only if explicitly stated in the text.  
   - If multiple models exist, list them separated by " / ".
4. If a component has sub-parts (e.g., BLVD/LLVD, Input/Output), keep them as separate lines with their full names.
5. Ignore optional accessories, warranty info, standards compliance, and marketing text unless they are part of the official component name/specification.
7. Do not infer or guess component names‚Äîextract only what is explicitly stated.    
"""
    return prompt

In [62]:
def create_prompt_extract_module2(query_str):
    prompt = f"""
    You are an expert in hardware product documentation analysis.  
Read the provided text (which can be either a detailed product brochure or a general product requirement) and extract ONLY the model numbers, codes, or exact designations of the core physical hardware components/modules of the system.

Input:
<<<
{query_str}
>>>

Output format:
A valid JSON array of strings, where each string is one model/code.  
Example:
["R48-121A3", "R56-3220"]

Rules:
1. Only extract model numbers, codes, or exact designations explicitly stated in the text.  
   - Do NOT include component/module names, descriptions, amperage, voltage, or units (e.g., "125 A / 2P" is ignored).
2. Extract models only from core hardware modules essential for the product‚Äôs operation (e.g., Rectifier Module, Controller, AC Input, AC Distribution, DC Distribution, Battery Distribution, Lightning Protection, Cooling System, Battery Bank).
3. Preserve the exact case, spacing, and characters from the original text.
4. If multiple models are listed together, split them into separate JSON array elements.
5. Ignore optional accessories, warranty info, standards compliance, and marketing text.
6. Do not infer or guess model numbers‚Äîextract only what is explicitly stated.
7. Output only a valid JSON array without extra text or explanations.

    """
    return prompt

In [63]:
product = 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah)'
product_line = retrieve_product_line(product)
print(f"Product Line: {product_line}")
query_str = f""

all_requirements = product_key[product]
for key in all_requirements:
    query_str += f"{key} :"
    for item in all_requirements[key]:
        if item not in context_queries:
            continue
        query_str += context_queries[item]["value"]
    query_str += "\n"
prompt_yeu_cau_ky_thuat = create_prompt_extract_module(query_str)
response = clientOpenAi.responses.create(
    model="gpt-4o-mini",
    input=prompt_yeu_cau_ky_thuat,
    temperature=0
)
product_requirement = f"{product}: {response.output_text.strip()}" 


  thread = clientOpenAi.beta.threads.create()
  clientOpenAi.beta.threads.messages.create(
  run = clientOpenAi.beta.threads.runs.create(
  run_status = clientOpenAi.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run_id)
  messages = clientOpenAi.beta.threads.messages.list(thread_id=thread_id)


Product Line: "DC Power Systems"


In [18]:
product_requirement

'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah): - Thi·∫øt b·ªã ngu·ªìn\n- Module ch·ªânh l∆∞u (Rectifier): ‚â• 3000W\n- Attomat DC: \n  - Lo·∫°i 32A: ‚â• 02 c√°i\n  - Lo·∫°i 16A: ‚â• 01 c√°i\n  - Lo·∫°i 63A: ‚â• 02 c√°i\n  - Lo·∫°i 32A: ‚â• 03 c√°i\n  - Lo·∫°i 16A: ‚â• 02 c√°i\n  - Lo·∫°i 125A: ‚â• 02 c√°i\n- ƒê·∫ßu v√†o AC\n- ƒê·∫ßu ra DC\n- B·ªô ngu·ªìn\n- Kh·ªëi ƒëi·ªÅu khi·ªÉn v√† hi·ªÉn th·ªã\n- C·ªïng k·∫øt n·ªëi: RS485 / Ethernet / USB\n- Acquy k√®m theo: 02 d√†n Ac quy 12V - 200Ah (08 b√¨nh)'

In [None]:
import os
from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex
from llama_index.core.vector_stores import VectorStoreInfo
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.settings import Settings
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
    FilterCondition
)
import time


# C·∫•u h√¨nh LLM v√† Embedding
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.llm = OpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
# C·∫•u h√¨nh client Qdrant
client = QdrantClient(
    url="https://a8bcf78f-0147-411f-aa58-079f863fcd6d.us-west-1-0.aws.cloud.qdrant.io:6333",
    api_key=os.getenv("QDRANT_API_KEY"),
)
aclient = AsyncQdrantClient(
    url="https://a8bcf78f-0147-411f-aa58-079f863fcd6d.us-west-1-0.aws.cloud.qdrant.io:6333",
    api_key=os.getenv("QDRANT_API_KEY"),
)
# Kh·ªüi t·∫°o Vector Store
vector_store = QdrantVectorStore(
    collection_name="hello_my_friend_test",
    client=client,
    aclient=aclient,
)

def retrieve_document(product_line, query_str):
    product_ids = []
    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
    
    filters_document = MetadataFilters(
        filters=[
            MetadataFilter(key="product_line", operator=FilterOperator.EQ, value=product_line),
            MetadataFilter(key="type", operator=FilterOperator.EQ, value="summary_document"),
        ],
    condition=FilterCondition.AND,
    )
    retriever_document = index.as_retriever(similarity_top_k=3, sparse_top_k=10, verbose=True, enable_hybrid=True, filters=filters_document)
    
    results = retriever_document.retrieve(query_str)
    # print("product: ", results)
    # print("results: ", results)
    for result in results:
        metadata = result.metadata
        print(metadata)
        product_ids.append(
            {
                "product_id": metadata["product_id"],
                "brochure_file_path": metadata["brochure_file_path"],
            }
        )

    return product_ids

def retrieve_chunk(product_ids, query_str):
    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
    
    filters_chunk = MetadataFilters(
        filters=[
            MetadataFilter(key="product_id", operator=FilterOperator.IN, value=product_ids),
            MetadataFilter(key="type", operator=FilterOperator.EQ, value="chunk_document"),
        ],
        condition=FilterCondition.AND,
    )

    retriever_chunk = index.as_retriever(similarity_top_k=5, verbose=True, filters=filters_chunk)
    
    results = retriever_chunk.retrieve(query_str)
    content = ""
    for i, result in enumerate(results, start=1):
        metadata = result.metadata
        file_name = metadata["file_name"]+ ".pdf"
        page = metadata["page"]
        table = metadata["table_name"]
        figure_name = metadata.get("figure_name")
        text = result.text.strip()
        content += f"Chunk {i} trong file {file_name} t·∫°i trang {page}, c√≥ ch·ª©a b·∫£ng {table} v√† h√¨nh {figure_name} c√≥ n·ªôi dung:\n{text}\n\n"

    return content

def retrieve_product_line(product_name, assistant_id="asst_j5wHMN84dpSLXD2GMH5QifS0"):
    from openai import OpenAI 
    clientOpenAi = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    thread = clientOpenAi.beta.threads.create()
    thread_id = thread.id
    # 2. G·ª≠i message v√†o thread
    clientOpenAi.beta.threads.messages.create(
        thread_id=thread_id,
        role="user",
        content=product_name
    )
    run = clientOpenAi.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistant_id,
        tool_choice="auto"  # ho·∫∑c thay b·∫±ng tool c·ª• th·ªÉ n·∫øu c·∫ßn
        # tool_choice={"type": "function", "function": {"name": "danh_gia_ky_thuat"}}
    )
    run_id = run.id
    # 4. ƒê·ª£i assistant x·ª≠ l√Ω xong
    while True:
        run_status = clientOpenAi.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run_id)
        if run_status.status == "completed":
            break
        elif run_status.status in ["failed", "cancelled", "expired"]:
            raise Exception(f"Run failed with status: {run_status.status}")
        time.sleep(1)

    # 5. L·∫•y k·∫øt qu·∫£ tr·∫£ v·ªÅ t·ª´ Assistant
    messages = clientOpenAi.beta.threads.messages.list(thread_id=thread_id)
    for message in reversed(messages.data):  # ƒë·∫£o ng∆∞·ª£c ƒë·ªÉ l·∫•y k·∫øt qu·∫£ m·ªõi nh·∫•t tr∆∞·ªõc
        if message.role == "assistant":
            for content in message.content:
                if content.type == "text":
                    # print("product_line: ", content.text.value)
                    return content.text.value

    return None

def retrieve_component(keyword_list):
    # T·∫°o danh s√°ch ƒëi·ªÅu ki·ªán OR
    should_conditions = [
        FieldCondition(
            key='file_brochure_name',
            match=MatchText(text=kw)
        )
        for kw in keyword_list
    ]

    text_filter = Filter(
        should=should_conditions  # OR search
    )

    scroll_result, next_page = client.scroll(
        collection_name="hello_my_friend_test",
        scroll_filter=text_filter,
        limit=5
    )

    product_ids = []
    if scroll_result:
        print("K·∫øt qu·∫£ t√¨m ki·∫øm:")
        for result in scroll_result:
            metadata = result.payload
            product_id = metadata.get("product_id", "")
            if product_id:
                product_ids.append(product_id)
    print(product_ids)
    return product_ids


        




In [49]:
products = retrieve_document(product_line, product_requirement)

{'category': 'Critical Power', 'product_line': 'DC Power Systems', 'product_name': 'Netsure 731 A41', 'summary': 'The NetSure‚Ñ¢ 731 A41 is a high-efficiency, compact DC power system designed to deliver reliable, uninterrupted -48 VDC power for telecom access applications. It integrates up to four high-efficiency eSure‚Ñ¢ rectifiers and advanced battery management capabilities within a small 4U rackmount footprint. It supports harsh grid environments and offers intelligent remote monitoring via multiple interfaces.Ultra-High Efficiency: Up to 98% efficiency with eSure‚Ñ¢ rectifiers, significantly reducing energy loss and operating costs.Compact Design: Delivers 240A @ -48VDC in a small 4U form factor, saving space for critical telecom equipment.Harsh Grid Tolerance: Wide AC input range (85‚Äì300 VAC) enhances grid adaptability and battery life. Advanced Battery Management: Includes BLVD, temperature compensation, auto voltage regulation, reserve time prediction, and online battery test

In [50]:
products

[{'product_id': 'b1023705-76c3-11f0-9c6a-38f3abb08dd1',
  'brochure_file_path': 'output/NetSure -731 A41 Brochure.md'},
 {'product_id': 'bbb1c548-76c4-11f0-8ed2-38f3abb08dd1',
  'brochure_file_path': 'output/M830B Brochure.md'},
 {'product_id': '794190a4-76c3-11f0-b48c-38f3abb08dd1',
  'brochure_file_path': 'output/R48-3000e3 Brochure.md'}]

In [51]:
item = products[0]

In [52]:
item

{'product_id': 'b1023705-76c3-11f0-9c6a-38f3abb08dd1',
 'brochure_file_path': 'output/NetSure -731 A41 Brochure.md'}

In [53]:
product_search_id = []
product_id = item["product_id"]
brochure = item["brochure_file_path"]

In [56]:
from llama_index.readers.file import MarkdownReader
import json

# Kh·ªüi t·∫°o reader
reader = MarkdownReader()
documents = reader.load_data(file=f"D:/study/LammaIndex/{brochure}")
markdown_text = "\n".join(doc.text for doc in documents)

In [64]:
prompt_brochure = create_prompt_extract_module2(markdown_text)
response = clientOpenAi.responses.create(
    model="gpt-4o-mini",
    input=prompt_brochure,
    temperature=0
)
product_brochure = response.output_text.strip()

In [66]:
import json
import re
# C√°ch 1: D√πng regex ƒë·ªÉ l·∫•y ph·∫ßn b√™n trong code block
match = re.search(r'```json\s*(.*?)\s*```', product_brochure, re.DOTALL)
if match:
    json_str = match.group(1)
else:
    json_str = response  # n·∫øu kh√¥ng c√≥ code block th√¨ d√πng nguy√™n vƒÉn

# Parse JSON th√†nh list Python
product_brochure = json.loads(json_str)

In [67]:
product_brochure

['NetSure‚Ñ¢ 731 A41',
 'R48-3000A3',
 'R48-3000e3',
 'R48-3500e3',
 'R48-3500E4',
 'M221S',
 'M830B']

In [74]:
from qdrant_client.http.models import PayloadSchemaType, Filter, FieldCondition, MatchText
product_component_id = retrieve_component(product_brochure)

K·∫øt qu·∫£ t√¨m ki·∫øm:
['b1023705-76c3-11f0-9c6a-38f3abb08dd1', '794190a4-76c3-11f0-b48c-38f3abb08dd1', 'bbb1c548-76c4-11f0-8ed2-38f3abb08dd1']


In [83]:
product_search_id=[]
product_search_id.extend(product_component_id) 
product_search_id.append(product_id)

In [85]:
product_search_id = set(product_search_id)

In [86]:
product_search_id

{'794190a4-76c3-11f0-b48c-38f3abb08dd1',
 'b1023705-76c3-11f0-9c6a-38f3abb08dd1',
 'bbb1c548-76c4-11f0-8ed2-38f3abb08dd1'}

In [87]:
all_requirements

{'Y√™u c·∫ßu chung': ['1C6C6',
  '48770',
  'BCA2F',
  '9F07C',
  '89B87',
  'DDB2C',
  '86CB8'],
 'C·∫•u h√¨nh thi·∫øt b·ªã ngu·ªìn': ['7193C', '2BCD4', '942E6', 'A30E7'],
 'ƒê·∫ßu v√†o AC': ['363A4', 'E019F', 'C0478', 'F48C8'],
 'ƒê·∫ßu ra DC': ['F703F', '16524', '8770B', '299DC', 'C6CB5'],
 'Y√™u c·∫ßu v·ªõi module ch·ªânh l∆∞u (Rectifier)': ['131AC',
  '4C5AE',
  '7F312',
  '46627',
  '7B602',
  '44D4C',
  '88C31',
  'F01DB'],
 'T√≠nh nƒÉng c·ªßa thi·∫øt b·ªã ngu·ªìn': ['4B269',
  'D15C7',
  '42940',
  '9B132',
  '36BCA',
  '0AD8A',
  'BFF7B'],
 'Kh·ªëi ƒëi·ªÅu khi·ªÉn v√† hi·ªÉn th·ªã': ['666B2',
  '7F0DD',
  '8293E',
  'B5CF6',
  'F8B9C',
  '41F54',
  '91152'],
 'ƒêi·ªÅu ki·ªán l√†m vi·ªác': ['B47E1'],
 'H·ªá th·ªëng l√†m m√°t': ['B7A22'],
 'ƒêi·ªÅu ki·ªán b·∫£o h√†nh': ['FB94E'],
 'Acquy k√®m theo': ['453E0',
  '71ADE',
  '342B6',
  '8ADC6',
  'BB901',
  '115C6',
  'D11DA',
  '2FD25',
  'B5CE8',
  'DDB51',
  '235B1']}

In [None]:
kha_nang_dap_ung_tham_chieu_final = {}
kha_nang_dap_ung_tham_chieu_step = {}

for key in all_requirements:
    for item in all_requirements[key]:
        if item not in context_queries:
            continue
        query = context_queries[item]["value"]
        content = retrieve_chunk(product_search_id, query)
        print(content)
        
        # Kh·ªüi t·∫°o dict n·∫øu ch∆∞a c√≥
        if item not in kha_nang_dap_ung_tham_chieu_step:
            kha_nang_dap_ung_tham_chieu_step[item] = {}
        
        kha_nang_dap_ung_tham_chieu_step[item]['relevant_context'] = content

In [None]:
kha_nang_dap_ung_tham_chieu_step

In [95]:
from openai import OpenAI
import json 
import time

clientOpenAI = OpenAI()

SYSTEM_PROMPT = """
B·∫°n ƒë∆∞·ª£c cung c·∫•p:
- M·ªôt ho·∫∑c nhi·ªÅu ƒëo·∫°n vƒÉn b·∫£n (chunk) t·ª´ t√†i li·ªáu k·ªπ thu·∫≠t, k√®m metadata: t√™n file, m·ª•c, b·∫£ng/h√¨nh (n·∫øu c√≥), s·ªë trang
- M·ªôt y√™u c·∫ßu k·ªπ thu·∫≠t c·ª• th·ªÉ.
- M·ªôt ƒëo·∫°n vƒÉn m·∫´u.

Y√™u c·∫ßu tr·∫£ l·ªùi b·∫±ng ti·∫øng vi·ªát:
- 1. T√¨m th√¥ng tin k·ªπ thu·∫≠t li√™n quan tr·ª±c ti·∫øp ƒë·∫øn y√™u c·∫ßu k·ªπ thu·∫≠t.
- 2. Tr√≠ch xu·∫•t gi√° tr·ªã th√¥ng s·ªë ƒë·ªÉ x√°c ƒë·ªãnh kh·∫£ nƒÉng ƒë√°p ·ª©ng theo y√™u c·∫ßu v√† tr·∫£ v·ªÅ ƒëo·∫°n vƒÉn t∆∞∆°ng t·ª± gi·ªëng ƒëo·∫°n vƒÉn m·∫´u kh√¥ng th√™m b·ªõt nh∆∞ng th√¥ng s·ªë ph·∫£i ch√≠nh x√°c c√≥ trong t√†i li·ªáu kh√¥ng ƒë∆∞·ª£c b·ªãa ƒë·∫∑t.
- 3. D·∫´n ch·ª©ng r√µ: file, section, table/figure name (n·∫øu c√≥), page, n·ªôi dung tr√≠ch d·∫´n c·ªßa nh·ªØng t√†i li·ªáu li√™n quan, n·ªôi dung tr√≠ch d·∫´n gi·ªØ nguy√™n kh√¥ng ƒë∆∞·ª£c d·ªãch , nh·ªØng t√†i li·ªáu kh√°c kh√¥ng li√™n quan th√¨ b·ªè qua.
 
Tr·∫£ k·∫øt qu·∫£ b·∫±ng c√°ch g·ªçi function `danh_gia_ky_thuat` v·ªõi c√°c tham s·ªë ph√π h·ª£p.
# V√≠ d·ª•:
Input:
Y√™u c·∫ßu: "S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): ‚â• 4"  
Chunk: "...NetSure 731 A41-S8: 4 rectifier slots (standard), expandable to 6..."  
Metadata:  
- file: "Netsure-731-A41-user-manual.pdf"  
- section: "Table 1-1 Configuration of power system"  
- page: 2"
ƒêo·∫°n vƒÉn m·∫´u: S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): ‚â• 4 (v√≠ d·ª• t√¨m trong t√†i li·ªáu s·ªë l∆∞·ª£ng l√† 5 th√¨ tr·∫£ v·ªÅ "S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): 5")
"""

# ƒê·ªãnh nghƒ©a function schema
FUNCTION_SCHEMA = {
    "name": "danh_gia_ky_thuat",
    "description": "ƒê√°nh gi√° kh·∫£ nƒÉng ƒë√°p ·ª©ng c·ªßa s·∫£n ph·∫©m theo y√™u c·∫ßu k·ªπ thu·∫≠t t·ª´ chunk t√†i li·ªáu.",
    "parameters": {
        "type": "object",
        "properties": {
            "yeu_cau_ky_thuat": {"type": "string"},
            "kha_nang_dap_ung": {"type": "string"},
            "tai_lieu_tham_chieu": {
                "type": "object",
                "properties": {
                    "file": {"type": "string"},
                    "section": {"type": "string"},
                    "table_or_figure": {"type": "string"},
                    "page": {"type": "integer"},
                    "evidence": {"type": "string"}
                },
                "required": ["file", "section", "page", "evidence"]
            }
        },
        "required": ["yeu_cau_ky_thuat", "kha_nang_dap_ung", "tai_lieu_tham_chieu"]
    }
}
def extract_first_json_object(json_str: str):
    s = json_str.strip()
    
    # T√¨m d·∫•u '{' ƒë·∫ßu ti√™n
    start_index = s.find('{')
    if start_index == -1:
        print("‚ùå Kh√¥ng t√¨m th·∫•y JSON object n√†o.")
        return None

    # Duy·ªát t·ª´ ƒë√≥ ƒë·ªÉ t√¨m d·∫•u '}' k·∫øt th√∫c object ƒë·∫ßu ti√™n
    brace_count = 0
    for i in range(start_index, len(s)):
        if s[i] == '{':
            brace_count += 1
        elif s[i] == '}':
            brace_count -= 1
            if brace_count == 0:
                end_index = i + 1  # C·∫Øt ƒë·∫øn sau d·∫•u '}'
                break
    else:
        print("‚ùå Kh√¥ng t√¨m th·∫•y JSON ƒë√≥ng ƒë√∫ng.")
        return None

    first_json_str = s[start_index:end_index]

    # Ki·ªÉm tra xem c√≥ parse ƒë∆∞·ª£c kh√¥ng
    result = json.loads(first_json_str)
    return result

def track_reference(context_queries,kha_nang_dap_ung_tham_chieu_step):
    # V√≠ d·ª• s·ª≠ d·ª•ng
    assistant_id = create_assistant()
    print(f"Assistant ID: {assistant_id}")

    # T·∫°o thread
    thread_id = create_thread()
    print(f"Thread ID: {thread_id}")
    for key in kha_nang_dap_ung_tham_chieu_step:
        value = context_queries[key]["value"]
        content = kha_nang_dap_ung_tham_chieu_step[key]["relevant_context"]
        form = context_queries[key]["yeu_cau_ky_thuat_chi_tiet"]
        # V√≠ d·ª• user prompt
        user_prompt = f'''
        Y√™u c·∫ßu: {value}
        Chunk v√† metadata: {content}
        ƒêo·∫°n vƒÉn m·∫´u: {form}
        '''

        # G·ªçi h√†m ƒë√°nh gi√°
        result = evaluate_technical_requirement(user_prompt, assistant_id)
        if isinstance(result, str):
            result = extract_first_json_object(result)
        kha_nang_dap_ung_tham_chieu_step[key]["kha_nang_dap_ung"] = result.get('kha_nang_dap_ung', "")
        kha_nang_dap_ung_tham_chieu_step[key]["tai_lieu_tham_chieu"] = {
            "file": result['tai_lieu_tham_chieu']['file'],
            "section": result['tai_lieu_tham_chieu'].get('section', ''),
            "table_or_figure": result['tai_lieu_tham_chieu'].get('table_or_figure', ''),
            "page": result['tai_lieu_tham_chieu'].get('page', 0),
            "evidence": result['tai_lieu_tham_chieu'].get('evidence', '')
        }
        kha_nang_dap_ung_tham_chieu_step[key].pop("relevant_context", None)  # Xo√° tr∆∞·ªùng kh√¥ng c·∫ßn thi·∫øt
    
    return kha_nang_dap_ung_tham_chieu_step

# H√†m t·∫°o Assistant b·∫±ng code
def create_assistant():
    assistant = clientOpenAI.beta.assistants.create(
        name="Technical Document Evaluator",
        instructions=SYSTEM_PROMPT,
        model="gpt-4o-mini",
        tools=[{"type": "function", "function": FUNCTION_SCHEMA}]
    )
    return assistant.id

# H√†m t·∫°o thread
def create_thread():
    thread = clientOpenAI.beta.threads.create()
    return thread.id

# === UPDATE ASSISTANT ===
def update_assistant(assistant_id):
    assistant = clientOpenAI.beta.assistants.update(
        assistant_id=assistant_id,
        instructions=SYSTEM_PROMPT,
        model="gpt-4o-mini",
        tools=[{"type": "function", "function": FUNCTION_SCHEMA}]
    )
    return assistant.id

# === EVALUATE TECHNICAL REQUIREMENT ===
def evaluate_technical_requirement(user_prompt, assistant_id):
    # 1. T·∫°o thread ri√™ng cho m·ªói l·∫ßn g·ªçi
    thread = clientOpenAI.beta.threads.create()
    thread_id = thread.id

    # 2. G·ª≠i message v√†o thread
    clientOpenAI.beta.threads.messages.create(
        thread_id=thread_id,
        role="user",
        content=user_prompt
    )

    # 3. T·∫°o run
    run = clientOpenAI.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistant_id,
        tool_choice={"type": "function", "function": {"name": "danh_gia_ky_thuat"}}
    )

    # 4. Ch·ªù assistant x·ª≠ l√Ω (t·ªëi ƒëa 20s)
    for _ in range(20):
        run = clientOpenAI.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
        if run.status not in ["queued", "in_progress"]:
            break
        time.sleep(1)

    # 5. L·∫•y arguments tr·ª±c ti·∫øp
    if run.status == "requires_action":
        call = run.required_action.submit_tool_outputs.tool_calls[0]
        print(f"üëâ Assistant ƒë√£ g·ªçi tool: {call.function.name}")
        print("üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:")
        print(call.function.arguments)
        return call.function.arguments

    elif run.status == "completed":
        messages = clientOpenAI.beta.threads.messages.list(thread_id=thread_id)
        for msg in messages.data:
            print(f"[{msg.role}] {msg.content[0].text.value}")
        return None

    else:
        print(f"Run status: {run.status}")
        return None


In [None]:
assistant_id = create_assistant()
print(f"Assistant ID: {assistant_id}")
for key in kha_nang_dap_ung_tham_chieu_step:
    value = context_queries[key]["value"]
    content = kha_nang_dap_ung_tham_chieu_step[key]["relevant_context"]
    form = context_queries[key]["yeu_cau_ky_thuat_chi_tiet"]
    # V√≠ d·ª• user prompt
    user_prompt = f'''
    Y√™u c·∫ßu: {value}
    Chunk v√† metadata: {content}
    ƒêo·∫°n vƒÉn m·∫´u: {form}
    '''
    
    # G·ªçi h√†m ƒë√°nh gi√°
    result = evaluate_technical_requirement(user_prompt, assistant_id)
    if isinstance(result, str):
        result = extract_first_json_object(result)
    kha_nang_dap_ung_tham_chieu_step[key]["kha_nang_dap_ung"] = result.get('kha_nang_dap_ung', "")
    kha_nang_dap_ung_tham_chieu_step[key]["tai_lieu_tham_chieu"] = {
        "file": result['tai_lieu_tham_chieu']['file'],
        "section": result['tai_lieu_tham_chieu'].get('section', ''),
        "table_or_figure": result['tai_lieu_tham_chieu'].get('table_or_figure', ''),
        "page": result['tai_lieu_tham_chieu'].get('page', 0),
        "evidence": result['tai_lieu_tham_chieu'].get('evidence', '')
    }
    kha_nang_dap_ung_tham_chieu_step[key].pop("relevant_context", None)  # Xo√° tr∆∞·ªùng kh√¥ng c·∫ßn thi·∫øt

In [None]:
kha_nang_dap_ung_tham_chieu_step

In [116]:
from openai import OpenAI
from dotenv import load_dotenv
import re
import json

clientOpenAI = OpenAI()

def adapt_or_not(kha_nang_dap_ung_tham_chieu_step, adapt_or_not_step):
    """
    H√†m n√†y s·∫Ω g·ªçi c√°c h√†m kh√°c ƒë·ªÉ th·ª±c hi·ªán qu√° tr√¨nh truy xu·∫•t v√† ƒë√°nh gi√° kh·∫£ nƒÉng ƒë√°p ·ª©ng y√™u c·∫ßu k·ªπ thu·∫≠t.
    """
    
    for key in all_requirements:
        dap_ung_ky_thuat = ""
        tai_lieu_tham_chieu = ""
        for item in all_requirements[key]:
            if item not in kha_nang_dap_ung_tham_chieu_step:
                continue
            yeu_cau_ky_thuat = context_queries[item].get('yeu_cau_ky_thuat_chi_tiet', "")
            kha_nang_dap_ung = kha_nang_dap_ung_tham_chieu_step[item].get('kha_nang_dap_ung', "")
            dap_ung_ky_thuat += f"{yeu_cau_ky_thuat} || {kha_nang_dap_ung}\n"
    
            tai_lieu = kha_nang_dap_ung_tham_chieu_step[item].get('tai_lieu_tham_chieu', {})
            file = tai_lieu.get("file", "")
            page = tai_lieu.get("page", "")
            table_or_figure = tai_lieu.get("table_or_figure", "")
            evidence = tai_lieu.get("evidence", "")
    
            tai_lieu_text = f"{file}, trang: {page}"
            if table_or_figure:
                tai_lieu_text += f", trong b·∫£ng(figure): {table_or_figure}"
            tai_lieu_text += f", evidence: {evidence}\n\n"
            tai_lieu_tham_chieu += tai_lieu_text
        if dap_ung_ky_thuat and tai_lieu_tham_chieu :
            prompt = prompt_adapt_or_not(dap_ung_ky_thuat)
            response = clientOpenAI.responses.create(
                model="gpt-4o-mini",
                input=prompt,
                temperature=0
            )
            output_text = response.output_text.strip()
            output_text = parse_output_text(output_text)
            if key not in adapt_or_not_step:
                adapt_or_not_step[key] = []
            adapt_or_not_step[key].append(output_text['ƒë√°p ·ª©ng k·ªπ thu·∫≠t'])
            adapt_or_not_step[key].append(tai_lieu_tham_chieu)
    return kha_nang_dap_ung_tham_chieu_step, adapt_or_not_step



def prompt_adapt_or_not(dap_ung_ky_thuat: str) -> str:
    prompt = f"""
B·∫°n s·∫Ω ƒë∆∞·ª£c cung c·∫•p m·ªôt danh s√°ch c√°c c·∫∑p ‚Äúy√™u c·∫ßu k·ªπ thu·∫≠t || kh·∫£ nƒÉng ƒë√°p ·ª©ng‚Äù trong file dap_ung_ky_thuat.
Nhi·ªám v·ª• c·ªßa b·∫°n:
1. V·ªõi t·ª´ng c·∫∑p, ƒë√°nh gi√° xem kh·∫£ nƒÉng ƒë√°p ·ª©ng c√≥ th·ª±c s·ª± ƒë√°p ·ª©ng y√™u c·∫ßu k·ªπ thu·∫≠t kh√¥ng.
2. T·ªïng h·ª£p k·∫øt qu·∫£:
  ‚Äì N·∫øu t·∫•t c·∫£ c√°c y√™u c·∫ßu ƒë·ªÅu ƒë∆∞·ª£c ƒë√°p ·ª©ng, tr·∫£ v·ªÅ "1"
  ‚Äì N·∫øu kh√¥ng c√≥ y√™u c·∫ßu n√†o ƒë∆∞·ª£c ƒë√°p ·ª©ng, tr·∫£ v·ªÅ "0"
  ‚Äì N·∫øu ch·ªâ m·ªôt ph·∫ßn y√™u c·∫ßu ƒë∆∞·ª£c ƒë√°p ·ª©ng, tr·∫£ v·ªÅ theo ƒë·ªãnh d·∫°ng "x/y", trong ƒë√≥:
       - x l√† s·ªë y√™u c·∫ßu ƒë∆∞·ª£c ƒë√°p ·ª©ng
       - y l√† t·ªïng s·ªë y√™u c·∫ßu
üì§ K·∫øt qu·∫£ ch·ªâ tr·∫£ v·ªÅ d∆∞·ªõi d·∫°ng JSON v·ªõi c·∫•u tr√∫c sau: {{
 "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "<k·∫øt qu·∫£ ƒë√°nh gi√°>"
}}

Danh s√°ch c√°c c·∫∑p ‚Äúy√™u c·∫ßu k·ªπ thu·∫≠t || kh·∫£ nƒÉng ƒë√°p ·ª©ng‚Äù : {dap_ung_ky_thuat}
"""
    return prompt

def parse_output_text(output_text: str) -> dict:
    # B1: Lo·∫°i b·ªè ph·∫ßn ```json ... ```
    cleaned = re.sub(r"^```json\n|```$", "", output_text.strip())
    print(cleaned)
    # B2: Gi·∫£i m√£ c√°c k√Ω t·ª± escape nh∆∞ \n, \"
    unescaped = cleaned.encode("utf-8")

    # B3: Chuy·ªÉn th√†nh dict
    return json.loads(unescaped)

In [117]:
adapt_or_not_step = {}
adapt_or_not_final = {}

In [118]:
kha_nang_dap_ung_tham_chieu_step, adapt_or_not_step = adapt_or_not(kha_nang_dap_ung_tham_chieu_step,adapt_or_not_step )

{
  "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "5/7"
}

{
  "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "3/4"
}

{
  "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "3/4"
}

{
  "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "4/5"
}

{
  "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "7/8"
}

{
  "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "5/6"
}

{
  "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "7/7"
}

{
  "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "1"
}

{
  "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "1"
}

{
  "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "0"
}

{
  "ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "5/10"
}



In [None]:
kha_nang_dap_ung_tham_chieu_step

In [144]:
adapt_or_not_step

{'Y√™u c·∫ßu chung': ['5/7',
  'Netsure 731 A41 Usermanual.pdf, trang: 0, evidence: Kh√¥ng c√≥ ch·∫ø ƒë·ªô ch·ª©ng nh·∫≠n ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m ho·∫∑c ngu·ªìn g·ªëc xu·∫•t x·ª© r√µ r√†ng ƒë∆∞·ª£c n√™u trong t√†i li·ªáu.\n\nNetsure-731-A41-user-manual.pdf, trang: 21, evidence: Before the test, inform the chief manufacturer representative. Only trained electrical engineer can maintain and operate this equipment.\n\nNetsure 731 A41 Usermanual.pdf, trang: 1, evidence: Copyright ¬© 2017 by Vertiv Co., Ltd. ¬Æ\n\nM830B Brochure.pdf, trang: 2, evidence: | Electrical    | IEC 60950-1, EN 60950-1, UL 60950-1            |\n\nNetsure 731 A41 Usermanual.pdf, trang: 9, evidence: The power system is composed of power distribution„ÄÅrectifier modules and controller module.\n\nNetsure 731 A41 Usermanual.pdf, trang: 35, trong b·∫£ng(figure): Figure 5 Accessory, evidence: ### Packing list\n\n[A diagram showing four accessories for a battery rack installation:\nAccessory 1: A large rectangular fra

In [132]:
def merge_dicts(kha_nang_dap_ung_tham_chieu_step, context_queries):
    for k, v in kha_nang_dap_ung_tham_chieu_step.items():
        if k in context_queries and isinstance(v, dict) and isinstance(context_queries[k], dict):
            # N·∫øu c·∫£ 2 c√πng l√† dict th√¨ merge ƒë·ªá quy
            merge_dicts(v, context_queries[k])
        else:
            # N·∫øu kh√¥ng ph·∫£i dict ho·∫∑c key ch∆∞a t·ªìn t·∫°i trong B th√¨ g√°n tr·ª±c ti·∫øp
            context_queries[k] = v
    return context_queries

In [133]:
context_queries= merge_dicts(kha_nang_dap_ung_tham_chieu_step, context_queries)

In [147]:
context_queries

{'1C6C6': {'ten_san_pham': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah)',
  'ten_hang_hoa': 'Y√™u c·∫ßu chung',
  'value': 'C√°c lo·∫°i thi·∫øt b·ªã, v·∫≠t t∆∞, ph·ª• ki·ªán ph·∫£i c√≥ ngu·ªìn g·ªëc xu·∫•t x·ª© r√µ r√†ng, c√≥ ch·ª©ng nh·∫≠n ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m c·ªßa nh√† s·∫£n xu·∫•t.',
  'yeu_cau_ky_thuat_chi_tiet': 'C√°c lo·∫°i thi·∫øt b·ªã, v·∫≠t t∆∞, ph·ª• ki·ªán ph·∫£i c√≥ ngu·ªìn g·ªëc xu·∫•t x·ª© r√µ r√†ng, c√≥ ch·ª©ng nh·∫≠n ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m c·ªßa nh√† s·∫£n xu·∫•t.',
  'yeu_cau_ky_thuat': None,
  'kha_nang_dap_ung': 'Kh√¥ng c√≥ th√¥ng tin ph√π h·ª£p n√†o v·ªÅ ngu·ªìn g·ªëc xu·∫•t x·ª© v√† ch·ª©ng nh·∫≠n ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m.',
  'tai_lieu_tham_chieu': {'file': 'Netsure 731 A41 Usermanual.pdf',
   'section': '',
   'table_or_figure': '',
   'page': 0,
   'evidence': 'Kh√¥ng c√≥ ch·∫ø ƒë·ªô ch·ª©ng nh·∫≠n ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m ho·∫∑c ngu·ªìn g·ªëc xu·∫•t x·ª© r√µ r√†ng ƒë∆∞·ª£c n√™u trong t√†i li·ªáu.'}},
 '48770

In [145]:
for key in adapt_or_not_step:
    product_key['B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah)'][key].extend(adapt_or_not_step[key]) 

In [146]:
product_key

{'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah)': {'Y√™u c·∫ßu chung': ['1C6C6',
   '48770',
   'BCA2F',
   '9F07C',
   '89B87',
   'DDB2C',
   '86CB8',
   '5/7',
   'Netsure 731 A41 Usermanual.pdf, trang: 0, evidence: Kh√¥ng c√≥ ch·∫ø ƒë·ªô ch·ª©ng nh·∫≠n ch·∫•t l∆∞·ª£ng s·∫£n ph·∫©m ho·∫∑c ngu·ªìn g·ªëc xu·∫•t x·ª© r√µ r√†ng ƒë∆∞·ª£c n√™u trong t√†i li·ªáu.\n\nNetsure-731-A41-user-manual.pdf, trang: 21, evidence: Before the test, inform the chief manufacturer representative. Only trained electrical engineer can maintain and operate this equipment.\n\nNetsure 731 A41 Usermanual.pdf, trang: 1, evidence: Copyright ¬© 2017 by Vertiv Co., Ltd. ¬Æ\n\nM830B Brochure.pdf, trang: 2, evidence: | Electrical    | IEC 60950-1, EN 60950-1, UL 60950-1            |\n\nNetsure 731 A41 Usermanual.pdf, trang: 9, evidence: The power system is composed of power distribution„ÄÅrectifier modules and controller module.\n\nNetsure 731 A41 Usermanual.pdf, trang: 35, trong b·∫£ng(f