In [1]:
import camelot
import json
import os
import re

In [2]:
def get_continued_tables(tables, threshold):

    continued_tables = {}
    previous_table = False
    group_counter = 0

    # typical height of a pdf is 842 points and bottom margins are anywhere between 56 and 85 points
    # therefore, accounting for margins, 792
    page_height = 792

    # iterate over the tables
    for i, table in enumerate(tables):

        # if a previous table exists (remember, we start with this as false)
        # and the previous table was on the previous page
        # and the number of columns of both tables is the same
        if previous_table and table.page == previous_table.page + 1 and len(table.cols) == len(previous_table.cols):

            # get the bottom coordinate of the previous table
            # note that for pdfs the origin (0, 0) typically starts from the bottom-left corner of the page,
            # with the y-coordinate increasing as you move upwards
            # this is why for {x0, y0, x1, y1} we need the y0 as the bottom
            previous_table_bottom = previous_table._bbox[1]

            # get the top coordinate of the current table
            # for {x0, y0, x1, y1} we need the y1 as the top
            current_table_top = table._bbox[3]

            # if the previous table ends in the last 15% of the page and the current table starts in the first 15% of the page
            if previous_table_bottom < (threshold / 100) * page_height and current_table_top > (1 - threshold / 100) * page_height:

                # if we don't have started this group of tables
                if (continued_tables.get(group_counter) is None):

                    # start by adding the first table
                    continued_tables[group_counter] = [previous_table]

                # add any of the sunsequent tables to the group
                continued_tables[group_counter].append(table)

            # if this is not a continuation of the previous table
            else:

                # increment the group number
                group_counter += 1;

        # if this is not a continuation of the previous table
        else:

            # increment the group number
            group_counter += 1;

        # the current table becomes the previous table for the next iteration
        previous_table = table

    # transform the dictionary into an array of arrays
    continued_tables = [value for value in continued_tables.values()]

    # return the combined tables
    return continued_tables

In [3]:
def table_to_json(table_data, table_info):
    """Convert table data to JSON format"""
    if not table_data:
        return {}
    
    # Create JSON structure
    json_data = {
        "metadata": {
            "source_file": table_info["source_file"],
            "page": table_info["page"],
            "table_order": table_info["order"],
            "total_rows": len(table_data),
            "total_columns": len(table_data[0]) if table_data else 0
        },
        "headers": [],
        "data": []
    }
    
    # Add headers (first row)
    if len(table_data) > 0:
        headers = [str(cell).strip() for cell in table_data[0]]
        
        # Replace first 3 headers with fixed names
        if len(headers) >= 1:
            headers[0] = "STT"
        if len(headers) >= 2:
            headers[1] = "hang_hoa"
        if len(headers) >= 3:
            headers[2] = "yeu_cau_ky_thuat"
            
        json_data["headers"] = headers
        
        # Add data rows (skip header)
        for i, row in enumerate(table_data[1:], 1):
            row_dict = {}
            for j, cell in enumerate(row):
                # Use header as key, fallback to column index if header is empty
                key = json_data["headers"][j] if j < len(json_data["headers"]) and json_data["headers"][j] else f"column_{j}"
                row_dict[key] = str(cell).strip()
            
            json_data["data"].append({
                "row_index": i,
                "values": row_dict
            })
    
    return json_data

In [4]:
def get_biggest_table(pdf_path, threshold):
    tables = camelot.read_pdf(pdf_path, flavor = 'lattice', pages = 'all')
    continued_tables = get_continued_tables(tables, threshold)

    # get the name of the PDF file we are processing (without the extension)
    pdf_file_name = os.path.splitext(os.path.basename(pdf_path))[0]

    processed = []
    all_table_jsons = []

    # iterate over found tables
    for i, table in enumerate(tables):

        # if table was already processed as part of a group
        if table in processed: continue

        # check if the current table is a continued table
        is_continued = any(table in sublist for sublist in continued_tables)

        # collect all table data (current table + continued tables if any)
        all_table_data = list(table.data)

        # if the current table is a continued table, append all subsequent continued tables data
        if is_continued:

            # get the index of the group in "continued_tables" associated with the current table
            group_index = next(index for index, sublist in enumerate(continued_tables) if table in sublist)

            # iterate over the tables in said group and append their data
            for continued_table in continued_tables[group_index]:

                # skip the current table as it's already added
                if continued_table == table or continued_table in processed: continue

                # append the data of the continued table (skip header for subsequent tables)
                all_table_data.extend(continued_table.data[1:] if len(continued_table.data) > 1 else [])

                # keep track of processed tables
                processed.append(continued_table)

        # convert to JSON
        table_info = {
            "source_file": pdf_file_name,
            "page": table.parsing_report['page'],
            "order": table.parsing_report['order']
        }
        
        json_data = table_to_json(all_table_data, table_info)
        all_table_jsons.append(json_data)
        
        # mark current table as processed
        processed.append(table)

    # find the table with the most rows
    if all_table_jsons:
        largest_table = max(all_table_jsons, key=lambda x: x.get('metadata', {}).get('total_rows', 0))
        
        # return the JSON of the largest table
        print(json.dumps(largest_table, ensure_ascii=False, indent=2))
        return largest_table
    else:
        print("No tables found in the PDF.")
        return None

In [16]:
hello = get_biggest_table("D:/study/LammaIndex/documents/test1.pdf",50)

{
  "metadata": {
    "source_file": "test1",
    "page": 1,
    "table_order": 1,
    "total_rows": 16,
    "total_columns": 3
  },
  "headers": [
    "STT",
    "hang_hoa",
    "yeu_cau_ky_thuat"
  ],
  "data": [
    {
      "row_index": 1,
      "values": {
        "STT": "IV",
        "hang_hoa": "B·ªô chuy·ªÉn ƒë·ªïi \nngu·ªìn AC/DC \nlo·∫°i nh·ªè (1U)",
        "yeu_cau_ky_thuat": "-  C√°c lo·∫°i thi·∫øt b·ªã, v·∫≠t t∆∞, ph·ª• ki·ªán \nph·∫£i c√≥ ngu·ªìn g·ªëc xu·∫•t x·ª© r√µ r√†ng, \nc√≥  ch·ª©ng  nh·∫≠n  ch·∫•t \nl∆∞·ª£ng  s·∫£n \nph·∫©m c·ªßa nh√† s·∫£n xu·∫•t. \n-  Thi·∫øt b·ªã m·ªõi 100% ch∆∞a qua s·ª≠ \nd·ª•ng \n-  Thi·∫øt  b·ªã  ph·∫£i  ƒë∆∞·ª£c  s·∫£n  xu·∫•t  t·ª´ \nnƒÉm 2021 tr·ªü l·∫°i ƒë√¢y \n-  Thu·ªôc \nlo·∫°i \nthi·∫øt  b·ªã  ngu·ªìn  s·ª≠ \nd·ª•ng k·ªπ thu·∫≠t chuy·ªÉn m·∫°ch, thi·∫øt \nk·∫ø  d·∫°ng  Module  theo  ch·ª©c  nƒÉng \nc·ªßa t·ª´ng kh·ªëi. \n-  T·∫•t c·∫£ c√°c kh·ªëi ƒë·∫∑t trong t·ªß li√™n \nho√†n, ƒë·ªìng b·ªô c·ªßa h√£ng, t·∫°o th√†nh \nthi·∫øt \nb·ª

In [17]:
data = hello["data"]

In [18]:
import uuid
def clean_text(text):
    """L√†m s·∫°ch text, lo·∫°i b·ªè k√Ω t·ª± xu·ªëng d√≤ng th·ª´a"""
    return re.sub(r'\n+', '', text.strip())

def split_requirements(text):
    """T√°ch c√°c y√™u c·∫ßu d·ª±a tr√™n d·∫•u g·∫°ch ƒë·∫ßu d√≤ng"""
    requirements = []
    lines = text.split('\n')
    for line in lines:
        line = line.strip()
        if line.startswith('- '):
            requirements.append(line[2:].strip())
        elif line and not any(line.startswith(prefix) for prefix in ['- ']):
            if requirements:
                requirements[-1] += ' ' + line
            else:
                requirements.append(line)
    return requirements

def generate_random_key():
    """T·∫°o key random 5 k√Ω t·ª± t·ª´ UUID"""
    return str(uuid.uuid4()).replace('-', '')[:5].upper()

def convert_to_new_format(data):
    result = []
    current_product = None
    current_category = None
    
    for item in data:
        values = item['values']
        stt_raw  = values['STT']
        hang_hoa = clean_text(values['hang_hoa'])
        yeu_cau = values['yeu_cau_ky_thuat']


        stt = stt_raw.strip()

        roman_pattern = r'^(VII|VIII|IX|X|XI|XII|I{1,3}|IV|V|VI)\s+(.+)'
        roman_match = re.match(roman_pattern, stt)
        # N·∫øu STT l√† s·ªë La M√£ (I, II, III...) th√¨ ƒë√¢y l√† t√™n s·∫£n ph·∫©m
        hang_hoa_roman_match = re.match(roman_pattern, hang_hoa)
        if roman_match and not hang_hoa and not yeu_cau:
            if current_product:
                result.append(current_product)
            
            roman_num = roman_match.group(1)  # S·ªë La M√£
            product_name = roman_match.group(2)  # T√™n s·∫£n ph·∫©m
            
            current_product = {
                "ten_san_pham": product_name,
                "cac_muc": []
            }
            current_category = None
        elif hang_hoa_roman_match and not stt_raw and not yeu_cau:
            if current_product:
                result.append(current_product)
            
            roman_num = hang_hoa_roman_match.group(1)  # S·ªë La M√£
            product_name = hang_hoa_roman_match.group(2)  # T√™n s·∫£n ph·∫©m
            
            current_product = {
                "ten_san_pham": product_name,
                "cac_muc": []
            }
            current_category = None        
        
        elif stt in ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV']:
            if current_product:
                result.append(current_product)
            
            current_product = {
                "ten_san_pham": hang_hoa,
                "cac_muc": []
            }
            current_category = None
            
        # N·∫øu STT l√† s·ªë (1, 2, 3...) th√¨ ƒë√¢y l√† danh m·ª•c
        elif stt.isdigit():
            current_category = {
                "ten_hang_hoa": hang_hoa,
                "thong_so_ky_thuat": {}
            }
            
            # X·ª≠ l√Ω y√™u c·∫ßu k·ªπ thu·∫≠t cho danh m·ª•c
            if yeu_cau.strip():
                requirements = split_requirements(yeu_cau)
                for req in requirements:
                    key = generate_random_key()  # T·∫°o key random
                    current_category["thong_so_ky_thuat"][key] = clean_text(req)
            if current_product:
                current_product["cac_muc"].append(current_category)
                
        # N·∫øu STT tr·ªëng th√¨ ƒë√¢y l√† th√¥ng s·ªë k·ªπ thu·∫≠t chi ti·∫øt
        elif stt == '' and current_category and hang_hoa:
            # T·∫°o key random cho th√¥ng s·ªë k·ªπ thu·∫≠t
            key = generate_random_key()
            
            # L√†m s·∫°ch t√™n h√†ng h√≥a v√† y√™u c·∫ßu k·ªπ thu·∫≠t
            clean_hang_hoa = clean_text(hang_hoa)
            clean_yeu_cau = clean_text(yeu_cau)
            
            current_category["thong_so_ky_thuat"][key] = [clean_hang_hoa, clean_yeu_cau]
        elif stt == '' and current_category and not hang_hoa:
            if yeu_cau.strip():
                requirements = split_requirements(yeu_cau)
                
                # L·∫•y key cu·ªëi c√πng trong thong_so_ky_thuat (n·∫øu c√≥)
                existing_keys = list(current_category["thong_so_ky_thuat"].keys())
                last_key = existing_keys[-1] if existing_keys else None
                
                for req in requirements:
                    clean_req = clean_text(req)
                    
                    # Ki·ªÉm tra ch·ªØ c√°i ƒë·∫ßu c√≥ vi·∫øt hoa HO·∫∂C c√≥ g·∫°ch ƒë·∫ßu d√≤ng kh√¥ng
                    has_dash = req.strip().startswith('- ')
                    has_uppercase = clean_req and clean_req[0].isupper()
                    
                    if has_uppercase or has_dash:
                        # Ch·ªØ ƒë·∫ßu vi·∫øt hoa HO·∫∂C c√≥ g·∫°ch ƒë·∫ßu d√≤ng -> t·∫°o key m·ªõi
                        key = generate_random_key()
                        current_category["thong_so_ky_thuat"][key] = clean_req
                        last_key = key
                    else:
                        # Ch·ªØ ƒë·∫ßu kh√¥ng vi·∫øt hoa V√Ä kh√¥ng c√≥ g·∫°ch ƒë·∫ßu d√≤ng -> n·ªëi v√†o key tr∆∞·ªõc ƒë√≥
                        if last_key and last_key in current_category["thong_so_ky_thuat"]:
                            current_category["thong_so_ky_thuat"][last_key] += " " + clean_req
                        else:
                            # N·∫øu kh√¥ng c√≥ key tr∆∞·ªõc ƒë√≥ th√¨ v·∫´n t·∫°o key m·ªõi
                            key = generate_random_key()
                            current_category["thong_so_ky_thuat"][key] = clean_req
                            last_key = key
    
    # Th√™m s·∫£n ph·∫©m cu·ªëi c√πng
    if current_product:
        result.append(current_product)
    
    return result

# Chuy·ªÉn ƒë·ªïi d·ªØ li·ªáu
converted_data = convert_to_new_format(data)

In [19]:
converted_data

[{'ten_san_pham': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn AC/DC lo·∫°i nh·ªè (1U)',
  'cac_muc': [{'ten_hang_hoa': 'Y√™u c·∫ßu chung', 'thong_so_ky_thuat': {}},
   {'ten_hang_hoa': 'C·∫•u h√¨nh thi·∫øt b·ªã ngu·ªìn',
    'thong_so_ky_thuat': {'94C74': 'K√≠ch th∆∞·ªõc: < 2U',
     '48499': 'S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): ‚â• 3',
     '34DE8': 'C√¥ng su·∫•t m·ªói module ch·ªânh l∆∞u: ‚â• 1000W',
     '976D6': 'S·ªë l∆∞·ª£ng module ch·ªânh l∆∞u trang b·ªã k√®m t·ªß ngu·ªìn: ‚â• 3 module',
     'CA0B1': 'Attomat DC: + Attomat cho t·∫£i: ‚óè  Lo·∫°i 30A: ‚â• 02 c√°i ‚óè  Lo·∫°i 10A: ‚â• 01 c√°i ‚óè  Lo·∫°i 03A: ‚â• 01 c√°i + Attomat cho acquy: ‚óè  Lo·∫°i 50A: ‚â• 01 c√°i'}},
   {'ten_hang_hoa': 'ƒê·∫ßu v√†o AC',
    'thong_so_ky_thuat': {'AE067': 'S·ª≠ d·ª•ng ƒë∆∞·ª£c c√°c ƒëi·ªán √°p LÔºãN ÔºãPE/220VAC',
     '181CA': 'D·∫£i ƒëi·ªán √°p ƒë·∫ßu v√†o t·ª´ 85VAC √∑ 300VAC',
     '2A970': 'D·∫£i t·∫ßn s·ªë AC ƒë·∫ßu v√†o: t·ª´ 45Hz √∑ 65Hz.'}},
   {'ten_hang_hoa': 'ƒê·∫ßu ra D

In [20]:
context_queries = {}  # Dict ch·ª©a th√¥ng tin chi ti·∫øt theo key
product_key = {}  # Dict l·ªìng: ten_san_pham -> ten_hang_hoa -> list[key]

for item in converted_data:
    ten_san_pham = item['ten_san_pham']
    for muc in item['cac_muc']:
        ten_hang_hoa = muc['ten_hang_hoa']
        thong_so_ky_thuat = muc['thong_so_ky_thuat']
        for key, value in thong_so_ky_thuat.items():
            if isinstance(value, list):
                q = value[0]
                k = value[1]
                value_str = ' '.join(value)
            else:
                q = None
                k = value
                value_str = value

            # Ghi v√†o context_queries
            context_queries[key] = {
                "ten_san_pham": ten_san_pham,
                "ten_hang_hoa": ten_hang_hoa,
                "value": value_str,
                "yeu_cau_ky_thuat_chi_tiet": k,
                "yeu_cau_ky_thuat": q
            }

            # Ghi v√†o product_key
            if ten_san_pham not in product_key:
                product_key[ten_san_pham] = {}
            if ten_hang_hoa not in product_key[ten_san_pham]:
                product_key[ten_san_pham][ten_hang_hoa] = []
            product_key[ten_san_pham][ten_hang_hoa].append(key)



In [21]:
context_queries

{'94C74': {'ten_san_pham': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn AC/DC lo·∫°i nh·ªè (1U)',
  'ten_hang_hoa': 'C·∫•u h√¨nh thi·∫øt b·ªã ngu·ªìn',
  'value': 'K√≠ch th∆∞·ªõc: < 2U',
  'yeu_cau_ky_thuat_chi_tiet': 'K√≠ch th∆∞·ªõc: < 2U',
  'yeu_cau_ky_thuat': None},
 '48499': {'ten_san_pham': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn AC/DC lo·∫°i nh·ªè (1U)',
  'ten_hang_hoa': 'C·∫•u h√¨nh thi·∫øt b·ªã ngu·ªìn',
  'value': 'S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): ‚â• 3',
  'yeu_cau_ky_thuat_chi_tiet': 'S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): ‚â• 3',
  'yeu_cau_ky_thuat': None},
 '34DE8': {'ten_san_pham': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn AC/DC lo·∫°i nh·ªè (1U)',
  'ten_hang_hoa': 'C·∫•u h√¨nh thi·∫øt b·ªã ngu·ªìn',
  'value': 'C√¥ng su·∫•t m·ªói module ch·ªânh l∆∞u: ‚â• 1000W',
  'yeu_cau_ky_thuat_chi_tiet': 'C√¥ng su·∫•t m·ªói module ch·ªânh l∆∞u: ‚â• 1000W',
  'yeu_cau_ky_thuat': None},
 '976D6': {'ten_san_pham': 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn AC/DC lo·∫°i nh·ªè (1U)',
  'ten_

In [22]:
product_key

{'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn AC/DC lo·∫°i nh·ªè (1U)': {'C·∫•u h√¨nh thi·∫øt b·ªã ngu·ªìn': ['94C74',
   '48499',
   '34DE8',
   '976D6',
   'CA0B1'],
  'ƒê·∫ßu v√†o AC': ['AE067', '181CA', '2A970'],
  'ƒê·∫ßu ra DC': ['8B9FD', 'CA203', 'ACAE0', '79EA6', '958BA', '3C2CD'],
  'Y√™u c·∫ßu v·ªõi module ch·ªânh l∆∞u (Rectifier)': ['80076',
   '618FD',
   'DCB9E',
   '4827E',
   '5A21D',
   'BDFB1',
   '35BC0',
   'D3686'],
  'T√≠nh nƒÉng c·ªßa thi·∫øt b·ªã ngu·ªìn': ['9E7AD',
   'BBAB8',
   '73ADB',
   '1C0EE',
   '59689',
   '49F9E',
   'B2F1A'],
  'Kh·ªëi ƒëi·ªÅu khi·ªÉn v√† hi·ªÉn th·ªã': ['76F42', 'AAA98', 'D0049', '0581D', '2795C'],
  'ƒêi·ªÅu ki·ªán l√†m vi·ªác': ['DCED8'],
  'H·ªá th·ªëng l√†m m√°t': ['0D04B'],
  'ƒêi·ªÅu ki·ªán b·∫£o h√†nh': ['37398']}}

In [23]:
from openai import OpenAI
from dotenv import load_dotenv
import re
import os
import time
load_dotenv()

clientOpenAi = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [24]:
def retrieve_product_line(product_name, assistant_id="asst_CkhqaSBGeaIlLO5mY7puPybD"):
    thread = clientOpenAi.beta.threads.create()
    thread_id = thread.id
    # 2. G·ª≠i message v√†o thread
    clientOpenAi.beta.threads.messages.create(
        thread_id=thread_id,
        role="user",
        content=product_name
    )
    run = clientOpenAi.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistant_id,
        tool_choice="auto"  # ho·∫∑c thay b·∫±ng tool c·ª• th·ªÉ n·∫øu c·∫ßn
        # tool_choice={"type": "function", "function": {"name": "danh_gia_ky_thuat"}}
    )
    run_id = run.id
    # 4. ƒê·ª£i assistant x·ª≠ l√Ω xong
    while True:
        run_status = clientOpenAi.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run_id)
        if run_status.status == "completed":
            break
        elif run_status.status in ["failed", "cancelled", "expired"]:
            raise Exception(f"Run failed with status: {run_status.status}")
        time.sleep(1)

    # 5. L·∫•y k·∫øt qu·∫£ tr·∫£ v·ªÅ t·ª´ Assistant
    messages = clientOpenAi.beta.threads.messages.list(thread_id=thread_id)
    for message in reversed(messages.data):  # ƒë·∫£o ng∆∞·ª£c ƒë·ªÉ l·∫•y k·∫øt qu·∫£ m·ªõi nh·∫•t tr∆∞·ªõc
        if message.role == "assistant":
            for content in message.content:
                if content.type == "text":
                    return content.text.value

    return None

In [25]:
def create_prompt_extract_module(query_str):
    prompt = f"""
You are an expert in hardware product documentation analysis.  
Read the provided text (which can be either a detailed product brochure or a general product requirement) and extract ONLY the core physical hardware components/modules of the system.
 
For each component:
- If the text explicitly includes a model number, code, or exact specification tied to the component ‚Üí output "<Full Component Name>: <Exact Model(s)/Code(s)>".
- If the text does NOT provide a model number or code ‚Üí output only "<Full Component Name>".
 
Input:
<<<
{query_str}
>>>
 
Output format:
- <Component Name>[: <Model(s)/Code(s) if available>]
 
Rules:
1. Only include core hardware modules essential for the product‚Äôs operation (e.g., Rectifier Module, Controller, AC Input, AC Distribution, DC Distribution, Battery Distribution, Lightning Protection, Cooling System, Battery Bank).
2. Preserve the exact wording of component/module names from the text (do not paraphrase or generalize).
3. Include model numbers, codes, or exact designations only if explicitly stated in the text.  
   - If multiple models exist, list them separated by " / ".
4. If a component has sub-parts (e.g., BLVD/LLVD, Input/Output), keep them as separate lines with their full names.
5. Ignore optional accessories, warranty info, standards compliance, and marketing text unless they are part of the official component name/specification.
7. Do not infer or guess component names‚Äîextract only what is explicitly stated.    
"""
    return prompt

In [26]:
def create_prompt_extract_module2(query_str):
    prompt = f"""
    You are an expert in hardware product documentation analysis.  
Read the provided text (which can be either a detailed product brochure or a general product requirement) and extract ONLY the model numbers, codes, or exact designations of the core physical hardware components/modules of the system.

Input:
<<<
{query_str}
>>>

Output format:
A valid JSON array of strings, where each string is one model/code.  
Example:
["R48-121A3", "R56-3220"]

Rules:
1. Only extract model numbers, codes, or exact designations explicitly stated in the text.  
   - Do NOT include component/module names, descriptions, amperage, voltage, or units (e.g., "125 A / 2P" is ignored).
2. Extract models only from core hardware modules essential for the product‚Äôs operation (e.g., Rectifier Module, Controller, AC Input, AC Distribution, DC Distribution, Battery Distribution, Lightning Protection, Cooling System, Battery Bank).
3. Preserve the exact case, spacing, and characters from the original text.
4. If multiple models are listed together, split them into separate JSON array elements.
5. Ignore optional accessories, warranty info, standards compliance, and marketing text.
6. Do not infer or guess model numbers‚Äîextract only what is explicitly stated.
7. Output only a valid JSON array without extra text or explanations.

    """
    return prompt

In [27]:
product = 'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn AC/DC lo·∫°i nh·ªè (1U)'
product_line = retrieve_product_line(product)
print(f"Product Line: {product_line}")
query_str = f""

all_requirements = product_key[product]
for key in all_requirements:
    query_str += f"{key} :"
    for item in all_requirements[key]:
        if item not in context_queries:
            continue
        query_str += context_queries[item]["value"]
    query_str += "\n"
    print(query_str)
prompt_yeu_cau_ky_thuat = create_prompt_extract_module(query_str)
response = clientOpenAi.responses.create(
    model="gpt-4o-mini",
    input=prompt_yeu_cau_ky_thuat,
    temperature=0
)
product_requirement = f"{product}: {response.output_text.strip()}" 


  thread = clientOpenAi.beta.threads.create()
  clientOpenAi.beta.threads.messages.create(
  run = clientOpenAi.beta.threads.runs.create(
  run_status = clientOpenAi.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run_id)
  messages = clientOpenAi.beta.threads.messages.list(thread_id=thread_id)


Product Line: "DC Power Systems"
C·∫•u h√¨nh thi·∫øt b·ªã ngu·ªìn :K√≠ch th∆∞·ªõc: < 2US·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): ‚â• 3C√¥ng su·∫•t m·ªói module ch·ªânh l∆∞u: ‚â• 1000WS·ªë l∆∞·ª£ng module ch·ªânh l∆∞u trang b·ªã k√®m t·ªß ngu·ªìn: ‚â• 3 moduleAttomat DC: + Attomat cho t·∫£i: ‚óè  Lo·∫°i 30A: ‚â• 02 c√°i ‚óè  Lo·∫°i 10A: ‚â• 01 c√°i ‚óè  Lo·∫°i 03A: ‚â• 01 c√°i + Attomat cho acquy: ‚óè  Lo·∫°i 50A: ‚â• 01 c√°i

C·∫•u h√¨nh thi·∫øt b·ªã ngu·ªìn :K√≠ch th∆∞·ªõc: < 2US·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier): ‚â• 3C√¥ng su·∫•t m·ªói module ch·ªânh l∆∞u: ‚â• 1000WS·ªë l∆∞·ª£ng module ch·ªânh l∆∞u trang b·ªã k√®m t·ªß ngu·ªìn: ‚â• 3 moduleAttomat DC: + Attomat cho t·∫£i: ‚óè  Lo·∫°i 30A: ‚â• 02 c√°i ‚óè  Lo·∫°i 10A: ‚â• 01 c√°i ‚óè  Lo·∫°i 03A: ‚â• 01 c√°i + Attomat cho acquy: ‚óè  Lo·∫°i 50A: ‚â• 01 c√°i
ƒê·∫ßu v√†o AC :S·ª≠ d·ª•ng ƒë∆∞·ª£c c√°c ƒëi·ªán √°p LÔºãN ÔºãPE/220VACD·∫£i ƒëi·ªán √°p ƒë·∫ßu v√†o t·ª´ 85VAC √∑ 300VACD·∫£i t·∫ßn s·ªë AC ƒë

In [28]:
product_requirement

'B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn AC/DC lo·∫°i nh·ªè (1U): - K√≠ch th∆∞·ªõc\n- S·ªë l∆∞·ª£ng khe c·∫Øm module ch·ªânh l∆∞u (Rectifier)\n- C√¥ng su·∫•t m·ªói module ch·ªânh l∆∞u\n- S·ªë l∆∞·ª£ng module ch·ªânh l∆∞u trang b·ªã k√®m t·ªß ngu·ªìn\n- Attomat DC\n- Attomat cho t·∫£i\n- Attomat cho acquy\n- ƒê·∫ßu v√†o AC\n- D·∫£i ƒëi·ªán √°p ƒë·∫ßu v√†o\n- D·∫£i t·∫ßn s·ªë AC ƒë·∫ßu v√†o\n- ƒê·∫ßu ra DC\n- ƒêi·ªán √°p ƒë·∫ßu ra danh ƒë·ªãnh\n- D·∫£i ƒëi·ªán √°p ƒë·∫ßu ra\n- D√≤ng t·∫£i max\n- ƒê·ªô ·ªïn ƒë·ªãnh ƒëi·ªán √°p ƒë·∫ßu ra\n- Nhi·ªÖu bƒÉng r·ªông (Wide band noise)\n- ƒê·ªô g·ª£n s√≥ng ƒë·∫ßu ra (ƒë·ªânh ‚Äì ƒë·ªânh)\n- Y√™u c·∫ßu v·ªõi module ch·ªânh l∆∞u (Rectifier)\n- H·ªá s·ªë c√¥ng su·∫•t c·ªßa Rectifier\n- Hi·ªáu su·∫•t ƒë·ªânh module rectifier\n- T·ªïng ƒë·ªô m√©o h√†i THD c·ªßa Rectifier\n- Rectifier thay th·∫ø n√≥ng\n- Tr·∫°ng th√°i ho·∫°t ƒë·ªông c·ªßa Rectifier\n- T√≠nh nƒÉng c·ªßa thi·∫øt b·ªã ngu·ªìn\n- B·ªô ngu·ªìn\n- H·ªá th·ªëng c√≥ trang b·ªã ch·ª©c nƒÉng LLVD (Load Low

In [45]:
import os
from dotenv import load_dotenv
from llama_index.core import VectorStoreIndex
from llama_index.core.vector_stores import VectorStoreInfo
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, AsyncQdrantClient
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.settings import Settings
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
    FilterCondition
)
import time


# C·∫•u h√¨nh LLM v√† Embedding
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.llm = OpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
# C·∫•u h√¨nh client Qdrant
client = QdrantClient(
    url="https://a8bcf78f-0147-411f-aa58-079f863fcd6d.us-west-1-0.aws.cloud.qdrant.io:6333",
    api_key=os.getenv("QDRANT_API_KEY"),
)
aclient = AsyncQdrantClient(
    url="https://a8bcf78f-0147-411f-aa58-079f863fcd6d.us-west-1-0.aws.cloud.qdrant.io:6333",
    api_key=os.getenv("QDRANT_API_KEY"),
)
# Kh·ªüi t·∫°o Vector Store
vector_store = QdrantVectorStore(
    collection_name="hello_my_friend",
    client=client,
    aclient=aclient,
)

def retrieve_document(product_line, query_str):
    product_ids = []
    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
    
    filters_document = MetadataFilters(
        filters=[
            MetadataFilter(key="product_line", operator=FilterOperator.EQ, value=product_line),
            MetadataFilter(key="type", operator=FilterOperator.EQ, value="summary_document"),
        ],
    condition=FilterCondition.AND,
    )
    retriever_document = index.as_retriever(similarity_top_k=5, sparse_top_k=10, verbose=True, enable_hybrid=True, filters=filters_document,alpha=0.7)
    
    results = retriever_document.retrieve(query_str)
    # print("product: ", results)
    # print("results: ", results)
    for result in results:
        metadata = result.metadata
        print(metadata)
        product_ids.append(
            {
                "product_id": metadata["product_id"],
                "brochure_file_path": metadata["brochure_file_path"],
            }
        )

    return product_ids

async def retrieve_chunk_async(product_ids, query_str):
    """
    Phi√™n b·∫£n async c·ªßa retrieve_chunk
    """
    # Wrap c√°c operations ƒë·ªìng b·ªô trong executor ƒë·ªÉ kh√¥ng block event loop
    loop = asyncio.get_event_loop()
    
    def _retrieve_sync():
        index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
        
        filters_chunk = MetadataFilters(
            filters=[
                MetadataFilter(key="product_id", operator=FilterOperator.IN, value=product_ids),
                MetadataFilter(key="type", operator=FilterOperator.EQ, value="chunk_document"),
            ],
            condition=FilterCondition.AND,
        )
        retriever_chunk = index.as_retriever(similarity_top_k=5, verbose=True, filters=filters_chunk)
        
        results = retriever_chunk.retrieve(query_str)
        content = ""
        for i, result in enumerate(results, start=1):
            metadata = result.metadata
            file_name = metadata["file_name"] + ".pdf"
            page = metadata["page"]
            table = metadata["table_name"]
            figure_name = metadata.get("figure_name")
            text = result.text.strip()
            content += f"Chunk {i} trong file {file_name} t·∫°i trang {page}, c√≥ ch·ª©a b·∫£ng {table} v√† h√¨nh {figure_name} c√≥ n·ªôi dung:\n{text}\n\n"
        return content
    
    # Ch·∫°y function ƒë·ªìng b·ªô trong thread pool ƒë·ªÉ kh√¥ng block
    return await loop.run_in_executor(None, _retrieve_sync)

def retrieve_product_line(product_name, assistant_id="asst_j5wHMN84dpSLXD2GMH5QifS0"):
    from openai import OpenAI 
    clientOpenAi = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    thread = clientOpenAi.beta.threads.create()
    thread_id = thread.id
    # 2. G·ª≠i message v√†o thread
    clientOpenAi.beta.threads.messages.create(
        thread_id=thread_id,
        role="user",
        content=product_name
    )
    run = clientOpenAi.beta.threads.runs.create(
        thread_id=thread_id,
        assistant_id=assistant_id,
        tool_choice="auto"  # ho·∫∑c thay b·∫±ng tool c·ª• th·ªÉ n·∫øu c·∫ßn
        # tool_choice={"type": "function", "function": {"name": "danh_gia_ky_thuat"}}
    )
    run_id = run.id
    # 4. ƒê·ª£i assistant x·ª≠ l√Ω xong
    while True:
        run_status = clientOpenAi.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run_id)
        if run_status.status == "completed":
            break
        elif run_status.status in ["failed", "cancelled", "expired"]:
            raise Exception(f"Run failed with status: {run_status.status}")
        time.sleep(1)

    # 5. L·∫•y k·∫øt qu·∫£ tr·∫£ v·ªÅ t·ª´ Assistant
    messages = clientOpenAi.beta.threads.messages.list(thread_id=thread_id)
    for message in reversed(messages.data):  # ƒë·∫£o ng∆∞·ª£c ƒë·ªÉ l·∫•y k·∫øt qu·∫£ m·ªõi nh·∫•t tr∆∞·ªõc
        if message.role == "assistant":
            for content in message.content:
                if content.type == "text":
                    # print("product_line: ", content.text.value)
                    return content.text.value

    return None

def retrieve_component(keyword_list):
    # T·∫°o danh s√°ch ƒëi·ªÅu ki·ªán OR
    should_conditions = [
        FieldCondition(
            key='file_brochure_name',
            match=MatchText(text=kw)
        )
        for kw in keyword_list
    ]

    text_filter = Filter(
        should=should_conditions  # OR search
    )

    scroll_result, next_page = client.scroll(
        collection_name="hello_my_friend",
        scroll_filter=text_filter,
        limit=5
    )

    product_ids = []
    if scroll_result:
        print("K·∫øt qu·∫£ t√¨m ki·∫øm:")
        for result in scroll_result:
            metadata = result.payload
            product_id = metadata.get("product_id", "")
            if product_id:
                product_ids.append(product_id)
    print(product_ids)
    return product_ids


        




In [46]:
product_line = product_line.replace('"', '').replace("'", "")
print(product_line)  # DC Power Systems

DC Power Systems


In [47]:
products = retrieve_document(product_line, product_requirement)

{'category': 'Critical Power', 'product_line': 'DC Power Systems', 'product_name': 'Vertiv‚Ñ¢ Mini NetSure‚Ñ¢ Control Unit', 'summary': 'A compact, pluggable DC power system controller designed for dense power environments, enabling remote monitoring and control of rectifiers, converters, batteries, and site conditions. It manages the full DC power chain.Integrated control of AC mains, rectifiers, batteries, and environment. Remote management via web (HTTPS), SNMP, Modbus, TL1. Advanced battery management: thermal protection, reserve time prediction. Hot-pluggable and supports multilingual interfaces. Expandable via optional interface boards for more I/Os.', 'product_id': '7866b3f6-78bd-11f0-928b-38f3abb08dd1', 'type': 'summary_document', 'brochure_file_path': 'output/M831A Controller Datasheet.md', 'file_brochure_name': 'M831A Controller Datasheet.pdf'}
{'category': 'Critical Power', 'product_line': 'DC Power Systems', 'product_name': '\u200bNetSure 701', 'summary': 'The NetSure‚Ñ¢ 70

In [48]:
products

[{'product_id': '7866b3f6-78bd-11f0-928b-38f3abb08dd1',
  'brochure_file_path': 'output/M831A Controller Datasheet.md'},
 {'product_id': '5ac3396c-78c0-11f0-a69c-38f3abb08dd1',
  'brochure_file_path': 'output/netsure-701-series_datasheet_en-asia.md'},
 {'product_id': '967d4216-78c2-11f0-8aad-38f3abb08dd1',
  'brochure_file_path': 'output/esure-rectifier-1r483000e3b-data-sheet.md'},
 {'product_id': '6f65920b-78c0-11f0-8850-38f3abb08dd1',
  'brochure_file_path': 'output/netsure-801-brochure.md'},
 {'product_id': 'e3b19345-78bc-11f0-85a4-38f3abb08dd1',
  'brochure_file_path': 'output/netsure-2100-brochure.md'}]

In [68]:
item = products[4]

In [69]:
item

{'product_id': 'c9ab50bc-78bb-11f0-86ca-38f3abb08dd1',
 'brochure_file_path': 'output/NetSure -731 A41 Brochure.md'}

In [70]:
product_search_id = []
product_id = item["product_id"]
brochure = item["brochure_file_path"]

In [71]:
from llama_index.readers.file import MarkdownReader
import json

# Kh·ªüi t·∫°o reader
reader = MarkdownReader()
documents = reader.load_data(file=f"D:/study/LammaIndex/{brochure}")
markdown_text = "\n".join(doc.text for doc in documents)

In [72]:
prompt_brochure = create_prompt_extract_module2(markdown_text)
response = clientOpenAi.responses.create(
    model="gpt-4o-mini",
    input=prompt_brochure,
    temperature=0
)
product_brochure = response.output_text.strip()

In [73]:
import json
import re
# C√°ch 1: D√πng regex ƒë·ªÉ l·∫•y ph·∫ßn b√™n trong code block
match = re.search(r'```json\s*(.*?)\s*```', product_brochure, re.DOTALL)
if match:
    json_str = match.group(1)
else:
    json_str = response  # n·∫øu kh√¥ng c√≥ code block th√¨ d√πng nguy√™n vƒÉn

# Parse JSON th√†nh list Python
product_brochure = json.loads(json_str)

In [86]:
product_brochure

['NetSure‚Ñ¢ 731 A41',
 'R48-3000A3',
 'R48-3000e3',
 'R48-3500e3',
 'R48-3500E4',
 'M221S',
 'M830B']

In [75]:
from qdrant_client.http.models import PayloadSchemaType, Filter, FieldCondition, MatchText
product_component_id = retrieve_component(product_brochure)

K·∫øt qu·∫£ t√¨m ki·∫øm:
['a78fd7ff-78bb-11f0-a0c0-38f3abb08dd1', '58875529-78bc-11f0-9d8f-38f3abb08dd1', 'c9ab50bc-78bb-11f0-86ca-38f3abb08dd1']


In [76]:
product_search_id=[]
product_search_id.extend(product_component_id) 
product_search_id.append(product_id)

In [77]:
product_search_id = set(product_search_id)

In [84]:
product_search_id

{'58875529-78bc-11f0-9d8f-38f3abb08dd1',
 'a78fd7ff-78bb-11f0-a0c0-38f3abb08dd1',
 'c9ab50bc-78bb-11f0-86ca-38f3abb08dd1'}

In [None]:
context_queries

In [88]:
kha_nang_dap_ung_tham_chieu_final = {}
kha_nang_dap_ung_tham_chieu_step = {}
async def process_single_item(item: str, key: str, context_queries: Dict, product_search_id: Any) -> tuple:
    """
    X·ª≠ l√Ω m·ªôt item ƒë∆°n l·∫ª m·ªôt c√°ch b·∫•t ƒë·ªìng b·ªô
    """
    if item not in context_queries:
        return item, None
    
    query = context_queries[item]["value"]
    content = await retrieve_chunk_async(product_search_id, query)
    print(f"Processed item: {item}")  # In ra ƒë·ªÉ theo d√µi ti·∫øn tr√¨nh
    
    return item, {
        'relevant_context': content
    }
# Phi√™n b·∫£n v·ªõi gi·ªõi h·∫°n s·ªë l∆∞·ª£ng concurrent tasks (khuy·∫øn ngh·ªã)
async def process_requirements_async_with_semaphore(all_requirements: Dict, context_queries: Dict, product_search_id: Any, max_concurrent: int = 10) -> Dict:
    """
    Phi√™n b·∫£n async v·ªõi gi·ªõi h·∫°n s·ªë l∆∞·ª£ng concurrent tasks
    """
    kha_nang_dap_ung_tham_chieu_step = {}
    semaphore = asyncio.Semaphore(max_concurrent)
    
    async def process_with_semaphore(item: str, key: str):
        async with semaphore:
            return await process_single_item(item, key, context_queries, product_search_id)
    
    # T·∫°o danh s√°ch t·∫•t c·∫£ c√°c tasks v·ªõi semaphore
    tasks = []
    for key in all_requirements:
        for item in all_requirements[key]:
            task = process_with_semaphore(item, key)
            tasks.append(task)
    
    print(f"B·∫Øt ƒë·∫ßu x·ª≠ l√Ω {len(tasks)} tasks v·ªõi t·ªëi ƒëa {max_concurrent} concurrent...")
    
    # Ch·∫°y t·∫•t c·∫£ tasks ƒë·ªìng th·ªùi (nh∆∞ng gi·ªõi h·∫°n concurrent)
    results = await asyncio.gather(*tasks, return_exceptions=True)
    
    # X·ª≠ l√Ω k·∫øt qu·∫£
    for result in results:
        if isinstance(result, Exception):
            print(f"L·ªói khi x·ª≠ l√Ω: {result}")
            continue
        
        item, data = result
        if data is not None:
            kha_nang_dap_ung_tham_chieu_step[item] = data
    
    return kha_nang_dap_ung_tham_chieu_step
kha_nang_dap_ung_tham_chieu_step = await process_requirements_async_with_semaphore(
        all_requirements, context_queries, product_search_id, max_concurrent=5
    )

B·∫Øt ƒë·∫ßu x·ª≠ l√Ω 45 tasks v·ªõi t·ªëi ƒëa 5 concurrent...
Processed item: B6B13
Processed item: A13CF
Processed item: 53F0F
Processed item: 6266E
Processed item: EF540
Processed item: 48E25
Processed item: 79BBE
Processed item: 46045
Processed item: 9C791
Processed item: 53211
Processed item: FCB4C
Processed item: 3D45D
Processed item: EC8D4
Processed item: 660E9
Processed item: E3A93
Processed item: 602C6
Processed item: FDD3C
Processed item: FFE72
Processed item: 7C3EA
Processed item: AFECD
Processed item: 76B1A
Processed item: 5F427
Processed item: 6E617
Processed item: 3F0D7
Processed item: FC55E
Processed item: 31B97
Processed item: 1D174
Processed item: 13DCF
Processed item: 95399
Processed item: 981D1
Processed item: F23C4
Processed item: 5EF9F
Processed item: 94585
Processed item: 6C105
Processed item: FF2ED
Processed item: 336B2
Processed item: 158E4
Processed item: C60F3
Processed item: DD0A8
Processed item: E5B19
Processed item: 8949D
Processed item: A2B89
Processed ite

In [89]:
kha_nang_dap_ung_tham_chieu_step

 '53F0F': {'relevant_context': 'Chunk 1 trong file Netsure 731 A41 Usermanual.pdf t·∫°i trang 30, c√≥ ch·ª©a b·∫£ng Technical Data of Power System v√† h√¨nh None c√≥ n·ªôi dung:\n|\n|                                               | Voltage set-point accuracy                       | ‚â§1ÔºÖ                                                                                                                             |\n|                                               | Hold up time                                     | 30 ms                                                                                                                           |\n|                                               | Output noise                                     | Wide band noise ‚â§20mV rms (5Hz - 1MHz)<br/>Peak-Peak noise ‚â§250mV p-p (0 - 20MHz)                                               |\n|                                               | Psophometric noise                               | ‚â§2mVÔºà300Ô

In [41]:
from openai import OpenAI
import json 
import time
import asyncio
from typing import Dict, Any

clientOpenAI = OpenAI()


DEFAULT_OBJECT = {
    "yeu_cau_ky_thuat": "",
    "kha_nang_dap_ung": "",
    "tai_lieu_tham_chieu": {
        "file": "",
        "section": "",
        "table_or_figure": "",
        "page": 0,
        "evidence": ""
    }
}

def fill_defaults(data: dict, defaults: dict) -> dict:
    """
    ƒê·ªá quy b·ªï sung c√°c tr∆∞·ªùng m·∫∑c ƒë·ªãnh v√†o data n·∫øu thi·∫øu.
    """
    result = defaults.copy()
    for key, default_value in defaults.items():
        if key in data:
            if isinstance(default_value, dict) and isinstance(data[key], dict):
                result[key] = fill_defaults(data[key], default_value)
            else:
                result[key] = data[key]
    return result


def extract_first_json_object(json_str: str):
    s = json_str.strip()
    
    # T√¨m d·∫•u '{' ƒë·∫ßu ti√™n
    start_index = s.find('{')
    if start_index == -1:
        print("‚ùå Kh√¥ng t√¨m th·∫•y JSON object n√†o.")
        return DEFAULT_OBJECT

    # Duy·ªát t·ª´ ƒë√≥ ƒë·ªÉ t√¨m d·∫•u '}' k·∫øt th√∫c object ƒë·∫ßu ti√™n
    brace_count = 0
    end_index = -1
    for i in range(start_index, len(s)):
        if s[i] == '{':
            brace_count += 1
        elif s[i] == '}':
            brace_count -= 1
            if brace_count == 0:
                end_index = i + 1  # C·∫Øt ƒë·∫øn sau d·∫•u '}'
                break
    if end_index == -1:
        print("‚ùå Kh√¥ng t√¨m th·∫•y JSON ƒë√≥ng ƒë√∫ng.")
        return DEFAULT_OBJECT

    first_json_str = s[start_index:end_index]
    # Ki·ªÉm tra xem c√≥ parse ƒë∆∞·ª£c kh√¥ng
    try:
        result = json.loads(first_json_str)
        # B·ªï sung field m·∫∑c ƒë·ªãnh n·∫øu thi·∫øu
        return fill_defaults(result, DEFAULT_OBJECT)
    except json.JSONDecodeError:
        return DEFAULT_OBJECT

# === ASYNC VERSION OF TRACK_REFERENCE ===
async def track_reference_async(context_queries: Dict, kha_nang_dap_ung_tham_chieu_step: Dict, 
                               max_concurrent: int = 5) -> Dict:
    """
    Phi√™n b·∫£n async c·ªßa track_reference v·ªõi gi·ªõi h·∫°n s·ªë requests ƒë·ªìng th·ªùi
    """
    assistant_id = "asst_FZIBIfjPM3kCoxURARvM27UV"
    
    # T·∫°o semaphore ƒë·ªÉ gi·ªõi h·∫°n s·ªë requests ƒë·ªìng th·ªùi
    semaphore = asyncio.Semaphore(max_concurrent)
    
    async def process_single_item(key: str):
        async with semaphore:  # Gi·ªõi h·∫°n s·ªë requests ƒë·ªìng th·ªùi
            try:
                value = context_queries[key]["value"]
                content = kha_nang_dap_ung_tham_chieu_step[key]["relevant_context"]
                
                user_prompt = f"""
                Chunk v√† metadata: {content}
                Y√™u c·∫ßu: {value}
                """
                
                print(f"üöÄ ƒêang x·ª≠ l√Ω key: {key}")
                result = await evaluate_technical_requirement_async(user_prompt, assistant_id)
                
                if isinstance(result, str):
                    result = extract_first_json_object(result)
                
                return key, result
                
            except Exception as e:
                print(f"‚ùå L·ªói x·ª≠ l√Ω key {key}: {str(e)}")
                return key, DEFAULT_OBJECT
    
    # T·∫°o tasks cho t·∫•t c·∫£ items
    tasks = [process_single_item(key) for key in kha_nang_dap_ung_tham_chieu_step]
    
    # Ch·∫°y t·∫•t c·∫£ tasks song song
    print(f"üèÉ‚Äç‚ôÇÔ∏è B·∫Øt ƒë·∫ßu x·ª≠ l√Ω {len(tasks)} items v·ªõi {max_concurrent} requests ƒë·ªìng th·ªùi...")
    results = await asyncio.gather(*tasks, return_exceptions=True)
    
    # X·ª≠ l√Ω k·∫øt qu·∫£
    for result in results:
        if isinstance(result, Exception):
            print(f"‚ùå Task failed: {result}")
            continue
            
        key, processed_result = result
        
        kha_nang_dap_ung_tham_chieu_step[key]["kha_nang_dap_ung"] = processed_result.get("kha_nang_dap_ung", "")
        kha_nang_dap_ung_tham_chieu_step[key]["tai_lieu_tham_chieu"] = {
            "file": processed_result["tai_lieu_tham_chieu"]["file"],
            "section": processed_result["tai_lieu_tham_chieu"].get("section", ""),
            "table_or_figure": processed_result["tai_lieu_tham_chieu"].get("table_or_figure", ""),
            "page": processed_result["tai_lieu_tham_chieu"].get("page", 0),
            "evidence": processed_result["tai_lieu_tham_chieu"].get("evidence", "")
        }
        kha_nang_dap_ung_tham_chieu_step[key].pop("relevant_context", None)
        print(f"‚úÖ Ho√†n th√†nh key: {key}")
    
    print("üéâ Ho√†n th√†nh t·∫•t c·∫£!")
    return kha_nang_dap_ung_tham_chieu_step
# === ASYNC VERSION CHO ƒêO·∫†N CODE C·ª¶A B·∫†N ===
async def process_requirements_async(context_queries: Dict, kha_nang_dap_ung_tham_chieu_step: Dict,
                                   max_concurrent: int = 5):
    """
    Phi√™n b·∫£n async cho ƒëo·∫°n code g·ªëc c·ªßa b·∫°n
    """
    assistant_id = "asst_FZIBIfjPM3kCoxURARvM27UV"
    semaphore = asyncio.Semaphore(max_concurrent)
    
    async def process_item(key: str):
        async with semaphore:
            try:
                value = context_queries[key]["value"]
                content = kha_nang_dap_ung_tham_chieu_step[key]["relevant_context"]
                form = context_queries[key]["yeu_cau_ky_thuat_chi_tiet"]
                
                user_prompt = f'''
                Chunk v√† metadata: {content}
                Y√™u c·∫ßu: {value}
                '''
                
                print(f"üöÄ Processing: {key}")
                
                # G·ªçi h√†m ƒë√°nh gi√° (async version)
                result = await evaluate_technical_requirement_async(user_prompt, assistant_id)
                
                if isinstance(result, str):
                    result = extract_first_json_object(result)
                
                return key, result
                
            except Exception as e:
                print(f"‚ùå Error processing {key}: {e}")
                return key, DEFAULT_OBJECT
    
    # T·∫°o v√† ch·∫°y t·∫•t c·∫£ tasks
    tasks = [process_item(key) for key in kha_nang_dap_ung_tham_chieu_step]
    results = await asyncio.gather(*tasks, return_exceptions=True)
    
    # C·∫≠p nh·∫≠t k·∫øt qu·∫£
    for result in results:
        if isinstance(result, Exception):
            continue
            
        key, processed_result = result
        
        kha_nang_dap_ung_tham_chieu_step[key]["kha_nang_dap_ung"] = processed_result.get('kha_nang_dap_ung', "")
        kha_nang_dap_ung_tham_chieu_step[key]["tai_lieu_tham_chieu"] = {
            "file": processed_result['tai_lieu_tham_chieu']['file'],
            "section": processed_result['tai_lieu_tham_chieu'].get('section', ''),
            "table_or_figure": processed_result['tai_lieu_tham_chieu'].get('table_or_figure', ''),
            "page": processed_result['tai_lieu_tham_chieu'].get('page', 0),
            "evidence": processed_result['tai_lieu_tham_chieu'].get('evidence', '')
        }
        kha_nang_dap_ung_tham_chieu_step[key].pop("relevant_context", None)
        
        print(f"‚úÖ Completed: {key}")

# H√†m t·∫°o thread
def create_thread():
    thread = clientOpenAI.beta.threads.create()
    return thread.id

# === UPDATE ASSISTANT ===
def update_assistant(assistant_id):
    assistant = clientOpenAI.beta.assistants.update(
        assistant_id=assistant_id,
        instructions=SYSTEM_PROMPT,
        model="gpt-4o-mini",
        tools=[{"type": "function", "function": FUNCTION_SCHEMA}]
    )
    return assistant.id

# === EVALUATE TECHNICAL REQUIREMENT ===
async def evaluate_technical_requirement_async(user_prompt: str, assistant_id: str) -> Dict[str, Any]:
    """
    Phi√™n b·∫£n async c·ªßa evaluate_technical_requirement
    """
    def _sync_evaluate():
        # 1. T·∫°o thread ri√™ng cho m·ªói l·∫ßn g·ªçi
        thread = clientOpenAI.beta.threads.create()
        thread_id = thread.id

        # 2. G·ª≠i message v√†o thread
        clientOpenAI.beta.threads.messages.create(
            thread_id=thread_id,
            role="user",
            content=user_prompt
        )

        # 3. T·∫°o run
        run = clientOpenAI.beta.threads.runs.create(
            thread_id=thread_id,
            assistant_id=assistant_id,
            tool_choice={"type": "function", "function": {"name": "danh_gia_ky_thuat"}}
        )

        # 4. Ch·ªù assistant x·ª≠ l√Ω (t·ªëi ƒëa 20s)
        for _ in range(20):
            run = clientOpenAI.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
            if run.status not in ["queued", "in_progress"]:
                break
            time.sleep(1)

        # 5. L·∫•y arguments tr·ª±c ti·∫øp
        if run.status == "requires_action":
            call = run.required_action.submit_tool_outputs.tool_calls[0]
            print(f"üëâ Assistant ƒë√£ g·ªçi tool: {call.function.name}")
            print("üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:")
            print(call.function.arguments)
            return call.function.arguments

        elif run.status == "completed":
            messages = clientOpenAI.beta.threads.messages.list(thread_id=thread_id)
            for msg in messages.data:
                print(f"[{msg.role}] {msg.content[0].text.value}")
            return None

        else:
            print(f"Run status: {run.status}")
            return None
    
    # Ch·∫°y function sync trong thread pool
    return await asyncio.to_thread(_sync_evaluate)


In [42]:
# === C√ÅCH S·ª¨ D·ª§NG ===

# Thay th·∫ø ƒëo·∫°n code g·ªëc c·ªßa b·∫°n b·∫±ng:
await process_requirements_async(
    context_queries, 
    kha_nang_dap_ung_tham_chieu_step,
    max_concurrent=10  # S·ªë requests ƒë·ªìng th·ªùi (t√πy ch·ªânh)
)

üöÄ Processing: 59F0F
üöÄ Processing: 41A92
üöÄ Processing: B180C
üöÄ Processing: 42BC5
üöÄ Processing: 7C60F
üöÄ Processing: 2AF36
üöÄ Processing: 3AA33
üöÄ Processing: 19BE0
üöÄ Processing: A2C97
üöÄ Processing: 9636B


  thread = clientOpenAI.beta.threads.create()
  clientOpenAI.beta.threads.messages.create(
  run = clientOpenAI.beta.threads.runs.create(
  run = clientOpenAI.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)


üëâ Assistant ƒë√£ g·ªçi tool: danh_gia_ky_thuat
üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:
{"yeu_cau_ky_thuat":"ƒêi·ªán √°p ƒë·∫ßu ra danh ƒë·ªãnh -48VDC, c·ª±c d∆∞∆°ng ƒë·∫•u ƒë·∫•t","kha_nang_dap_ung":"\"\"","tai_lieu_tham_chieu":{"file":"M831A Controller User Manual.pdf","section":"","table_or_figure":"","page":0,"evidence":""}}
üöÄ Processing: E804B
üëâ Assistant ƒë√£ g·ªçi tool: danh_gia_ky_thuat
üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:
{"yeu_cau_ky_thuat":"S·ª≠ d·ª•ng ƒë∆∞·ª£c c√°c ƒëi·ªán √°p LÔºãNÔºãPE/220VAC","kha_nang_dap_ung":"","tai_lieu_tham_chieu":{"file":"M831A Controller User Manual.pdf","section":"","table_or_figure":"","page":0,"evidence":""}}
üöÄ Processing: 0CB16
üëâ Assistant ƒë√£ g·ªçi tool: danh_gia_ky_thuat
üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:
{"yeu_cau_ky_thuat":"D·∫£i ƒëi·ªán √°p ƒë·∫ßu ra: t·ª´ -43.2VDC t·ªõi -57.6VDC","kha_nang_dap_ung":"","tai_lieu_tham_chieu":{"file":"M831A Controller User Manual.pdf","section":"","tab

In [None]:
kha_nang_dap_ung_tham_chieu_step

In [44]:
from typing import Dict, Any, Tuple
# === ASYNC VERSION OF ADAPT_OR_NOT ===
async def adapt_or_not_async(kha_nang_dap_ung_tham_chieu_step: Dict, 
                           adapt_or_not_step: Dict, 
                           all_requirements: Dict,
                           context_queries: Dict,
                           max_concurrent: int = 5) -> Tuple[Dict, Dict]:
    """
    Phi√™n b·∫£n async c·ªßa h√†m adapt_or_not
    """
    assistant_id = "asst_SIWbRtRbvCxXS9dgqvtj9U8O"
    print(f"Assistant ID: {assistant_id}")
    
    # T·∫°o semaphore ƒë·ªÉ gi·ªõi h·∫°n s·ªë requests ƒë·ªìng th·ªùi
    semaphore = asyncio.Semaphore(max_concurrent)
    
    async def process_requirement(key: str):
        async with semaphore:
            try:
                print(f"üöÄ ƒêang x·ª≠ l√Ω requirement: {key}")
                
                dap_ung_ky_thuat = ""
                tai_lieu_tham_chieu = ""
                
                # Thu th·∫≠p th√¥ng tin t·ª´ t·∫•t c·∫£ items trong requirement
                for item in all_requirements[key]:
                    if item not in kha_nang_dap_ung_tham_chieu_step:
                        continue
                        
                    yeu_cau_ky_thuat = context_queries[item].get('yeu_cau_ky_thuat_chi_tiet', "")
                    kha_nang_dap_ung = kha_nang_dap_ung_tham_chieu_step[item].get('kha_nang_dap_ung', "")
                    dap_ung_ky_thuat += f"{yeu_cau_ky_thuat} || {kha_nang_dap_ung}\n"
            
                    tai_lieu = kha_nang_dap_ung_tham_chieu_step[item].get('tai_lieu_tham_chieu', {})
                    file = tai_lieu.get("file", "")
                    page = tai_lieu.get("page", "")
                    table_or_figure = tai_lieu.get("table_or_figure", "")
                    evidence = tai_lieu.get("evidence", "")
            
                    tai_lieu_text = f"{file}, trang: {page}"
                    if table_or_figure:
                        tai_lieu_text += f", trong b·∫£ng(figure): {table_or_figure}"
                    tai_lieu_text += f", evidence: {evidence}\n\n"
                    tai_lieu_tham_chieu += tai_lieu_text
                
                # Ch·ªâ x·ª≠ l√Ω n·∫øu c√≥ d·ªØ li·ªáu
                if dap_ung_ky_thuat and tai_lieu_tham_chieu:
                    print(f"üìû G·ªçi API cho key: {key}")
                    result = await Evaluator_adaptability_async(dap_ung_ky_thuat, assistant_id)
                    result = parse_output_text(result)  # result ƒë√£ l√† dict
                    
                    output_text = result['ƒë√°p ·ª©ng k·ªπ thu·∫≠t']
                    
                    print(f"‚úÖ Ho√†n th√†nh key: {key}")
                    return key, output_text, tai_lieu_tham_chieu
                else:
                    print(f"‚ö†Ô∏è Kh√¥ng c√≥ d·ªØ li·ªáu cho key: {key}")
                    return key, None, None
                    
            except Exception as e:
                print(f"‚ùå L·ªói x·ª≠ l√Ω key {key}: {str(e)}")
                return key, None, None
    
    # T·∫°o tasks cho t·∫•t c·∫£ requirements
    tasks = [process_requirement(key) for key in all_requirements]
    
    print(f"üèÉ‚Äç‚ôÇÔ∏è B·∫Øt ƒë·∫ßu x·ª≠ l√Ω {len(tasks)} requirements v·ªõi {max_concurrent} requests ƒë·ªìng th·ªùi...")
    results = await asyncio.gather(*tasks, return_exceptions=True)
    
    # X·ª≠ l√Ω k·∫øt qu·∫£
    for result in results:
        if isinstance(result, Exception):
            print(f"‚ùå Task failed: {result}")
            continue
            
        key, output_text, tai_lieu_tham_chieu = result
        
        if output_text is not None and tai_lieu_tham_chieu is not None:
            if key not in adapt_or_not_step:
                adapt_or_not_step[key] = []
            
            adapt_or_not_step[key].append(output_text)
            adapt_or_not_step[key].append(tai_lieu_tham_chieu)
    
    print("üéâ Ho√†n th√†nh t·∫•t c·∫£ requirements!")
    return kha_nang_dap_ung_tham_chieu_step, adapt_or_not_step



def parse_output_text(output_text: str) -> dict:
    DEFAULT_JSON = {"ƒë√°p ·ª©ng k·ªπ thu·∫≠t": "0"}
    # B1: T√¨m ph·∫ßn JSON ƒë·∫ßu ti√™n trong chu·ªói
    match = re.search(r"\{.*\}", output_text, re.DOTALL)
    if not match:
        return DEFAULT_JSON.copy()

    json_str = match.group(0).strip()

    # B2: Parse JSON
    try:
        data = json.loads(json_str)
    except json.JSONDecodeError:
        return DEFAULT_JSON.copy()

    # B3: N·∫øu kh√¥ng c√≥ key th√¨ tr·∫£ m·∫∑c ƒë·ªãnh
    if "ƒë√°p ·ª©ng k·ªπ thu·∫≠t" not in data:
        return DEFAULT_JSON.copy()

    return data



# H√†m t·∫°o thread
def create_thread():
    thread = clientOpenAI.beta.threads.create()
    return thread.id

# === ASYNC VERSION OF EVALUATOR_ADAPTABILITY ===
async def Evaluator_adaptability_async(user_prompt: str, assistant_id: str = "asst_SIWbRtRbvCxXS9dgqvtj9U8O") -> str:
    """
    Phi√™n b·∫£n async c·ªßa Evaluator_adaptability
    """
    def _sync_evaluate():
        # 1. T·∫°o thread ri√™ng cho m·ªói l·∫ßn g·ªçi
        thread = clientOpenAI.beta.threads.create()
        thread_id = thread.id

        # 2. G·ª≠i message v√†o thread
        clientOpenAI.beta.threads.messages.create(
            thread_id=thread_id,
            role="user",
            content=user_prompt
        )

        # 3. T·∫°o run
        run = clientOpenAI.beta.threads.runs.create(
            thread_id=thread_id,
            assistant_id=assistant_id,
            tool_choice={"type": "function", "function": {"name": "evaluator"}}
        )

        # 4. Ch·ªù assistant x·ª≠ l√Ω (t·ªëi ƒëa 20s)
        for _ in range(20):
            run = clientOpenAI.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
            if run.status not in ["queued", "in_progress"]:
                break
            time.sleep(1)

        # 5. L·∫•y arguments tr·ª±c ti·∫øp
        if run.status == "requires_action":
            call = run.required_action.submit_tool_outputs.tool_calls[0]
            print(f"üëâ Assistant ƒë√£ g·ªçi tool: {call.function.name}")
            print("üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:")
            print(call.function.arguments)
            return call.function.arguments

        elif run.status == "completed":
            messages = clientOpenAI.beta.threads.messages.list(thread_id=thread_id)
            for msg in messages.data:
                print(f"hello:.........[{msg.role}] {msg.content[0].text.value}")
            return None

        else:
            print(f"Run status: {run.status}")
            return None
    
    # Ch·∫°y function sync trong thread pool
    return await asyncio.to_thread(_sync_evaluate)

In [45]:
adapt_or_not_step = {}
adapt_or_not_final = {}

In [46]:
kha_nang_dap_ung_tham_chieu_step, adapt_or_not_step = await adapt_or_not_async(
    kha_nang_dap_ung_tham_chieu_step,
    adapt_or_not_step,
    all_requirements,  # C·∫ßn pass th√™m bi·∫øn n√†y
    context_queries,   # C·∫ßn pass th√™m bi·∫øn n√†y
    max_concurrent=10   # S·ªë requests ƒë·ªìng th·ªùi
)

Assistant ID: asst_SIWbRtRbvCxXS9dgqvtj9U8O
üèÉ‚Äç‚ôÇÔ∏è B·∫Øt ƒë·∫ßu x·ª≠ l√Ω 9 requirements v·ªõi 10 requests ƒë·ªìng th·ªùi...
üöÄ ƒêang x·ª≠ l√Ω requirement: ƒê·∫ßu v√†o AC
üìû G·ªçi API cho key: ƒê·∫ßu v√†o AC
üöÄ ƒêang x·ª≠ l√Ω requirement: ƒê·∫ßu ra DC
üìû G·ªçi API cho key: ƒê·∫ßu ra DC
üöÄ ƒêang x·ª≠ l√Ω requirement: Y√™u c·∫ßu v·ªõi module ch·ªânh l∆∞u (Rectifier)
üìû G·ªçi API cho key: Y√™u c·∫ßu v·ªõi module ch·ªânh l∆∞u (Rectifier)
üöÄ ƒêang x·ª≠ l√Ω requirement: T√≠nh nƒÉng c·ªßa thi·∫øt b·ªã ngu·ªìn
üìû G·ªçi API cho key: T√≠nh nƒÉng c·ªßa thi·∫øt b·ªã ngu·ªìn
üöÄ ƒêang x·ª≠ l√Ω requirement: Kh·ªëi ƒëi·ªÅu khi·ªÉn v√† hi·ªÉn th·ªã
üìû G·ªçi API cho key: Kh·ªëi ƒëi·ªÅu khi·ªÉn v√† hi·ªÉn th·ªã
üöÄ ƒêang x·ª≠ l√Ω requirement: ƒêi·ªÅu ki·ªán l√†m vi·ªác
üìû G·ªçi API cho key: ƒêi·ªÅu ki·ªán l√†m vi·ªác
üöÄ ƒêang x·ª≠ l√Ω requirement: H·ªá th·ªëng l√†m m√°t
üìû G·ªçi API cho key: H·ªá th·ªëng l√†m m√°t
üöÄ ƒêang x·ª≠ l√Ω requirement: ƒêi·ªÅu ki·ªán b·∫£o h√†n

  thread = clientOpenAI.beta.threads.create()
  clientOpenAI.beta.threads.messages.create(
  run = clientOpenAI.beta.threads.runs.create(
  run = clientOpenAI.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)


üëâ Assistant ƒë√£ g·ªçi tool: evaluator
üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:
{"ƒë√°p ·ª©ng k·ªπ thu·∫≠t":"1/2"}
‚úÖ Ho√†n th√†nh key: ƒêi·ªÅu ki·ªán l√†m vi·ªác
üëâ Assistant ƒë√£ g·ªçi tool: evaluator
üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:
{"ƒë√°p ·ª©ng k·ªπ thu·∫≠t":"0/5"}
‚úÖ Ho√†n th√†nh key: ƒê·∫ßu ra DC
üëâ Assistant ƒë√£ g·ªçi tool: evaluator
üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:
{"ƒë√°p ·ª©ng k·ªπ thu·∫≠t":"2/12"}
‚úÖ Ho√†n th√†nh key: Acquy k√®m theo
üëâ Assistant ƒë√£ g·ªçi tool: evaluator
üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:
{"ƒë√°p ·ª©ng k·ªπ thu·∫≠t":"0"}
‚úÖ Ho√†n th√†nh key: Y√™u c·∫ßu v·ªõi module ch·ªânh l∆∞u (Rectifier)
üëâ Assistant ƒë√£ g·ªçi tool: evaluator
üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:
{"ƒë√°p ·ª©ng k·ªπ thu·∫≠t":"0"}
‚úÖ Ho√†n th√†nh key: H·ªá th·ªëng l√†m m√°t
üëâ Assistant ƒë√£ g·ªçi tool: evaluator
üß† D·ªØ li·ªáu JSON assistant mu·ªën tr·∫£ v·ªÅ:
{"ƒë√°p ·ª©ng k·ªπ thu·∫≠t":"1"}

In [None]:
kha_nang_dap_ung_tham_chieu_step

In [None]:
adapt_or_not_step

In [132]:
def merge_dicts(kha_nang_dap_ung_tham_chieu_step, context_queries):
    for k, v in kha_nang_dap_ung_tham_chieu_step.items():
        if k in context_queries and isinstance(v, dict) and isinstance(context_queries[k], dict):
            # N·∫øu c·∫£ 2 c√πng l√† dict th√¨ merge ƒë·ªá quy
            merge_dicts(v, context_queries[k])
        else:
            # N·∫øu kh√¥ng ph·∫£i dict ho·∫∑c key ch∆∞a t·ªìn t·∫°i trong B th√¨ g√°n tr·ª±c ti·∫øp
            context_queries[k] = v
    return context_queries

In [133]:
context_queries= merge_dicts(kha_nang_dap_ung_tham_chieu_step, context_queries)

In [None]:
context_queries

In [145]:
for key in adapt_or_not_step:
    product_key['B·ªô chuy·ªÉn ƒë·ªïi ngu·ªìn 220VAC/ 48VDC (k√®m theo 02 d√†n acquy 200Ah)'][key].extend(adapt_or_not_step[key]) 

In [None]:
product_key

In [None]:
s·ª≠a prompt ƒë·ªÉ llm ch·ªâ b√≥c ra ƒë∆∞·ª£c nh·ªØng component ƒë∆∞·ª£c ch·ªâ ƒë·ªãnh(ph·ª• thu·ªôc website)