In [1]:
import camelot
import json
import os

In [2]:
def get_continued_tables(tables, threshold):

    continued_tables = {}
    previous_table = False
    group_counter = 0

    # typical height of a pdf is 842 points and bottom margins are anywhere between 56 and 85 points
    # therefore, accounting for margins, 792
    page_height = 792

    # iterate over the tables
    for i, table in enumerate(tables):

        # if a previous table exists (remember, we start with this as false)
        # and the previous table was on the previous page
        # and the number of columns of both tables is the same
        if previous_table and table.page == previous_table.page + 1 and len(table.cols) == len(previous_table.cols):

            # get the bottom coordinate of the previous table
            # note that for pdfs the origin (0, 0) typically starts from the bottom-left corner of the page,
            # with the y-coordinate increasing as you move upwards
            # this is why for {x0, y0, x1, y1} we need the y0 as the bottom
            previous_table_bottom = previous_table._bbox[1]

            # get the top coordinate of the current table
            # for {x0, y0, x1, y1} we need the y1 as the top
            current_table_top = table._bbox[3]

            # if the previous table ends in the last 15% of the page and the current table starts in the first 15% of the page
            if previous_table_bottom < (threshold / 100) * page_height and current_table_top > (1 - threshold / 100) * page_height:

                # if we don't have started this group of tables
                if (continued_tables.get(group_counter) is None):

                    # start by adding the first table
                    continued_tables[group_counter] = [previous_table]

                # add any of the sunsequent tables to the group
                continued_tables[group_counter].append(table)

            # if this is not a continuation of the previous table
            else:

                # increment the group number
                group_counter += 1;

        # if this is not a continuation of the previous table
        else:

            # increment the group number
            group_counter += 1;

        # the current table becomes the previous table for the next iteration
        previous_table = table

    # transform the dictionary into an array of arrays
    continued_tables = [value for value in continued_tables.values()]

    # return the combined tables
    return continued_tables

In [16]:
def table_to_json(table_data, table_info):
    """Convert table data to JSON format"""
    if not table_data:
        return {}
    
    # Create JSON structure
    json_data = {
        "metadata": {
            "source_file": table_info["source_file"],
            "page": table_info["page"],
            "table_order": table_info["order"],
            "total_rows": len(table_data),
            "total_columns": len(table_data[0]) if table_data else 0
        },
        "headers": [],
        "data": []
    }
    
    # Add headers (first row)
    if len(table_data) > 0:
        headers = [str(cell).strip() for cell in table_data[0]]
        
        # Replace first 3 headers with fixed names
        if len(headers) >= 1:
            headers[0] = "STT"
        if len(headers) >= 2:
            headers[1] = "hang_hoa"
        if len(headers) >= 3:
            headers[2] = "yeu_cau_ky_thuat"
            
        json_data["headers"] = headers
        
        # Add data rows (skip header)
        for i, row in enumerate(table_data[1:], 1):
            row_dict = {}
            for j, cell in enumerate(row):
                # Use header as key, fallback to column index if header is empty
                key = json_data["headers"][j] if j < len(json_data["headers"]) and json_data["headers"][j] else f"column_{j}"
                row_dict[key] = str(cell).strip()
            
            json_data["data"].append({
                "row_index": i,
                "values": row_dict
            })
    
    return json_data

In [17]:
def get_biggest_table(pdf_path, threshold):
    tables = camelot.read_pdf(pdf_path, flavor = 'lattice', pages = 'all')
    continued_tables = get_continued_tables(tables, threshold)

    # get the name of the PDF file we are processing (without the extension)
    pdf_file_name = os.path.splitext(os.path.basename(pdf_path))[0]

    processed = []
    all_table_jsons = []

    # iterate over found tables
    for i, table in enumerate(tables):

        # if table was already processed as part of a group
        if table in processed: continue

        # check if the current table is a continued table
        is_continued = any(table in sublist for sublist in continued_tables)

        # collect all table data (current table + continued tables if any)
        all_table_data = list(table.data)

        # if the current table is a continued table, append all subsequent continued tables data
        if is_continued:

            # get the index of the group in "continued_tables" associated with the current table
            group_index = next(index for index, sublist in enumerate(continued_tables) if table in sublist)

            # iterate over the tables in said group and append their data
            for continued_table in continued_tables[group_index]:

                # skip the current table as it's already added
                if continued_table == table or continued_table in processed: continue

                # append the data of the continued table (skip header for subsequent tables)
                all_table_data.extend(continued_table.data[1:] if len(continued_table.data) > 1 else [])

                # keep track of processed tables
                processed.append(continued_table)

        # convert to JSON
        table_info = {
            "source_file": pdf_file_name,
            "page": table.parsing_report['page'],
            "order": table.parsing_report['order']
        }
        
        json_data = table_to_json(all_table_data, table_info)
        all_table_jsons.append(json_data)
        
        # mark current table as processed
        processed.append(table)

    # find the table with the most rows
    if all_table_jsons:
        largest_table = max(all_table_jsons, key=lambda x: x.get('metadata', {}).get('total_rows', 0))
        
        # return the JSON of the largest table
        print(json.dumps(largest_table, ensure_ascii=False, indent=2))
        return largest_table
    else:
        print("No tables found in the PDF.")
        return None

In [None]:
hello = get_biggest_table("D:/study/LammaIndex/documents/Chuong_V_Yeu_cau_ky_thuat.pdf",15)

In [41]:
data = hello["data"]

In [None]:
data

In [None]:
import uuid
def clean_text(text):
    """Làm sạch text, loại bỏ ký tự xuống dòng thừa"""
    return re.sub(r'\n+', '', text.strip())

def split_requirements(text):
    """Tách các yêu cầu dựa trên dấu gạch đầu dòng"""
    requirements = []
    lines = text.split('\n')
    for line in lines:
        line = line.strip()
        if line.startswith('- '):
            requirements.append(line[2:].strip())
        elif line and not any(line.startswith(prefix) for prefix in ['- ']):
            if requirements:
                requirements[-1] += ' ' + line
            else:
                requirements.append(line)
    return requirements

def generate_random_key():
    """Tạo key random 5 ký tự từ UUID"""
    return str(uuid.uuid4()).replace('-', '')[:5].upper()

def convert_to_new_format(data):
    result = []
    current_product = None
    current_category = None
    
    for item in data:
        values = item['values']
        stt_raw  = values['STT']
        hang_hoa = clean_text(values['hang_hoa'])
        yeu_cau = values['yeu_cau_ky_thuat']


        stt = stt_raw.strip()

        roman_pattern = r'^(VII|VIII|IX|X|XI|XII|I{1,3}|IV|V|VI)\s+(.+)'
        roman_match = re.match(roman_pattern, stt)
        # Nếu STT là số La Mã (I, II, III...) thì đây là tên sản phẩm
        hang_hoa_roman_match = re.match(roman_pattern, hang_hoa)
        if roman_match and not hang_hoa and not yeu_cau:
            if current_product:
                result.append(current_product)
            
            roman_num = roman_match.group(1)  # Số La Mã
            product_name = roman_match.group(2)  # Tên sản phẩm
            
            current_product = {
                "ten_san_pham": product_name,
                "cac_muc": []
            }
            current_category = None
        elif hang_hoa_roman_match and not stt_raw and not yeu_cau:
            if current_product:
                result.append(current_product)
            
            roman_num = hang_hoa_roman_match.group(1)  # Số La Mã
            product_name = hang_hoa_roman_match.group(2)  # Tên sản phẩm
            
            current_product = {
                "ten_san_pham": product_name,
                "cac_muc": []
            }
            current_category = None        
        
        elif stt in ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII']:
            if current_product:
                result.append(current_product)
            
            current_product = {
                "ten_san_pham": hang_hoa,
                "cac_muc": []
            }
            current_category = None
            
        # Nếu STT là số (1, 2, 3...) thì đây là danh mục
        elif stt.isdigit():
            current_category = {
                "ten_hang_hoa": hang_hoa,
                "thong_so_ky_thuat": {}
            }
            
            # Xử lý yêu cầu kỹ thuật cho danh mục
            if yeu_cau.strip():
                requirements = split_requirements(yeu_cau)
                for req in requirements:
                    key = generate_random_key()  # Tạo key random
                    current_category["thong_so_ky_thuat"][key] = clean_text(req)
            if current_product:
                current_product["cac_muc"].append(current_category)
                
        # Nếu STT trống thì đây là thông số kỹ thuật chi tiết
        elif stt == '' and current_category and hang_hoa:
            # Tạo key random cho thông số kỹ thuật
            key = generate_random_key()
            
            # Làm sạch tên hàng hóa và yêu cầu kỹ thuật
            clean_hang_hoa = clean_text(hang_hoa)
            clean_yeu_cau = clean_text(yeu_cau)
            
            current_category["thong_so_ky_thuat"][key] = [clean_hang_hoa, clean_yeu_cau]
        elif stt == '' and current_category and not hang_hoa:
            if yeu_cau.strip():
                requirements = split_requirements(yeu_cau)
                
                # Lấy key cuối cùng trong thong_so_ky_thuat (nếu có)
                existing_keys = list(current_category["thong_so_ky_thuat"].keys())
                last_key = existing_keys[-1] if existing_keys else None
                
                for req in requirements:
                    clean_req = clean_text(req)
                    
                    # Kiểm tra chữ cái đầu có viết hoa HOẶC có gạch đầu dòng không
                    has_dash = req.strip().startswith('- ')
                    has_uppercase = clean_req and clean_req[0].isupper()
                    
                    if has_uppercase or has_dash:
                        # Chữ đầu viết hoa HOẶC có gạch đầu dòng -> tạo key mới
                        key = generate_random_key()
                        current_category["thong_so_ky_thuat"][key] = clean_req
                        last_key = key
                    else:
                        # Chữ đầu không viết hoa VÀ không có gạch đầu dòng -> nối vào key trước đó
                        if last_key and last_key in current_category["thong_so_ky_thuat"]:
                            current_category["thong_so_ky_thuat"][last_key] += " " + clean_req
                        else:
                            # Nếu không có key trước đó thì vẫn tạo key mới
                            key = generate_random_key()
                            current_category["thong_so_ky_thuat"][key] = clean_req
                            last_key = key
    
    # Thêm sản phẩm cuối cùng
    if current_product:
        result.append(current_product)
    
    return result

# Chuyển đổi dữ liệu
converted_data = convert_to_new_format(data)

In [None]:
converted_data

In [None]:
context_prompts = []
for item in converted_data:
    ten_san_pham = item['ten_san_pham']
    for muc in item['cac_muc']:
        ten_hang_hoa = muc['ten_hang_hoa']
        thong_so_ky_thuat = muc['thong_so_ky_thuat']
        for key, value in thong_so_ky_thuat.items():
            if isinstance(value, list):
                value_str = ' '.join(value)
            else:
                value_str = value
            query = {
                key: f"{ten_san_pham} {ten_hang_hoa} {value_str}"
            }
            context_prompts.append(query)



In [66]:
print(query[12])

KeyError: 12

In [None]:
def create_query():
    pass

In [None]:
def search_rag(url: str, api_key: str, query: str, collection_name: str = "thong_tin_san_pham", file_name: str = "NetSure_732_User_Manual"):
    # Cấu hình client Qdrant
    client = QdrantClient(
        url=url,
        api_key=api_key,
    )
    aclient = AsyncQdrantClient(
        url=url,
        api_key=api_key,
    )
    # Khởi tạo Vector Store
    vector_store = QdrantVectorStore(
        collection_name=collection_name,
        client=client,
        aclient=aclient,
    )
    filters = MetadataFilters(
        filters=[
            MetadataFilter(key="file_name", operator=FilterOperator.EQ, value=file_name),
            MetadataFilter(key="type", operator=FilterOperator.EQ, value="chunk_document"),
        ],
        condition=FilterCondition.AND,
    )

    index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

    retriever = index.as_retriever(similarity_top_k=5, verbose=True, filters=filters)

    query = query

    # --- Thay đổi từ đây ---
    query_engine = RetrieverQueryEngine.from_args(
        retriever=retriever
    )

    # 2. Thực hiện truy vấn qua Query Engine
    print("Bắt đầu truy vấn với Query Engine...")
    response = query_engine.retrieve(query)
    text_content = ""
    for node in response:
        text_content += node.get_content() + "\n"
    return text_content

In [None]:
results = []

for query in queries:
    for key, value in query.items():
        vector = search_rag(value)
        results.append({
            "key": key,
            "value": value,
            "text_content": text_content  
        })

In [None]:
def create_query(requirement: str, text_content: str):
    template = (
        "Based on the following text, create a comprehensive summary for the entire document.\n"
        "Document:\n---\n{requirement}\n {text_content}---\nSummary:"
    )
    prompt_template = PromptTemplate(template)
    response = Settings.llm.predict(prompt_template, requirement=requirement, text_content=text_content)
    return response.strip()

In [None]:
result1 = []
for item in results:
    value = item["value"]
    vector = item["vector"]
    answer = create_query(value, text_content)