# Chonking for Better DocQnA

## Queries

|Query|Expectation|Implementation Detail|
|---|---|---|
| What information present in an Manufacturer's Manual is actually irrelevant for writing an SOP? | <li> The irrelevant sections can be dropped off </li> <li>If SOP contains ALL of the information in the Manufacturer's Manual then what's the point of rewriting?</li>|Filter Table of Contents & drop irrelevant pages|
|How is the procedure in the Manufacturer's Manual different from what's in an SOP? | <li>SOPs have pre-defined tasks that need to be carried out - user specific experiments. </li><li>These tasks require combining the various sub-procedures specified by the manufacturer.</li>|<ol><li>Identify sub-processes</li><li>Store subprocess summaries in metadata</li><li>Prompt LLM with experiment & Subprocess Options</li></ol>

In [151]:
import fitz
import re
import pandas as pd
import os
import shutil

In [86]:
doc = fitz.open("data/Akta_Pure_User_Manual.PDF")

In [87]:
pages = []
for page in doc.pages(start = 1, stop = 10):
    pages.append(page)

## Extract Table of Contents

In [88]:
contents_text = pages[0].get_text()
pattern = r'(\d+(\.\d+)*)\s+(.*?)\s+(\d+)'
matches = re.findall(pattern, contents_text)
table_of_contents = []
for match in matches:
    number = match[0]
    heading = re.sub('\.{2,}', '', match[2])
    page_number = match[3]
    table_of_contents.append({"Index": number, "Heading": heading})

table_of_contents = pd.DataFrame(table_of_contents)
table_of_contents.head()

Unnamed: 0,Index,Heading
0,1.0,Introduction
1,1.1,Important user information
2,1.2,ÄKTA pure overview
3,1.3,ÄKTA pure user documentation
4,2.0,The ÄKTA pure instrument


Easy Alternative!

In [89]:
table_of_contents = doc.get_toc()
table_of_contents = pd.DataFrame(table_of_contents, columns = ['Level', 'Title', 'Page Number'])
table_of_contents.head()

Unnamed: 0,Level,Title,Page Number
0,1,Coverpage,1
1,1,Table of Contents,2
2,1,1 Introduction,6
3,2,1.1 Important user information,7
4,2,1.2 ÄKTA pure overview,9


## Extract Images

In [60]:
IMG_DIR = 'images'
os.makedirs(IMG_DIR, exist_ok=True)

In [61]:
for page_index in range(len(doc)):
	page = doc[page_index]
	image_list = page.get_images()

	for image_index, img in enumerate(image_list, start=1):
		xref = img[0]
		pix = fitz.Pixmap(doc, xref)

		if pix.n - pix.alpha > 3:
			pix = fitz.Pixmap(fitz.csRGB, pix)

		pix.save(f"{IMG_DIR}/page_{page_index}-image_{image_index}.png")
		pix = None
print(f"Found {len(os.listdir(IMG_DIR))} images. Saved all under {IMG_DIR}/")

Found 732 images. Saved all under images/


## Extract Tables

In [116]:
from pprint import pprint

page = doc[9]
print(page.get_text()[0:200])
print('\n','-'*50)

tabs = page.find_tables()
print(f"{len(tabs.tables)} tables found on {page}\n", '-'*50)
pd.DataFrame(tabs[0].extract()[1:], columns = tabs[0].extract()[0])

Main functions
Module
Create and edit methods using one or a combination of:
Method Editor
•
Predefined methods with built-in application support
•
Drag-and-drop function to build methods with relevan

 --------------------------------------------------
1 tables found on page 9 of data/Akta_Pure_User_Manual.PDF
 --------------------------------------------------


Unnamed: 0,Module,Main functions
0,Method Editor,Create and edit methods using one or a combina...
1,System Control,"Start, monitor and control runs. The current f..."
2,Evaluation,"Open results, evaluate runs and create reports..."
3,,


In [153]:
TABLE_DIR = 'tables'
os.makedirs(TABLE_DIR, exist_ok=True)
os.makedirs(os.path.join(TABLE_DIR, '.temp'), exist_ok=True)

In [154]:
def extract_and_save_tables(doc, output_directory = os.path.join(TABLE_DIR, '.temp')): 
    count = 0
    for page_num in range(doc.page_count):
        page = doc[page_num]
        tabs = page.find_tables()
        if len(tabs.tables) > 0:
            for idx, tab in enumerate(tabs):
                table_data = tab.extract()
                header, *rows = table_data
    
                # Convert table data to DataFrame
                df = pd.DataFrame(rows, columns=header)
    
                # Save DataFrame as CSV
                table_filename = f"table_page_{page_num + 1}_idx_{idx + 1}.csv"
                table_path = os.path.join(output_directory, table_filename)
                if df.shape[0]>0:
                    df.to_csv(table_path, index=False)
                    count+=1
    print(f"Saved {count} tables")

In [158]:
def merge_and_save_tables(output_directory):
    input_directory = os.path.join(output_directory, '.temp')
    table_files = sorted(os.listdir(input_directory))
    queue = []

    for table_file in table_files:
        table_path = os.path.join(input_directory, table_file)
        df = pd.read_csv(table_path)

        try: 
            assert (not queue or all(df.columns == queue[0].columns))
            queue.append(df)
        except:
            merged_table = pd.concat(queue, ignore_index=True)
            merged_filename = f"merged_{table_file}"
            merged_table_path = os.path.join(output_directory, merged_filename)
            merged_table.to_csv(merged_table_path, index=False)

            queue = [df]

    if queue:
        merged_table = pd.concat(queue, ignore_index=True)
        merged_filename = f"merged_last_{table_file}"
        merged_table_path = os.path.join(output_directory, merged_filename)
        merged_table.to_csv(merged_table_path, index=False)

    shutil.rmtree(input_directory)
    print(f"{len(os.listdir(output_directory))} tables in repository after merging")

In [156]:
extract_and_save_tables(doc)

Saved 402 tables


In [159]:
merge_and_save_tables(TABLE_DIR)

180 tables in repository after merging


## What Next?