In [2]:
# !pip install boto3

In [7]:
import boto3
import time
import json
import os
from dotenv import load_dotenv
import os

# Load the .env file (assuming it’s in the same folder)
load_dotenv()
AWS_REGION = os.getenv("AWS_REGION")

boto3.setup_default_session(
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    region_name=AWS_REGION
)

In [None]:
from textractClient import start_job, is_job_complete, get_job_results, save_results_to_file

bucket = 'catalogs-for-extraction'
document_name = 'SampleCylindersCatalog'
document = f'{document_name}.pdf'

# job_id = start_job(bucket, document, region=region)
job_id = "9c2c2e2e5a7c1c28c1f09b85da90c8413d7f9ecd4a8d37bc7ee9315a5cc357b7"

In [10]:
status = is_job_complete(job_id, region=AWS_REGION)

Job status: SUCCEEDED


In [11]:
results_pages = get_job_results(job_id, region=AWS_REGION)

Retrieved 1000 blocks on this page.
Fetching next page of results...
Retrieved 1000 blocks on this page.
Fetching next page of results...
Retrieved 1000 blocks on this page.
Fetching next page of results...
Retrieved 1000 blocks on this page.
Fetching next page of results...
Retrieved 1000 blocks on this page.
Fetching next page of results...
Retrieved 674 blocks on this page.


In [12]:
save_results_to_file(results_pages, f'textract_output_{document_name}.json')

Saved results to textract_output_SampleCylindersCatalog.json


In [None]:
import json, os, csv
from collections import defaultdict
from typing import List, Dict, Any, Tuple

TEXTRACT_JSON_PATH = f"textract_output_{document_name}.json"  # your saved file (list of responses with Blocks)

def load_textract_pages(path: str) -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    # Some folks save a single dict; others save a list of pages. Normalize:
    if isinstance(data, dict) and "Blocks" in data:
        return [data]
    elif isinstance(data, list):
        return data
    else:
        raise ValueError("Unexpected JSON structure – expected a list of responses or a single response with Blocks.")

def build_block_maps(blocks: List[Dict[str, Any]]):
    """Return dicts for quick lookup by Id and by BlockType."""
    id_map = {}
    type_map = defaultdict(list)
    for b in blocks:
        bid = b.get("Id")
        if bid:
            id_map[bid] = b
        type_map[b.get("BlockType","")].append(b)
    return id_map, type_map

def get_text_for_block(block: Dict[str,Any], id_map: Dict[str,Any]) -> str:
    """Collect WORD/SELECTION_ELEMENT text under a LINE or CELL, etc."""
    text = []
    for rel in block.get("Relationships", []):
        if rel.get("Type") == "CHILD":
            for cid in rel.get("Ids", []):
                child = id_map.get(cid)
                if not child: 
                    continue
                if child.get("BlockType") == "WORD":
                    text.append(child.get("Text",""))
                elif child.get("BlockType") == "SELECTION_ELEMENT":
                    if child.get("SelectionStatus") == "SELECTED":
                        text.append("[X]")
    return " ".join(text).strip()

# def extract_tables_from_blocks(blocks: List[Dict[str,Any]]) -> List[Dict[str, Any]]:
#     """Return normalized tables with headers/rows/page/table_index."""
#     id_map, type_map = build_block_maps(blocks)

#     tables_out = []
#     for t_index, tblock in enumerate(type_map.get("TABLE", []), start=1):
#         page = tblock.get("Page", None)

#         # Collect all CELL blocks under this TABLE
#         cell_blocks = []
#         for rel in tblock.get("Relationships", []):
#             if rel.get("Type") == "CHILD":
#                 for cid in rel.get("Ids", []):
#                     child = id_map.get(cid)
#                     if child and child.get("BlockType") == "CELL":
#                         cell_blocks.append(child)

#         # Index cells by (row, col)
#         grid = defaultdict(dict)
#         max_row, max_col = 0, 0
#         for cell in cell_blocks:
#             r = cell.get("RowIndex", 0)
#             c = cell.get("ColumnIndex", 0)
#             txt = get_text_for_block(cell, id_map)
#             grid[r][c] = txt
#             max_row = max(max_row, r)
#             max_col = max(max_col, c)

#         # Build a 2D list
#         table_2d = []
#         for r in range(1, max_row + 1):
#             row = []
#             for c in range(1, max_col + 1):
#                 row.append(grid.get(r, {}).get(c, ""))
#             table_2d.append(row)

#         # Heuristic: use first non-empty row as header
#         header_idx = None
#         for i, row in enumerate(table_2d[:5]):  # scan first 5 rows
#             nonempty = sum(1 for x in row if x.strip())
#             if nonempty >= max(2, max_col // 2):  # half+ populated → header candidate
#                 header_idx = i
#                 break

#         if header_idx is None:
#             # fallback: create generic headers
#             headers = [f"col_{i}" for i in range(1, max_col+1)]
#             body = table_2d
#         else:
#             headers = [h.strip() or f"col_{i}" for i, h in enumerate(table_2d[header_idx], start=1)]
#             body = [r for j, r in enumerate(table_2d) if j > header_idx]

#         tables_out.append({
#             "page": page,
#             "table_index": t_index,
#             "headers": headers,
#             "rows": body
#         })

#     return tables_out
import re
from collections import defaultdict

def _collect_cells_for_table(tblock, id_map):
    """Return list of CELLs, and MERGED_CELLs (if any) for this table."""
    cells, merged = [], []
    for rel in tblock.get("Relationships", []):
        if rel.get("Type") != "CHILD":
            continue
        for cid in rel.get("Ids", []):
            b = id_map.get(cid)
            if not b: 
                continue
            bt = b.get("BlockType")
            if bt == "CELL":
                cells.append(b)
            elif bt == "MERGED_CELL":
                merged.append(b)
    return cells, merged

def extract_tables_from_blocks_unmerged(blocks, replicate_data=True, header_scan_rows=5):
    """
    Build span-aware grids and unmerge merged cells.
    - replicate_data=True: copy merged cell values across spanned columns in data rows.
    - header_scan_rows: how many top rows to consider when building composite headers.
    Returns: [{page, table_index, headers, rows, spans}]
    """
    # Build maps
    id_map, type_map = build_block_maps(blocks)

    out = []
    for t_index, tblock in enumerate(type_map.get("TABLE", []), start=1):
        page = tblock.get("Page", None)

        cells, merged_cells = _collect_cells_for_table(tblock, id_map)

        # Compute table size considering spans
        max_row = max((c.get("RowIndex", 0) + c.get("RowSpan", 1) - 1) for c in cells) if cells else 0
        max_col = max((c.get("ColumnIndex", 0) + c.get("ColumnSpan", 1) - 1) for c in cells) if cells else 0

        # Init grid + span holders
        grid = [["" for _ in range(max_col)] for _ in range(max_row)]
        spans = [[(1,1) for _ in range(max_col)] for _ in range(max_row)]
        is_header_row = [False]*max_row  # we’ll guess header band later

        # Helper: place text across span
        def place_text(r0, c0, rs, cs, txt, is_header=False, replicate=True):
            # top-left always gets text and span
            grid[r0][c0] = txt if not grid[r0][c0] else grid[r0][c0]
            spans[r0][c0] = (rs, cs)
            # optional replication across covered cells
            if replicate:
                for rr in range(r0, r0+rs):
                    for cc in range(c0, c0+cs):
                        if rr == r0 and cc == c0:
                            continue
                        if is_header:
                            # In header rows, copy parent so we can compose later
                            if not grid[rr][cc]:
                                grid[rr][cc] = txt
                        else:
                            if not grid[rr][cc]:
                                grid[rr][cc] = txt

        # First pass: put CELL text and spans
        for c in cells:
            r0 = c.get("RowIndex", 1)-1
            c0 = c.get("ColumnIndex", 1)-1
            rs = c.get("RowSpan", 1)
            cs = c.get("ColumnSpan", 1)
            txt = get_text_for_block(c, id_map)
            place_text(r0, c0, rs, cs, txt, is_header=False, replicate=replicate_data)

        # If MERGED_CELL is present, ensure its text is propagated as well
        for m in merged_cells:
            # Textract links MERGED_CELL -> CHILD -> CELL ids
            child_ids = []
            for rel in m.get("Relationships", []):
                if rel.get("Type") == "CHILD":
                    child_ids.extend(rel.get("Ids", []))
            child_cells = [id_map[i] for i in child_ids if id_map.get(i) and id_map[i].get("BlockType")=="CELL"]
            if not child_cells:
                continue
            # Compute merged area (min row/col, max row/col)
            r_indices = []
            c_indices = []
            for cc in child_cells:
                ri = cc.get("RowIndex", 1)-1
                ci = cc.get("ColumnIndex", 1)-1
                rs = cc.get("RowSpan", 1)
                cs = cc.get("ColumnSpan", 1)
                r_indices.extend(list(range(ri, ri+rs)))
                c_indices.extend(list(range(ci, ci+cs)))
            r0, c0 = min(r_indices), min(c_indices)
            rs = max(r_indices) - r0 + 1
            cs = max(c_indices) - c0 + 1
            txt = get_text_for_block(m, id_map) or get_text_for_block(child_cells[0], id_map)
            place_text(r0, c0, rs, cs, txt, is_header=False, replicate=replicate_data)

        # ---- Header detection & composition (multi-row headers) ----
        # Density-based header band guess
        densities = [sum(1 for x in row if x.strip()) for row in grid[:min(header_scan_rows, max_row)]]
        header_end = 0
        for i in range(len(densities)):
            header_end = i
            if i > 0 and densities[i] <= max(1, densities[i-1]//2):
                break
        header_rows = list(range(0, header_end+1))
        for r in header_rows:
            is_header_row[r] = True

        # Compose headers: parent spans replicated above child columns become "Parent / Child"
        headers = []
        for c in range(max_col):
            parts = []
            for r in header_rows:
                t = grid[r][c].strip()
                if t and (not parts or t.lower() != parts[-1].lower()):
                    parts.append(t)
            header = " / ".join(parts) if parts else f"col_{c+1}"
            headers.append(header)

        # Body rows
        body_rows = [grid[r] for r in range(max_row) if not is_header_row[r]]

        out.append({
            "page": page,
            "table_index": t_index,
            "headers": headers,
            "rows": body_rows,
            "spans": spans,  # keep for downstream logic if needed
        })
    return out



def extract_forms_kv(blocks: List[Dict[str,Any]]) -> List[Dict[str,str]]:
    """Extract KEY_VALUE_SET (FORMS) pairs."""
    id_map, type_map = build_block_maps(blocks)
    kv_pairs = []

    for kv in type_map.get("KEY_VALUE_SET", []):
        if "KEY" not in kv.get("EntityTypes", []):
            continue
        key_text = get_text_for_block(kv, id_map)
        value_text = ""
        # find linked VALUE via Relationships of type VALUE
        for rel in kv.get("Relationships", []):
            if rel.get("Type") == "VALUE":
                for vid in rel.get("Ids", []):
                    vblock = id_map.get(vid)
                    if vblock and vblock.get("BlockType") == "KEY_VALUE_SET":
                        value_text = get_text_for_block(vblock, id_map)
        if key_text or value_text:
            kv_pairs.append({
                "page": kv.get("Page"),
                "key": key_text,
                "value": value_text
            })
    return kv_pairs

def write_tables_csv(tables: List[Dict[str,Any]], out_dir="tables_out"):
    os.makedirs(out_dir, exist_ok=True)
    # per-table CSVs
    for t in tables:
        page = t["page"]
        idx = t["table_index"]
        headers = t["headers"]
        rows = t["rows"]
        path = os.path.join(out_dir, f"tables_page_{page}_table_{idx}.csv")
        with open(path, "w", newline="", encoding="utf-8") as f:
            w = csv.writer(f)
            w.writerow(headers)
            w.writerows(rows)

    # consolidated CSV (best-effort union of headers)
    # We’ll normalize headers per table and write them one under another, with blank filler for missing cols.
    all_headers = set()
    norm_tables = []
    for t in tables:
        headers = [h.strip() for h in t["headers"]]
        all_headers.update(headers)
        norm_tables.append((headers, t["rows"], t["page"], t["table_index"]))
    all_headers = list(all_headers)

    with open(f"{out_dir}/tables_consolidated.csv", "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["page","table_index"] + all_headers)
        for headers, rows, page, tindex in norm_tables:
            for r in rows:
                rowmap = dict(zip(headers, r))
                w.writerow([page, tindex] + [rowmap.get(h, "") for h in all_headers])

def write_forms_csv(forms: List[Dict[str,str]], path="forms_kv_pairs.csv"):
    if not forms:
        return
    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["page","key","value"])
        for kv in forms:
            w.writerow([kv["page"], kv["key"], kv["value"]])

def build_llm_jsonl(tables: List[Dict[str,Any]], path="tables_for_llm.jsonl", max_rows_per_chunk=50):
    """
    Emit compact, LLM-friendly JSONL. Each line contains:
      { "page": n, "table_index": i, "headers": [...], "rows": [[...],...]}
    Split large tables into chunks to keep tokens manageable.
    """
    with open(path, "w", encoding="utf-8") as f:
        for t in tables:
            headers = t["headers"]
            rows = t["rows"]
            page = t["page"]
            ti = t["table_index"]

            # chunk
            for start in range(0, len(rows), max_rows_per_chunk):
                chunk = rows[start:start+max_rows_per_chunk]
                obj = {
                    "page": page,
                    "table_index": ti,
                    "row_range": [start, start+len(chunk)-1],
                    "headers": headers,
                    "rows": chunk
                }
                f.write(json.dumps(obj, ensure_ascii=False) + "\n")

# --- Run the pipeline ---
pages = load_textract_pages(TEXTRACT_JSON_PATH)

# Flatten all Blocks across responses (Textract paginates API, not your PDF pages)
all_blocks = []
for part in pages:
    all_blocks.extend(part.get("Blocks", []))

tables = extract_tables_from_blocks_unmerged(all_blocks, replicate_data=True, header_scan_rows=5)
forms = extract_forms_kv(all_blocks)

write_tables_csv(tables)
write_forms_csv(forms)
build_llm_jsonl(tables)

len(tables), len(forms)


(11, 97)

In [27]:
import pandas as pd
pd.read_csv('tables_out/tables_page_3_table_1.csv')
# pd.read_csv('tables_consolidated.csv')
# pd.read_csv('forms_kv_pairs.csv')

Unnamed: 0,Material Grade/ Cylinder,Pressure Rating,Internal Volume,P,Ordering,col_6,"Dimensions, in. (mm)",col_8,Weight
0,Specification,psig (bar),cm³ ± 5 %,in.,Number,A,B,T,lb (kg)
1,,,,,Single-Ended,,,,
2,,,150,,304L-05SF4-150,,4.88 (124),,1.1 (0.50)
3,304L SS/ DOT-4B 500,500 (34.4),300,1/4,304L-05SF4-300,2.00 (50.8),8.62 (219),0.093 (2.4),1.8 (0.82)
4,,,500,,304L-05SF4-500,,13.6 (345),,2.7 (1.2)
5,,,,,Double-Ended,,,,
6,,,40,1/8,304L-HDF2-40,1.25 (31.8),3.88 (98.6),0.070 (1.8),0.31 (0.14)
7,,,50,,304L-HDF4-50,,3.75 (95.2),,0.38 (0.17)
8,,,75,,304L-HDF4-75,1.50 (38.1),4.94 (125),,0.62 (0.28)
9,304L SS/ DOT-3E 1800,1800 (124),150,1/4,304L-HDF4-150,,5.25 (133),0.093,0.94 (0.43)


In [23]:
# Read the unified product table JSON file
df = pd.read_json('unified_product_table.json')
# df.to_csv('unified_product_table.csv', index=False)
specs_df = pd.json_normalize(df.specs)
pd.concat([df, specs_df], axis=1)

Unnamed: 0,ordering_number,section,page,table_index,specs,description_en,material_grade_cylinder,pressure_rating,internal_volume,p,...,features,miniature_sample_cylinders,cleaning_and_testing,quick_connects,warranty_information,warning,oxygen_service_hazards,august,Unnamed: 20,volume_tolerance
0,Number,,3,2,"{'material_grade_cylinder': 'Specification', '...",Product Number with specifications such as mat...,Specification,psig (bar),cm³ ± 5 %,in.,...,,,,,,,,,,
1,Single-Ended,,3,2,"{'t': 'nan', 'a': 'nan', 'b': 'nan', 'minimum_...",Product Single-Ended with specifications such ...,,,,,...,,,,,,,,,,
2,304L-05SF4-150,,3,2,"{'internal_volume': '150', 'dimensions_in_mm':...",Product 304L-05SF4-150 with specifications suc...,,,150,,...,,,,,,,,,,
3,304L-05SF4-300,,3,2,{'material_grade_cylinder': '304L SS/ DOT-4B 5...,Product 304L-05SF4-300 with specifications suc...,304L SS/ DOT-4B 500,500 (34.4),300,1/4,...,,,,,,,,,,
4,304L-05SF4-500,,3,2,"{'internal_volume': '500', 'dimensions_in_mm':...",Product 304L-05SF4-500 with specifications suc...,,,500,,...,,,,,,,,,,
5,Double-Ended,,3,2,"{'t': 'nan', 'a': 'nan', 'b': 'nan', 'minimum_...",Product Double-Ended with specifications such ...,,,,,...,,,,,,,,,,
6,304L-HDF2-40,,3,2,"{'internal_volume': '40', 'p': '1/8', 'col_6':...",Product 304L-HDF2-40 with specifications such ...,,,40,1/8,...,,,,,,,,,,
7,304L-HDF4-50,,3,2,"{'internal_volume': '50', 'dimensions_in_mm': ...",Product 304L-HDF4-50 with specifications such ...,,,50,,...,,,,,,,,,,
8,304L-HDF4-75,,3,2,"{'internal_volume': '75', 'col_6': '1.50 (38.1...",Product 304L-HDF4-75 with specifications such ...,,,75,,...,,,,,,,,,,
9,304L-HDF4-150,,3,2,{'material_grade_cylinder': '304L SS/ DOT-3E 1...,Product 304L-HDF4-150 with specifications such...,304L SS/ DOT-3E 1800,1800 (124),150,1/4,...,,,,,,,,,,
