In [12]:
import re
import textwrap
import pandas as pd
from pathlib import Path

try:
    import pdfplumber
except ImportError:
    import sys, subprocess
    print("Installing pdfplumber … (run manually if this fails)")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pdfplumber"])
    import pdfplumber

try:
    from fpdf import FPDF, FPDFException
except ImportError:
    import sys, subprocess
    print("Installing fpdf2 … (run manually if this fails)")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "fpdf2"])
    from fpdf import FPDF, FPDFException


def extract_pages(pdf_path: Path):
    with pdfplumber.open(str(pdf_path)) as pdf:
        return [
            (i + 1, page.extract_text())
            for i, page in enumerate(pdf.pages)
            if page.extract_text()
        ]


def parse_material_sections(pages):
    data = []
    current_material, buffer, last_pg = None, [], 0

    material_patterns = [
        re.compile(r'^(\d+(?:\.\d+)*)\s+[A-Z][A-Za-z0-9 ,/\-()]+$'),
        re.compile(r'^(?:Material:?)\s+[A-Z][A-Za-z0-9 ,/\-()]+$', re.I),
        re.compile(r'^(?:Specification for)\s+[A-Z][A-Za-z0-9 ,/\-()]+$', re.I),
        re.compile(r'^[A-Z0-9][A-Z0-9 \-/]{10,}$')
    ]

    print("\nDetected material section headers:")

    for pg_num, text in pages:
        for ln in text.split('\n'):
            ln = ln.strip()
            if not ln:
                continue
            header_match = next((ln for pat in material_patterns if pat.match(ln)), None)
            if header_match:
                if current_material and buffer:
                    data.append((current_material, buffer, last_pg))
                current_material, buffer, last_pg = header_match, [], pg_num
                print(f"  → {current_material}  (page {last_pg})")
            else:
                buffer.append((ln, pg_num))
    if current_material and buffer:
        data.append((current_material, buffer, last_pg))
    return data


def extract_fields(material, buffer, start_pg):
    tests, defs, misc = [], [], []
    code_regex = re.compile(r"\b(?:IS|ASTM|BS|EN|DIN)\s*[- ]?\d+(?:[:\-]\d+)?", re.I)
    page_ref = lambda t, p: f"{t} (Page {p})"

    for line, pg in buffer:
        target = page_ref(line, pg)
        if code_regex.search(line) or re.search(r"\b(test|strength|absorption|method|procedure|requirement|classification)\b", line, re.I):
            tests.append(target)
        elif re.search(r"\b(type|grade|class|definition|composition|description|material|specification)\b", line, re.I):
            defs.append(target)
        else:
            misc.append(target)

    return {
        "Material Name": material,
        "Test Name/Reference Code/Standard as per the given document (with reference page number)": tests or ["No Information Available"],
        "Specific Material Type/Material Definition": defs or ["No Information Available"],
        "Any other relevant information": misc or ["No Information Available"]
    }


def clean_text(txt):
    if isinstance(txt, list):
        txt = '\n'.join(txt)
    txt = re.sub(r"(\S{120})", r"\1 ", txt)
    return txt.encode("latin-1", "replace").decode("latin-1")


def export_to_pdf(df: pd.DataFrame, filename="extracted_report.pdf"):
    class PDF(FPDF):
        def header(self):
            # Add a border box
            self.set_line_width(0.5)
            self.rect(10, 10, 190, 20)  # x, y, width, height
            self.set_font("Arial", 'B', 14)
            self.set_y(15)
            self.cell(0, 10, "Extracted Material Specification Table", ln=True, align='C')
            self.ln(4)

        def footer(self):
            pass


    pdf = PDF()
    pdf.set_auto_page_break(True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=10)

    col_widths = [10, 35, 65, 40, 40]
    headers = list(df.columns)


    pdf.set_fill_color(0, 102, 204)
    pdf.set_font("Arial", 'B', 10)
    for i, header in enumerate(headers):
        pdf.multi_cell(col_widths[i], 10, clean_text(header), border=1, align='C', fill=True, ln=3, max_line_height=pdf.font_size)
    pdf.ln()

    pdf.set_font("Arial", '', 9)
    fill = False
    pdf.set_fill_color(255, 255, 153)
    for idx, row in df.iterrows():

        pdf.ln(2)
        pdf.set_draw_color(180, 180, 180)
        pdf.line(10, pdf.get_y(), 200, pdf.get_y())
        pdf.ln(1)
        pdf.set_font("Arial", 'B', 11)
        pdf.set_fill_color(224, 224, 224)
        pdf.cell(0, 8, clean_text(f"Material Section {idx + 1}: {row['Material Name']}"), ln=True, fill=True)
        pdf.set_font("Arial", '', 9)
        fill = not fill

        row_data = [
            str(idx + 1),
            row['Material Name'],
            '\n'.join(row['Test Name/Reference Code/Standard as per the given document (with reference page number)']),
            '\n'.join(row['Specific Material Type/Material Definition']),
            '\n'.join(row['Any other relevant information'])
        ]
        for i, item in enumerate(row_data):
            if i == 1:
                pdf.set_font("Arial", 'B', 9)
            else:
                pdf.set_font("Arial", '', 9)
            pdf.multi_cell(col_widths[i], 6, clean_text(item), border=1, align='L', ln=3, max_line_height=pdf.font_size, fill=fill)
        pdf.ln()

    try:
        pdf.output(filename)
        print(f"PDF saved to {filename}")
    except FPDFException as e:
        print(f"FPDF error: {e}. Try shorter lines or different font size.")


def main():
    pdf_path = Path(input("Enter path to technical specification PDF: ").strip())
    if not pdf_path.exists():
        print("File not found.")
        return

    pages = extract_pages(pdf_path)
    if not pages:
        print("No text extracted – scanned PDF? Try OCR first.")
        return

    sections = parse_material_sections(pages)
    if not sections:
        print("No headers found – fallback to fixed‑size blocks.")
        block_sz, counter = 30, 1
        for pg, txt in pages:
            lines = [ln.strip() for ln in txt.split('\n') if ln.strip()]
            for i in range(0, len(lines), block_sz):
                blk = lines[i:i+block_sz]
                label = f"Material Block {counter} (Page {pg})"
                sections.append((label, [(l, pg) for l in blk], pg))
                counter += 1

    rows = [extract_fields(m, buf, pg) for m, buf, pg in sections]
    df = pd.DataFrame(rows)
    df.insert(0, "Sl. No.", range(1, len(df)+1))

    csv_name = "extracted_table.csv"
    excel_name = "extracted_table.xlsx"
    df.to_csv(csv_name, index=False)
    df.to_excel(excel_name, index=False)
    print(f"Excel saved to {excel_name}")
    print(f"CSV saved to {csv_name}")

    export_to_pdf(df)


if __name__ == "__main__":
    main()


Enter path to technical specification PDF: /content/Prescriptive Specifications_CPWD.pdf

Detected material section headers:
  → 4.0 CONCRETE WORK  (page 1)
  → 4.1.1 Coarse Aggregate  (page 1)
  → 4.1.1.3 Size and Grading  (page 1)
  → CPWD SPECIFICATIONS 2019  (page 1)
  → 4.1.2 Chemical Admixtures  (page 3)
  → CPWD SPECIFICATIONS 2019  (page 3)
  → 4.1.2.5 Some admixture may be in the form of powder, particle or high concentration liquids which may  (page 4)
  → 4.1.2.7 Certain admixtures may contain significant amounts of finely divided insoluble materials or  (page 4)
  → 4.1.2.9 No admixtures shall be accepted for use in concrete unless these are tested in accordance with  (page 4)
  → 4.2.1 Grades of Cement Concrete  (page 4)
  → 4.2.1.1 The characteristic strength is defined as the strength of material below which not more than 5  (page 5)
  → CPWD SPECIFICATIONS 2019 92  (page 5)
  → 4.2.2 Workability of Concrete  (page 6)
  → 4.2.2.1 The concrete mix proportion chosen should

  self.set_font("Arial", 'B', 14)
  self.cell(0, 10, "Extracted Material Specification Table", ln=True, align='C')
  pdf.set_font("Arial", size=10)
  pdf.set_font("Arial", 'B', 10)
  pdf.multi_cell(col_widths[i], 10, clean_text(header), border=1, align='C', fill=True, ln=3, max_line_height=pdf.font_size)
  pdf.set_font("Arial", '', 9)
  pdf.set_font("Arial", 'B', 11)
  pdf.cell(0, 8, clean_text(f"Material Section {idx + 1}: {row['Material Name']}"), ln=True, fill=True)
  pdf.set_font("Arial", '', 9)
  pdf.set_font("Arial", '', 9)
  pdf.multi_cell(col_widths[i], 6, clean_text(item), border=1, align='L', ln=3, max_line_height=pdf.font_size, fill=fill)
  pdf.set_font("Arial", 'B', 9)  # Bold for Material Name column
  self.set_font("Arial", 'B', 14)
  self.cell(0, 10, "Extracted Material Specification Table", ln=True, align='C')
  pdf.set_font("Arial", 'B', 11)
  pdf.cell(0, 8, clean_text(f"Material Section {idx + 1}: {row['Material Name']}"), ln=True, fill=True)
  pdf.set_font("Arial", '

PDF saved to extracted_report.pdf


  pdf.set_font("Arial", '', 9)
  pdf.multi_cell(col_widths[i], 6, clean_text(item), border=1, align='L', ln=3, max_line_height=pdf.font_size, fill=fill)
  pdf.set_font("Arial", 'B', 11)
  pdf.cell(0, 8, clean_text(f"Material Section {idx + 1}: {row['Material Name']}"), ln=True, fill=True)
  pdf.set_font("Arial", '', 9)
  pdf.set_font("Arial", 'B', 9)  # Bold for Material Name column
  self.set_font("Arial", 'B', 14)
  self.cell(0, 10, "Extracted Material Specification Table", ln=True, align='C')
