In [1]:
import json
from PyPDF2 import PdfWriter, PdfReader
import os

In [2]:
with open('report.json') as f:
    report = json.load(f)

report

{'type': 'Section',
 'bookmark_name': 'BWL Report',
 'base_directory': 'C:\\Users\\guest2\\Documents\\Level.III\\BWL\\58313',
 'variables': [{'template_text': 'SDG Batch',
   'is_constant': True,
   'constant_value': 'SDG Batch',
   'id': 'fcdf1b48-c998-4697-b3b7-57465104658a'},
  {'template_text': 'Month Day,  Year',
   'is_constant': True,
   'constant_value': 'day month year',
   'id': 'cfab2ca8-c944-488f-8e1d-6ee2d8acbad6'},
  {'template_text': 'Attention name',
   'is_constant': True,
   'constant_value': 'name',
   'id': '520d808a-de37-41d9-b64b-4c383b78bf3f'},
  {'template_text': 'Max Page Number',
   'is_constant': True,
   'constant_value': 'page num',
   'id': '94f13371-a82b-4efd-8bef-3362d172db8b'},
  {'template_text': 'Project Name',
   'is_constant': True,
   'constant_value': 'proj name',
   'id': '00574615-be35-4c0a-a59a-8c4141643a0a'},
  {'template_text': 'Client Name',
   'is_constant': True,
   'constant_value': 'client name',
   'id': '969d887c-53e8-4254-ba67-5b6541e

In [3]:
!pip install python-docx-replace



In [4]:
import os
from docx import Document
from python_docx_replace import docx_replace, docx_get_keys
from docx2pdf import convert
from PyPDF2 import PdfReader
from buildpdf.table_entries import TableEntry, TableEntryData


def get_variables_in_docx(docx_path):
    # Create an object of the Document class
    document = Document(docx_path)

    # Get the keys in the DOCX file
    try:
        keys = docx_get_keys(document)
    except Exception as e:
        keys = [f"Error: {e}"]
    return keys


def replace_text_in_docx(docx_path, replacements):
    # Create an object of the Document class
    document = Document(docx_path)

    # Replace the text in the DOCX file
    docx_replace(document, **replacements)

    # Save the modified DOCX file
    modified_docx_path = docx_path.replace(".docx", "_modified.docx")
    document.save(modified_docx_path)
    return modified_docx_path


def convert_docx_to_pdf(docx_path):
    pdf_path = docx_path.replace(".docx", ".pdf")

    convert(docx_path, pdf_path)
    return pdf_path


def update_table_of_contents(docx_path, table_entries, page_start_col, page_end_col):
    doc = Document(docx_path)
    table = doc.tables[0]
    num_rows = len(table.rows)
    for i in range(num_rows):
        entry = TableEntry(table, i, page_start_col, page_end_col)
        if entry.name in table_entries:
            entry.set_page_start(table_entries[entry.name])
            entry.set_page_end("")
    new_docx_path = docx_path.replace(".docx", "_updated_toc.docx")
    doc.save(new_docx_path)
    return new_docx_path


def convert_docx_template_to_pdf(
    docx_path,
    replacements=None,
    table_entries=None,
    page_start_col=None,
    page_end_col=None,
):
    intermediate_files = []

    # If there are replacements to be made, do them in the DOCX file
    if replacements:
        modified_docx_path = replace_text_in_docx(docx_path, replacements)
        intermediate_files.append(modified_docx_path)
    else:
        modified_docx_path = docx_path

    if table_entries:
        modified_docx_path = update_table_of_contents(
            modified_docx_path, table_entries, page_start_col, page_end_col
        )

    # Convert the modified DOCX file to PDF
    pdf_path = convert_docx_to_pdf(modified_docx_path)

    # Cleanup intermediate files
    for file_path in intermediate_files:
        if os.path.exists(file_path):
            os.remove(file_path)

    pdf_reader = PdfReader(pdf_path)
    pdf_reader_file_path = pdf_path

    # Cleanup the resulting PDF file
    if os.path.exists(pdf_reader_file_path):
        os.remove(pdf_reader_file_path)

    return (pdf_reader, len(pdf_reader.pages))


In [5]:
from PyPDF2 import PdfWriter, PdfReader
import os
from io import StringIO
from typing import Optional, Any


def get_pdf_and_page_count(file_path):
    pdf = PdfReader(file_path)
    return pdf, len(pdf.pages)


def vars_to_mapping(variables):
    return {var['template_text']: var['constant_value'] for var in variables}


def section_has_files(section):
    for child in section["children"]:
        if child["type"] == "docxTemplate":
            if child["exists"]:
                return True
        if child["type"] == "FileType":
            if len(child["files"]) > 0:
                return True
        if child["type"] == "Section":
            if section_has_files(child):
                return True
    return False

from pydantic import BaseModel

class BookmarkItem(BaseModel):
    title: str
    page: int
    parent: Optional["BookmarkItem"] = None
    outline_element: Optional[Any] = None


def generate_pdf_pass_one(report: dict):

    writer_data = []
    bookmark_data = []
    current_page = 0

    def build_pdf_data(section, base_directory="./", root_bookmark=None):
        nonlocal current_page
        nonlocal writer_data
        nonlocal bookmark_data
        base_directory = os.path.join(base_directory, section["base_directory"])
        base_directory = os.path.normpath(base_directory)
        if section.get("bookmark_name") and section_has_files(section):
            # root_bookmark = writer.add_outline_item(
            #     section["bookmark_name"], current_page, root_bookmark
            # )
            root_bookmark = BookmarkItem(title=section["bookmark_name"], page=current_page, parent=root_bookmark)
            bookmark_data.append(root_bookmark)

        for child in section["children"]:
            if child["type"] == "DocxTemplate":
                if child["bookmark_name"] and child["exists"]:
                    # writer.add_outline_item(
                    #     child["bookmark_name"], current_page, root_bookmark
                    # )
                    bookmark_data.append(
                        BookmarkItem(title=child["bookmark_name"], page=current_page, parent=root_bookmark)
                                         )

                if child["exists"]:
                    docx_path = os.path.normpath(
                        os.path.join(base_directory, child["docx_path"])
                    )
                    # pdf, num_pages = convert_docx_template_to_pdf( # change this to only return num_pages
                    #     docx_path, replacements=vars_to_mapping(section["variables"])
                    # )
                    _, num_pages = convert_docx_template_to_pdf(docx_path)
                    docx = {
                        "type": "docxTemplate",
                        "id": child["id"],
                        "path": docx_path,
                        "replacements": vars_to_mapping(section["variables"]),
                        "num_pages": num_pages,
                        "table_entries": child.get("table_entries"), # or []?
                        "page_start": current_page,
                        "page_start_col": child.get("page_start_col"),
                        "page_end_col": child.get("page_end_col"),
                    }
                    writer_data.append(docx)
                    current_page += num_pages

            if child["type"] == "FileType":
                if child["bookmark_name"] and child["files"]:
                    # writer.add_outline_item(
                    #     child["bookmark_name"], current_page, root_bookmark
                    # )
                    file_type_bookmark = BookmarkItem(title=child["bookmark_name"], page=current_page, parent=root_bookmark)
                    bookmark_data.append(file_type_bookmark)

                directory_source = os.path.normpath(
                    os.path.join(base_directory, child["directory_source"])
                )
                file_type_data = {
                    "type": "FileType",
                    "id": child["id"],
                    "directory_source": directory_source,
                    "page_start": current_page,
                }
                writer_data.append(file_type_data)
                for file in child["files"]:
                    file_path = os.path.normpath(
                        os.path.join(directory_source, file["file_path"])
                    )
                    pdf, num_pages = get_pdf_and_page_count(file_path)
                    # writer.append(pdf, import_outline=False)
                    file_data = {
                        "type": "FileData",
                        "id": file["id"],
                        "path": file_path,
                        "num_pages": num_pages,
                        "pdf": pdf,
                        "page_start": current_page,
                    }
                    writer_data.append(file_data)
                    current_page += num_pages

            if child["type"] == "Section":
                section_data = {
                    "type": "Section",
                    "id": child["id"],
                    "page_start": current_page,
                }
                writer_data.append(section_data)
                build_pdf_data(child, base_directory, root_bookmark)

    build_pdf_data(report)
    return writer_data, bookmark_data


In [6]:
writer_data, bookmark_data = generate_pdf_pass_one(report)

  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
writer_data

[{'type': 'docxTemplate',
  'id': 'cf7b8fc9-f2c7-4938-99cb-5dd8c2bb79ff',
  'path': 'C:\\Users\\guest2\\Documents\\level3-docx-template.docx',
  'replacements': {'SDG Batch': 'SDG Batch',
   'Month Day,  Year': 'day month year',
   'Attention name': 'name',
   'Max Page Number': 'page num',
   'Project Name': 'proj name',
   'Client Name': 'client name'},
  'num_pages': 6,
  'table_entries': [['2. SDG Case Narrative',
    'cf7b8fc9-f2c7-4938-99cb-5dd8c2bb79ff'],
   ['3. Analytical Summary Report', '349b1f84-0cb2-4b2a-a1ba-c80ddbeda2b8'],
   ['4. ICP/MS Metals Data', 'a4a2930e-e0c4-43ea-bdf2-5970cca636dd'],
   ['5. Mercury Data', '94e1df98-b17d-41a5-8ad2-e775de28dc1e'],
   ['6. Ion Chromatography Data', None],
   ['7. Hardness Data', None],
   ['8. Bicarbonate & Carbonate Data', None],
   ['9. Total Suspended Solids Data', None],
   ['10. Total Dissolved Solids Data', None],
   ['11. Shipping / Receiving Documents', None]],
  'page_start': 0,
  'page_start_col': 3,
  'page_end_col': 4},

In [8]:
def compose_pdf(writer_data: dict) -> PdfWriter:
    writer = PdfWriter()
    id_to_page_start = {data['id']: data['page_start'] + 1 for data in writer_data} # sections are not included in this mapping

    def compose_pdf_inner(writer_data):
        nonlocal writer
        nonlocal id_to_page_start
        for data in writer_data:
          if data["type"] == "docxTemplate":
              # table_entries = {entry_name: id_to_page_start.get(entry_id) for entry_name, entry_id in data['table_entries']}
              table_entries = {}
              for entry_name, entry_id in data['table_entries']:
                  table_entries[entry_name] = id_to_page_start.get(entry_id)
              pdf, _ = convert_docx_template_to_pdf(data['path'], replacements=data['replacements'], table_entries=table_entries, page_start_col=data['page_start_col'], page_end_col=data['page_end_col'])
              writer.append(pdf, import_outline=False)
          if data["type"] == "FileData":
              writer.append(data["pdf"], import_outline=False)
          # if data["type"] == "Section":
          #     compose_pdf_inner(data)

    compose_pdf_inner(writer_data)
    return writer

In [9]:
def add_bookmarks(writer: PdfWriter, bookmarks: list):
    for bookmark in bookmarks:
        if bookmark.parent:
            parent = bookmark.parent.outline_element
        else:
            parent = None
        bookmark.outline_element = writer.add_outline_item(bookmark.title, bookmark.page, parent)
    return writer

In [10]:
writer_data

[{'type': 'docxTemplate',
  'id': 'cf7b8fc9-f2c7-4938-99cb-5dd8c2bb79ff',
  'path': 'C:\\Users\\guest2\\Documents\\level3-docx-template.docx',
  'replacements': {'SDG Batch': 'SDG Batch',
   'Month Day,  Year': 'day month year',
   'Attention name': 'name',
   'Max Page Number': 'page num',
   'Project Name': 'proj name',
   'Client Name': 'client name'},
  'num_pages': 6,
  'table_entries': [['2. SDG Case Narrative',
    'cf7b8fc9-f2c7-4938-99cb-5dd8c2bb79ff'],
   ['3. Analytical Summary Report', '349b1f84-0cb2-4b2a-a1ba-c80ddbeda2b8'],
   ['4. ICP/MS Metals Data', 'a4a2930e-e0c4-43ea-bdf2-5970cca636dd'],
   ['5. Mercury Data', '94e1df98-b17d-41a5-8ad2-e775de28dc1e'],
   ['6. Ion Chromatography Data', None],
   ['7. Hardness Data', None],
   ['8. Bicarbonate & Carbonate Data', None],
   ['9. Total Suspended Solids Data', None],
   ['10. Total Dissolved Solids Data', None],
   ['11. Shipping / Receiving Documents', None]],
  'page_start': 0,
  'page_start_col': 3,
  'page_end_col': 4},

In [11]:
writer = compose_pdf(writer_data)
writer = add_bookmarks(writer, bookmark_data)
writer.write("output.pdf")

  0%|          | 0/1 [00:00<?, ?it/s]

(True, <_io.FileIO [closed]>)