#### Basic navigation

Features:
 - Formatted text
 - Formatted tables including cell bg colour and horizontal merging
 - Targetted image retrieval

In [1]:
import docx_converter as dc
from docx import Document
import os
import pypandoc
import os
import shutil
import json
import re
from collections import Counter

In [13]:
import importlib

dc = importlib.reload(dc)

#### Testing

In [14]:
doc_path = "data/grovia_Carbon-PRO_Template.docx"
output_path = f"app/{os.path.splitext(os.path.basename(doc_path))[0]}"
lua_script = "scripts/pandoc/docx_cleanup.lua"

os.makedirs(output_path, exist_ok=True)
allowed_alt_texts = ["timeline"]

os.makedirs(f"{output_path}/{dc.FOLDERS['media']}", exist_ok=True)
os.makedirs(f"{output_path}/{dc.FOLDERS['data']}", exist_ok=True)
os.makedirs(f"{output_path}/{dc.FOLDERS['content']}", exist_ok=True)


## copy index.html to output_path
shutil.copyfile("scripts/index.html", f"{output_path}/index.html")
if not os.path.exists(f"{output_path}/js"):
    shutil.copytree("scripts/js", f"{output_path}/js")
if not os.path.exists(f"{output_path}/css"):
    shutil.copytree("scripts/css", f"{output_path}/css")

## Extract text and table styles
DEFAULT_STYLES = dc.extract_styles(doc_path)
## save to json
with open(f"{output_path}/{dc.FOLDERS['data']}/styles.json", 'w') as f:
    json.dump(DEFAULT_STYLES, f, indent=2)

## Extract table formating that differs from the default styles
tables = dc.extract_table_format(doc_path, DEFAULT_STYLES)
## save to json
with open(f'{output_path}/{dc.FOLDERS['data']}/tables.json', 'w') as f:
    json.dump(tables, f, indent=2)

alt_text_map = dc.extract_docx_media(doc_path, output_path, dc.FOLDERS['media'], allowed_alt_texts)
print("\n📝 Alt Text to Image Mapping:", alt_text_map)
## convert to a list of image integers
keep_images = [int(name.replace('image', '').replace('.png', '')) for name in alt_text_map.keys()]
images_dict = {image: {'path': path, 'alt_text': ''} for image, path in alt_text_map.items()}
print(images_dict)

## Generate HTML from the docx file
html = dc.convert_docx_to_html(doc_path, lua_script, keep_images)
print("HTML with unwanted images removed has been generated.")

# Replace images with placeholders
html_images, alt_text_map = dc.replace_images_with_placeholders(html, images_dict)

## Save to json
with open(f'{output_path}/{dc.FOLDERS['data']}/media.json', 'w') as f:
    json.dump(alt_text_map, f, indent=2)

# Regular expression to match tables
table_pattern = re.compile(r'<table.*?</table>', re.DOTALL)

# Reset the counter to zero before replacing tables
counter = [0]
html_tables_id = table_pattern.sub(lambda match: dc.table_replacer(match, counter), html_images)

# Generate navigation data
nav_data = dc.generate_navigation_data(html_tables_id)

# Save navigation data to JSON file
with open(f"{output_path}/{dc.FOLDERS['data']}/navigation.json", "w", encoding="utf-8") as f:
    json.dump(nav_data, f, indent=4)

# Embed navigation data placeholder as the first element in the HTML
if '<div data-navigation></div>' not in html_tables_id:
    content_html = '<div data-navigation></div>\n' + html_tables_id

# Embed sub navigation data placeholder in the HTML. Place at the line above the first h2 tag of each section
content_html = dc.insert_sub_navigation(content_html, nav_data)


# Save the modified content.html
with open(f"{output_path}/{dc.FOLDERS['content']}/content.html", "w", encoding="utf-8") as f:
    f.write(content_html)

print("Navigation JSON file and content placeholder updated successfully!")
print(f"Conversion complete! HTML file saved as {output_path}.")


✅ Moving image: image2.png ➝ app/grovia_Carbon-PRO_Template\assets\image2.png
✅ Moving image: image5.png ➝ app/grovia_Carbon-PRO_Template\assets\image5.png
✅ Moving image: image22.png ➝ app/grovia_Carbon-PRO_Template\assets\image22.png

📝 Alt Text to Image Mapping: {'image2': 'assets/image2.png', 'image5': 'assets/image5.png', 'image22': 'assets/image22.png'}
{'image2': {'path': 'assets/image2.png', 'alt_text': ''}, 'image5': {'path': 'assets/image5.png', 'alt_text': ''}, 'image22': {'path': 'assets/image22.png', 'alt_text': ''}}
HTML with unwanted images removed has been generated.
Navigation JSON file and content placeholder updated successfully!
Conversion complete! HTML file saved as app/grovia_Carbon-PRO_Template.
