#### Basic navigation

Features:
 - Formatted text
 - Formatted tables including cell bg colour and horizontal merging
 - Targetted image retrieval

In [15]:
import docx_converter as dc
from docx import Document
import os
import pypandoc
import os
import shutil
import json
import re
from collections import Counter

In [74]:
import importlib

dc = importlib.reload(dc)

#### Testing

In [75]:
docx_path = "data/grovia_Carbon-PRO_Template.docx"
compatible_docx_path = "app/test/output.docx"
output_path = f"app/{os.path.splitext(os.path.basename(docx_path))[0]}"
lua_script = "scripts/pandoc/docx_cleanup.lua"

os.makedirs(output_path, exist_ok=True)
allowed_alt_texts = ["timeline"]

os.makedirs(f"{output_path}/{dc.FOLDERS['media']}", exist_ok=True)
os.makedirs(f"{output_path}/{dc.FOLDERS['data']}", exist_ok=True)
os.makedirs(f"{output_path}/{dc.FOLDERS['content']}", exist_ok=True)


dc.check_compatibility(docx_path, compatible_docx_path)



## copy index.html to output_path
shutil.copyfile("scripts/index.html", f"{output_path}/index.html")
if not os.path.exists(f"{output_path}/js"):
    shutil.copytree("scripts/js", f"{output_path}/js")
if not os.path.exists(f"{output_path}/css"):
    shutil.copytree("scripts/css", f"{output_path}/css")


## Styles

## Extract text and table styles
DEFAULT_STYLES = dc.extract_styles(compatible_docx_path)
## save to json
with open(f"{output_path}/{dc.FOLDERS['data']}/styles.json", 'w') as f:
    json.dump(DEFAULT_STYLES, f, indent=2)

## Extract table formating that differs from the default styles
tables = dc.extract_table_format(compatible_docx_path, DEFAULT_STYLES)
## save to json
with open(f'{output_path}/{dc.FOLDERS['data']}/tables.json', 'w') as f:
    json.dump(tables, f, indent=2)

## End Styles ##
## Images ##

alt_text_map = dc.extract_docx_media(compatible_docx_path, output_path, dc.FOLDERS['media'], allowed_alt_texts)
print("\n📝 Alt Text to Image Mapping:", alt_text_map)
## convert to a list of image integers
keep_images = [int(name.replace('image', '').replace('.png', '')) for name in alt_text_map.keys()]
images_dict = {image: {'path': path, 'alt_text': ''} for image, path in alt_text_map.items()}
print("images_dict: ", images_dict)

## Generate HTML from the docx file
initial_html = dc.convert_docx_to_html(compatible_docx_path, lua_script, keep_images)
print("HTML with unwanted images removed has been generated.")

initial_html_clean = dc.remove_empty_paragraphs(initial_html)

# Remove empty <figure> tags
# number of <figure> tags before cleaning
num_figures_before = initial_html_clean.count('<figure>')
# Remove empty <figure> tags
html_captions_removed = dc.remove_empty_figures(initial_html_clean)
# number of <figure> tags after cleaning
num_figures_after = html_captions_removed.count('<figure>')
print(f"Number of <figure> tags before cleaning: {num_figures_before}")
print(f"Number of <figure> tags after cleaning: {num_figures_after}")

# Replace images with placeholders
html_images, alt_text_map = dc.replace_images_with_placeholders(html_captions_removed, images_dict)

## Save to json
with open(f'{output_path}/{dc.FOLDERS['data']}/media.json', 'w') as f:
    json.dump(alt_text_map, f, indent=2)


## End Images ##
## Tables ##


# Regular expression to match tables
table_pattern = re.compile(r'<table.*?</table>', re.DOTALL)

# Reset the counter to zero before replacing tables
counter = [0]
html_tables_id = table_pattern.sub(lambda match: dc.table_replacer(match, counter), html_images)

## End Tables ##
## Navigation ##

# Generate navigation data
nav_data = dc.generate_navigation_data(html_tables_id)

# Save navigation data to JSON file
with open(f"{output_path}/{dc.FOLDERS['data']}/navigation.json", "w", encoding="utf-8") as f:
    json.dump(nav_data, f, indent=4)

# Embed navigation data placeholder as the first element in the HTML
if '<div data-navigation></div>' not in html_tables_id:
    content_html = '<div data-navigation></div>\n' + html_tables_id

# Embed sub navigation data placeholder in the HTML. Place at the line above the first h2 tag of each section
content_html = dc.insert_sub_navigation(content_html, nav_data)

## End Navigation ##

# Save the modified content.html
with open(f"{output_path}/{dc.FOLDERS['content']}/content.html", "w", encoding="utf-8") as f:
    f.write(content_html)

print("Navigation JSON file and content placeholder updated successfully!")
print(f"Conversion complete! HTML file saved as {output_path}.")


Compatible document saved as: app/test/output.docx
📂 Creating media folder: app/grovia_Carbon-PRO_Template\assets
✅ Moving image: image2.png ➝ app/grovia_Carbon-PRO_Template\assets\image2.png
✅ Moving image: image5.png ➝ app/grovia_Carbon-PRO_Template\assets\image5.png
✅ Moving image: image13.png ➝ app/grovia_Carbon-PRO_Template\assets\image13.png
✅ Moving image: image14.png ➝ app/grovia_Carbon-PRO_Template\assets\image14.png
✅ Moving image: image15.png ➝ app/grovia_Carbon-PRO_Template\assets\image15.png
✅ Moving image: image23.png ➝ app/grovia_Carbon-PRO_Template\assets\image23.png

📝 Alt Text to Image Mapping: {'image2': 'assets/image2.png', 'image5': 'assets/image5.png', 'image13': 'assets/image13.png', 'image14': 'assets/image14.png', 'image15': 'assets/image15.png', 'image23': 'assets/image23.png'}
images_dict:  {'image2': {'path': 'assets/image2.png', 'alt_text': ''}, 'image5': {'path': 'assets/image5.png', 'alt_text': ''}, 'image13': {'path': 'assets/image13.png', 'alt_text': ''

In [49]:
initial_html

'<h1 id="overview">Overview</h1>\r\n<p>This section provides an overview of the property and carbon\r\nassessment.</p>\r\n<h2 id="preview">Preview</h2>\r\n<table>\r\n<caption><p>Table 1. Summary of Assessment</p></caption>\r\n<colgroup>\r\n<col style="width: 50%" />\r\n<col style="width: 50%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th>Item assessed</th>\r\n<th>Assessment</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\n<tr>\r\n<td>Planting Area</td>\r\n<td><strong>Moderate risk.</strong> Lots of risk.</td>\r\n</tr>\r\n<tr>\r\n<td>Carbon returns</td>\r\n<td><strong>High risk.</strong> Only 4.1 ACCUs/ha.</td>\r\n</tr>\r\n<tr>\r\n<td>Topography</td>\r\n<td><strong>Low risk.</strong> Little to no areas with slope greater\r\nthan 15 degrees.</td>\r\n</tr>\r\n</tbody>\r\n</table>\r\n<p>Located in the Dandenong ranges amongst wet sclerophyll forests and\r\ncarbon rich soils. Your dream carbon project awaits…</p>\r\n<figure>\r\n<img src="./media/image2.png" style="width:6.03278in;height:2.85398in"\r\n

In [51]:
initial_html_clean

'<h1 id="overview">Overview</h1>\r\n<p>This section provides an overview of the property and carbon\r\nassessment.</p>\r\n<h2 id="preview">Preview</h2>\r\n<table>\r\n<caption><p>Table 1. Summary of Assessment</p></caption>\r\n<colgroup>\r\n<col style="width: 50%" />\r\n<col style="width: 50%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th>Item assessed</th>\r\n<th>Assessment</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\n<tr>\r\n<td>Planting Area</td>\r\n<td><strong>Moderate risk.</strong> Lots of risk.</td>\r\n</tr>\r\n<tr>\r\n<td>Carbon returns</td>\r\n<td><strong>High risk.</strong> Only 4.1 ACCUs/ha.</td>\r\n</tr>\r\n<tr>\r\n<td>Topography</td>\r\n<td><strong>Low risk.</strong> Little to no areas with slope greater\r\nthan 15 degrees.</td>\r\n</tr>\r\n</tbody>\r\n</table>\r\n<p>Located in the Dandenong ranges amongst wet sclerophyll forests and\r\ncarbon rich soils. Your dream carbon project awaits…</p>\r\n<figure>\r\n<img src="./media/image2.png" style="width:6.03278in;height:2.85398in"\r\n

In [52]:
html_captions_removed

'<h1 id="overview">Overview</h1>\r\n<p>This section provides an overview of the property and carbon\r\nassessment.</p>\r\n<h2 id="preview">Preview</h2>\r\n<table>\r\n<caption><p>Table 1. Summary of Assessment</p></caption>\r\n<colgroup>\r\n<col style="width: 50%" />\r\n<col style="width: 50%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th>Item assessed</th>\r\n<th>Assessment</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\n<tr>\r\n<td>Planting Area</td>\r\n<td><strong>Moderate risk.</strong> Lots of risk.</td>\r\n</tr>\r\n<tr>\r\n<td>Carbon returns</td>\r\n<td><strong>High risk.</strong> Only 4.1 ACCUs/ha.</td>\r\n</tr>\r\n<tr>\r\n<td>Topography</td>\r\n<td><strong>Low risk.</strong> Little to no areas with slope greater\r\nthan 15 degrees.</td>\r\n</tr>\r\n</tbody>\r\n</table>\r\n<p>Located in the Dandenong ranges amongst wet sclerophyll forests and\r\ncarbon rich soils. Your dream carbon project awaits…</p>\r\n<figure>\r\n<img src="./media/image2.png" style="width:6.03278in;height:2.85398in"\r\n

In [55]:
keep_images

[2, 5, 13, 14, 15, 23]

In [62]:
alt_text_map

{'image2': {'path': 'assets/image2.png',
  'alt_text': 'location',
  'figcaption': '<p>Figure 1. Location within the region and state. This will\r\nuse a different satellite colour scheme (faded) but for some reason I\r\ncouldn’t get it to work</p>'},
 'image5': {'path': 'assets/image5.png',
  'alt_text': 'accu_breakdown',
  'figcaption': '<p>Figure 5. Accumulation curve of gross ACCUs generated\r\neach year over the 25-year permanence period. The first recommended\r\nreporting period of the permanence period is illustrated by a yellow\r\nstar</p>'},
 'image13': {'path': 'assets/image13.png',
  'alt_text': 'precipitation',
  'figcaption': '<p>Figure 12. Precipitation</p>'},
 'image14': {'path': 'assets/image14.png',
  'alt_text': 'temperature',
  'figcaption': '<p>Figure 13. Temperature</p>'},
 'image15': {'path': 'assets/image15.png',
  'alt_text': 'frost',
  'figcaption': '<p>Figure 14. Frost</p>'},
 'image23': {'path': 'assets/image23.png',
  'alt_text': 'timeline',
  'figcaption': 

In [70]:
int(re.sub('image', '', list(alt_text_map.keys())[0])) - 1

1

In [60]:
images_dict

{'image2': {'path': 'assets/image2.png',
  'alt_text': 'location',
  'figcaption': '<p>Figure 1. Location within the region and state. This will\r\nuse a different satellite colour scheme (faded) but for some reason I\r\ncouldn’t get it to work</p>'},
 'image5': {'path': 'assets/image5.png',
  'alt_text': 'accu_breakdown',
  'figcaption': '<p>Figure 5. Accumulation curve of gross ACCUs generated\r\neach year over the 25-year permanence period. The first recommended\r\nreporting period of the permanence period is illustrated by a yellow\r\nstar</p>'},
 'image13': {'path': 'assets/image13.png',
  'alt_text': 'precipitation',
  'figcaption': '<p>Figure 12. Precipitation</p>'},
 'image14': {'path': 'assets/image14.png',
  'alt_text': 'temperature',
  'figcaption': '<p>Figure 13. Temperature</p>'},
 'image15': {'path': 'assets/image15.png',
  'alt_text': 'frost',
  'figcaption': '<p>Figure 14. Frost</p>'},
 'image23': {'path': 'assets/image23.png',
  'alt_text': 'timeline',
  'figcaption': 

In [61]:
html_images

'<h1 id="overview">Overview</h1>\r\n<p>This section provides an overview of the property and carbon\r\nassessment.</p>\r\n<h2 id="preview">Preview</h2>\r\n<table>\r\n<caption><p>Table 1. Summary of Assessment</p></caption>\r\n<colgroup>\r\n<col style="width: 50%" />\r\n<col style="width: 50%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th>Item assessed</th>\r\n<th>Assessment</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\n<tr>\r\n<td>Planting Area</td>\r\n<td><strong>Moderate risk.</strong> Lots of risk.</td>\r\n</tr>\r\n<tr>\r\n<td>Carbon returns</td>\r\n<td><strong>High risk.</strong> Only 4.1 ACCUs/ha.</td>\r\n</tr>\r\n<tr>\r\n<td>Topography</td>\r\n<td><strong>Low risk.</strong> Little to no areas with slope greater\r\nthan 15 degrees.</td>\r\n</tr>\r\n</tbody>\r\n</table>\r\n<p>Located in the Dandenong ranges amongst wet sclerophyll forests and\r\ncarbon rich soils. Your dream carbon project awaits…</p>\r\n<div data-image="assets/image2.png" data-caption="&lt;p&gt;Figure 1. Location within t