In [1]:
import docx_converter as dc
from docx import Document
from bs4 import BeautifulSoup
import os
import os
import shutil
import json
import re
from bs4 import BeautifulSoup
from pathlib import Path
import json
import html_converter as hc

In [27]:
import importlib

dc = importlib.reload(dc)

{'textAlign': 'left', 'fontSize': '1rem', 'fontWeight': 'normal', 'color': '#000000', 'backgroundColor': '#FFFFFF', 'padding': '8px', 'verticalAlign': 'middle'}


In [2]:
docx_path = "data/PagesCreekPRO+_FINAL.docx"
output_path = f"app/{os.path.splitext(os.path.basename(docx_path))[0]}"
lua_script = "scripts/pandoc/pandoc_docx_cleanup.lua"
html_path = f"{output_path}/content/content.html"
json_path = f"data/{os.path.basename(output_path)}.json"

In [3]:
def parse_docx_to_html(docx_path, lua_script, output_path):
    """
    Convert a DOCX file to HTML and output tables and images for use in a web application.
    """
    os.path.splitext(os.path.basename(docx_path))[0]
    compatible_docx_path = f"app/{os.path.splitext(os.path.basename(docx_path))[0]}.docx"

    os.makedirs(output_path, exist_ok=True)
    allowed_alt_texts = ["timeline"]

    os.makedirs(f"{output_path}/{dc.FOLDERS['media']}", exist_ok=True)
    os.makedirs(f"{output_path}/{dc.FOLDERS['data']}", exist_ok=True)
    os.makedirs(f"{output_path}/{dc.FOLDERS['content']}", exist_ok=True)

    dc.check_compatibility(docx_path, compatible_docx_path)

    ## copy index.html to output_path
    shutil.copyfile("scripts/index.html", f"{output_path}/index.html")
    if not os.path.exists(f"{output_path}/js"):
        shutil.copytree("scripts/js", f"{output_path}/js")
    if not os.path.exists(f"{output_path}/css"):
        shutil.copytree("scripts/css", f"{output_path}/css")


    ## Styles

    ## Extract text and table styles
    # DEFAULT_STYLES = dc.extract_styles(compatible_docx_path)
    DEFAULT_STYLES = {
      "headings": {
        "h1": {
          "fontFamily": "Chillax Semibold",
          "fontSize": "1.67rem",
          "color": "#1D426F"
        },
        "h2": {
          "fontFamily": "Chillax Semibold",
          "fontSize": "1.17rem",
          "color": "#1D426F"
        },
        "h3": {
          "fontFamily": "Chillax Medium",
          "fontSize": "1.00rem",
          "color": "#1D426F"
        }
      },
      "body": {
        "p": {
          "fontSize": "0.92rem"
        }
      },
      "lists": {},
      "captions": {
        "caption": {
          "fontSize": "0.83rem",
          "fontStyle": "italic"
        }
      },
      "table": {
        "border": "1px solid black",
        "borderCollapse": "collapse",
        "marginBottom": "12px"
      },
      "th": {
        "fontSize": "0.92rem",
        "textAlign": "left",
        "padding": "5px",
        "verticalAlign": "middle",
        "textAlign": "left"
      },
      "td1": {
        "fontSize": "0.92rem",
        "textAlign": "left",
        "padding": "5px",
        "verticalAlign": "middle",
        "textAlign": "left"
      },
      "td": {
        "fontSize": "0.92rem",
        "textAlign": "left",
        "padding": "5px",
        "verticalAlign": "middle",
        "textAlign": "left"
      }
    }

    ## save to json
    with open(f"{output_path}/{dc.FOLDERS['data']}/styles.json", 'w') as f:
        json.dump(DEFAULT_STYLES, f, indent=2)

    ## Extract table data and formating that differs from the default styles
    tables = dc.extract_table_format(compatible_docx_path, DEFAULT_STYLES)
    ## save to json
    with open(f'{output_path}/{dc.FOLDERS['data']}/tables.json', 'w') as f:
        json.dump(tables, f, indent=2)

    ## End Styles ##
    ## Images ##

    alt_text_map = dc.extract_docx_media(compatible_docx_path, output_path, dc.FOLDERS['media'], allowed_alt_texts)
    print("\n📝 Alt Text to Image Mapping:", alt_text_map)

    keep_images = [value.replace("assets", "media") for _, value in alt_text_map.items()]
    print(f"keep_images: {keep_images}")
    images_dict = {image: {'path_doc': path.replace('assets/', './media/'), 'path': path, 'alt_text': ''} for image, path in alt_text_map.items()}
    print("images_dict: ", images_dict)

    ## START NEW VERSION ##
    image_map = dc.parse_images_with_links_and_captions(compatible_docx_path)
    # Filter images with alt text starting with "keep-"
    keep_image_map = [image for image in image_map if image["alt_text"].startswith("keep-")]
    ## Update figure numbers
    keep_image_map_nums = dc.update_figure_numbers(keep_image_map)
    ## Retrieve image types
    keep_image_map_types = dc.identify_image_type(keep_image_map_nums)
    ## END NEW VERSION ##

    initial_html = dc.convert_docx_to_html(compatible_docx_path, lua_script, keep_images)
    print("HTML with unwanted images removed has been generated.")

    initial_html_clean = dc.remove_empty_paragraphs(initial_html)

    missing_figures = dc.check_for_missing_figures(compatible_docx_path, initial_html_clean)
    if missing_figures:
        for r_id, caption in missing_figures.items():
          print(f"⚠️ WARNING: Missing figure: {caption}")

    # Identify figure captions and their corresponding images
    doc_img_src = [images_dict[img]['path_doc'] for img in images_dict]
    # figure_captions = dc.get_figure_captions(initial_html_clean, doc_img_src)

    figure_captions = dc.retrieve_all_figure_captions(
        docx_path=compatible_docx_path,
        html_content=initial_html_clean,
        doc_img_src=doc_img_src
    )

    print(f"Figure captions: {figure_captions}")
    keep_image_map_types = [
        {**item, 'image_name': os.path.splitext(os.path.basename(item.get('image_file', '')))[0]}
        for item in keep_image_map_types
    ]

    keep_image_map_types_figure_captions = [
        {
            **item,
            **(
                {
                    'figure_caption': dc.clean_figure_caption(fc.figure_caption),
                    'figure_number': fc.figure_number,
                    'figure_number_new': fc.figure_number_new,
                    'figure_caption_new': re.sub(
                        str(fc.figure_number),
                        str(fc.figure_number_new),
                        dc.clean_figure_caption(fc.figure_caption),
                        count=1
                    )
                } if (fc := figure_captions.get(item["image_name"])) else {}
            )
        }
        for item in keep_image_map_types
    ]

    # Remove empty <figure> tags
    # number of <figure> tags before cleaning
    num_figures_before = initial_html_clean.count('<figure>')
    # Remove empty <figure> tags
    html_captions_removed = dc.remove_empty_figures(initial_html_clean)
    # number of <figure> tags after cleaning
    num_figures_after = html_captions_removed.count('<figure>')
    print(f"Number of <figure> tags before cleaning: {num_figures_before}")
    print(f"Number of <figure> tags after cleaning: {num_figures_after}")

    ## Replace img tags with div placeholders
    html_images_replaced = dc.replace_images_with_divs(html_captions_removed, keep_image_map_types_figure_captions)

    ## Remove captions from unwanted figures that weren't apart for a <figure> tag
    html_more_captions_removed = dc.remove_captions_from_unwanted_figures(html_images_replaced)

    ## Update in-text figure references
    html_figure_references_updated = dc.update_in_text_figure_references(html_more_captions_removed, keep_image_map_types_figure_captions)

    ## End Images ##
    ## Tables ##


    # Regular expression to match tables
    table_pattern = re.compile(r'<table.*?</table>', re.DOTALL)

    # Reset the counter to zero before replacing tables
    counter = [0]
    html_tables_id = table_pattern.sub(lambda match: dc.table_replacer(match, counter), html_figure_references_updated)

    ## End Tables ##
    ## Navigation ##

    # # Generate navigation data
    # nav_data = dc.generate_navigation_data(html_tables_id)

    # # Save navigation data to JSON file
    # with open(f"{output_path}/{dc.FOLDERS['data']}/navigation.json", "w", encoding="utf-8") as f:
    #     json.dump(nav_data, f, indent=4)

    # # Embed navigation data placeholder as the first element in the HTML
    # if '<div data-navigation></div>' not in html_tables_id:
    #     content_html = '<div data-navigation></div>\n' + html_tables_id

    # # Embed sub navigation data placeholder in the HTML. Place at the line above the first h2 tag of each section
    # content_html = dc.insert_sub_navigation(content_html, nav_data)

    ## End Navigation ##

    # Save the modified content.html
    with open(f"{output_path}/{dc.FOLDERS['content']}/content.html", "w", encoding="utf-8") as f:
        f.write(html_tables_id)

    print("Navigation JSON file and content placeholder updated successfully!")
    print(f"Conversion complete! HTML file saved as {output_path}.")

    return None

parse_docx_to_html(docx_path, lua_script, output_path)

def parse_html_to_json(html_path: str, json_path: str) -> None:
    # Load the HTML content
    html_content = Path(html_path).read_text(encoding="utf-8")

    soup = BeautifulSoup(html_content, "html.parser")
    json_data = hc.html_convert(soup)

    # Clean the JSON fields
    cleaned_json_data = hc.clean_nested_json(json_data)

    # Save the updated structure
    with open(json_path, "w") as f:
        json.dump(cleaned_json_data, f, indent=2)

parse_html_to_json(html_path, json_path)

Compatible document saved as: app/PagesCreekPRO+_FINAL.docx
Processing table 1...
Found a drawing element
Processing table 2...
Processing table 3...
Processing table 4...
Processing table 5...
Processing table 6...
📂 Creating media folder: app/PagesCreekPRO+_FINAL\assets
✅ Moving image: image2.png ➝ app/PagesCreekPRO+_FINAL\assets\image2.png
✅ Moving image: image7.png ➝ app/PagesCreekPRO+_FINAL\assets\image7.png
✅ Moving image: image15.png ➝ app/PagesCreekPRO+_FINAL\assets\image15.png
✅ Moving image: image16.png ➝ app/PagesCreekPRO+_FINAL\assets\image16.png
✅ Moving image: image17.png ➝ app/PagesCreekPRO+_FINAL\assets\image17.png
✅ Moving image: image18.png ➝ app/PagesCreekPRO+_FINAL\assets\image18.png
✅ Moving image: image19.png ➝ app/PagesCreekPRO+_FINAL\assets\image19.png
✅ Moving image: image20.png ➝ app/PagesCreekPRO+_FINAL\assets\image20.png
✅ Moving image: image22.png ➝ app/PagesCreekPRO+_FINAL\assets\image22.png
✅ Moving image: image28.png ➝ app/PagesCreekPRO+_FINAL\assets\ima

#### START TESTING

In [None]:
docx_path = "data/grovia_Carbon-PRO_Template.docx"
output_path = f"app/{os.path.splitext(os.path.basename(docx_path))[0]}"
lua_script = "scripts/pandoc/pandoc_docx_cleanup.lua"


In [None]:
compatible_docx_path = f"app/{os.path.splitext(os.path.basename(docx_path))[0]}.docx"

os.makedirs(output_path, exist_ok=True)
allowed_alt_texts = ["timeline"]

os.makedirs(f"{output_path}/{dc.FOLDERS['media']}", exist_ok=True)
os.makedirs(f"{output_path}/{dc.FOLDERS['data']}", exist_ok=True)
os.makedirs(f"{output_path}/{dc.FOLDERS['content']}", exist_ok=True)

dc.check_compatibility(docx_path, compatible_docx_path)

## copy index.html to output_path
shutil.copyfile("scripts/index.html", f"{output_path}/index.html")
if not os.path.exists(f"{output_path}/js"):
    shutil.copytree("scripts/js", f"{output_path}/js")
if not os.path.exists(f"{output_path}/css"):
    shutil.copytree("scripts/css", f"{output_path}/css")


## Styles

## Extract text and table styles
# DEFAULT_STYLES = dc.extract_styles(compatible_docx_path)
DEFAULT_STYLES = {
  "headings": {
    "h1": {
      "fontFamily": "Chillax Semibold",
      "fontSize": "1.67rem",
      "color": "#1D426F"
    },
    "h2": {
      "fontFamily": "Chillax Semibold",
      "fontSize": "1.17rem",
      "color": "#1D426F"
    },
    "h3": {
      "fontFamily": "Chillax Medium",
      "fontSize": "1.00rem",
      "color": "#1D426F"
    }
  },
  "body": {
    "p": {
      "fontSize": "0.92rem"
    }
  },
  "lists": {},
  "captions": {
    "caption": {
      "fontSize": "0.83rem",
      "fontStyle": "italic"
    }
  },
  "table": {
    "border": "1px solid black",
    "borderCollapse": "collapse",
    "marginBottom": "12px"
  },
  "th": {
    "fontSize": "0.92rem",
    "textAlign": "left",
    "padding": "5px",
    "verticalAlign": "middle",
    "textAlign": "left"
  },
  "td1": {
    "fontSize": "0.92rem",
    "textAlign": "left",
    "padding": "5px",
    "verticalAlign": "middle",
    "textAlign": "left"
  },
  "td": {
    "fontSize": "0.92rem",
    "textAlign": "left",
    "padding": "5px",
    "verticalAlign": "middle",
    "textAlign": "left"
  }
}

## save to json
with open(f"{output_path}/{dc.FOLDERS['data']}/styles.json", 'w') as f:
    json.dump(DEFAULT_STYLES, f, indent=2)

## Extract table data and formating that differs from the default styles
tables = dc.extract_table_format(compatible_docx_path, DEFAULT_STYLES)
## save to json
with open(f'{output_path}/{dc.FOLDERS['data']}/tables.json', 'w') as f:
    json.dump(tables, f, indent=2)

## End Styles ##
## Images ##

alt_text_map = dc.extract_docx_media(compatible_docx_path, output_path, dc.FOLDERS['media'], allowed_alt_texts)
print("\n📝 Alt Text to Image Mapping:", alt_text_map)

keep_images = [value.replace("assets", "media") for _, value in alt_text_map.items()]
print(f"keep_images: {keep_images}")
images_dict = {image: {'path_doc': path.replace('assets/', './media/'), 'path': path, 'alt_text': ''} for image, path in alt_text_map.items()}
print("images_dict: ", images_dict)

## START NEW VERSION ##
image_map = dc.parse_images_with_links_and_captions(compatible_docx_path)
# Filter images with alt text starting with "keep-"
keep_image_map = [image for image in image_map if image["alt_text"].startswith("keep-")]
## Update figure numbers
keep_image_map_nums = dc.update_figure_numbers(keep_image_map)
## Retrieve image types
keep_image_map_types = dc.identify_image_type(keep_image_map_nums)
## END NEW VERSION ##

initial_html = dc.convert_docx_to_html(compatible_docx_path, lua_script, keep_images)
print("HTML with unwanted images removed has been generated.")

initial_html_clean = dc.remove_empty_paragraphs(initial_html)

# Identify figure captions and their corresponding images
doc_img_src = [images_dict[img]['path_doc'] for img in images_dict]

## Gets figure captions from the HTML content or direct from docx if they are not in HTML content and matches them with the document images
figure_captions = dc.retrieve_all_figure_captions(
    docx_path = compatible_docx_path, 
    html_content = initial_html_clean, 
    doc_img_src = doc_img_src
    )

print(f"Figure captions: {figure_captions}")
keep_image_map_types = [
    {**item, 'image_name': os.path.splitext(os.path.basename(item.get('image_file', '')))[0]}
    for item in keep_image_map_types
]

keep_image_map_types_figure_captions = [
    {
        **item,
        **(
            {
                'figure_caption': dc.clean_figure_caption(fc.figure_caption),
                'figure_number': fc.figure_number,
                'figure_number_new': fc.figure_number_new,
                'figure_caption_new': re.sub(
                    str(fc.figure_number),
                    str(fc.figure_number_new),
                    dc.clean_figure_caption(fc.figure_caption),
                    count=1
                )
            } if (fc := figure_captions.get(item["image_name"])) else {}
        )
    }
    for item in keep_image_map_types
]

# Remove empty <figure> tags
# number of <figure> tags before cleaning
num_figures_before = initial_html_clean.count('<figure>')
# Remove empty <figure> tags
html_captions_removed = dc.remove_empty_figures(initial_html_clean)
# number of <figure> tags after cleaning
num_figures_after = html_captions_removed.count('<figure>')
print(f"Number of <figure> tags before cleaning: {num_figures_before}")
print(f"Number of <figure> tags after cleaning: {num_figures_after}")

## Replace img tags with div placeholders
html_images_replaced = dc.replace_images_with_divs(html_captions_removed, keep_image_map_types_figure_captions)

def remove_captions_from_unwanted_figures(html_content: str) -> None:
  # Pattern to match the start of a figure caption like "Figure 8."
  figure_caption_pattern = re.compile(r"^Figure\s+\d+\.?", re.IGNORECASE)

  soup = BeautifulSoup(html_content, "html.parser")

  # Loop over all <p> tags
  for p in soup.find_all("p"):
      first_em = p.find("em", recursive=False)
      if first_em:
          em_text = first_em.get_text(strip=True)
          if figure_caption_pattern.match(em_text):
              print(f"Removing orphaned figure caption: {em_text}")
              p.decompose()

  return str(soup)


def update_in_text_figure_references(html_content: str, image_mapping: list) -> str:
    """
    Update in-text figure references to match the new figure numbers.
    """
    import re
    from bs4 import NavigableString

    soup = BeautifulSoup(html_content, "html.parser")

    # Build a map from original figure number → new number
    figure_number_map = {
        str(item["figure_number"]): str(item["figure_number_new"])
        for item in image_mapping
        if item.get("figure_number") is not None and item.get("figure_number_new") is not None
    }

    # Regex to find "Figure N"
    figure_ref_pattern = re.compile(r"\bFigure\s+(\d+)\b")

    # Tags to inspect (excluding <div>)
    search_tags = ["p", "li", "span", "td"]

    # Loop through all text nodes in non-div tags
    for tag in soup.find_all(search_tags):
        if tag.find_parent("div"):
            continue  # skip if this tag is inside any <div>

        for descendant in tag.descendants:
            if isinstance(descendant, NavigableString):
                original_text = str(descendant)
                new_text = figure_ref_pattern.sub(
                    lambda m: f"Figure {figure_number_map[m.group(1)]}" if m.group(1) in figure_number_map else m.group(0),
                    original_text
                )
                if new_text != original_text:
                    print(f"Updating in-text reference: '{original_text.strip()}' → '{new_text.strip()}'")
                    descendant.replace_with(new_text)

    return str(soup)

## Remove captions from unwanted figures that weren't apart for a <figure> tag
html_more_captions_removed = remove_captions_from_unwanted_figures(html_images_replaced)

## Update in-text figure references
html_figure_references_updated = update_in_text_figure_references(html_more_captions_removed, keep_image_map_types_figure_captions)


## End Images ##
## Tables ##


# Regular expression to match tables
table_pattern = re.compile(r'<table.*?</table>', re.DOTALL)

# Reset the counter to zero before replacing tables
counter = [0]
html_tables_id = table_pattern.sub(lambda match: dc.table_replacer(match, counter), html_figure_references_updated)


## End Tables ##
## Navigation ##

# # Generate navigation data
# nav_data = dc.generate_navigation_data(html_tables_id)

# # Save navigation data to JSON file
# with open(f"{output_path}/{dc.FOLDERS['data']}/navigation.json", "w", encoding="utf-8") as f:
#     json.dump(nav_data, f, indent=4)

# # Embed navigation data placeholder as the first element in the HTML
# if '<div data-navigation></div>' not in html_tables_id:
#     content_html = '<div data-navigation></div>\n' + html_tables_id

# # Embed sub navigation data placeholder in the HTML. Place at the line above the first h2 tag of each section
# content_html = dc.insert_sub_navigation(content_html, nav_data)

# End Navigation ##

# Save the modified content.html
with open(f"{output_path}/{dc.FOLDERS['content']}/content.html", "w", encoding="utf-8") as f:
    f.write(html_tables_id)

# print("Navigation JSON file and content placeholder updated successfully!")
print(f"Conversion complete! HTML file saved as {output_path}.")

Compatible document saved as: app/PagesCreekPRO+_FINAL.docx
Processing table 1...
Found a drawing element
Processing table 2...
Processing table 3...
Processing table 4...
Processing table 5...
Processing table 6...
📂 Creating media folder: app/PagesCreekPRO+_FINAL\assets
✅ Moving image: image2.png ➝ app/PagesCreekPRO+_FINAL\assets\image2.png
✅ Moving image: image7.png ➝ app/PagesCreekPRO+_FINAL\assets\image7.png
✅ Moving image: image15.png ➝ app/PagesCreekPRO+_FINAL\assets\image15.png
✅ Moving image: image16.png ➝ app/PagesCreekPRO+_FINAL\assets\image16.png
✅ Moving image: image17.png ➝ app/PagesCreekPRO+_FINAL\assets\image17.png
✅ Moving image: image18.png ➝ app/PagesCreekPRO+_FINAL\assets\image18.png
✅ Moving image: image19.png ➝ app/PagesCreekPRO+_FINAL\assets\image19.png
✅ Moving image: image20.png ➝ app/PagesCreekPRO+_FINAL\assets\image20.png
✅ Moving image: image22.png ➝ app/PagesCreekPRO+_FINAL\assets\image22.png
✅ Moving image: image28.png ➝ app/PagesCreekPRO+_FINAL\assets\ima

#### END TESTING

#### FIXING missing figures

In [11]:
html_images_replaced

'<h2 id="overview">Overview</h2>\n<p>The following is a property-specific assessment of select variables\r\nthat are important to consider when assessing the feasibility of\r\ndeveloping an Environmental Planting (EP).</p>\n<p>You will be able to view fundamental information regarding the\r\npotential opportunity and obstacles to project development.\xa0</p>\n<p>With a PRO and PRO<sup>+</sup> subscription you will also have access\r\nto more elaborate information and interpretation of additional factors\r\nthat are required for critical review in the decision-making process of\r\nyour land assessment and project development.</p>\n<table>\n<caption><p>Table 1. Summary of project details</p></caption>\n<colgroup>\n<col style="width: 40%"/>\n<col style="width: 59%"/>\n</colgroup>\n<thead>\n<tr>\n<th style="text-align: right;">Project Name:</th>\n<th>Pages Creek</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td style="text-align: right;"><strong>Location:</strong></td>\n<td>(Figure 1)</td>\n</tr>\

In [12]:
html_more_captions_removed

'<h2 id="overview">Overview</h2>\n<p>The following is a property-specific assessment of select variables\r\nthat are important to consider when assessing the feasibility of\r\ndeveloping an Environmental Planting (EP).</p>\n<p>You will be able to view fundamental information regarding the\r\npotential opportunity and obstacles to project development.\xa0</p>\n<p>With a PRO and PRO<sup>+</sup> subscription you will also have access\r\nto more elaborate information and interpretation of additional factors\r\nthat are required for critical review in the decision-making process of\r\nyour land assessment and project development.</p>\n<table>\n<caption><p>Table 1. Summary of project details</p></caption>\n<colgroup>\n<col style="width: 40%"/>\n<col style="width: 59%"/>\n</colgroup>\n<thead>\n<tr>\n<th style="text-align: right;">Project Name:</th>\n<th>Pages Creek</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td style="text-align: right;"><strong>Location:</strong></td>\n<td>(Figure 1)</td>\n</tr>\

In [9]:
figure_captions = dc.get_figure_captions(initial_html_clean, doc_img_src)
figure_captions

{'image7': Image(figure_number=5, figure_number_new=1, figure_caption='Figure 5. Accumulation curve of gross ACCUs generated\r\neach year over the 25-year permanence period. The first recommended\r\nreporting period of the permanence period is illustrated by a yellow\r\nstar.', alt_text='keep-chart-accu_curve')}

In [None]:
print(f"Figure captions: {figure_captions}")
keep_image_map_types = [
    {**item, 'image_name': os.path.splitext(os.path.basename(item.get('image_file', '')))[0]}
    for item in keep_image_map_types
]

keep_image_map_types_figure_captions = [
    {
        **item,
        **(
            {
                'figure_caption': dc.clean_figure_caption(fc.figure_caption),
                'figure_number': fc.figure_number,
                'figure_number_new': fc.figure_number_new,
                'figure_caption_new': re.sub(
                    str(fc.figure_number),
                    str(fc.figure_number_new),
                    dc.clean_figure_caption(fc.figure_caption),
                    count=1
                )
            } if (fc := figure_captions.get(item["image_name"])) else {}
        )
    }
    for item in keep_image_map_types
]

<div data-navigation=""></div>
<h1 id="overview">Overview</h1>
<p>The following is a property-specific assessment of select variables

that are important to consider when assessing the feasibility of

developing an Environmental Planting (EP).</p>
<p>You will be able to view fundamental information regarding the

potential opportunity and obstacles to project development. </p>
<p>With a PRO and PRO<sup>+</sup> subscription you will also have access

to more elaborate information and interpretation of additional factors

that are required for critical review in the decision-making process of

your land assessment and project development.</p>
<div data-parent="overview" data-sub-navigation=""></div>
<h2 id="summary-of-assessment">Summary of Assessment</h2>
<div class="table" id="table_0"></div><br/>
<h3 id="summary-of-key-risks-affecting-a-carbon-projects-feasibility">Summary

of Key Risks Affecting a Carbon Projects Feasibility</h3>
<div class="table" data-caption="&lt;p&gt;Table 1. Sum

#### END FIXING missing figures

In [None]:
from zipfile import ZipFile
import xml.etree.ElementTree as ET

html_content = initial_html_clean

# Namespaces used in Word XML
ns = {
    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
    'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
    'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}

figures = []

with ZipFile(docx_path) as docx:
    # Step 1: Extract only the document.xml
    with docx.open('word/document.xml') as file:
        tree = ET.parse(file)
        root = tree.getroot()

    # Step 2: Extract image and caption info
    paragraphs = root.findall('.//w:p', ns)

    for i, p in enumerate(paragraphs):
        img_tag = p.find('.//a:blip', ns)
        if img_tag is not None:
            r_id = img_tag.attrib.get(f'{{{ns["r"]}}}embed')
            # Look ahead for the caption (next paragraph)
            caption_text = ''
            if i + 1 < len(paragraphs):
                next_p = paragraphs[i + 1]
                caption_text = ''.join([t.text or '' for t in next_p.findall('.//w:t', ns)])
            figures.append({'rId': r_id, 'caption': caption_text})

# Step 3: Verify which figures have captions starting with "Figure {d} or Figure {letters}"
figures_verified = []
for fig in figures:
    caption = fig['caption']
    if caption.startswith("Figure"):
        match = re.search(r"Figure [A-Za-z0-9]+", caption)
        if match:
            figure_number = match.group(0)
            caption_after = caption.split(figure_number)[1].strip()
            figures_verified.append({'rId': fig['rId'], 'caption': caption, 'html_caption': caption_after})

# Step 4: Check for missing figures in the html
missing_figures = {}
soup = BeautifulSoup(html_content, 'html.parser')
figure_tags = soup.find_all('figcaption')

# Identify missing captions that are in the docx but not in the html
for fig in figures_verified:
    found = False
    for fig_tag in figure_tags:
        if fig['html_caption'] in fig_tag.text.replace("\r\n", " "):
            found = True
            break
    if not found:
        missing_figures[fig['rId']] = fig['caption'].rstrip() # Remove trailing whitespace



In [23]:
img_tag.attrib

{'{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed': 'rId37'}

In [79]:
import xml.etree.ElementTree as ET
from zipfile import ZipFile
import base64

ns = {
    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
    'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
    'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}

figures = []

with ZipFile(docx_path) as docx:
    # Load main document
    with docx.open('word/document.xml') as file:
        tree = ET.parse(file)
        root = tree.getroot()

    # Load image relationships
    rels = {}
    with docx.open('word/_rels/document.xml.rels') as rels_file:
        rels_tree = ET.parse(rels_file)
        for rel in rels_tree.findall('Relationship', {'': 'http://schemas.openxmlformats.org/package/2006/relationships'}):
            r_id = rel.attrib['Id']
            target = rel.attrib['Target']
            if target.startswith('media/'):
                rels[r_id] = f'word/{target}'

    # Search all <wp:inline> and <wp:anchor> blocks (image containers)
    image_blocks = root.findall('.//wp:inline', ns) + root.findall('.//wp:anchor', ns)

    for block in image_blocks:
        blip = block.find('.//a:blip', ns)
        docPr = block.find('wp:docPr', ns)

        if blip is not None:
            r_id = blip.attrib.get(f'{{{ns["r"]}}}embed')
            alt_text = ''
            if docPr is not None:
                alt_text = docPr.attrib.get('descr', '') or docPr.attrib.get('title', '')

            # Find image path and base64 encode it
            img_path = rels.get(r_id)
            img_data = None
            if img_path:
                with docx.open(img_path) as img_file:
                    img_bytes = img_file.read()
                    img_data = base64.b64encode(img_bytes).decode('utf-8')

            # Attempt to find the *next* paragraph after this block for caption
            # by checking position in the full list of elements
            caption_text = ''
            parent = block.getparent() if hasattr(block, 'getparent') else None  # not supported in ElementTree
            # Instead, just find all paragraphs and match by rId position
            paragraphs = root.findall('.//w:p', ns)
            next_caption_found = False
            for i, p in enumerate(paragraphs):
                if p.find('.//a:blip[@r:embed="%s"]' % r_id, ns) is not None:
                    if i + 1 < len(paragraphs):
                        next_p = paragraphs[i + 1]
                        caption_text = ''.join([t.text or '' for t in next_p.findall('.//w:t', ns)])
                    break

            figures.append({
                'rId': r_id,
                'caption': caption_text,
                'img_path': img_path,
                'alt_text': alt_text,
                # 'image_base64': img_data  # Optional: Remove this if you only want the file path or bytes
            })

    # Step 3: Verify which figures have captions starting with "Figure {d} or Figure {letters}"
    figures_verified_cleaned = []
    for fig in figures:
        caption = fig['caption']
        if caption.startswith("Figure"):
            match = re.search(r"Figure [A-Za-z0-9]+", caption)
            if match:
                figure_number = match.group(0)
                caption_after = caption.split(figure_number)[1].strip()
                figures_verified_cleaned.append({'rId': fig['rId'], 'img_path': fig['img_path'], 'alt_text': fig['alt_text'], 'caption': caption, 'html_caption': caption_after})

    # Step 4: Check for missing figures in the html
    missing_figures = {}
    soup = BeautifulSoup(html_content, 'html.parser')
    figure_tags = soup.find_all('figcaption')

    # Identify missing captions that are in the docx but not in the html
    for fig in figures_verified_cleaned:
        found = False
        for fig_tag in figure_tags:
            if fig['html_caption'] in fig_tag.text.replace("\r\n", " "):
                found = True
                break
        if not found:
            missing_figures[fig['rId']] = {
                "caption": fig['caption'].rstrip(), # Remove trailing whitespace
                "img_path": '/'.join(fig['img_path'].split('/')[1:]),  # Include image path for reference     
                "alt_text": fig['alt_text']  # Include alt text for reference      
            }


    print(missing_figures)


{'rId16': {'caption': 'Figure 6. Slope across the property.', 'img_path': 'media/image8.png', 'alt_text': ''}, 'rId17': {'caption': 'Figure 7. Infrastructure that occurs within and around the Pages Creek property.', 'img_path': 'media/image9.png', 'alt_text': ''}, 'rId18': {'caption': 'Figure 8. State-based mapped tenure of the Pages Creek property.', 'img_path': 'media/image10.png', 'alt_text': ''}, 'rId19': {'caption': 'Figure 9. State-based zoning of the Pages Creek property.', 'img_path': 'media/image11.png', 'alt_text': ''}, 'rId20': {'caption': 'Figure 10. Mapping of Indigenous Land Use Agreements in and native title determinations in and around the Pages Creek property.', 'img_path': 'media/image12.png', 'alt_text': ''}, 'rId21': {'caption': 'Figure 11. Mining tenements and exploration licenses in and around the Pages Creek property.', 'img_path': 'media/image13.png', 'alt_text': ''}, 'rId22': {'caption': 'Figure 12. Existing Clean Energy Regulator carbon projects in and around 

In [76]:
figures

[{'rId': 'rId11',
  'caption': 'Figure 1. Location of the Pages Creek property in relation to surrounding localities. The property is identified by the white polygon.',
  'img_path': 'word/media/image3.png',
  'alt_text': ''},
 {'rId': 'rId12',
  'caption': 'Figure 2. a) Planting / non-woody areas of Pages Creek property; b) exclusion areas of Pages Creek as defined by grovia’s tree detection model.',
  'img_path': 'word/media/image4.png',
  'alt_text': ''},
 {'rId': 'rId13',
  'caption': 'Figure 3. Spatial distribution of carbon yield across the property.',
  'img_path': 'word/media/image5.png',
  'alt_text': ''},
 {'rId': 'rId14',
  'caption': 'Figure 4. CEA stratification and modelling points of Pages Creek property.',
  'img_path': 'word/media/image6.png',
  'alt_text': ''},
 {'rId': 'rId15',
  'caption': 'Figure 5. Accumulation curve of gross ACCUs generated each year over the 25-year permanence period. The first recommended reporting period of the permanence period is illustrated

In [68]:
figures

[{'rId': 'rId9',
  'caption': '',
  'img_path': 'word/media/image1.png',
  'alt_text': ''},
 {'rId': 'rId10',
  'caption': 'Project Area:',
  'img_path': 'word/media/image2.png',
  'alt_text': ''},
 {'rId': 'rId11',
  'caption': 'Figure 1. Location of the Pages Creek property in relation to surrounding localities. The property is identified by the white polygon.',
  'img_path': 'word/media/image3.png',
  'alt_text': ''},
 {'rId': 'rId12',
  'caption': 'Figure 2. a) Planting / non-woody areas of Pages Creek property; b) exclusion areas of Pages Creek as defined by grovia’s tree detection model.',
  'img_path': 'word/media/image4.png',
  'alt_text': ''},
 {'rId': 'rId13',
  'caption': 'Figure 3. Spatial distribution of carbon yield across the property.',
  'img_path': 'word/media/image5.png',
  'alt_text': ''},
 {'rId': 'rId14',
  'caption': 'Figure 4. CEA stratification and modelling points of Pages Creek property.',
  'img_path': 'word/media/image6.png',
  'alt_text': ''},
 {'rId': 'rI

In [38]:
figure_captions

{'image7': Image(figure_number=5, figure_number_new=1, figure_caption='Figure 5. Accumulation curve of gross ACCUs generated\r\neach year over the 25-year permanence period. The first recommended\r\nreporting period of the permanence period is illustrated by a yellow\r\nstar.', alt_text='keep-chart-accu_curve')}

In [None]:
from collections import namedtuple


def retrieve_all_figure_captions(docx_path, html_content, doc_img_src):
    """
    Retrieve all figure captions from the HTML content and match them with the document images.
    
    Args:
        docx_path (str): Path to the DOCX file.
        html_content (str): The HTML content as a string.
        doc_img_src (list): List of image source paths from the document.

    Returns:
        dict: A dictionary mapping image names to their figure captions and alt texts.
    """
    Image = namedtuple('Image', ["figure_number", "figure_number_new", "figure_caption", "alt_text"])

    figure_captions_revised = {}
    if not len(figure_captions) == len(doc_img_src):
        print("⚠️ WARNING: Number of figure captions does not match number of images in the document.")
        print("Sourcing missing figures from document...")
        # missing_figures = dc.check_for_missing_figures(compatible_docx_path, initial_html_clean)
        missing_figures = check_for_missing_figures(docx_path, html_content)
        manual_sourcing_imgs = { os.path.splitext(os.path.basename(value['img_path']))[0]: key for key , value in missing_figures.items()}
        figure_counter = 1
        for image_path in doc_img_src:
            image_name = os.path.splitext(os.path.basename(image_path))[0]
            if image_name not in figure_captions.keys():
                if image_name in manual_sourcing_imgs:
                    r_Id = manual_sourcing_imgs[image_name]
                    figure_captions_revised[image_name] = Image(0, figure_counter, missing_figures[r_Id]['caption'], missing_figures[r_Id]['alt_text']) #Current figure number is 0 as it is unknown but could be grabbed from the caption
                    # result[current_img] = Image(figure_number, figure_number_new, figure_caption, alt_text)
                    figure_counter += 1
                else:
                    # figure_captions_revised[image_name] = f"Figure {image_name} caption not found."
                    print(f"⚠️ WARNING: Figure {image_name} caption not found.")
                    print("If it is an icon, then it doesn't matter, otherwise please check the document.")
            else:
                figure_captions_revised[image_name] = figure_captions[image_name]
                figure_counter += 1

    return figure_captions_revised

Sourcing missing figures from document...
If it is an icon, then it doesn't matter, otherwise please check the document.
If it is an icon, then it doesn't matter, otherwise please check the document.
If it is an icon, then it doesn't matter, otherwise please check the document.
If it is an icon, then it doesn't matter, otherwise please check the document.
{'image8': 'rId16', 'image9': 'rId17', 'image10': 'rId18', 'image11': 'rId19', 'image12': 'rId20', 'image13': 'rId21', 'image14': 'rId22', 'image15': 'rId23', 'image16': 'rId24', 'image17': 'rId25', 'image19': 'rId27', 'image21': 'rId29', 'image23': 'rId31', 'image24': 'rId32', 'image25': 'rId33', 'image26': 'rId34', 'image27': 'rId35', 'image28': 'rId36'}


In [96]:
figure_captions_revised

{'image7': Image(figure_number=5, figure_number_new=1, figure_caption='Figure 5. Accumulation curve of gross ACCUs generated\r\neach year over the 25-year permanence period. The first recommended\r\nreporting period of the permanence period is illustrated by a yellow\r\nstar.', alt_text='keep-chart-accu_curve'),
 'image16': Image(figure_number=0, figure_number_new=2, figure_caption='Figure 14. Interpolated climate data from the Bureau of Meteorology and other providers at a 5km resolution. The figure shows the mean minimum and mean maximum monthly temperatures from data between 1956 to 2024.', alt_text='keep-chart-temperature'),
 'image17': Image(figure_number=0, figure_number_new=3, figure_caption='Figure 15. Median number of days with temperatures below 0°C since 1956.', alt_text='keep-chart-temperature'),
 'image19': Image(figure_number=0, figure_number_new=4, figure_caption='Figure 16. The Standardised Precipitation Evapotranspiration Index data for the past 20 years, grouped by 6 

New version