#### Basic navigation

Features:
 - Formatted text
 - Formatted tables including cell bg colour and horizontal merging
 - Targetted image retrieval

In [3]:
import docx_converter as dc
from docx import Document
from bs4 import BeautifulSoup
import os
import pypandoc
import os
import shutil
import json
import re
from collections import Counter

In [73]:
import importlib

dc = importlib.reload(dc)

#### Find icons with links

In [74]:
docx_path = "data/grovia_Carbon-LITE_Template.docx"
os.path.splitext(os.path.basename(docx_path))[0]
compatible_docx_path = f"app/{os.path.splitext(os.path.basename(docx_path))[0]}.docx"
output_path = f"app/{os.path.splitext(os.path.basename(docx_path))[0]}"
lua_script = "scripts/pandoc/docx_cleanup.lua"

os.makedirs(output_path, exist_ok=True)
allowed_alt_texts = ["timeline"]

os.makedirs(f"{output_path}/{dc.FOLDERS['media']}", exist_ok=True)
os.makedirs(f"{output_path}/{dc.FOLDERS['data']}", exist_ok=True)
os.makedirs(f"{output_path}/{dc.FOLDERS['content']}", exist_ok=True)

dc.check_compatibility(docx_path, compatible_docx_path)

## copy index.html to output_path
shutil.copyfile("scripts/index.html", f"{output_path}/index.html")
if not os.path.exists(f"{output_path}/js"):
    shutil.copytree("scripts/js", f"{output_path}/js")
if not os.path.exists(f"{output_path}/css"):
    shutil.copytree("scripts/css", f"{output_path}/css")


## Styles

## Extract text and table styles
DEFAULT_STYLES = dc.extract_styles(compatible_docx_path)
## save to json
with open(f"{output_path}/{dc.FOLDERS['data']}/styles.json", 'w') as f:
    json.dump(DEFAULT_STYLES, f, indent=2)

## Extract table data and formating that differs from the default styles
tables = dc.extract_table_format(compatible_docx_path, DEFAULT_STYLES)
## save to json
with open(f'{output_path}/{dc.FOLDERS['data']}/tables.json', 'w') as f:
    json.dump(tables, f, indent=2)

## End Styles ##
## Images ##

alt_text_map = dc.extract_docx_media(compatible_docx_path, output_path, dc.FOLDERS['media'], allowed_alt_texts)
print("\nüìù Alt Text to Image Mapping:", alt_text_map)

keep_images = [value.replace("assets", "media") for _, value in alt_text_map.items()]
print(f"keep_images: {keep_images}")
images_dict = {image: {'path_doc': path.replace('assets/', './media/'), 'path': path, 'alt_text': ''} for image, path in alt_text_map.items()}
print("images_dict: ", images_dict)

## START NEW VERSION ##
image_map = dc.parse_images_with_links_and_captions(compatible_docx_path)
# Filter images with alt text starting with "keep-"
keep_image_map = [image for image in image_map if image["alt_text"].startswith("keep-")]
## Update figure numbers
keep_image_map_nums = dc.update_figure_numbers(keep_image_map)
## Retrieve image types
keep_image_map_types = dc.identify_image_type(keep_image_map_nums)
## END NEW VERSION ##

initial_html = dc.convert_docx_to_html(compatible_docx_path, lua_script, keep_images)
print("HTML with unwanted images removed has been generated.")

initial_html_clean = dc.remove_empty_paragraphs(initial_html)

missing_figures = dc.check_for_missing_figures(compatible_docx_path, initial_html_clean)
if missing_figures:
    for missing_figure in missing_figures:
        print(f"‚ö†Ô∏è WARNING: Missing figure: {missing_figure}")

# Remove empty <figure> tags
# number of <figure> tags before cleaning
num_figures_before = initial_html_clean.count('<figure>')
# Remove empty <figure> tags
html_captions_removed = dc.remove_empty_figures(initial_html_clean)
# number of <figure> tags after cleaning
num_figures_after = html_captions_removed.count('<figure>')
print(f"Number of <figure> tags before cleaning: {num_figures_before}")
print(f"Number of <figure> tags after cleaning: {num_figures_after}")

## Replace img tags with div placeholders
html_images_replaced = dc.replace_images_with_divs(html_captions_removed, keep_image_map_types)


## End Images ##
## Tables ##


# Regular expression to match tables
table_pattern = re.compile(r'<table.*?</table>', re.DOTALL)

# Reset the counter to zero before replacing tables
counter = [0]
html_tables_id = table_pattern.sub(lambda match: dc.table_replacer(match, counter), html_images_replaced)

## End Tables ##
## Navigation ##

# Generate navigation data
nav_data = dc.generate_navigation_data(html_tables_id)

# Save navigation data to JSON file
with open(f"{output_path}/{dc.FOLDERS['data']}/navigation.json", "w", encoding="utf-8") as f:
    json.dump(nav_data, f, indent=4)

# Embed navigation data placeholder as the first element in the HTML
if '<div data-navigation></div>' not in html_tables_id:
    content_html = '<div data-navigation></div>\n' + html_tables_id

# Embed sub navigation data placeholder in the HTML. Place at the line above the first h2 tag of each section
content_html = dc.insert_sub_navigation(content_html, nav_data)

## End Navigation ##

# Save the modified content.html
with open(f"{output_path}/{dc.FOLDERS['content']}/content.html", "w", encoding="utf-8") as f:
    f.write(content_html)

print("Navigation JSON file and content placeholder updated successfully!")
print(f"Conversion complete! HTML file saved as {output_path}.")

Compatible document saved as: app/grovia_Carbon-LITE_Template.docx
Processing table 1...
Found a drawing element
Processing table 2...
Processing table 3...
Processing table 4...
Processing table 5...
Processing table 6...
üìÇ Creating media folder: app/grovia_Carbon-LITE_Template\assets
‚úÖ Moving image: image2.png ‚ûù app/grovia_Carbon-LITE_Template\assets\image2.png
‚úÖ Moving image: image3.png ‚ûù app/grovia_Carbon-LITE_Template\assets\image3.png
‚ùå ERROR: Image file not found: app/grovia_Carbon-LITE_Template\docx_extracted\word/media\image2.png
‚ùå ERROR: Image file not found: app/grovia_Carbon-LITE_Template\docx_extracted\word/media\image2.png
‚úÖ Moving image: image6.png ‚ûù app/grovia_Carbon-LITE_Template\assets\image6.png
‚úÖ Moving image: image7.png ‚ûù app/grovia_Carbon-LITE_Template\assets\image7.png
‚úÖ Moving image: image8.png ‚ûù app/grovia_Carbon-LITE_Template\assets\image8.png
‚úÖ Moving image: image9.png ‚ûù app/grovia_Carbon-LITE_Template\assets\image9.png
‚úÖ Movi

Convert to json

In [75]:
from bs4 import Tag, NavigableString
import re
import html
from pathlib import Path


INLINE_TAGS = {
    "b": "bold",
    "strong": "bold",
    "i": "italic",
    "em": "italic",
    "u": "underline",
    "sup": "superscript",
    "sub": "subscript"
}


def tokenize_text(text):
    """
    Split text into normal text and variable placeholders like {varName}
    Returns a list of dicts like {"type": "text", "text": "..."} or {"type": "var", "name": "..."}
    """
    tokens = []
    pattern = re.compile(r'({\w+})')  # match {...}

    parts = pattern.split(text)
    for part in parts:
        if part.startswith("{") and part.endswith("}"):
            tokens.append({ "type": "var", "name": part[1:-1] })  # strip curly braces
        elif part.strip():
            tokens.append({ "type": "text", "text": part })
    return tokens


def clean_text(text):
    # Decode HTML entities like &quot;, &rsquo;, etc.
    text = html.unescape(text)

    # Normalize curly/smart quotes to plain quotes
    replacements = {
        '‚Äò': "'", '‚Äô': "'",
        '‚Äú': '"', '‚Äù': '"',
        '‚Äû': '"', '¬´': '"', '¬ª': '"'
    }
    for orig, repl in replacements.items():
        text = text.replace(orig, repl)

    # Replace non-breaking space and odd encodings
    text = text.replace('\u00A0', ' ').replace('\u00C2', '')

    # Collapse multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)

    return text


def extract_text_with_links(tag):
    """Extract text and inline tags like <a>, <b>, etc., preserving clean formatting."""
    result = []

    def handle_node(node):
        if isinstance(node, NavigableString):
            text = clean_text(str(node))
            return tokenize_text(text)  # returns a list of {type: text|var}

        elif isinstance(node, Tag):
            if node.name == "a":
                return {
                    "type": "link",
                    "href": node.get("href"),
                    "text": clean_text(node.get_text()),
                    "class": node.get("class")
                }
            elif node.name in INLINE_TAGS:
                return {
                    "type": INLINE_TAGS[node.name],
                    "content": extract_text_with_links(node)
                }
            else:
                # Fallback for other tags
                return {"type": "text", "text": clean_text(node.get_text())}
        return None

    for child in tag.contents:
        item = handle_node(child)
        if item:
            if isinstance(item, list):
                result.extend(item)  # multiple tokens (text + var)
            else:
                result.append(item)


    return result



def parse_html_to_json(soup):
    stack = []
    root = []

    def create_section(tag):
        return {
            "title": tag.get_text(strip=True),
            "level": int(tag.name[1]),
            "content": [],
            "children": []
        }

    # Step 1: Identify the footnotes section so we can exclude its children
    footnotes_section = soup.find("section", class_="footnotes")
    footnote_ids = set()
    if footnotes_section:
        footnote_ids = {id_tag.get("id") for id_tag in footnotes_section.find_all(True) if id_tag.get("id")}

    # Step 2: Get all top-level elements (not inside footnotes)
    elements = []
    for tag in soup.find_all(['h1', 'h2', 'h3', 'p', 'div', 'ul', 'ol']):
        parent = tag.find_parent("section", class_="footnotes")
        if not parent and tag.get("id") not in footnote_ids:
            elements.append(tag)



    # Step 3: Parse normal content
    for elem in elements:
        if elem.name in ['h1', 'h2', 'h3']:
            section = create_section(elem)
            while stack and stack[-1]['level'] >= section['level']:
                stack.pop()
            if stack:
                stack[-1]['children'].append(section)
            else:
                root.append(section)
            stack.append(section)

        elif elem.name == 'p':
            if stack:
                stack[-1]['content'].append({
                    "type": "paragraph",
                    "content": extract_text_with_links(elem)
                })

        elif elem.name == 'div':
            div_type = elem.get("class")[0] if elem.get("class") else "other"
            div_content = {
                "type": div_type,
                **{key: value for key, value in elem.attrs.items() if key != "class"}, # exclude class as it is already used as div_type
                "text": elem.get_text(strip=True)
            }
            # Format for react-native-webview
            div_content = {key.replace("data-", ""): value for key, value in div_content.items()}
            for key in list(div_content.keys()):
                if "-" in key:
                    parts = key.split("-")
                    new_key = parts[0] + "".join(part.capitalize() for part in parts[1:])
                    div_content[new_key] = div_content.pop(key)
                    
            if stack:
                stack[-1]['content'].append(div_content)

        elif elem.name in ['ul', 'ol']:
            items = []
            for li in elem.find_all("li", recursive=False):
                p = li.find("p")
                if p:
                    items.append({
                        "content": extract_text_with_links(p)
                    })
                else:
                    items.append({
                        "content": extract_text_with_links(li)
                    })
            if stack:
                stack[-1]['content'].append({
                    "type": "list",
                    "ordered": elem.name == "ol",
                    "items": items
                })

    # Step 4: Parse footnotes separately
    footnotes = []
    if footnotes_section:
        for i, li in enumerate(footnotes_section.find_all("li"), start=1):
            fn_id = li.get("id", f"fn{i}")
            p = li.find("p")
            if p:
                backlink = p.find("a", class_="footnote-back")
                if backlink:
                    backlink.extract()
                footnotes.append({
                    "id": fn_id,
                    "number": i,
                    "content": extract_text_with_links(p)
                })

    return {
        "content": root,
        "footnotes": footnotes
    }

# Parse html

html_path = "app/grovia_Carbon-LITE_Template/content/content.html"
json_path = "data/grovia_Carbon-LITE_Template.json"

# Load the HTML content
html_content = Path(html_path).read_text(encoding="utf-8")

soup = BeautifulSoup(html_content, "html.parser")
json_data = parse_html_to_json(soup)


# Save the updated structure
with open(json_path, "w") as f:
    json.dump(json_data, f, indent=2)


Debugging json conversion

In [51]:
# This is your raw HTML string
html = '<div class="icon" data-link="What%20is%20an%20Environmental%20Planting%20(EP)?" data-src="materials:help"></div>'

# Parse the HTML
soup = BeautifulSoup(html, 'html.parser')

# Get the div element
elem = soup.find('div')

# Now process it as a tag
if elem.name == 'div':
    print(elem)  # Just for debug
    div_type = elem.get("class")[0] if elem.get("class") else "other"

    # Extract all attributes
    div_content = {
        "type": div_type,
        **{key: value for key, value in elem.attrs.items() if key != "class"}, # exclude class as it is already used as div_type
        "text": elem.get_text(strip=True)
    }

    print(div_content)

<div class="icon" data-link="What%20is%20an%20Environmental%20Planting%20(EP)?" data-src="materials:help"></div>
{'type': 'icon', 'data-link': 'What%20is%20an%20Environmental%20Planting%20(EP)?', 'data-src': 'materials:help', 'text': ''}


#### Testing code

In [89]:
from docx import Document
from zipfile import ZipFile
import os
import shutil
import xml.etree.ElementTree as ET

# XML namespaces used in DOCX
ns = {
    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
    'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
    'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'
}

# Load the DOCX file and unzip it
with ZipFile(docx_path) as docx_zip:
    document_xml = ET.fromstring(docx_zip.read('word/document.xml'))
    rels_xml = ET.fromstring(docx_zip.read('word/_rels/document.xml.rels'))

# Namespaces
ns = {
    'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
    'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
}

rels_ns = '{http://schemas.openxmlformats.org/package/2006/relationships}'

# Step 3: Build rels lookup: rId ‚Üí target
rels_lookup = {}
for rel in rels_xml.findall(f"{rels_ns}Relationship"):
    rels_lookup[rel.attrib['Id']] = rel.attrib['Target']

# Step 4: Find image blocks with both <a:blip> and <a:hlinkClick>
for drawing in document_xml.findall('.//a:graphicData/pic:pic', ns):
    blip = drawing.find('.//a:blip', ns)
    hlink = drawing.find('.//a:hlinkClick', ns)

    if blip is not None:
        img_rid = blip.attrib.get(f"{{{ns['r']}}}embed")
        img_target = rels_lookup.get(img_rid)

        if hlink is not None:
            link_rid = hlink.attrib.get(f"{{{ns['r']}}}id")
            link_target = rels_lookup.get(link_rid)

            print("üîó Found hyperlinked image:")
            print(f"    üì∑ Image rId: {img_rid}")
            print(f"    üñºÔ∏è  Image path: {img_target}")
            print(f"    üåê Link URL:  {link_target}\n")

üîó Found hyperlinked image:
    üì∑ Image rId: rId10
    üñºÔ∏è  Image path: media/image2.png
    üåê Link URL:  link_to_ep_article

üîó Found hyperlinked image:
    üì∑ Image rId: rId10
    üñºÔ∏è  Image path: media/image2.png
    üåê Link URL:  link_to_estimated_carbon_yield_article



In [152]:
import zipfile

def extract_docx(doc_path, output_path):
    """Extracts the DOCX contents into a specified folder."""
    extracted_folder = os.path.join(output_path, "docx_extracted")
    os.makedirs(extracted_folder, exist_ok=True)
    
    with zipfile.ZipFile(doc_path, "r") as docx_zip:
        docx_zip.extractall(extracted_folder)
    
    return extracted_folder

def parse_relationships(rels_path):
    """Parses the relationships file to map image IDs to filenames."""
    image_map = {}
    
    if os.path.exists(rels_path):
        tree = ET.parse(rels_path)
        root = tree.getroot()
        
        for rel in root.findall(".//{http://schemas.openxmlformats.org/package/2006/relationships}Relationship"):
            rid = rel.attrib.get("Id", "")
            target = rel.attrib.get("Target", "")
            
            if "media/" in target:
                image_map[rid] = target.split("/")[-1]
    
    return image_map

import os
import xml.etree.ElementTree as ET
from zipfile import ZipFile
from collections import defaultdict

def parse_images_with_links(docx_path):
    """
    Parses a .docx file to extract each image instance along with:
    - the image file path
    - the associated hyperlink (if any)
    - any 'link text' (from image metadata: name or description)
    """
    image_info = []

    with ZipFile(docx_path) as docx_zip:
        # Load the document and relationships
        document_xml = ET.fromstring(docx_zip.read("word/document.xml"))
        rels_xml = ET.fromstring(docx_zip.read("word/_rels/document.xml.rels"))

    # Define namespaces
    NS = {
        'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
        'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
        'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
        'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
        'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
    }
    RELS_NS = '{http://schemas.openxmlformats.org/package/2006/relationships}'

    # Build a lookup: rId ‚Üí target path (image or URL)
    rels_lookup = {
        rel.attrib['Id']: rel.attrib['Target']
        for rel in rels_xml.findall(f"{RELS_NS}Relationship")
    }

    # Scan all <w:drawing> blocks (where images and links are defined)
    for drawing in document_xml.findall('.//w:drawing', NS):
        # Get optional metadata
        docpr = drawing.find('.//wp:docPr', NS)
        descr = docpr.attrib.get("descr") if docpr is not None else ""
        name = docpr.attrib.get("name") if docpr is not None else ""

        # Get image ID from <a:blip r:embed="...">
        blip = drawing.find('.//a:blip', NS)
        img_rid = blip.attrib.get(f"{{{NS['r']}}}embed") if blip is not None else None
        img_target = rels_lookup.get(img_rid) if img_rid else None

        # Get hyperlink ID from <a:hlinkClick r:id="...">
        hlink = drawing.find('.//a:hlinkClick', NS)
        link_rid = hlink.attrib.get(f"{{{NS['r']}}}id") if hlink is not None else None
        link_target = rels_lookup.get(link_rid) if link_rid else None

        # Append result
        image_info.append({
            "image_id": img_rid,
            "image_file": img_target,
            "link_id": link_rid,
            "link_url": link_target,
            # "link_text": name or "",  # Use alt text or name if available
            "alt_text": descr or ""
        })

    return image_info


def extract_alt_texts(doc_xml_path, image_map, allowed_alt_texts, extracted_folder, output_path, image_folder):
    """Extracts images based on allowed alt texts and renames them."""
    alt_text_map = {}

    # Create the output folder for images
    media_folder = os.path.join(output_path, image_folder)
    print(f"üìÇ Creating media folder: {media_folder}")
    os.makedirs(image_folder, exist_ok=True)
    
    if os.path.exists(doc_xml_path):
        tree = ET.parse(doc_xml_path)
        root = tree.getroot()

        ns = {
            "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", 
            "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
            "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
            "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
        }

        for drawing in root.findall(".//w:drawing", ns):
            doc_pr = drawing.find(".//a:blip", ns)
            descr_tag = drawing.find(".//wp:docPr", ns)
            
            if doc_pr is not None and descr_tag is not None:
                alt_text = descr_tag.attrib.get("descr", "").strip()
                rid = doc_pr.attrib.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed", "")
                
                if rid in image_map and (alt_text in allowed_alt_texts or alt_text.startswith(ALT_TEXT_KEEP_PREFIX)):
                    old_name = image_map[rid]
                    alt_text_map[os.path.splitext(old_name)[0]] = f"{image_folder}/{old_name}"

                    old_path = os.path.join(extracted_folder, "word/media", old_name)
                    new_path = os.path.join(media_folder, old_name)

                    if os.path.exists(old_path):
                        print(f"‚úÖ Moving image: {old_name} ‚ûù {new_path}")
                        shutil.move(old_path, new_path)
                    else:
                        print(f"‚ùå ERROR: Image file not found: {old_path}")
    
    return alt_text_map

ALT_TEXT_KEEP_PREFIX = "keep-"

# Extract DOCX contents
extracted_folder = extract_docx(docx_path, output_path)

# Paths to XML files inside the extracted DOCX
rels_path = os.path.join(extracted_folder, "word/_rels/document.xml.rels")
doc_xml_path = os.path.join(extracted_folder, "word/document.xml")

# Parse relationships to map image IDs to filenames
image_map = parse_relationships(rels_path)
# image_map = parse_images_with_links(rels_path)

alt_text_map = extract_alt_texts(doc_xml_path, image_map, allowed_alt_texts, extracted_folder, output_path, dc.FOLDERS['media'])

alt_text_map

üìÇ Creating media folder: app/figure_numbers\assets
‚úÖ Moving image: image2.png ‚ûù app/figure_numbers\assets\image2.png
‚úÖ Moving image: image3.png ‚ûù app/figure_numbers\assets\image3.png
‚ùå ERROR: Image file not found: app/figure_numbers\docx_extracted\word/media\image2.png
‚úÖ Moving image: image6.png ‚ûù app/figure_numbers\assets\image6.png
‚úÖ Moving image: image7.png ‚ûù app/figure_numbers\assets\image7.png
‚úÖ Moving image: image8.png ‚ûù app/figure_numbers\assets\image8.png
‚úÖ Moving image: image13.png ‚ûù app/figure_numbers\assets\image13.png


{'image2': 'assets/image2.png',
 'image3': 'assets/image3.png',
 'image6': 'assets/image6.png',
 'image7': 'assets/image7.png',
 'image8': 'assets/image8.png',
 'image13': 'assets/image13.png'}

#### In-use version

In [None]:
import os
import xml.etree.ElementTree as ET
from zipfile import ZipFile
import re

def parse_images_with_links_and_captions(docx_path):
    """
    Extracts each image instance from a .docx file with:
    - image file path
    - hyperlink (if any)
    - image metadata (descr / name)
    - associated figure caption and number
    - inferred numbering scheme
    """
    image_info = []
    
    with ZipFile(docx_path) as docx_zip:
        document_xml = ET.fromstring(docx_zip.read("word/document.xml"))
        rels_xml = ET.fromstring(docx_zip.read("word/_rels/document.xml.rels"))

    NS = {
        'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
        'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
        'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
        'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
        'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
    }
    RELS_NS = '{http://schemas.openxmlformats.org/package/2006/relationships}'

    # Map rId ‚Üí Target (image or link)
    rels_lookup = {
        rel.attrib['Id']: rel.attrib['Target']
        for rel in rels_xml.findall(f"{RELS_NS}Relationship")
    }

    # Get all paragraphs (for matching captions)
    paragraphs = document_xml.findall('.//w:p', NS)

    for i, paragraph in enumerate(paragraphs):
        drawing = paragraph.find('.//w:drawing', NS)
        if drawing is not None:
            # Metadata
            docpr = drawing.find('.//wp:docPr', NS)
            descr = docpr.attrib.get("descr") if docpr is not None else ""
            name = docpr.attrib.get("name") if docpr is not None else ""

            # Image
            blip = drawing.find('.//a:blip', NS)
            img_rid = blip.attrib.get(f"{{{NS['r']}}}embed") if blip is not None else None
            img_target = rels_lookup.get(img_rid)

            # Hyperlink
            hlink = drawing.find('.//a:hlinkClick', NS)
            link_rid = hlink.attrib.get(f"{{{NS['r']}}}id") if hlink is not None else None
            link_target = rels_lookup.get(link_rid)

            # Try to get caption from the NEXT paragraph
            caption_para = paragraphs[i + 1] if i + 1 < len(paragraphs) else None
            fig_label, fig_number, caption_text, numbering_type, fig_caption = None, None, "", "", ""

            if caption_para is not None:
                # Look for caption text and figure numbering
                texts = caption_para.findall('.//w:t', NS)
                fld = caption_para.find('.//w:fldSimple', NS)
                instr = fld.attrib.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}instr') if fld is not None else ""

                if 'SEQ Figure' in instr:
                    match = re.search(r'SEQ Figure(\\\* [A-Z]+)?', instr)
                    if match:
                        if '\\* ARABIC' in instr:
                            numbering_type = "numeric"
                        elif '\\* ALPHABETIC' in instr:
                            numbering_type = "alphabetic"
                        elif '\\* ROMAN' in instr:
                            numbering_type = "roman"
                        else:
                            numbering_type = "unknown"

                    # Get actual number from <w:t> inside fldSimple
                    fig_number_elem = fld.find('.//w:t', NS)
                    fig_number = fig_number_elem.text if fig_number_elem is not None else None

                    # Get caption text from rest of paragraph
                    caption_text = ''.join(t.text for t in texts).strip()

                    # Try to isolate "Figure X" as label
                    fig_label = re.match(r'Figure\s+\S+', caption_text)
                    fig_label = fig_label.group(0) if fig_label else f"Figure {fig_number}"

                    # Extract prefix before "Figure"
                    # Extract everything before "Figure X" (with flexible matching)
                    if fig_number:
                        # Updated pattern to match "Figure" followed by an optional letter and then capture the caption
                        pattern = r'^(Figure(?: [A-Z])?)(\s*)(.*)'
                        match = re.match(pattern, caption_text)
                        if match:
                            fig_prefix, space, fig_suffix = match.groups()
                            fig_label = fig_prefix + space
                            fig_caption = fig_suffix[len(fig_number):] ## remove the number from caption text

            image_info.append({
                "image_id": img_rid,
                "image_file": img_target,
                "link_id": link_rid,
                "link_url": link_target,
                "alt_text": descr or name or "",
                "figure_number": fig_number,
                "figure_label": fig_label,
                "figure_caption": fig_caption,
                "caption_text": caption_text,
                "numbering_type": numbering_type
            })

    return image_info


# Create new figure captions
def update_figure_numbers(image_map):
    """
    Update figure numbers in the image map based on new postion.
    For example if Figures 1-3 are removed, then Figure 4 becomes Figure 1.
    """
    figure_counter = {}
    for image in keep_image_map:
        if image["figure_number"] is not None:
            # Extract the figure number from the caption text
            figure_prefix = image["figure_label"]
            if figure_prefix not in figure_counter:
                figure_counter[figure_prefix] = 1
            else:
                figure_counter[figure_prefix] += 1
            # Update the figure number and caption text
            image["figure_number_new"] = str(figure_counter[figure_prefix])
            image["caption_text_new"] = ''.join([image["figure_label"], image["figure_number_new"], image["figure_caption"]])
        else:
            image["figure_number_new"] = None
            
    return keep_image_map


In [225]:
image_map = parse_images_with_links_and_captions(docx_path)
# Filter images with alt text starting with "keep-"
keep_image_map = [image for image in image_map if image["alt_text"].startswith("keep-")]

keep_image_map_new = update_figure_numbers(keep_image_map)
keep_image_map_new

[{'image_id': 'rId10',
  'image_file': 'media/image2.png',
  'link_id': 'rId9',
  'link_url': 'link_to_ep_article',
  'alt_text': 'keep-icon-help',
  'figure_number': None,
  'figure_label': None,
  'figure_caption': '',
  'caption_text': '',
  'numbering_type': '',
  'figure_number_new': None},
 {'image_id': 'rId11',
  'image_file': 'media/image3.png',
  'link_id': None,
  'link_url': None,
  'alt_text': 'keep-icon-psychiatry',
  'figure_number': None,
  'figure_label': None,
  'figure_caption': '',
  'caption_text': '',
  'numbering_type': '',
  'figure_number_new': None},
 {'image_id': 'rId10',
  'image_file': 'media/image2.png',
  'link_id': 'rId13',
  'link_url': 'link_to_estimated_carbon_yield_article',
  'alt_text': 'keep-icon-help',
  'figure_number': None,
  'figure_label': None,
  'figure_caption': '',
  'caption_text': '',
  'numbering_type': '',
  'figure_number_new': None},
 {'image_id': 'rId17',
  'image_file': 'media/image6.png',
  'link_id': None,
  'link_url': None,
  

### Convert to html

In [47]:
docx_path = "data/grovia_Carbon-LITE_Template.docx"
os.path.splitext(os.path.basename(docx_path))[0]
compatible_docx_path = f"app/{os.path.splitext(os.path.basename(docx_path))[0]}.docx"
output_path = f"app/{os.path.splitext(os.path.basename(docx_path))[0]}"
lua_script = "scripts/pandoc/docx_cleanup.lua"

os.makedirs(output_path, exist_ok=True)
allowed_alt_texts = ["timeline"]

os.makedirs(f"{output_path}/{dc.FOLDERS['media']}", exist_ok=True)
os.makedirs(f"{output_path}/{dc.FOLDERS['data']}", exist_ok=True)
os.makedirs(f"{output_path}/{dc.FOLDERS['content']}", exist_ok=True)

dc.check_compatibility(docx_path, compatible_docx_path)

## copy index.html to output_path
shutil.copyfile("scripts/index.html", f"{output_path}/index.html")
if not os.path.exists(f"{output_path}/js"):
    shutil.copytree("scripts/js", f"{output_path}/js")
if not os.path.exists(f"{output_path}/css"):
    shutil.copytree("scripts/css", f"{output_path}/css")


## Styles

## Extract text and table styles
DEFAULT_STYLES = dc.extract_styles(compatible_docx_path)
## save to json
with open(f"{output_path}/{dc.FOLDERS['data']}/styles.json", 'w') as f:
    json.dump(DEFAULT_STYLES, f, indent=2)

## Extract table data and formating that differs from the default styles
tables = dc.extract_table_format(compatible_docx_path, DEFAULT_STYLES)
## save to json
with open(f'{output_path}/{dc.FOLDERS['data']}/tables.json', 'w') as f:
    json.dump(tables, f, indent=2)

## End Styles ##
## Images ##

alt_text_map = dc.extract_docx_media(compatible_docx_path, output_path, dc.FOLDERS['media'], allowed_alt_texts)
print("\nüìù Alt Text to Image Mapping:", alt_text_map)
## convert to a list of image integers
# keep_images = [int(name.replace('image', '').replace('.png', '')) for name in alt_text_map.keys()]
keep_images = [value.replace("assets", "media") for _, value in alt_text_map.items()]
print(f"keep_images: {keep_images}")
images_dict = {image: {'path_doc': path.replace('assets/', './media/'), 'path': path, 'alt_text': ''} for image, path in alt_text_map.items()}
print("images_dict: ", images_dict)


## Generate HTML from the docx file
initial_html = dc.convert_docx_to_html(compatible_docx_path, lua_script, keep_images)
print("HTML with unwanted images removed has been generated.")

initial_html_clean = dc.remove_empty_paragraphs(initial_html)

missing_figures = dc.check_for_missing_figures(compatible_docx_path, initial_html_clean)
if missing_figures:
    for missing_figure in missing_figures:
        print(f"‚ö†Ô∏è WARNING: Missing figure: {missing_figure}")

# Identify figure captions and their corresponding images
doc_img_src = [images_dict[img]['path_doc'] for img in images_dict]
figure_captions = dc.get_figure_captions(initial_html_clean, doc_img_src)
print(f"Figure captions: {figure_captions}")
# Remove empty <figure> tags
# number of <figure> tags before cleaning
num_figures_before = initial_html_clean.count('<figure>')
# Remove empty <figure> tags
html_captions_removed = dc.remove_empty_figures(initial_html_clean)
# number of <figure> tags after cleaning
num_figures_after = html_captions_removed.count('<figure>')
print(f"Number of <figure> tags before cleaning: {num_figures_before}")
print(f"Number of <figure> tags after cleaning: {num_figures_after}")

image_mapping = dc.find_image_alt_text(html_captions_removed)
# remove keep from alt text
for key, value in image_mapping.items():
    image_mapping[key] = value.replace("keep-", "")

icons_src, charts_src, images_src = dc.identify_image_type(image_mapping)

## Replace icons with placeholders
html_icons = dc.replace_icons_with_placholders(html_captions_removed, icons_src)

## Replace charts with placeholders
print(charts_src)
html_charts = dc.replace_charts_with_placeholders(html_icons, charts_src, figure_captions)

# Replace images with placeholders
html_images = dc.replace_images_with_placeholders(html_charts, images_src, figure_captions)

## Save to json
with open(f'{output_path}/{dc.FOLDERS['data']}/media.json', 'w') as f:
    json.dump(alt_text_map, f, indent=2)

print(f"alt_text_map: {alt_text_map}")

## End Images ##
## Tables ##


# Regular expression to match tables
table_pattern = re.compile(r'<table.*?</table>', re.DOTALL)

# Reset the counter to zero before replacing tables
counter = [0]
html_tables_id = table_pattern.sub(lambda match: dc.table_replacer(match, counter), html_images)

## End Tables ##
## Navigation ##

# Generate navigation data
nav_data = dc.generate_navigation_data(html_tables_id)

# Save navigation data to JSON file
with open(f"{output_path}/{dc.FOLDERS['data']}/navigation.json", "w", encoding="utf-8") as f:
    json.dump(nav_data, f, indent=4)

# Embed navigation data placeholder as the first element in the HTML
if '<div data-navigation></div>' not in html_tables_id:
    content_html = '<div data-navigation></div>\n' + html_tables_id

# Embed sub navigation data placeholder in the HTML. Place at the line above the first h2 tag of each section
content_html = dc.insert_sub_navigation(content_html, nav_data)

## End Navigation ##

# Save the modified content.html
with open(f"{output_path}/{dc.FOLDERS['content']}/content.html", "w", encoding="utf-8") as f:
    f.write(content_html)

print("Navigation JSON file and content placeholder updated successfully!")
print(f"Conversion complete! HTML file saved as {output_path}.")


Compatible document saved as: app/grovia_Carbon-LITE_Template.docx
Processing table 1...
Found a drawing element
[]
Processing table 2...
Processing table 3...
Processing table 4...
Processing table 5...
Processing table 6...
üìÇ Creating media folder: app/grovia_Carbon-LITE_Template\assets
‚úÖ Moving image: image2.png ‚ûù app/grovia_Carbon-LITE_Template\assets\image2.png
‚úÖ Moving image: image5.png ‚ûù app/grovia_Carbon-LITE_Template\assets\image5.png
‚úÖ Moving image: image6.png ‚ûù app/grovia_Carbon-LITE_Template\assets\image6.png
‚úÖ Moving image: image7.png ‚ûù app/grovia_Carbon-LITE_Template\assets\image7.png
‚úÖ Moving image: image12.png ‚ûù app/grovia_Carbon-LITE_Template\assets\image12.png

üìù Alt Text to Image Mapping: {'image2': 'assets/image2.png', 'image5': 'assets/image5.png', 'image6': 'assets/image6.png', 'image7': 'assets/image7.png', 'image12': 'assets/image12.png'}
keep_images: ['media/image2.png', 'media/image5.png', 'media/image6.png', 'media/image7.png', 'medi

#### content.html to json custom conversion

Attempt to support inline html tags

In [None]:
from bs4 import Tag, NavigableString
import re
import html
from pathlib import Path


INLINE_TAGS = {
    "b": "bold",
    "strong": "bold",
    "i": "italic",
    "em": "italic",
    "u": "underline",
    "sup": "superscript",
    "sub": "subscript"
}


def tokenize_text(text):
    """
    Split text into normal text and variable placeholders like {varName}
    Returns a list of dicts like {"type": "text", "text": "..."} or {"type": "var", "name": "..."}
    """
    tokens = []
    pattern = re.compile(r'({\w+})')  # match {...}

    parts = pattern.split(text)
    for part in parts:
        if part.startswith("{") and part.endswith("}"):
            tokens.append({ "type": "var", "name": part[1:-1] })  # strip curly braces
        elif part.strip():
            tokens.append({ "type": "text", "text": part })
    return tokens


def clean_text(text):
    # Decode HTML entities like &quot;, &rsquo;, etc.
    text = html.unescape(text)

    # Normalize curly/smart quotes to plain quotes
    replacements = {
        '‚Äò': "'", '‚Äô': "'",
        '‚Äú': '"', '‚Äù': '"',
        '‚Äû': '"', '¬´': '"', '¬ª': '"'
    }
    for orig, repl in replacements.items():
        text = text.replace(orig, repl)

    # Replace non-breaking space and odd encodings
    text = text.replace('\u00A0', ' ').replace('\u00C2', '')

    # Collapse multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text)

    return text


def extract_text_with_links(tag):
    """Extract text and inline tags like <a>, <b>, etc., preserving clean formatting."""
    result = []

    def handle_node(node):
        if isinstance(node, NavigableString):
            text = clean_text(str(node))
            return tokenize_text(text)  # returns a list of {type: text|var}

        elif isinstance(node, Tag):
            if node.name == "a":
                return {
                    "type": "link",
                    "href": node.get("href"),
                    "text": clean_text(node.get_text()),
                    "class": node.get("class")
                }
            elif node.name in INLINE_TAGS:
                return {
                    "type": INLINE_TAGS[node.name],
                    "content": extract_text_with_links(node)
                }
            else:
                # Fallback for other tags
                return {"type": "text", "text": clean_text(node.get_text())}
        return None

    for child in tag.contents:
        item = handle_node(child)
        if item:
            if isinstance(item, list):
                result.extend(item)  # multiple tokens (text + var)
            else:
                result.append(item)


    return result



def parse_html_to_json(soup):
    stack = []
    root = []

    def create_section(tag):
        return {
            "title": tag.get_text(strip=True),
            "level": int(tag.name[1]),
            "content": [],
            "children": []
        }

    # Step 1: Identify the footnotes section so we can exclude its children
    footnotes_section = soup.find("section", class_="footnotes")
    footnote_ids = set()
    if footnotes_section:
        footnote_ids = {id_tag.get("id") for id_tag in footnotes_section.find_all(True) if id_tag.get("id")}

    # Step 2: Get all top-level elements (not inside footnotes)
    elements = []
    for tag in soup.find_all(['h1', 'h2', 'h3', 'p', 'div', 'ul', 'ol']):
        parent = tag.find_parent("section", class_="footnotes")
        if not parent and tag.get("id") not in footnote_ids:
            elements.append(tag)



    # Step 3: Parse normal content
    for elem in elements:
        if elem.name in ['h1', 'h2', 'h3']:
            section = create_section(elem)
            while stack and stack[-1]['level'] >= section['level']:
                stack.pop()
            if stack:
                stack[-1]['children'].append(section)
            else:
                root.append(section)
            stack.append(section)

        elif elem.name == 'p':
            if stack:
                stack[-1]['content'].append({
                    "type": "paragraph",
                    "content": extract_text_with_links(elem)
                })

        elif elem.name == 'div':
            div_type = elem.get("class")[0] if elem.get("class") else "other"
            div_content = {
                "type": div_type,
                # **{key: elem.get(key) for key in ["class", "id", "data-caption", "data-src", "data-link"] if elem.get(key) is not None},
                **{key: elem.get(key) for key in elem if elem.get(key) is not None}, ## gets all the attributes
                "text": elem.get_text(strip=True)
            }
            if stack:
                stack[-1]['content'].append(div_content)

        elif elem.name in ['ul', 'ol']:
            items = []
            for li in elem.find_all("li", recursive=False):
                p = li.find("p")
                if p:
                    items.append({
                        "content": extract_text_with_links(p)
                    })
                else:
                    items.append({
                        "content": extract_text_with_links(li)
                    })
            if stack:
                stack[-1]['content'].append({
                    "type": "list",
                    "ordered": elem.name == "ol",
                    "items": items
                })

    # Step 4: Parse footnotes separately
    footnotes = []
    if footnotes_section:
        for i, li in enumerate(footnotes_section.find_all("li"), start=1):
            fn_id = li.get("id", f"fn{i}")
            p = li.find("p")
            if p:
                backlink = p.find("a", class_="footnote-back")
                if backlink:
                    backlink.extract()
                footnotes.append({
                    "id": fn_id,
                    "number": i,
                    "content": extract_text_with_links(p)
                })

    return {
        "content": root,
        "footnotes": footnotes
    }

# Parse html

html_path = "app/grovia_Carbon-LITE_Template/content/content.html"
json_path = "data/grovia_Carbon-LITE_Template.json"

# Load the HTML content
html_content = Path(html_path).read_text(encoding="utf-8")

soup = BeautifulSoup(html_content, "html.parser")
json_data = parse_html_to_json(soup)


# Save the updated structure
with open(json_path, "w") as f:
    json.dump(json_data, f, indent=2)


Doesn't support inline html tags

In [None]:
def parse_html_to_json(soup):
    stack = []
    root = []

    def create_section(tag):
        return {
            "title": tag.get_text(strip=True),
            "level": int(tag.name[1]),
            "content": [],
            "children": []
        }

    # Step 1: Identify the footnotes section so we can exclude its children
    footnotes_section = soup.find("section", class_="footnotes")
    footnote_ids = set()
    if footnotes_section:
        footnote_ids = {id_tag.get("id") for id_tag in footnotes_section.find_all(True) if id_tag.get("id")}

    # Step 2: Get all top-level elements (not inside footnotes)
    elements = []
    for tag in soup.find_all(['h1', 'h2', 'h3', 'p', 'div', 'ul', 'ol']):
        parent = tag.find_parent("section", class_="footnotes")
        if not parent and tag.get("id") not in footnote_ids:
            elements.append(tag)



    # Step 3: Parse normal content
    for elem in elements:
        if elem.name in ['h1', 'h2', 'h3']:
            section = create_section(elem)
            while stack and stack[-1]['level'] >= section['level']:
                stack.pop()
            if stack:
                stack[-1]['children'].append(section)
            else:
                root.append(section)
            stack.append(section)

        elif elem.name == 'p':
            if stack:
                stack[-1]['content'].append({
                    "type": "paragraph",
                    "content": extract_text_with_links(elem)
                })

        elif elem.name == 'div':
            div_type = elem.get("class")[0] if elem.get("class") else "other"
            div_content = {
                "type": div_type,
                **{key: elem.get(key) for key in ["id", "data-caption", "data-src"] if elem.get(key) is not None},
                "text": elem.get_text(strip=True)
            }
            if stack:
                stack[-1]['content'].append(div_content)

        elif elem.name in ['ul', 'ol']:
            items = []
            for li in elem.find_all("li", recursive=False):
                p = li.find("p")
                if p:
                    items.append({
                        "content": extract_text_with_links(p)
                    })
                else:
                    items.append({
                        "content": extract_text_with_links(li)
                    })
            if stack:
                stack[-1]['content'].append({
                    "type": "list",
                    "ordered": elem.name == "ol",
                    "items": items
                })

    # Step 4: Parse footnotes separately
    footnotes = []
    if footnotes_section:
        for i, li in enumerate(footnotes_section.find_all("li"), start=1):
            fn_id = li.get("id", f"fn{i}")
            p = li.find("p")
            if p:
                backlink = p.find("a", class_="footnote-back")
                if backlink:
                    backlink.extract()
                footnotes.append({
                    "id": fn_id,
                    "number": i,
                    "content": extract_text_with_links(p)
                })

    return {
        "content": root,
        "footnotes": footnotes
    }


# Parse html

html_path = "app/grovia_Carbon-LITE_Template/content/content.html"
json_path = "data/grovia_Carbon-LITE_Template.json"

# Load the HTML content
html_content = Path(html_path).read_text()

soup = BeautifulSoup(html_content, "html.parser")
json_data = parse_html_to_json(soup)


# Save the updated structure
with open(json_path, "w") as f:
    json.dump(json_data, f, indent=2)


In [35]:
text = "Table 4. Hello</a>"
text[:-4]

'Table 4. Hello'

In [22]:
from bs4 import BeautifulSoup, Tag
import json
from collections import defaultdict
from pathlib import Path

# Load the HTML content
html_path = "app/grovia_Carbon-LITE_Template/content/content.html"
html_content = Path(html_path).read_text()

## convert docx to json
json_path = "data/grovia_Carbon-LITE_Template.json"

# Parse with BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

# Fix the list duplication issue and ensure list items and footnotes contain correct nested structures
# Improve accuracy in comparing paragraph text to list item content
# Also ensure that <a> tags in footnotes and list items are correctly handled

from bs4 import NavigableString

def extract_inline_content(tag):
    """Extract inline content (text, footnotes, links) from a tag's children."""
    contents = []
    for child in tag.children:
        if isinstance(child, NavigableString):
            text = child.strip()
            if text:
                contents.append({"type": "text", "text": text})
        elif isinstance(child, Tag):
            if child.name == "a":
                if "footnote-ref" in child.get("class", []):
                    contents.append({
                        "type": "footnote-ref",
                        "ref": child.get("href").lstrip("#")
                    })
                else:
                    contents.append({
                        "type": "link",
                        "href": child.get("href"),
                        "text": child.get_text(strip=True)
                    })
            else:
                text = child.get_text(strip=True)
                if text:
                    contents.append({"type": "text", "text": text})
    return contents

def extract_list(tag):
    return {
        "type": "list",
        "ordered": tag.name == "ol",
        "list_type": "ordered" if tag.name == "ol" else "unordered",
        "items": [extract_inline_content(li) for li in tag.find_all("li", recursive=False)]
    }

def parse_html_final(soup):
    elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'div', 'ul', 'ol'])
    stack = []
    root = []
    skip_paragraphs = set()

    def create_section(tag):
        return {
            "title": tag.get_text(strip=True),
            "level": int(tag.name[1]),
            "content": [],
            "children": []
        }

    for i, elem in enumerate(elements):
        if elem.name in ['h1', 'h2', 'h3']:
            section = create_section(elem)
            while stack and stack[-1]['level'] >= section['level']:
                stack.pop()
            if stack:
                stack[-1]['children'].append(section)
            else:
                root.append(section)
            stack.append(section)

        elif elem.name == 'p':
            if elem in skip_paragraphs:
                continue
            if stack:
                content = extract_inline_content(elem)
                stack[-1]['content'].append({
                    "type": "paragraph",
                    "content": content
                })

        elif elem.name in ['ul', 'ol']:
            if stack:
                list_data = extract_list(elem)
                stack[-1]['content'].append(list_data)

                # Check if the next one or two paragraphs match the list items and should be skipped
                list_texts_flat = ["".join(part['text'] for part in item if part['type'] == 'text') for item in list_data["items"]]

                for j in range(1, 3):  # Check next two elements
                    if i + j >= len(elements):
                        break
                    next_elem = elements[i + j]
                    if next_elem.name == 'p':
                        next_text = next_elem.get_text(strip=True)
                        if any(next_text in list_item for list_item in list_texts_flat):
                            skip_paragraphs.add(next_elem)

        elif elem.name == 'div':
            if elem.get("data-navigation") is not None:
                continue
            div_type = "sub-navigation" if elem.get("data-sub-navigation") is not None else (
                elem.get("class")[0] if elem.get("class") else "other"
            )
            div_content = {
                "type": div_type,
                "id": elem.get("id"),
                "data-caption": elem.get("data-caption"),
                "data-parent": elem.get("data-parent"),
                "text": elem.get_text(strip=True)
            }
            if stack:
                stack[-1]['content'].append(div_content)

    # Extract footnotes
    footnotes = []
    footnote_section = soup.find("section", class_="footnotes")
    if footnote_section:
        for li in footnote_section.find_all("li"):
            ref_id = li.get("id")
            contents = extract_inline_content(li)
            footnotes.append({
                "id": ref_id,
                "content": contents
            })

    return {
        "content": root,
        "footnotes": footnotes
    }

# Run the final parser
soup = BeautifulSoup(html_content, "html.parser")
final_structure = parse_html_final(soup)


# Save the updated structure
with open(json_path, "w") as f:
    json.dump(final_structure, f, indent=2)




Old

In [None]:
from bs4 import BeautifulSoup, Tag
import json

def parse_html_structure(html_path: str) -> list:
    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    content = []
    stack = []  # For nested heading levels

    def current_container(level: int):
        while stack and stack[-1]["level"] >= level:
            stack.pop()
        return stack[-1]["children"] if stack else content

    def add_to_container(block, level=None):
        container = current_container(level or 999)
        container.append(block)

    body = soup.body if soup.body else soup
    
    for el in body.children:

        if not isinstance(el, Tag):
            continue

        tag = el.name

        # Handle headings
        if tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            level = int(tag[1])
            node = {
                "heading": el.get_text(strip=True),
                "level": level,
                "children": []
            }
            add_to_container(node, level)
            stack.append(node)

        # Handle paragraphs
        elif tag == "p":
            add_to_container({ "type": "paragraph", "content": el.get_text(strip=True) })

        # Handle div-based custom content
        elif tag == "div":
            classes = el.get("class", [])
            data = el.attrs

            if "icon" in classes:
                add_to_container({
                    "type": "icon",
                    "icon": data.get("data-icon"),
                    "label": data.get("data-label")
                })
            elif "chart" in classes:
                add_to_container({ "type": "chart", "id": data.get("id") })
            elif "image" in classes:
                add_to_container({
                    "type": "image",
                    "src": data.get("data-src"),
                    "caption": data.get("data-caption")
                })
            elif "table" in classes:
                add_to_container({ "type": "table", "id": data.get("id") })
            else:
                add_to_container({ "type": "unknown-div", "class": classes, "attributes": data })

        # Handle real tables
        elif tag == "table":
            headers = []
            rows = []
            for tr in el.find_all("tr"):
                cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
                if tr.find("th"):
                    headers = cells
                else:
                    rows.append(cells)

            add_to_container({
                "type": "table",
                "headers": headers,
                "rows": rows
            })

        # Optionally handle <ul>, <ol>, etc.

    return content


In [12]:
out = parse_html_structure(html_path = "app/grovia_Carbon-LITE_Template/content/content.html")

In [13]:
out

[{'type': 'unknown-div', 'class': [], 'attributes': {'data-navigation': ''}},
 {'heading': 'Overview',
  'level': 1,
  'children': [{'type': 'paragraph',
    'content': 'The following is a property-specific assessment of select variables\n\nthat are important to consider when assessing the feasibility of\n\ndeveloping anEnvironmental\n\nPlanting (EP).'},
   {'type': 'paragraph',
    'content': 'You will be able to view fundamental information regarding the\n\npotential opportunity and obstacles to project development.'},
   {'type': 'paragraph',
    'content': 'With a PRO and PRO+subscription you will also have access\n\nto more elaborate information and interpretation of additional factors\n\nthat are required for critical review in the decision-making process of\n\nyour land assessment and project development.'},
   {'type': 'unknown-div',
    'class': [],
    'attributes': {'data-sub-navigation': '', 'data-parent': 'overview'}},
   {'heading': 'Summary of Assessment',
    'level': 2

#### docx to json conversion

In [None]:
import importlib

dc = importlib.reload(dc)

docx_path = "data/grovia_Carbon-LITE_Template.docx"

## convert docx to json
json_path = "data/grovia_Carbon-LITE_Template.json"


def simplify(ast_json):
    import json
    parsed = json.loads(ast_json)
    return [{'type': block['t'], 'content': block['c']} for block in parsed['blocks']]


## convert html to json
html_path = "app/grovia_Carbon-LITE_Template/content/content.html"

html_data = pypandoc.convert_file(html_path, 'json')
html_data_simplified = simplify(html_data)
with open(json_path, 'w') as f:
    json.dump(html_data_simplified, f, indent=4)


#### Custom attempt

In [None]:
from docx import Document
import json
import base64
import os

docx_path = "data/grovia_Carbon-LITE_Template.docx"

## convert docx to json
json_path = "data/grovia_Carbon-LITE_Template.json"

def extract_paragraph(paragraph):
    parts = []
    for run in paragraph.runs:
        text = run.text.strip()
        if not text:
            continue
        part = {"text": text}
        if run.bold:
            part["bold"] = True
        if run.italic:
            part["italic"] = True
        parts.append(part)
    return parts

def extract_list_type(paragraph):
    # Simple bullet detection (imperfect but works for many styles)
    if paragraph.style.name.startswith('List Bullet'):
        return 'ul'
    elif paragraph.style.name.startswith('List Number'):
        return 'ol'
    return None

def extract_images(document, base64_encode=True):
    images = []
    rels = document.part._rels
    for rel in rels:
        rel = rels[rel]
        if "image" in rel.target_ref:
            image_data = rel.target_part.blob
            image_name = os.path.basename(rel.target_ref)
            image_obj = {
                "name": image_name,
                "data": base64.b64encode(image_data).decode() if base64_encode else None
            }
            images.append(image_obj)
    return images

def docx_to_json(filepath, include_images=True):
    document = Document(filepath)
    pages = []
    current_page = None

    for para in document.paragraphs:
        style = para.style.name
        text = para.text.strip()

        if not text and not para.runs:
            continue

        # New page starts with Heading 1
        if style.startswith("Heading 1"):
            if current_page:
                pages.append(current_page)
            current_page = {
                "header": text,
                "content": []
            }
        elif current_page:
            # Process block content
            block = {
                "type": style,
                "text": extract_paragraph(para)
            }
            list_type = extract_list_type(para)
            if list_type:
                block["list_type"] = list_type
            current_page["content"].append(block)
        else:
            # Skip content before first H1
            continue

    # Append last page
    if current_page:
        pages.append(current_page)

    result = {"pages": pages}
    if include_images:
        result["images"] = extract_images(document)

    return result


# Example usage:
json_data = docx_to_json(docx_path)
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=2)


TypeError: 'NoneType' object is not subscriptable

#### Original

In [6]:
img_matches = []
if img_matches:
    for match in img_matches:
        print(f"Image found: {match.group()}")

In [3]:
tables

{'table_0': {'headers': [{'text': 'Project Name:',
    'textAlign': 'right',
    'verticalAlign': 'middle'},
   {'text': 'Armidale', 'textAlign': 'left', 'verticalAlign': 'middle'}],
  'rows': [[{'text': 'Location:',
     'bold': True,
     'textAlign': 'right',
     'verticalAlign': 'middle'},
    {'text': 'Armidale, NSW', 'textAlign': 'left', 'verticalAlign': 'middle'}],
   [{'text': 'Project Type:',
     'bold': True,
     'textAlign': 'right',
     'verticalAlign': 'middle',
     'iconHtml': "<span class='material-symbols-outlined'>psychiatry</span>",
     'iconPosition': 'start'},
    {'text': 'Environmental Planting',
     'textAlign': 'left',
     'verticalAlign': 'middle'}],
   [{'text': 'Project Area:',
     'bold': True,
     'textAlign': 'right',
     'verticalAlign': 'middle'},
    {'text': '1804.0 ha', 'textAlign': 'left', 'verticalAlign': 'middle'}],
   [{'text': 'Planting Area:',
     'bold': True,
     'textAlign': 'right',
     'verticalAlign': 'middle'},
    {'text': 

In [5]:
initial_html_clean

'<h1 id="overview">Overview</h1>\r\n<p>The following is a property-specific assessment of select variables\r\nthat are important to consider when assessing the feasibility of\r\ndeveloping an <a\r\nhref="What%20is%20an%20Environmental%20Planting%20(EP)">Environmental\r\nPlanting (EP).</a></p>\r\n<p>You will be able to view fundamental information regarding the\r\npotential opportunity and obstacles to project development.\xa0</p>\r\n<p>With a PRO and PRO<sup>+</sup> subscription you will also have access\r\nto more elaborate information and interpretation of additional factors\r\nthat are required for critical review in the decision-making process of\r\nyour land assessment and project development.</p>\r\n<h2 id="summary-of-assessment">Summary of Assessment</h2>\r\n<table>\r\n<colgroup>\r\n<col style="width: 21%" />\r\n<col style="width: 78%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th style="text-align: right;">Project Name:</th>\r\n<th>Armidale</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\

In [79]:
images_src

{'./media/image22.png': 'timeline'}

In [None]:
from collections import namedtuple

## figure number
def get_figure_captions(html_content, doc_img_src: list) -> namedtuple:
    soup = BeautifulSoup(html_content, 'html.parser')
    figure_tags = soup.find_all('figure')
    figure_number = 0
    figure_number_new = 0
    Image = namedtuple('Image', ["figure_number", "figure_number_new", "figure_caption", "alt_text"])
    result = {}

    for figure in figure_tags:
        figure_number += 1
        if not figure.find('img') or figure.find('img')['src'] not in doc_img_src:
            continue
        figure_number_new += 1
        figure_caption = figure.find('figcaption').text if figure.find('figcaption') else ""
        alt_text = figure.find('img')['alt'] if figure.find('img')['alt'] else ""
        current_img = os.path.splitext(os.path.basename(figure.find('img')['src']))[0]
        result[current_img] = Image(figure_number, figure_number_new, figure_caption, alt_text)

    return result


In [41]:
doc_img_src = [images_dict[img]['path_doc'] for img in images_dict]

In [42]:
doc_img_src

['./media/image2.png',
 './media/image11.png',
 './media/image12.png',
 './media/image13.png',
 './media/image14.png',
 './media/image15.png',
 './media/image16.png',
 './media/image18.png',
 './media/image22.png']

In [80]:
html_images

'<h1 id="overview">Overview</h1>\n<p>The following is a property-specific assessment of select variables\r\nthat are important to consider when assessing the feasibility of\r\ndeveloping an <a href="What%20is%20an%20Environmental%20Planting%20(EP)">Environmental\r\nPlanting (EP).</a></p>\n<p>You will be able to view fundamental information regarding the\r\npotential opportunity and obstacles to project development.\xa0</p>\n<p>With a PRO and PRO<sup>+</sup> subscription you will also have access\r\nto more elaborate information and interpretation of additional factors\r\nthat are required for critical review in the decision-making process of\r\nyour land assessment and project development.</p>\n<h2 id="summary-of-assessment">Summary of Assessment</h2>\n<table>\n<colgroup>\n<col style="width: 21%"/>\n<col style="width: 78%"/>\n</colgroup>\n<thead>\n<tr>\n<th style="text-align: right;">Project Name:</th>\n<th>Armidale</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td style="text-align: right;"><

In [66]:
out

{'image11': Image(figure_number=8, figure_number_new=1, figure_caption='. Precipitation', alt_text='keep-chart-precipitation'),
 'image12': Image(figure_number=9, figure_number_new=2, figure_caption='. Temperature', alt_text='keep-chart-temperature'),
 'image13': Image(figure_number=10, figure_number_new=3, figure_caption='. Frost', alt_text='keep-chart-frost'),
 'image15': Image(figure_number=11, figure_number_new=4, figure_caption='. Standardised Precipitation Evapotranspiration Index\r\n(SPEI-6) over the last 20 years for the project site', alt_text='keep-chart-drought'),
 'image22': Image(figure_number=16, figure_number_new=5, figure_caption='. A timeline outlining the key stages of a carbon\r\nproject, from initial desktop assessment and site visits to CER project\r\nregistration and ongoing monitoring, reporting, and verification\r\npost-planting.', alt_text='keep-timeline')}

In [None]:
def find_image_alt_text(html_content):
    # Parse HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # create a named tuple to store the image filename and alt text
    Image = collections.namedtuple('Image', ['path_doc', 'path_new', 'alt_text', 'figure_number_doc', 'figure_number_new', 'caption'])

    # Map image information to a list of named tuples
    image_mapping = {}

    for 

    # Find all image tags in the HTML
    for img in soup.find_all('img'):
        # Check if the image tag has a parent figure tag
        figure = img.find_parent('figure')

        # Get the image filename and alt text
        filename = img['src']
        alt_text = img.get('alt', '')
        caption = figure.find('figcaption').text if figure and figure.find('figcaption') else ''

        # Store the image information in the named tuple
        image_mapping[filename] = Image(filename, alt_text, figure.number if figure else None, caption)

    # Create a mapping of image filename to alt text from the HTML
    html_alt_map = {
        img['src']: img.get('alt', '')
        for img in soup.find_all('img') if 'src' in img.attrs
    }

    return html_alt_map

{'image2': 'assets/image2.png',
 'image11': 'assets/image11.png',
 'image12': 'assets/image12.png',
 'image13': 'assets/image13.png',
 'image14': 'assets/image14.png',
 'image15': 'assets/image15.png',
 'image16': 'assets/image16.png',
 'image18': 'assets/image18.png',
 'image22': 'assets/image22.png'}

In [6]:
image_mapping

{'./media/image2.png': 'icon-project_type',
 './media/image11.png': 'chart-precipitation',
 './media/image12.png': 'chart-temperature',
 './media/image13.png': 'chart-frost',
 './media/image14.png': 'icon-drought',
 './media/image15.png': 'chart-drought',
 './media/image16.png': 'icon-bushfire',
 './media/image18.png': 'icon-flood',
 './media/image22.png': 'timeline'}

In [None]:
# Store alt text and caption
alt_text_map[img_id]['alt_text'] = alt_text.removeprefix(ALT_TEXT_KEEP_PREFIX) if alt_text else ""
alt_text_map[img_id]['figcaption'] = figcaption_text

{'./media/image2.png': 'Project-Type',
 './media/image11.png': 'chart-precipitation',
 './media/image12.png': 'chart-temperature',
 './media/image13.png': 'chart-frost',
 './media/image14.png': 'icon-drought',
 './media/image15.png': 'chart-drought',
 './media/image16.png': 'icon-bushfire',
 './media/image18.png': 'icon-flood',
 './media/image22.png': 'timeline'}

In [6]:
images_dict

{'image2': {'path': 'assets/image2.png', 'alt_text': ''},
 'image11': {'path': 'assets/image11.png',
  'alt_text': 'chart-precipitation',
  'figcaption': '<p>Figure 1. Precipitation</p>'},
 'image12': {'path': 'assets/image12.png',
  'alt_text': 'chart-temperature',
  'figcaption': '<p>Figure 2. Temperature</p>'},
 'image13': {'path': 'assets/image13.png',
  'alt_text': 'chart-frost',
  'figcaption': '<p>Figure 3. Frost</p>'},
 'image14': {'path': 'assets/image14.png', 'alt_text': ''},
 'image15': {'path': 'assets/image15.png',
  'alt_text': 'chart-drought',
  'figcaption': '<p>Figure 4. Standardised Precipitation Evapotranspiration Index\r\n(SPEI-6) over the last 20 years for the project site</p>'},
 'image16': {'path': 'assets/image16.png', 'alt_text': ''},
 'image18': {'path': 'assets/image18.png', 'alt_text': ''},
 'image22': {'path': 'assets/image22.png',
  'alt_text': 'timeline',
  'figcaption': '<p>Figure 5. A timeline outlining the key stages of a carbon\r\nproject, from initia

In [5]:
html_captions_removed

'<h1 id="overview">Overview</h1>\r\n<p>The following is a property-specific assessment of select variables\r\nthat are important to consider when assessing the feasibility of\r\ndeveloping an <a\r\nhref="What%20is%20an%20Environmental%20Planting%20(EP)">Environmental\r\nPlanting (EP).</a></p>\r\n<p>You will be able to view fundamental information regarding the\r\npotential opportunity and obstacles to project development.\xa0</p>\r\n<p>With a PRO and PRO<sup>+</sup> subscription you will also have access\r\nto more elaborate information and interpretation of additional factors\r\nthat are required for critical review in the decision-making process of\r\nyour land assessment and project development.</p>\r\n<h2 id="summary-of-assessment">Summary of Assessment</h2>\r\n<table>\r\n<colgroup>\r\n<col style="width: 21%" />\r\n<col style="width: 78%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th style="text-align: right;">Project Name:</th>\r\n<th>Armidale</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\

In [4]:
html_images

'<h1 id="overview">Overview</h1>\r\n<p>The following is a property-specific assessment of select variables\r\nthat are important to consider when assessing the feasibility of\r\ndeveloping an <a\r\nhref="What%20is%20an%20Environmental%20Planting%20(EP)">Environmental\r\nPlanting (EP).</a></p>\r\n<p>You will be able to view fundamental information regarding the\r\npotential opportunity and obstacles to project development.\xa0</p>\r\n<p>With a PRO and PRO<sup>+</sup> subscription you will also have access\r\nto more elaborate information and interpretation of additional factors\r\nthat are required for critical review in the decision-making process of\r\nyour land assessment and project development.</p>\r\n<h2 id="summary-of-assessment">Summary of Assessment</h2>\r\n<table>\r\n<colgroup>\r\n<col style="width: 21%" />\r\n<col style="width: 78%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th style="text-align: right;">Project Name:</th>\r\n<th>Armidale</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\

In [None]:
# Convert DOCX to HTML
html_initial2 = pypandoc.convert_file(
    compatible_docx_path, 
    "html", 
    extra_args=[
        "--quiet",
        "--extract-media=."  # Extract media to the current directory
    ]
)

In [33]:
html_initial2

'<p><img src="./media/image1.png"\r\nstyle="width:8.26217in;height:11.68122in" /></p>\r\n<p>Carbon PRO Report</p>\r\n<p>Environmental Planting</p>\r\n<p>Armidale, NSW</p>\r\n<p>12 March 2025</p>\r\n<h1 id="overview">Overview</h1>\r\n<p>The following is a property-specific assessment of select variables\r\nthat are important to consider when assessing the feasibility of\r\ndeveloping an <a\r\nhref="What%20is%20an%20Environmental%20Planting%20(EP)">Environmental\r\nPlanting (EP).</a></p>\r\n<p>You will be able to view fundamental information regarding the\r\npotential opportunity and obstacles to project development.\xa0</p>\r\n<p>With a PRO and PRO<sup>+</sup> subscription you will also have access\r\nto more elaborate information and interpretation of additional factors\r\nthat are required for critical review in the decision-making process of\r\nyour land assessment and project development.</p>\r\n<h2 id="summary-of-assessment">Summary of Assessment</h2>\r\n<table>\r\n<colgroup>\r\n<c

In [30]:
metadata_json = dc.generate_lua_lookup_table(keep_images)

In [31]:
# Convert DOCX to HTML
html_initial = pypandoc.convert_file(
    compatible_docx_path, 
    "html", 
    extra_args=[
        "--quiet",
        f"--lua-filter={lua_script}",  # Replace with your actual Lua filter file
        "--extract-media=.",  # Extract media to the current directory
        "--metadata", f"keep_images={metadata_json}"  # Pass as JSON
    ]
)

In [32]:
html_initial

'<h1 id="overview">Overview</h1>\r\n<p>The following is a property-specific assessment of select variables\r\nthat are important to consider when assessing the feasibility of\r\ndeveloping an <a\r\nhref="What%20is%20an%20Environmental%20Planting%20(EP)">Environmental\r\nPlanting (EP).</a></p>\r\n<p>You will be able to view fundamental information regarding the\r\npotential opportunity and obstacles to project development.\xa0</p>\r\n<p>With a PRO and PRO<sup>+</sup> subscription you will also have access\r\nto more elaborate information and interpretation of additional factors\r\nthat are required for critical review in the decision-making process of\r\nyour land assessment and project development.</p>\r\n<h2 id="summary-of-assessment">Summary of Assessment</h2>\r\n<table>\r\n<colgroup>\r\n<col style="width: 21%" />\r\n<col style="width: 78%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th style="text-align: right;">Project Name:</th>\r\n<th>Armidale</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\

In [39]:
initial_html

'<h1 id="overview">Overview</h1>\r\n<p>The following is a property-specific assessment of select variables\r\nthat are important to consider when assessing the feasibility of\r\ndeveloping an <a\r\nhref="What%20is%20an%20Environmental%20Planting%20(EP)">Environmental\r\nPlanting (EP).</a></p>\r\n<p>You will be able to view fundamental information regarding the\r\npotential opportunity and obstacles to project development.\xa0</p>\r\n<p>With a PRO and PRO<sup>+</sup> subscription you will also have access\r\nto more elaborate information and interpretation of additional factors\r\nthat are required for critical review in the decision-making process of\r\nyour land assessment and project development.</p>\r\n<h2 id="summary-of-assessment">Summary of Assessment</h2>\r\n<table>\r\n<colgroup>\r\n<col style="width: 21%" />\r\n<col style="width: 78%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th style="text-align: right;">Project Name:</th>\r\n<th>Armidale</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\

In [35]:
initial_html_clean

'<h1 id="overview">Overview</h1>\r\n<p>The following is a property-specific assessment of select variables\r\nthat are important to consider when assessing the feasibility of\r\ndeveloping an <a\r\nhref="What%20is%20an%20Environmental%20Planting%20(EP)">Environmental\r\nPlanting (EP).</a></p>\r\n<p>You will be able to view fundamental information regarding the\r\npotential opportunity and obstacles to project development.\xa0</p>\r\n<p>With a PRO and PRO<sup>+</sup> subscription you will also have access\r\nto more elaborate information and interpretation of additional factors\r\nthat are required for critical review in the decision-making process of\r\nyour land assessment and project development.</p>\r\n<h2 id="summary-of-assessment">Summary of Assessment</h2>\r\n<table>\r\n<colgroup>\r\n<col style="width: 21%" />\r\n<col style="width: 78%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th style="text-align: right;">Project Name:</th>\r\n<th>Armidale</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\

In [52]:
html_captions_removed

'<h1 id="overview">Overview</h1>\r\n<p>This section provides an overview of the property and carbon\r\nassessment.</p>\r\n<h2 id="preview">Preview</h2>\r\n<table>\r\n<caption><p>Table 1. Summary of Assessment</p></caption>\r\n<colgroup>\r\n<col style="width: 50%" />\r\n<col style="width: 50%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th>Item assessed</th>\r\n<th>Assessment</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\n<tr>\r\n<td>Planting Area</td>\r\n<td><strong>Moderate risk.</strong> Lots of risk.</td>\r\n</tr>\r\n<tr>\r\n<td>Carbon returns</td>\r\n<td><strong>High risk.</strong> Only 4.1 ACCUs/ha.</td>\r\n</tr>\r\n<tr>\r\n<td>Topography</td>\r\n<td><strong>Low risk.</strong> Little to no areas with slope greater\r\nthan 15 degrees.</td>\r\n</tr>\r\n</tbody>\r\n</table>\r\n<p>Located in the Dandenong ranges amongst wet sclerophyll forests and\r\ncarbon rich soils. Your dream carbon project awaits‚Ä¶</p>\r\n<figure>\r\n<img src="./media/image2.png" style="width:6.03278in;height:2.85398in"\r

In [38]:
keep_images

[2, 14, 16, 18, 22]

In [37]:
alt_text_map

{'image2': {'path': 'assets/image2.png', 'alt_text': ''},
 'image14': {'path': 'assets/image14.png',
  'alt_text': 'chart-frost',
  'figcaption': '<p>Figure 1. Frost</p>'},
 'image16': {'path': 'assets/image16.png',
  'alt_text': 'chart-drought',
  'figcaption': '<p>Figure 2. Standardised Precipitation Evapotranspiration Index\r\n(SPEI-6) over the last 20 years for the project site</p>'},
 'image18': {'path': 'assets/image18.png',
  'alt_text': 'A map of land with yellow and orange dots AI-generated content may be incorrect.',
  'figcaption': '<p>Figure 3. Bushfires in the region</p>'},
 'image22': {'path': 'assets/image22.png',
  'alt_text': 'A map of a land with green squares AI-generated content may be incorrect.',
  'figcaption': '<p>Figure 4. Major Vegetation Groups (MVG)</p>'}}

In [70]:
int(re.sub('image', '', list(alt_text_map.keys())[0])) - 1

1

In [60]:
images_dict

{'image2': {'path': 'assets/image2.png',
  'alt_text': 'location',
  'figcaption': '<p>Figure 1. Location within the region and state. This will\r\nuse a different satellite colour scheme (faded) but for some reason I\r\ncouldn‚Äôt get it to work</p>'},
 'image5': {'path': 'assets/image5.png',
  'alt_text': 'accu_breakdown',
  'figcaption': '<p>Figure 5. Accumulation curve of gross ACCUs generated\r\neach year over the 25-year permanence period. The first recommended\r\nreporting period of the permanence period is illustrated by a yellow\r\nstar</p>'},
 'image13': {'path': 'assets/image13.png',
  'alt_text': 'precipitation',
  'figcaption': '<p>Figure 12. Precipitation</p>'},
 'image14': {'path': 'assets/image14.png',
  'alt_text': 'temperature',
  'figcaption': '<p>Figure 13. Temperature</p>'},
 'image15': {'path': 'assets/image15.png',
  'alt_text': 'frost',
  'figcaption': '<p>Figure 14. Frost</p>'},
 'image23': {'path': 'assets/image23.png',
  'alt_text': 'timeline',
  'figcaption'

In [61]:
html_images

'<h1 id="overview">Overview</h1>\r\n<p>This section provides an overview of the property and carbon\r\nassessment.</p>\r\n<h2 id="preview">Preview</h2>\r\n<table>\r\n<caption><p>Table 1. Summary of Assessment</p></caption>\r\n<colgroup>\r\n<col style="width: 50%" />\r\n<col style="width: 50%" />\r\n</colgroup>\r\n<thead>\r\n<tr>\r\n<th>Item assessed</th>\r\n<th>Assessment</th>\r\n</tr>\r\n</thead>\r\n<tbody>\r\n<tr>\r\n<td>Planting Area</td>\r\n<td><strong>Moderate risk.</strong> Lots of risk.</td>\r\n</tr>\r\n<tr>\r\n<td>Carbon returns</td>\r\n<td><strong>High risk.</strong> Only 4.1 ACCUs/ha.</td>\r\n</tr>\r\n<tr>\r\n<td>Topography</td>\r\n<td><strong>Low risk.</strong> Little to no areas with slope greater\r\nthan 15 degrees.</td>\r\n</tr>\r\n</tbody>\r\n</table>\r\n<p>Located in the Dandenong ranges amongst wet sclerophyll forests and\r\ncarbon rich soils. Your dream carbon project awaits‚Ä¶</p>\r\n<div data-image="assets/image2.png" data-caption="&lt;p&gt;Figure 1. Location within