# Split Large Documents

We have a number of genAI applications in which we use LLMs to assess large documents. Some are many times the context window.

There's an interesting approach involving chunking which we're exploring separately, but for one particular extremely large document I'd like to try and do it programmatically. The doc has a table of contents which links to the headings, so I feel there's enough structure there to work with

In [1]:
import docx
import os

The below functions were written by ChatGPT3.5turbo for docx. Not tested yet.

In [None]:
# Function to extract headings from the table of contents
def extract_headings(doc):
    headings = []
    for para in doc.paragraphs:
        if para.style.name.startswith('Heading'):
            headings.append(para.text.strip())
    return headings
# Function to save a section as a new Word file
def save_section(heading, content, output_dir):
    new_doc = docx.Document()
    for para in content:
        new_para = new_doc.add_paragraph(para.text)
        new_para.style = para.style

    # Create a valid filename from the heading
    safe_heading = ''.join(c for c in heading if c.isalnum() or c in (' ', '_')).rstrip()
    filename = f"{safe_heading}.docx"
    file_path = os.path.join(output_dir, filename)

    # Save the new document
    new_doc.save(file_path)
    print(f"Saved section '{heading}' to '{file_path}'")
# Function to split document based on headings
def split_doc_by_headings(doc, headings, output_dir):
    current_section = None
    section_content = []

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for para in doc.paragraphs:
        if para.text.strip() in headings:
            # Save the previous section if any
            if current_section:
                save_section(current_section, section_content, output_dir)
            # Start a new section
            current_section = para.text.strip()
            section_content = []
        section_content.append(para)

    # Save the last section
    if current_section:
        save_section(current_section, section_content, output_dir)
# Main function
def split_document_by_toc(input_doc_path, output_dir):
    # Load the Word document
    doc = docx.Document(input_doc_path)

    # Extract headings from the document
    headings = extract_headings(doc)

    # Split document by headings and save each section
    split_doc_by_headings(doc, headings, output_dir)

# Usage
input_doc_path = "/Users/joe/Documents/git/defra/docs/SFI 2024 actions MASTER v1.1b clean for pdf.odt"
output_dir = "output/folder"
split_document_by_toc(input_doc_path, output_dir)


The below functions were written by ChatGPT3.5turbo for odf, then had to be heavily modified to make them actually work...

In [3]:
from odf.opendocument import load
from odf.text import P, H
from odf.opendocument import OpenDocumentText
import os

# Function to extract headings from the document
def extract_headings(doc):
    headings = []
    for element in doc.getElementsByType(H):
        headings.append(element.textContent.strip())
    return headings

# Function to split document based on headings
def split_doc_by_headings(doc, headings, output_dir):
    current_section = None
    section_content = []

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for element in doc.getElementsByType(P):
        if element.textContent.strip() in headings:
            # Save the previous section if any
            if current_section:
                save_section(current_section, section_content, output_dir)
            # Start a new section
            current_section = element.textContent.strip()
            section_content = []
        section_content.append(element)

    # Save the last section
    if current_section:
        save_section(current_section, section_content, output_dir)

# Function to save a section as a new ODT file
def save_section(heading, content, output_dir):
    new_doc = OpenDocumentText()

    for element in content:
        new_para = P(text=element.textContent)
        new_doc.text.addElement(new_para)

    # Create a valid filename from the heading
    safe_heading = ''.join(c for c in heading if c.isalnum() or c in (' ', '_')).rstrip()
    filename = f"{safe_heading}.odt"
    file_path = os.path.join(output_dir, filename)

    # Save the new document
    new_doc.save(file_path)
    print(f"Saved section '{heading}' to '{file_path}'")

# Main function
def split_document_by_toc(input_doc_path, output_dir):
    # Load the ODT document
    doc = load(input_doc_path)

    # Extract headings from the document
    headings = extract_headings(doc)

    # Split document by headings and save each section
    split_doc_by_headings(doc, headings, output_dir)


In [None]:

# Usage
input_doc_path = "/Users/joe/Documents/git/defra/docs/SFI 2024 actions MASTER v1.1b clean for pdf.odt"
output_dir = "output/expanded_sections"
split_document_by_toc(input_doc_path, output_dir)


It did not work. Let's unpick it asnd see if the problem's with the document

In [11]:
doc = load(input_doc_path)

In [28]:
from odf import text

def get_toc_elements(doc):
    paragraphs = []
    for paragraph in doc.getElementsByType(text.P):
        style = paragraph.getAttribute('stylename')
        if style=='TOC1' or style=='TOC2':
            #print(f"{style}, {paragraph}")
            paragraphs.append(paragraph)

    return paragraphs



paragraphs = get_toc_elements(doc)

From the above it's clear the doc doesn't use any of the 'H' predefined styles for headings. But we do have a very nice TOC and those bits have to be in the predefined TOC1 and TOC2 styles.

Inspection of the document shows the actions are all TOC2 styles and they're grouped in TOC1 categories that differentiate them based on their purpose. This is something Ian was trying to capture so let's run with that:

- identify all TOC1 headings and the TOC2 ones below each
- for each TOC2 use the link to identify the corresponding section?
- split and save in TOC1/TOC2.odt

As suspected, the links in the TOC point to unique anchors. I'm gunna need to know the anchor and the following one. There's a corner case when the TOC2 element is the last in its TOC1, but let's see how we go.

Used the prompt:

`please write code to extract a section of the document without reference to its own style, but using the links associated with a TOC2 element and the subsequent TOC2 element` 

In [43]:
from odf.opendocument import load
from odf.text import P, A, H, TocMark
from odf.opendocument import OpenDocumentText
import os

# Function to extract TOC2 elements and their linked sections
def extract_toc2_sections(doc):
    #toc_entries = []
    # Locate the Table of Contents (TocEntry)
    #toc_entries = doc.getElementsByType(Toc)

    toc_entries = []
    for element in doc.getElementsByType(text.P):
        style = element.getAttribute('stylename')
        if style=='TOC1' or style=='TOC2':
            #print(f"{style}, {paragraph}")
            toc_entries.append(element)

    toc2_elements = []
    # Iterate over all TOC entries to find TOC2 elements
    for toc_entry in toc_entries:
        link_element = toc_entry.getElementsByType(A)
        if link_element:
            link = link_element[0].getAttribute("xlink:href")
            toc2_elements.append((toc_entry.textContent.strip(), link))
    return toc2_elements

# Function to extract the section content between two TOC2 links
def extract_section_by_toc2_links(doc, start_link, end_link):
    in_section = False
    section_content = []

    for para in doc.getElementsByType(P):
        # Check paragraph id (text:id), which may correspond to the TOC link
        para_id = para.getAttribute("text:id")

        if para_id == start_link:
            in_section = True  # Start extracting content when start_link is found

        if in_section:
            section_content.append(para)

        if para_id == end_link:
            break  # Stop extracting when end_link is found

    return section_content

# Function to save the extracted section as a new ODT file
def save_section(content, output_filename):
    new_doc = OpenDocumentText()

    for para in content:
        new_para = P(text=para.textContent)
        new_doc.text.addElement(new_para)

    # Save the new document
    new_doc.save(output_filename)
    print(f"Saved section to '{output_filename}'")

# Main function to extract and save the section between two TOC2 elements
def extract_and_save_section_by_toc2(input_doc_path, toc2_start, toc2_end, output_filename):
    # Load the ODT document
    doc = load(input_doc_path)

    # Get TOC2 elements and their links
    #toc2_elements = extract_toc2_sections(doc)
    toc2_elements = get_toc_elements(doc)

    # Find the start and end TOC2 links
    start_link = end_link = None
    for toc2_title, link in toc2_elements:
        if toc2_title == toc2_start:
            start_link = link
        if toc2_title == toc2_end:
            end_link = link
            break

    # Check if both start and end links were found
    if not start_link or not end_link:
        raise ValueError(f"Could not find start or end TOC2 elements: {toc2_start}, {toc2_end}")

    # Extract section between these TOC2 links
    section_content = extract_section_by_toc2_links(doc, start_link, end_link)

    # Save the extracted section as a new ODT file
    save_section(section_content, output_filename)


In [None]:

# Usage example
input_doc_path = "/Users/joe/Documents/git/defra/docs/SFI 2024 actions MASTER v1.1b clean for pdf.odt"
toc2_start = "CSAM1: Assess soil, produce a soil management plan and test soil organic matter"  # Title of the starting TOC2 section
toc2_end = "CNUM3: Legume fallow"  # Title of the ending TOC2 section
output_filename = "output_section.odt"

extract_and_save_section_by_toc2(input_doc_path, toc2_start, toc2_end, output_filename)


In [None]:
toc2_elements = extract_toc2_sections(doc)
toc2_elements

In [45]:
toc_entries = []
for element in doc.getElementsByType(text.P):
    style = element.getAttribute('stylename')
    if style=='TOC1' or style=='TOC2':
        #print(f"{style}, {paragraph}")
        toc_entries.append(element)

toc_entries

[<odf.element.Element at 0x123adcfe0>,
 <odf.element.Element at 0x123add3a0>,
 <odf.element.Element at 0x123add6d0>,
 <odf.element.Element at 0x123adda30>,
 <odf.element.Element at 0x123addd90>,
 <odf.element.Element at 0x123ade0f0>,
 <odf.element.Element at 0x123ade450>,
 <odf.element.Element at 0x123ade7b0>,
 <odf.element.Element at 0x123adeb10>,
 <odf.element.Element at 0x123adee70>,
 <odf.element.Element at 0x123adf350>,
 <odf.element.Element at 0x123adf6b0>,
 <odf.element.Element at 0x123adfa10>,
 <odf.element.Element at 0x123adfd70>,
 <odf.element.Element at 0x123afc110>,
 <odf.element.Element at 0x123afc470>,
 <odf.element.Element at 0x123afc7d0>,
 <odf.element.Element at 0x123afcb30>,
 <odf.element.Element at 0x123afce90>,
 <odf.element.Element at 0x123afd1f0>,
 <odf.element.Element at 0x123afd550>,
 <odf.element.Element at 0x123afd8b0>,
 <odf.element.Element at 0x123afdc10>,
 <odf.element.Element at 0x123afdf70>,
 <odf.element.Element at 0x123afe2d0>,
 <odf.element.Element at 

In [52]:
print(toc_entries[0].getElementsByType(A)[0])

SFI actions for soil health5


In [None]:

toc2_elements = []
# Iterate over all TOC entries to find TOC2 elements
for toc_entry in toc_entries:
    link_element = toc_entry.getElementsByType(A)
    if link_element:
        link = link_element[0].getAttribute("xlink:href")
        toc2_elements.append((toc_entry.textContent.strip(), link))

This all doesn't seem to work. I suspect the bit of code that handles these `TocMark` elements doesn't work on the `P` elements the revised version is returning - none of them have hrefs. The documentation is pretty difficult to work with but given that I've identified the text of the TOC21 headings I'm going to try using text scanning instead.

In [None]:
from odf.opendocument import load, OpenDocumentText
from odf.text import P

def extract_section_by_strings(input_doc_path, start_string, end_string, output_doc_path):
    # Load the ODT document
    doc = load(input_doc_path)

    # Variables to track extraction status
    in_section = False
    section_content = []

    # Iterate over paragraphs in the document
    for para in doc.getElementsByType(P):
        para_text = para.textContent

        # Check if the current paragraph contains the starting string
        if start_string in para_text:
            in_section = True

        # If in the section, add the paragraph to the section content
        if in_section:
            section_content.append(para)

        # Check if the current paragraph contains the ending string
        if end_string in para_text and in_section:
            break  # Stop extraction after the end string is found

    # Create a new ODT document for the extracted content
    new_doc = OpenDocumentText()

    # Add the extracted section content to the new document
    for para in section_content:
        new_para = P(text=para.textContent)
        new_doc.text.addElement(new_para)

    # Save the new document
    new_doc.save(output_doc_path)
    print(f"Section saved as '{output_doc_path}'")

# Usage example
input_doc_path = "/Users/joe/Documents/git/defra/docs/SFI 2024 actions MASTER v1.1b clean for pdf.odt"
start_string = "CSAM1: Assess soil, produce a soil management plan and test soil organic matter"  # The string where the section starts
end_string = "CSAM2: Multi-species winter cover crop"  # The string where the section ends
output_doc_path = "extracted_section.odt"  # The path to save the new document

extract_section_by_strings(input_doc_path, start_string, end_string, output_doc_path)


We're not getting anywhere with odfpy. It's not been maintained for ages and chatgpt seems to be using quite old docs which I'm guessing are deprecated. `odfdo` seems to be newer, let's try again with that

In [54]:
from odfdo import Document, Paragraph

def extract_section_by_toc2(input_doc_path, toc2_start_title, toc2_end_title, output_doc_path):
    # Load the ODT document
    doc = Document(input_doc_path)

    # Get the content of the document
    body = doc.body

    # Variables to track extraction status
    in_section = False
    section_content = []

    # Function to find bookmarks associated with TOC2 titles
    def find_toc2_bookmark(toc_title):
        for element in body.get_elements():
            if isinstance(element, Paragraph):
                if toc_title in element.text():
                    # Return the bookmark ID
                    bookmarks = element.get_links()
                    if bookmarks:
                        return bookmarks[0]  # Assume first link is correct
        return None

    # Find the start and end bookmarks based on the TOC2 titles
    start_bookmark = find_toc2_bookmark(toc2_start_title)
    end_bookmark = find_toc2_bookmark(toc2_end_title)

    if not start_bookmark or not end_bookmark:
        raise ValueError(f"Could not find TOC2 elements for: {toc2_start_title} and {toc2_end_title}")

    # Extract paragraphs between the start and end bookmarks
    for element in body.get_elements():
        if isinstance(element, Paragraph):
            # Check if the current element contains the start bookmark
            if start_bookmark in element.text():
                in_section = True

            if in_section:
                section_content.append(element)

            # Check if the current element contains the end bookmark
            if end_bookmark in element.text():
                break

    # Create a new document for the extracted content
    new_doc = Document()

    # Add the extracted content to the new document
    new_body = new_doc.body
    for para in section_content:
        new_body.append(para)

    # Save the new document
    new_doc.save(output_doc_path)
    print(f"Extracted section saved as '{output_doc_path}'")

# Usage example
input_doc_path = "/Users/joe/Documents/git/defra/docs/SFI 2024 actions MASTER v1.1b clean for pdf.odt"
toc2_start_title = "CSAM1: Assess soil, produce a soil management plan and test soil organic matter"  # Title of the TOC2 section to start extraction
toc2_end_title = "CSAM2: Multi-species winter cover crop"  # Title of the TOC2 section to end extraction
output_doc_path = "extracted_section.odt"  # Path to save the new document

extract_section_by_toc2(input_doc_path, toc2_start_title, toc2_end_title, output_doc_path)


TypeError: Element.get_elements() missing 1 required positional argument: 'xpath_query'

I've converted the document to docx, which I suspect is where it started (hence not having all the right style labels). Let's see if we can do this with the docx library instead.

In [57]:
from docx import Document

def extract_toc_entries(docx_path):
    # Load the DOCX document
    doc = Document(docx_path)

    # List to store the extracted TOC entries
    toc_entries = []

    # Iterate over paragraphs in the document
    for para in doc.paragraphs:
        # Check if the paragraph style is related to TOC (e.g., TOC1, TOC2, etc.)
        if para.style.name.startswith('TOC'):
            toc_entries.append(para.text)

    return toc_entries

# Usage example
docx_path = "/Users/joe/Documents/git/defra/docs/SFI 2024 actions MASTER v1.1b clean for pdf.docx"
toc_entries = extract_toc_entries(docx_path)

# Print the extracted TOC entries
for i, entry in enumerate(toc_entries, 1):
    print(f"{i}. {entry}")


1. photographs or other documentation 


In [58]:
doc = Document(docx_path)
for para in doc.paragraphs:
    print(para.style.name)

LO-Normal
LO-Normal
LO-Normal
LO-Normal
LO-Normal
Heading 1
Heading 2
Heading 3
LO-Normal
Heading 3
LO-Normal
Heading 3
LO-Normal
bullet
bullet
Heading 3
LO-Normal
bullet
bullet
bullet
LO-Normal
Heading 4
Heading 4
LO-Normal
LO-Normal
Heading 4
LO-Normal
Heading 4
LO-Normal
Heading 3
LO-Normal
bullet
bullet
LO-Normal
bullet
bullet
LO-Normal
LO-Normal
LO-Normal
LO-Normal
Heading 3
LO-Normal
bullet
bullet
LO-Normal
bullet
bullet
LO-Normal
LO-Normal
LO-Normal
bullet
bullet
bullet
Heading 3
LO-Normal
bullet
bullet
LO-Normal
Heading 3
LO-Normal
bullet
bullet
LO-Normal
Heading 3
LO-Normal
LO-Normal
LO-Normal
LO-Normal
bullet
bullet
bullet
bullet
Heading 3
LO-Normal
LO-Normal
LO-Normal
LO-Normal
Heading 2
Heading 3
LO-Normal
Heading 3
LO-Normal
Heading 3
LO-Normal
List Paragraph
List Paragraph
LO-Normal
List Paragraph
List Paragraph
List Paragraph
Heading 3
LO-Normal
List Paragraph
List Paragraph
List Paragraph
Heading 4
LO-Normal
Heading 4
LO-Normal
LO-Normal
Heading 4
LO-Normal
Heading 4
LO

In [59]:
from docx import Document
import os

def split_doc_by_heading2(docx_path, output_dir):
    # Load the DOCX document
    doc = Document(docx_path)

    # Make sure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Variables to store the current section and its heading
    current_section = []
    current_heading = None

    # Function to save the current section to a new document
    def save_section(heading, content):
        # Create a new document
        new_doc = Document()

        # Add content to the new document
        for para in content:
            new_doc.add_paragraph(para.text)

        # Clean the heading to make it a valid filename
        clean_heading = ''.join(c for c in heading if c.isalnum() or c in ' ').strip()
        file_name = f"{clean_heading}.docx"
        output_path = os.path.join(output_dir, file_name)

        # Save the new document
        new_doc.save(output_path)
        print(f"Section '{heading}' saved as {output_path}")

    # Iterate over the paragraphs in the document
    for para in doc.paragraphs:
        # Check if the paragraph style is 'Heading 2'
        if para.style.name == 'Heading 2':
            # If we are in a section, save the previous section
            if current_heading and current_section:
                save_section(current_heading, current_section)
                current_section = []  # Reset the section content

            # Update the current heading to the new Heading 2 text
            current_heading = para.text

        # Add the current paragraph to the current section
        if current_heading:
            current_section.append(para)

    # Save the last section
    if current_heading and current_section:
        save_section(current_heading, current_section)

# Usage example
input_docx_path = "/Users/joe/Documents/git/defra/docs/SFI 2024 actions MASTER v1.1b clean for pdf.docx"
output_directory = "output/expanded_sections"  # Directory to save the split sections

split_doc_by_heading2(input_docx_path, output_directory)


Section 'CSAM1: Assess soil, produce a soil management plan and test soil organic matter' saved as output/expanded_sections/CSAM1 Assess soil produce a soil management plan and test soil organic matter.docx
Section 'CSAM2: Multi-species winter cover crop' saved as output/expanded_sections/CSAM2 Multispecies winter cover crop.docx
Section 'CSAM3: Herbal leys' saved as output/expanded_sections/CSAM3 Herbal leys.docx
Section 'SOH1: No-till farming' saved as output/expanded_sections/SOH1 Notill farming.docx
Section 'SOH2: Multi-species spring-sown cover crop ' saved as output/expanded_sections/SOH2 Multispecies springsown cover crop.docx
Section 'SOH3: Multi-species summer-sown cover crop' saved as output/expanded_sections/SOH3 Multispecies summersown cover crop.docx
Section 'SOH4: Winter cover following maize crops ' saved as output/expanded_sections/SOH4 Winter cover following maize crops.docx
Section 'AGF1: Maintain very low density in-field agroforestry on less sensitive land' saved as

Got there in the end!

We'd now like to convert all those docx files to .txt files, they'll be easier to read back in

In [77]:
from docx import Document
from pathlib import Path

def create_new_file_path(file_path, new_extension):
    # Convert the file_path to a Path object
    original_path = Path(file_path)

    # Create a new 'txt' subdirectory under the original directory
    new_dir = original_path.parent / 'txt'
    new_dir.mkdir(exist_ok=True)  # Create the 'txt' subdirectory if it doesn't exist

    # Create a new file path with the same name but a different extension
    new_file_path = new_dir / original_path.with_suffix(new_extension).name

    return new_file_path

def save_docx_as_txt(docx_dir):
    file_list = [f for f in Path(docx_dir).resolve().glob('**/*.docx') if f.is_file()]
    for docx_path in file_list:
        # Load the DOCX document
        doc = Document(docx_path)

        # generate the output filepath
        txt_path = create_new_file_path(docx_path, '.txt')

        # Open the .txt file for writing
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            # Iterate through all the paragraphs in the document
            for para in doc.paragraphs:
                # Write the paragraph text followed by a newline
                txt_file.write(para.text + '\n')

        print(f"Document saved as '{txt_path}'")

# Usage example
docx_dir = "/Users/joe/Documents/git/defra/ffc-rps-scratchpad/src/python/output/expanded_sections"

save_docx_as_txt(docx_dir)


Document saved as '/Users/joe/Documents/git/defra/ffc-rps-scratchpad/src/python/output/expanded_sections/txt/WBD7 Remove livestock from grassland during the autumn and winter SDAs.txt'
Document saved as '/Users/joe/Documents/git/defra/ffc-rps-scratchpad/src/python/output/expanded_sections/txt/AHW1 Bumblebird mix.txt'
Document saved as '/Users/joe/Documents/git/defra/ffc-rps-scratchpad/src/python/output/expanded_sections/txt/UPL3 Limited livestock grazing on moorland.txt'
Document saved as '/Users/joe/Documents/git/defra/ffc-rps-scratchpad/src/python/output/expanded_sections/txt/CIPM2 Flowerrich grass margins blocks or infield strips.txt'
Document saved as '/Users/joe/Documents/git/defra/ffc-rps-scratchpad/src/python/output/expanded_sections/txt/UPL10 Shepherding livestock on moorland remove stock for at least 8 months.txt'
Document saved as '/Users/joe/Documents/git/defra/ffc-rps-scratchpad/src/python/output/expanded_sections/txt/OFM2 Organic land management  unimproved permanent grass