In [10]:
from pathlib import Path
from vectoria_lib.common.paths import TEST_DIR
from docx import Document
import docx
from docx.shared import Pt

In [11]:
document = Document(TEST_DIR / "data/docx/2_from_word.docx")

## Header / Footer

In [8]:
# Extract headers and footers
def extract_headers_footers(doc):
    for section in doc.sections:
        header = section.header
        footer = section.footer
        print("Header: ", end="")
        for paragraph in header.paragraphs:
            print(paragraph.text)

        print("Footer: ", end="")
        for paragraph in footer.paragraphs:
            print(paragraph.text)

In [9]:
# Run extraction functions
print("Extracting Headers and Footers...")
extract_headers_footers(document)

Extracting Headers and Footers...
Header: header

Footer: footer



## Content

In [None]:
# A helper function to determine the level of a heading based on style
def get_heading_level(paragraph):
    if paragraph.style.name.startswith("Heading"):
        return int(paragraph.style.name.split()[-1])  # Extract heading level number
    return None  # Not a heading

# Extract document structure: paragraphs, tables, and their heading context
def extract_content_with_tables(doc):
    current_heading = None  # Track the current heading
    structure = []  # Store content structure

    for element in doc.element.body:
        if element.tag.endswith('p'):  # It's a paragraph
            paragraph = docx.text.paragraph.Paragraph(element, doc)
            heading_level = get_heading_level(paragraph)
            if heading_level:
                current_heading = paragraph.text  # Update current heading
                structure.append((f"Heading {heading_level}", paragraph.text))
            else:
                structure.append(("Paragraph", paragraph.text))
        
        elif element.tag.endswith('tbl'):  # It's a table
            table = docx.table.Table(element, doc)
            table_data = []
            for row in table.rows:
                row_data = [cell.text for cell in row.cells]
                table_data.append(row_data)
            structure.append(("Table", table_data, current_heading))

    return structure

# Print the document structure, including where tables are located
def print_document_structure(structure):
    for element in structure:
        if element[0].startswith("Heading"):
            print(f"{element[0]}: {element[1]}")
        elif element[0] == "Paragraph":
            print(f"   Paragraph: {element[1]}")
        elif element[0] == "Table":
            print(f"   Table under Heading: {element[2]}")
            for row in element[1]:
                print(f"      {row}")



In [32]:
print("Extracting document structure with tables and headings...")
structure = extract_content_with_tables(document)


Extracting document structure with tables and headings...


In [35]:
structure

[('Paragraph', 'Test document 2'),
 ('Heading 1', 'First chapter'),
 ('Paragraph',
  'Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum'),
 ('Heading 1', 'Second Chapter'),
 ('Paragraph',
  'Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem i

In [33]:
print_document_structure(structure)

   Paragraph: Test document 2
Heading 1: First chapter
   Paragraph: Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum
Heading 1: Second Chapter
   Paragraph: Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum Lorem ipsum
   Table under Heading: Sec

## Tables

In [28]:
document.tables[0]

<docx.table._Rows at 0x7ac99dccf4f0>

In [15]:
# Extract tables
def extract_tables(doc):
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                print(cell.text, end="\t")
            print()

In [16]:
print("\nExtracting Tables...")
extract_tables(document)


Extracting Tables...
Tab 11	Tab 12	
Tab 21	Tab 22	


## Images

In [18]:
# Extract images
def extract_images(doc):
    rels = doc.part.rels
    for rel in rels:
        if "image" in rels[rel].target_ref:
            image_part = rels[rel].target_part
            image_data = image_part.blob
            image_filename = rels[rel].target_ref.split("/")[-1]
            with open(image_filename, "wb") as img_file:
                img_file.write(image_data)
                print(f"Saved image: {image_filename}")

In [19]:
print("\nExtracting Images...")
extract_images(document)


Extracting Images...
Saved image: image1.jpg
