In [42]:
import os
from bs4 import BeautifulSoup
from docx import Document

def concatenate_html_plaintext_to_docx(html_dir, output_docx_file):
    """
    Concatenates the *plaintext content* of .html files in a directory into a single .docx file.
    Extracts visible text and ignores HTML tags to ensure clean text in DOCX.

    Args:
        html_dir (str): Path to the directory containing .html files.
        output_docx_file (str): Path to save the concatenated .docx file.
    """

    docx_document = Document()  # Create a new DOCX document

    html_files = sorted([f for f in os.listdir(html_dir) if f.endswith('.html')]) # Get sorted list of HTML files

    for html_file in html_files:
        html_filepath = os.path.join(html_dir, html_file)
        print(f"Processing: {html_file}")

        try:
            with open(html_filepath, 'r', encoding='utf-8') as f:
                html_content = f.read()

            soup = BeautifulSoup(html_content, 'html.parser')

            # **Extract plaintext: Get all text, remove tags, scripts, and styles**
            plaintext_content = soup.get_text(separator='\n', strip=True) # Get all text, separated by newlines, and strip whitespace

            # Add the plaintext content as a paragraph
            docx_document.add_paragraph(plaintext_content)

            docx_document.add_page_break() # Add a page break between files

        except Exception as e:
            print(f"Error processing {html_file}: {e}")

    docx_document.save(output_docx_file)
    print(f"\nConcatenated DOCX file created (plaintext content): {output_docx_file}")
    
if __name__ == "__main__":
    html_directory = "/home/aricept094/pipet/rayon_docs_content"  # Replace with the directory where your HTML files are saved
    docx_output_file = "/home/aricept094/pipet/combined_rayon_docs.docx" # Name for the output .docx file

    concatenate_html_raw_to_docx(html_directory, docx_output_file)

Processing: all.html
Processing: array.rs.html
Processing: binary_heap.rs.html
Processing: blocks.rs.html
Processing: btree_map.rs.html
Processing: btree_set.rs.html
Processing: chain.rs.html
Processing: chunk_by.rs.html
Processing: chunks.rs.html
Processing: cloned.rs.html
Processing: copied.rs.html
Processing: empty.rs.html
Processing: enum.Either.html
Processing: enum.Yield.html
Processing: enumerate.rs.html
Processing: extend.rs.html
Processing: filter.rs.html
Processing: filter_map.rs.html
Processing: flat_map.rs.html
Processing: flat_map_iter.rs.html
Processing: flatten.rs.html
Processing: flatten_iter.rs.html
Processing: fn.bridge.html
Processing: fn.bridge_producer_consumer.html
Processing: fn.bridge_unindexed.html
Processing: fn.broadcast.html
Processing: fn.current_num_threads.html
Processing: fn.current_thread_index.html
Processing: fn.empty.html
Processing: fn.in_place_scope.html
Processing: fn.in_place_scope_fifo.html
Processing: fn.join.html
Processing: fn.join_context.ht