## TextHomogenizer


In [2]:
# Importing useful dependencies
import io
import os
import boto3
import chardet
import docx
import fitz
import zipfile
from bs4 import BeautifulSoup
from odf.opendocument import load
from odf.text import P
from odf.teletype import extractText as odf_text
from striprtf.striprtf import rtf_to_text
import re
from markdown_it import MarkdownIt

In [3]:
# Setup S3 client for MinIO (MinIO implements Amazon S3 API)
s3 = boto3.client(
    "s3",
    endpoint_url="http://127.0.0.1:9000", # MinIO API endpoint
    aws_access_key_id="minioadmin", # User name
    aws_secret_access_key="minioadmin", # Password
)

In [4]:
# Define all function to extract text from other format


# Helper: normalize paragraph spacing
def _norm(s: str) -> str:
    """Normalize newlines and ensure paragraphs are separated by one blank line."""
    if not s:
        return ""
    s = s.replace("\r\n", "\n").replace("\r", "\n").strip()
    # Collapse 3 or more newlines into 2 (blank line between paragraphs)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s

# DOCX
def extract_from_docx(body: bytes) -> str:
    """Extract text from DOCX, preserving paragraphs."""
    doc = docx.Document(io.BytesIO(body))
    return _norm("\n\n".join(p.text.strip() for p in doc.paragraphs if p.text.strip()))

# PDF (block-level extraction for better paragraph preservation)

def extract_from_pdf(body: bytes) -> str:
    """Extract text from PDF by text blocks (more reliable paragraph grouping)."""
    doc = fitz.open(stream=io.BytesIO(body), filetype="pdf")
    pages = []
    for i in range(len(doc)):
        blocks = doc.load_page(i).get_text("blocks")
        blocks.sort(key=lambda b: (b[1], b[0]))  # Sort by y, then x
        txts = [b[4].strip() for b in blocks if b[4].strip()]
        pages.append("\n\n".join(txts))
    return _norm("\n\n".join(pages))

# EPUB or ZIP (HTML)
def extract_from_zip(epub_bytes: bytes) -> str:
    """Extract text from EPUB/ZIP containing HTML or XHTML files."""
    parts = []
    with zipfile.ZipFile(io.BytesIO(epub_bytes)) as z:
        for name in sorted(z.namelist()):
            if name.lower().endswith((".xhtml", ".html", ".htm")):
                with z.open(name) as f:
                    soup = BeautifulSoup(f.read(), "html.parser")
                    parts.append(soup.get_text(separator="\n").strip())
    return _norm("\n\n".join(parts))

# HTML
def extract_from_html(html_bytes: bytes) -> str:
    """Extract text from HTML while keeping line breaks between elements."""
    soup = BeautifulSoup(html_bytes, "html.parser")
    return _norm(soup.get_text(separator="\n"))

# ODT
def extract_from_odt(odt_bytes: bytes) -> str:
    """Extract text from ODT file, preserving paragraphs."""
    doc = load(io.BytesIO(odt_bytes))
    paras = [odf_text(p).strip() for p in doc.getElementsByType(P)]
    return _norm("\n\n".join(p for p in paras if p))

# RTF
def extract_from_rtf(rtf_bytes: bytes) -> str:
    """Extract text from RTF; prefer using striprtf if available."""
    try:
        text = rtf_to_text(rtf_bytes.decode("utf-8", errors="ignore"))
    except Exception:
        # Fallback: remove control words and treat \par as paragraph break
        s = rtf_bytes.decode("utf-8", errors="ignore")
        s = re.sub(r"\\par[d]?", "\n\n", s)
        s = re.sub(r"{\\[^}]*}", " ", s)
        s = re.sub(r"\\[a-zA-Z]+-?\d*", " ", s)
        s = re.sub(r"[{}]", " ", s)
        text = s
    return _norm(text)

# Markdown
def extract_from_md(md_bytes: bytes) -> str:
    """
    Markdown -> plain text, drop ALL images.
    Handles:
      - HTML images: <img ...>
      - Inline images: ![alt](url "title")
      - Reference images: ![alt][id]
      - Reference definitions: [id]: <url or data:image...> ("title")
      - Obsidian embeds: ![[file.png]]
      - Long data-URI definitions that run to EOF (hard cut)
    Ensures clean paragraph spacing.
    """
    s = md_bytes.decode("utf-8", errors="replace")

    # 0) Obsidian-style embeds: ![[image.png]]
    s = re.sub(r"!\[\[[^\]]+\]\]", "", s)

    # 1) Remove HTML <img ...> (covers data:image in HTML as well)
    s = re.sub(r"(?is)<img\b[^>]*>", "", s)

    # 2) Remove inline images: ![alt](url "title")
    s = re.sub(
        r"!\[[^\]]*\]\(\s*<?[^)\s>]+[^)]*?>?\s*(?:\"[^\"]*\"|'[^']*'|\([^)]+\))?\s*\)",
        "",
        s,
    )

    # 3) Remove reference-style images used in text: ![alt][id]
    s = re.sub(r"!\[[^\]]*\]\[[^\]]*\]", "", s)

    # 4) Remove normal reference definitions (single-line), including <...> urls.
    #    Example: [img1]: <https://...> "title"
    s = re.sub(
        r"(?im)^\s*\[[^\]]+\]:\s*(?:<[^>\n]+>|[^\s]+)(?:\s+(?:\"[^\"]*\"|'[^']*'|\([^)]+\)))?\s*$",
        "",
        s,
    )

    # 5) Hard-cut from the first 'data:image' reference definition to EOF, in case it's huge / no blank line.
    m = re.search(r"(?is)^\s*\[[^\]]+\]:\s*<?\s*data:image[^>\s]*.*$", s, flags=re.MULTILINE)
    if m:
        s = s[:m.start()]

    return _norm(s)

In [5]:
# 1) map each extension to its extractor
EXT_MAP = {
    ".docx": extract_from_docx,
    ".pdf":  extract_from_pdf,
    ".epub": extract_from_zip,    # epub is a ZIP of XHTML/HTML
    ".zip":  extract_from_zip,
    ".html": extract_from_html,
    ".odt":  extract_from_odt,
    ".rtf":  extract_from_rtf,
    ".md":   extract_from_md,
}

def convert_and_replace(s3, bucket, key, body, dest_bucket=None, keep_original=False):
    """
    Convert supported doc to .txt and upload. Optionally delete the original.
    dest_bucket: if None, write back to the same bucket.
    keep_original: if True, do not delete the original file.
    """
    ext = os.path.splitext(key)[1].lower()
    extractor = EXT_MAP.get(ext)
    if extractor is None:
        return False  # not supported; caller can skip

    print(f"Converting {key} to txt")
    content = extractor(body)  # run the right extractor
    name, _ = os.path.splitext(key)
    new_key = name + ".txt"

    target_bucket = dest_bucket or bucket
    s3.put_object(
        Bucket=target_bucket,
        Key=new_key,
        Body=content.encode("utf-8"),
        ContentType="text/plain",
    )
    print(f"Successfully converted {key} to {new_key}.")

    if not keep_original and dest_bucket is None:
        # only delete if we stayed in the same bucket; if moving across buckets,
        # you can also delete here after confirming the put succeeded.
        s3.delete_object(Bucket=bucket, Key=key)

    return True

def convert_texts_to_txt(bucket, prefix=""):
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):

            if obj['Size'] == 0:  # Skip the folder itself (if the file size is 0)
                print(obj)
                continue

            key = obj["Key"]
            resp = s3.get_object(Bucket=bucket, Key=key)
            body = resp["Body"].read()  # Read the file content
            

            if key.endswith(".txt"):
                try:
                    # Get the file object from S3
    
                    # Use chardet to detect the file encoding
                    result = chardet.detect(body)
                    current_encoding = result['encoding']
                    # Skip if the file is already in UTF-8 encoding
                    if (current_encoding == "utf-8" or current_encoding == "ascii"):
                        continue
                    print(current_encoding)
                    print(f"Converting {key} from {current_encoding} to UTF-8")

                    # Decode the content using the detected encoding and re-encode it in UTF-8
                    content = body.decode(current_encoding, errors='ignore')  # Ignore characters that can't be decoded
                    
                    # Upload the converted content back to S3 in UTF-8
                    s3.put_object(
                        Bucket=bucket,
                        Key=key,  # Make sure the file key (path) is correct
                        Body=content.encode('utf-8'),
                        ContentType="text/plain"
                    )
                    print(f"Successfully converted {key} to UTF-8.")

                except Exception as e:
                    print(f"Failed to process {key}: {e}")  # Print error if something goes wrong
            else:
                handled = convert_and_replace(s3, bucket, key, body)  # or dest_bucket="trusted_zone"
                if not handled:
                    print(f"Skip (unsupported ext): {key}")
                

In [6]:
convert_texts_to_txt(bucket = "formatted-zone", prefix = "texts/")

Windows-1252
Converting texts/text_1759415298698.txt from Windows-1252 to UTF-8
Successfully converted texts/text_1759415298698.txt to UTF-8.
ISO-8859-1
Converting texts/text_1759415298945.txt from ISO-8859-1 to UTF-8
Successfully converted texts/text_1759415298945.txt to UTF-8.
Windows-1252
Converting texts/text_1759415301555.txt from Windows-1252 to UTF-8
Successfully converted texts/text_1759415301555.txt to UTF-8.
Windows-1252
Converting texts/text_1759415302798.txt from Windows-1252 to UTF-8
Successfully converted texts/text_1759415302798.txt to UTF-8.
ISO-8859-1
Converting texts/text_1759415303906.txt from ISO-8859-1 to UTF-8
Successfully converted texts/text_1759415303906.txt to UTF-8.
Windows-1252
Converting texts/text_1759415304197.txt from Windows-1252 to UTF-8
Successfully converted texts/text_1759415304197.txt to UTF-8.
Windows-1252
Converting texts/text_1759415305437.txt from Windows-1252 to UTF-8
Successfully converted texts/text_1759415305437.txt to UTF-8.
ISO-8859-1
Con