In [1]:
%pip install pytesseract pdf2image opencv-python pillow tqdm


Defaulting to user installation because normal site-packages is not writeable
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting opencv-python
  Downloading opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl.metadata (19 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl (39.0 MB)
   ---------------------------------------- 0.0/39.0 MB ? eta -:--:--
    --------------------------------------- 0.5/39.0 MB 5.3 MB/s eta 0:00:08
   -- ------------------------------------- 2.1/39.0 MB 7.2 MB/s eta 0:00:06
   ---- ----------------------------------- 3.9/39.0 MB 8.1 MB/s eta 0:00:05
   ----- ---------------------------------- 5.0/39.0 MB 8.3 MB/s eta 0:00:05
   ------ --------------------------------- 6.6/39.0 MB 7.4 MB/s eta 0:00:05
   ---


[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
from pathlib import Path
from tqdm import tqdm
from datetime import datetime

import cv2
import pytesseract
from pdf2image import convert_from_path
from PIL import Image, ImageOps, ImageFilter

# ================= USER CONFIG =================
PDF_PATH = r"./x.pdf"   # আপনার PDF ফাইল/পাথ
OUTPUT_TXT = "extracted_bangla_text.txt"
LANG = "ben+eng"   # বাংলা + ইংরেজি OCR
TESSERACT_EXE = r"C:\Program Files\Tesseract-OCR\tesseract.exe"  # Windows হলে দিন; mac/Linux এ ফাঁকা রাখুন
POPPLER_PATH = r""  # Windows এ Poppler ইনস্টল করলে এর bin ফোল্ডারের path দিন, যেমন r"C:\poppler-24.02.0\Library\bin"
DPI = 300
PSM = 6  # page segmentation mode
OEM = 1  # LSTM OCR Engine
# ===============================================

# Windows: point pytesseract to tesseract.exe
if os.name == "nt" and Path(TESSERACT_EXE).exists():
    pytesseract.pytesseract.tesseract_cmd = TESSERACT_EXE

def pdf_to_images(pdf_path: str, dpi: int = 300):
    kwargs = {}
    if os.name == "nt" and POPPLER_PATH:
        kwargs["poppler_path"] = POPPLER_PATH
    images = convert_from_path(pdf_path, dpi=dpi, **kwargs)
    return images

def preprocess(img: Image.Image) -> Image.Image:
    # (1) Auto-orient
    img = ImageOps.exif_transpose(img)
    # (2) Grayscale
    img = img.convert("L")
    # (3) Light de-noise & sharpen
    img = img.filter(ImageFilter.MedianFilter(size=3))
    img = img.filter(ImageFilter.UnsharpMask(radius=2, percent=150, threshold=3))
    # (4) Binarize using Otsu (via OpenCV) for better OCR
    cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_GRAY2BGR)
    gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY)
    _, th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return Image.fromarray(th)

import numpy as np

def ocr_image(pil_img: Image.Image, lang="ben+eng", psm=6, oem=1) -> str:
    config = f"--oem {oem} --psm {psm}"
    text = pytesseract.image_to_string(pil_img, lang=lang, config=config)
    return text

def main():
    pdf = Path(PDF_PATH)
    if not pdf.exists():
        print(f"PDF not found: {pdf.resolve()}")
        return

    images = pdf_to_images(str(pdf), dpi=DPI)
    print(f"Total pages: {len(images)}")

    out_lines = []
    for idx, img in enumerate(tqdm(images, desc="OCR pages")):
        pimg = preprocess(img)
        page_text = ocr_image(pimg, lang=LANG, psm=PSM, oem=OEM)
        out_lines.append(f"\n\n----- Page {idx+1} -----\n{page_text.strip()}\n")

    out = Path(OUTPUT_TXT)
    out.write_text("".join(out_lines), encoding="utf-8")
    print(f"\n✅ Done. Saved OCR text to: {out.resolve()}")

if __name__ == "__main__":
    main()


PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

In [2]:
%pip install python-docx

Defaulting to user installation because normal site-packages is not writeable
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.2.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from docx import Document
import re

# -------- Digit Conversion --------
bangla_map = str.maketrans("0123456789", "০১২৩৪৫৬৭৮৯")

def to_bangla_digits(text):
    return text.translate(bangla_map)

# Function to check if text has Bangla letters
def has_bangla(text):
    # Unicode range for Bangla: 0980–09FF
    return re.search(r'[\u0980-\u09FF]', text)

# -------- Load DOCX --------
doc = Document("./xy.docx")  # replace with your file

# -------- Process paragraphs --------
for para in doc.paragraphs:
    if not has_bangla(para.text):
        para.text = to_bangla_digits(para.text)

# -------- Process tables (like TOC) --------
for table in doc.tables:
    for row in table.rows:
        for cell in row.cells:
            if not has_bangla(cell.text):
                cell.text = to_bangla_digits(cell.text)

# -------- Save new file --------
doc.save("your_document_bangla_fast.docx")
print("✅ Numbers converted to Bangla (Bangla text skipped)")

✅ Numbers converted to Bangla (Bangla text skipped)


In [7]:
# y.py
# pip install python-docx
import argparse, os, sys, glob
from typing import Iterable, Union, Optional
from docx import Document
from docx.table import _Cell, Table

# 0-9 -> ০-৯
DIGIT_MAP = str.maketrans("0123456789", "০১২৩৪৫৬৭৮৯")

def translate_runs_in_paragraph(paragraph):
    for run in paragraph.runs:
        t = run.text
        if t and any(ch.isdigit() for ch in t):
            run.text = t.translate(DIGIT_MAP)

def iter_paragraphs_in_obj(obj: Union[Document, _Cell, Table]) -> Iterable:
    """Yield all paragraphs in Document/_Cell/Table (recursively handles tables)."""
    if hasattr(obj, "paragraphs"):
        for p in obj.paragraphs:
            yield p
    if hasattr(obj, "tables"):
        for tbl in obj.tables:
            for row in tbl.rows:
                for cell in row.cells:
                    yield from iter_paragraphs_in_obj(cell)

def convert_docx_digits(src_path: str, dst_path: str) -> None:
    doc = Document(src_path)

    # Body + tables
    for p in iter_paragraphs_in_obj(doc):
        translate_runs_in_paragraph(p)

    # Headers & footers (per section)
    for section in doc.sections:
        for p in iter_paragraphs_in_obj(section.header):
            translate_runs_in_paragraph(p)
        for p in iter_paragraphs_in_obj(section.footer):
            translate_runs_in_paragraph(p)

    # Save
    outdir = os.path.dirname(dst_path)
    if outdir:
        os.makedirs(outdir, exist_ok=True)
    doc.save(dst_path)

def resolve_from_base(path: str, base: str) -> str:
    """Resolve `path` relative to script folder `base` if not absolute."""
    return path if os.path.isabs(path) else os.path.abspath(os.path.join(base, path))

def valid_docx(path: str) -> bool:
    return os.path.isfile(path) and path.lower().endswith(".docx")

def run_single(BASE: str, infile: str, outfile: Optional[str]):
    src = resolve_from_base(infile, BASE)
    if not valid_docx(src):
        print(f"[ERROR] Input file not found or not a .docx: {src}", file=sys.stderr)
        sys.exit(1)
    dst = resolve_from_base(outfile, BASE) if outfile else os.path.splitext(src)[0] + "_BN.docx"
    try:
        convert_docx_digits(src, dst)
        print(f"[OK] Saved: {dst}")
    except Exception as e:
        print(f"[ERROR] Failed to process '{src}': {e}", file=sys.stderr)
        sys.exit(2)

def run_batch(BASE: str, folder: str):
    indir = resolve_from_base(folder, BASE)
    if not os.path.isdir(indir):
        print(f"[ERROR] Folder not found: {indir}", file=sys.stderr)
        sys.exit(1)
    files = glob.glob(os.path.join(indir, "*.docx"))
    if not files:
        print(f"[INFO] No .docx files found in {indir}")
        return
    ok = fail = 0
    for src in files:
        if src.lower().endswith("_bn.docx"):
            continue
        dst = os.path.splitext(src)[0] + "_BN.docx"
        try:
            convert_docx_digits(src, dst)
            print(f"[OK] {os.path.basename(dst)}")
            ok += 1
        except Exception as e:
            print(f"[ERROR] {os.path.basename(src)} -> {e}", file=sys.stderr)
            fail += 1
    print(f"\nDone. Success: {ok}, Failed: {fail}")

def get_base_dir() -> str:
    # Works when run as a script or from notebooks/REPL where __file__ is missing
    try:
        return os.path.dirname(os.path.abspath(__file__))  # type: ignore[name-defined]
    except NameError:
        return os.path.abspath(os.getcwd())

def main():
    BASE = get_base_dir()

    ap = argparse.ArgumentParser(
        description="Convert English digits to Bangla digits (০-৯) in .docx files."
    )
    g = ap.add_mutually_exclusive_group()
    g.add_argument("--in", dest="infile", help="Input .docx file (relative to script or absolute)")
    g.add_argument("--dir", dest="indir", help="Process all .docx files in this folder")
    ap.add_argument("--out", dest="outfile", help="Output .docx (single-file mode only)")

    # If running in a notebook, argparse will try to parse Jupyter args; ignore them safely
    if hasattr(sys, "argv"):
        args = ap.parse_args(args=None if sys.argv[0].endswith(".py") else [])
    else:
        args = ap.parse_args([])

    if args.infile:
        run_single(BASE, args.infile, args.outfile)
        return

    if args.indir:
        run_batch(BASE, args.indir)
        return

    # Default: look for 'xy.docx' next to this script (or CWD in notebooks), write 'xy_BN.docx'
    default_in = "./xy.docx"
    default_out = "xy_BN.docx"
    print(f"[INFO] No arguments provided. Using default: {default_in} -> {default_out}")
    run_single(BASE, default_in, default_out)

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--in INFILE | --dir INDIR] [--out OUTFILE]
ipykernel_launcher.py: error: unrecognized arguments: --f=c:\Users\efte2\AppData\Roaming\jupyter\runtime\kernel-v39ffdd2df66efb18995b7db1dae8664b9e24ee352.json


SystemExit: 2