In [57]:
import pandas as pd
import re
import csv
import pytesseract
from tempfile import TemporaryDirectory
from PIL import Image
from pathlib import Path 
from pdf2image import convert_from_path


pdf = Path()/"bini_dict.pdf" 

In [None]:
def read_pdf2img(pdf_path):
    """
    Converts the PDF to images and writes the OCR output to a single text file.
    """
    output_folder = Path("output_folder")
    output_folder.mkdir(exist_ok=True)

    with TemporaryDirectory() as tmp:
        storage = Path(tmp)
        convert_from_path(
            pdf_path=Path(pdf_path),
            output_folder=storage,
            fmt='png',
            single_file=False,
            first_page=19,
            thread_count=4, 
        )

        with open(output_folder/"output_file.txt", "w") as f:
            for x in sorted(storage.glob("*.png")): 
                text = pytesseract.image_to_string(Image.open(x)) # after training the model, use lang=['eng', 'bini'] as an argument
                f.write(text + "\n\n")  # Add newlines between pages
                # fix: pages of the file ebing written get scattered, find out why


# read_pdf2img(pdf)

In [None]:
INPUT_TXT  = "output_folder/output_file.txt"   # path to your text file
OUTPUT_CSV = "bini_pairs_2.csv"  # output csv filename

def normalize_block(block: str) -> str:
    """Fix hyphenated line-breaks and join lines with spaces."""
    lines = [ln.strip() for ln in block.splitlines()]
    combined = []
    for ln in lines:
        if not combined:
            combined.append(ln)
        else:
            if combined[-1].endswith("-"):
                combined[-1] = combined[-1][:-1] + ln  # remove trailing '-' and glue
            else:
                combined.append(ln)
    return " ".join(combined)


def extract_pairs(text: str):
    # 1) Split the file into blocks separated by blank lines
    blocks = [b.strip() for b in re.split(r"\n\s*\n", text) if b.strip()]

    rows = []
    for block in blocks:
        nb = normalize_block(block)
        # Skip headers
        if nb.upper().startswith("BINI DICTIONARY"):
            continue

        # 2) Capture the headword = first non-space token at start of block
        m = re.match(r"^\s*(?P<head>\S+)\s*(?P<rest>.*)$", nb)
        if not m:
            continue
        head = m.group("head").strip()
        rest = m.group("rest").strip()

        # 3) If a pronunciation bracket immediately follows the headword, drop it
        rest = re.sub(r"^\[[^\]]+\]\s*", "", rest)

        # 4) Tidy leading separators (| : ; , . dashes) before the gloss
        rest = re.sub(r"^[\s\|\:\;\,\.\-–—]+", "", rest)

        rows.append((head, rest))
    return rows


def main():
    with open(INPUT_TXT, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()

    rows = extract_pairs(text)

    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as out:
        w = csv.writer(out)
        w.writerow(["bini", "english"])
        w.writerows(rows)

    print(f"Done. Wrote {len(rows)} rows to {OUTPUT_CSV}")


# tested but still needs some tuning. post csv making, there should be some cleaning done via pandas to maintain the intergrity of the bnin words,
# likely well map the special characters to the mistaken characters

# main()

In [None]:
data = Path("")
df = pd.read_csv(data)
df.sample(10)