In [2]:
import fitz  # PyMuPDF
import re
import pandas as pd

# 1. Extract Text from PDF
def extract_text_from_pdf(pdf_path, start_page=98, end_page=928):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(start_page, end_page):
        text += doc.load_page(page_num).get_text()
    doc.close()
    return text

# 2. Split Entries
def split_entries(text):
    return [e.strip().replace('\n', ' ') 
            for e in re.split(r"\n(?=\w+\s+\([^)]+\))", text.strip()) 
            if e.strip()]

# 3. Parse Each Entry — only word & meaning
def parse_lexicon_entry(entry_text):
    entry_text = entry_text.strip().replace('\n', ' ')
    word = meaning = ""

    # Word (English) and pronunciation
    match_wp = re.match(r"^(\w+)\s+\([^)]+\)", entry_text)
    if match_wp:
        word = match_wp.group(1)

    # Meaning — capture everything until note/example or end
    pos_meaning_match = re.search(
        r"\)\s*[a-z.]+\s+(.*?)(?=\s*Short form\.|\s*E\.g\.|\s*Eg\.|$)", 
        entry_text, 
        re.IGNORECASE
    )
    if pos_meaning_match:
        meaning = pos_meaning_match.group(1).strip()

    return {
        "word": word,
        "meaning": meaning
    }

# 4. Process Entire PDF
def process_pdf_to_dataframe(pdf_path):
    raw_text = extract_text_from_pdf(pdf_path, start_page=98, end_page=929)
    entries = split_entries(raw_text)
    parsed_entries = [parse_lexicon_entry(entry) for entry in entries]
    return pd.DataFrame(parsed_entries)

# Example Usage
if __name__ == "__main__":
    pdf_path = "document.pdf"
    df = process_pdf_to_dataframe(pdf_path)


In [4]:
df

Unnamed: 0,word,meaning
0,,
1,à,we. (
2,á,a noun-forming prefix attached to a phrasal ve...
3,a,"we ; us. (Indicating advice, urgency or appeal)."
4,aa,Short form for ‘awa’ Or : ‘agba’ : (Compliment...
...,...,...
3386,yoolo,"1. clean ; neatly. (Qualifies the verb, gwẹ̀ :..."
3387,yòòlò,1. to be deep and circular.
3388,yóóló,clearly ; indisputably ; undeniably. (Qualifie...
3389,yọ,1. to save ; to salvage ; to rescue ; to free.
