# Importing **Dependencies** and Setting Directory

In [15]:
import fitz  # PyMuPDF
import re
import os
import sys
sys.path.append(os.path.abspath("../"))

# Data Extraction and Cleaning

In [16]:
# ---------------- Extraction function definition and saving as .txt ---------------------
def extract_pdf_text(pdf_path):
    """Extracts text from the pdf using the PyMupdf library

    Args:
        pdf_path (_str_): Description - file name path

    Returns:
       all_text (_str_): Description - all the words in the pdf document
    """
    doc = fitz.open(pdf_path)
    all_text = ""

    for page in doc:
        all_text += page.get_text()
    
    return all_text

pdf_path = "../data/raw/Olukumi-EnglishPUBLISHNEW.pdf"
text = extract_pdf_text(pdf_path)

# Save the extracted text to a .txt file
with open("../text_files/raw_text.txt", "w", encoding="utf-8") as f:
    f.write(text)

print(f"All text copied from PDF and saved to raw_text.txt, with {len(text)} number of words")


All text copied from PDF and saved to raw_text.txt, with 65274 number of words


In [18]:
# -------------------- Clean text function definition and saving as .txt ------------------
def clean_text(text):
    """Removes URLs, page numbers, dates and blank lines  using regex patterns, while still preserving data integrity.

    Args:
        text (_str_): Description - input text to be cleaned

    Returns:
       text (_str_): Description - A clean version of the text
    """
    # Remove URLs
    text = re.sub(r'https?://\S+', '', text)
    # Remove "Page 1", "Page 2", etc.
    text = re.sub(r'Page\s*\d+', '', text)
    # Remove dates like "July 31, 2025"
    text = re.sub(r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}', '', text)
    # Remove extra blank lines
    text = re.sub(r'\n\s*\n+', '\n', text)
    return text

# Load the raw text from the file
with open("../text_files/raw_text.txt", "r", encoding="utf-8") as f:
    raw = f.read()

# Clean the text
cleaned = clean_text(raw)

# Save the cleaned text to a new file
with open("../text_files/cleaned_text.txt", "w", encoding="utf-8") as f:
    f.write(cleaned)

print(f"Cleaned text saved to cleaned_text.txt, with {len(cleaned)} number of words")


Cleaned text saved to cleaned_text.txt, with 64552 number of words


In [19]:
# Opening and printing the cleaned text right
with open("../text_files/cleaned_text.txt", "r", encoding="utf-8") as f:
    content = f.read()

# showing the first 1000 characters 
print(content[:1000])  


See discussions, stats, and author profiles for this publication at: 
Olukumi Bilingual Dictionary
Book · July 2017
CITATIONS
0
READS
1,156
2 authors:
Bolanle Elizabeth Arokoyo
University of Ilorin
44 PUBLICATIONS   42 CITATIONS   
SEE PROFILE
Olamide Mabodu
Nottingham Trent University
5 PUBLICATIONS   0 CITATIONS   
SEE PROFILE
All content following this page was uploaded by Bolanle Elizabeth Arokoyo on 10 May 2020.
The user has requested enhancement of the downloaded file.
ababe 
1 
OLÙKÙMI-ENGLISH
ababe 
1 
A - a 
ababe   [ɑbɑbe] n. poison. 
ábe  ̣́   [ɑ́bɛ́] adv, prep. below. 
ábe  ̣́   [ɑ́bɛ́] n. bottom. 
àbéké   [ɑ̀béké] n. knife. 
abọrọkpọ  [ɑbͻrͻkpͻ] n. 
spinning wheel. 
abo  ̣́wo  ̣́   [ɑbͻ́wͻ́] n. armpit. 
àdá   [ɑ̀dɑ́] n. hatchet, sword. 
adan  [ɑdɑ̃] n. bat. 
adé   [ɑdé] n. crown. 
adele   [ɑdele] n. house 
lizard. 
ade  ̣́n   [ɑdɛ̃́] adj. fried. 
adidun   [ɑdidũ] adj. stingy. 
afán   [ɑfɑ̃́] n. gun. 
afo  ̣̀máyàsé  [ɑfͻ̀mɑ́jɑ̀sé] v. 
bluff. 
afọsẹ   [