In [8]:
# ----------------Importing Dependencies------------------
import re
import pandas as pd

In [16]:
# --------- Filtering word, translation, pronunciation and part of speech ----------------
def olukumi_data(file):
    # Load file
    with open(file,'r',encoding='utf-8') as f:
        text = f.read()

    # Define the pattern
    pattern = r"""
        ^([^\[\n]+?)\s*     # Local word (anything before [ )
        \[([^\]]+)\]\s+     # IPA in brackets
        ([a-z,\s]+\.?)\s*      # POS like n. v. adj.
        (.*?)               # Meaning: anything, including newlines
        (?=^[^\[\n]+?\s*\[|\Z) # Until next word [IPA] or end
    """

    # Extract matches
    return re.findall(pattern, text, re.MULTILINE | re.DOTALL | re.VERBOSE)

In [None]:
# Put into a Dataframe
matches = olukumi_data("../text_files/cleaned_text.txt")
df = pd.DataFrame(matches,columns=['local_word','pronunciation','pos','english_meaning'])

In [18]:
df.head()

Unnamed: 0,local_word,pronunciation,pos,english_meaning
0,ababe,ɑbɑbe,n.,poison. \n
1,ábe ̣́,ɑ́bɛ́,"adv, prep.",below. \n
2,ábe ̣́,ɑ́bɛ́,n.,bottom. \n
3,àbéké,ɑ̀béké,n.,knife. \n
4,abọrọkpọ,ɑbͻrͻkpͻ,n.,spinning wheel. \n


In [19]:
# clean up 
df['local_word'] = df['local_word'].str.strip().str.rstrip(':')
df['english_meaning'] = df['english_meaning'].str.replace('\n','',regex=False).str.replace('\s+',' ',regex=True).str.strip().str.rstrip('.')


  df['english_meaning'] = df['english_meaning'].str.replace('\n','',regex=False).str.replace('\s+',' ',regex=True).str.strip().str.rstrip('.')


In [20]:
df = df[['local_word','english_meaning']]

In [21]:
df.head(20)

Unnamed: 0,local_word,english_meaning
0,ababe,poison
1,ábe ̣́,below
2,ábe ̣́,bottom
3,àbéké,knife
4,abọrọkpọ,spinning wheel
5,abo ̣́wo ̣́,armpit
6,àdá,"hatchet, sword"
7,adan,bat
8,adé,crown
9,adele,house lizard


In [None]:
df.to_csv('../data/processed_csv/olukumi_local_english.csv',index=False)
df.to_excel('../data/processed_csv/olukumi_local_english.xlsx',index=False)
df.to_parquet('../data/processed_csv/olukumi_local_english.parquet',index=False)