# Importing Dependencies and Loadiong data

In [20]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import nltk

# Conditionally download the 'words' corpus if not present
try:
    nltk.data.find('corpora/words')
except LookupError:
    nltk.download('words')


#loading Data
df = pd.read_csv('../data/processed_csv/olukumi_local_english.csv')

[nltk_data] Error loading words: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


# Data Inspection, EDA and validation

In [2]:
# Previewing Data
print("HEAD:\n")
print(df.head())
print("\n")

print("SHAPE:\n")
print(df.shape)
print("\n")

print("INFO:\n")
print(df.info())
print("\n")

print("DESCRIBE:\n")
print(df.describe())
print("\n")


print("DATATYPES:\n")
print(df.dtypes)

HEAD:

    local_word english_meaning
0        ababe          poison
1     ábe  ̣́           below
2     ábe  ̣́          bottom
3     àbéké           knife
4  abọrọkpọ  spinning wheel


SHAPE:

(1668, 2)


INFO:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   local_word       1668 non-null   object
 1   english_meaning  1662 non-null   object
dtypes: object(2)
memory usage: 26.2+ KB
None


DESCRIBE:

       local_word english_meaning
count        1668            1662
unique       1625            1646
top        kòkò          bellow
freq            3               2


DATATYPES:

local_word         object
english_meaning    object
dtype: object


In [3]:
# Checking for missing values
df.isna().sum()

local_word         0
english_meaning    6
dtype: int64

In [4]:
# Dropping missing values
df.dropna(inplace = True)

df.isna().sum()

local_word         0
english_meaning    0
dtype: int64

In [5]:
#Checking For Duplicates
duplicates = df[df.duplicated()]
print(duplicates)

Empty DataFrame
Columns: [local_word, english_meaning]
Index: []


In [6]:
df.columns

Index(['local_word', 'english_meaning'], dtype='object')

In [7]:
#Checking For Empty strings or white space
df['local_word'] = df['local_word'].str.strip()
df['english_meaning'] = df['english_meaning'].str.strip()

df.replace('',pd.NA,inplace=True)
print(df.isna().sum())

local_word         0
english_meaning    0
dtype: int64


In [8]:
# Confirming Datatypes of columns
print(df.dtypes)

local_word         object
english_meaning    object
dtype: object


In [9]:
# making sure datatypes are all strings
df['local_word'] = df['local_word'].astype('str')
df['english_meaning'] = df['english_meaning'].astype('str')

In [10]:
df.head(15)

Unnamed: 0,local_word,english_meaning
0,ababe,poison
1,ábe ̣́,below
2,ábe ̣́,bottom
3,àbéké,knife
4,abọrọkpọ,spinning wheel
5,abo ̣́wo ̣́,armpit
6,àdá,"hatchet, sword"
7,adan,bat
8,adé,crown
9,adele,house lizard


Looking for strange characters or symbols

In [None]:
import re
from textblob import TextBlob

#Remove unwanted characters.
def clean_text(text):
    # Keep: letters (a-z, A-Z), digits (0-9), spaces, hyphens, and accented letters (Latin-1 and Latin Extended)
    return re.sub(r"[^a-zA-Z0-9\s\-\u00C0-\u024F]", "", text)

In [12]:
#Splitting multiple English meanings (separated by commas) into lists
df["english_meaning"] = df["english_meaning"].astype(str).str.split(",")

#Expanding the DataFrame so each English meaning gets its own row
df_expanded = df.explode("english_meaning")

#Removing leading and trailing spaces from each English meaning
df_expanded['english_meaning'] = df_expanded['english_meaning'].str.strip()

df_expanded.head(15)

Unnamed: 0,local_word,english_meaning
0,ababe,poison
1,ábe ̣́,below
2,ábe ̣́,bottom
3,àbéké,knife
4,abọrọkpọ,spinning wheel
5,abo ̣́wo ̣́,armpit
6,àdá,hatchet
6,àdá,sword
7,adan,bat
8,adé,crown


# Checking for Language Consistency

In [None]:
import nltk
#  Download if 'words' corpus is not available
try:
    nltk.data.find('corpora/words')
except LookupError:
    nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [34]:
# importing words
from nltk.corpus import words

# Create a set of valid English words for fast lookup
english_vocab = set(words.words())
print(f"Total Words: {len(english_vocab)}")

Total Words: 235892


In [35]:
def is_misspelled(word):
    """check if a given word is not in the English word list

    Args:
        word (_str_): english words in the english meaning column

    Returns:
        (_str_): misspelled words
    """
    return str(word).lower() not in english_vocab

# Apply the misspelling check across the "English Meaning" column
# Count how many words are not recognized as valid English
misspelled_count = df_expanded["english_meaning"].apply(is_misspelled).sum()

# Display the total number of misspelled or unrecognized English words
print(f"\nNumber of badly spelled words: {misspelled_count}")


Number of badly spelled words: 228


In [37]:
# Identify and extract all misspelled words from the "English Meaning" column.
misspelled_words = df_expanded["english_meaning"][df_expanded["english_meaning"].apply(is_misspelled)]

# Convert to lowercase, remove any missing values (NaN), and eliminate duplicates.
unique_misspelled = misspelled_words.dropna().str.lower().unique()

# Convert the result to a list and print it for easier viewing.
print(list(unique_misspelled))


['spinning wheel', 'house lizard', 'bawdy', 'wing. akakara 2', 'horriplate', 'for storage', 'fish basket', 'horn pipe', "adam's apple", 'fan. àkwàn 3', 'in law', 'brother-in-law', 'old age', 'weaver bird', 'rag. áshíwín 4', 'bee wax', 'sole of foot', 'palm of hand', 'cauldron', 'wooden plank', 'mint leaf', 'unmoveable', 'old days', 'witch. ba ba mì lílá 5 b - b', 'great grandfather', 'as usual', 'etc', 'as if', 'according to', 'twinkle. búkú 6', 'lessen. cháchá 7 c - c', 'bedbug. dà 8 d - d', 'self-defence', 'pass. e - e', 'waist thread', 'half dried', 'vapour', "coconut's oil", 'mole rat', 'half boil', 'poetry. eregugu 2', 'country people', 'clod of earth', 'hood of serpent', 'earlobe', 'tooth ache', 'stable. e bíbì 3 ẹ - ẹ', 'body part', 'neighbour', 'fish trap of conical shape', 'centre', 'chicken egg', 'ear drum', 'embers', 'boil. ẹ́ ghó àyáká 4', 'eye ball', 'bride price', 'nook and cranny', 'palm wine', 'garden lizard', 'lover. e ne wíwe ̣́n 5', 'examiner. 

In [38]:
#Removing Rows where the English column contains Local words

import unicodedata

# Making sure all entries in the "English Meaning" column are treated as strings
df['english_meaning'] = df['english_meaning'].astype(str)

# Function to detect accented characters in a word
def contains_accented(word):
    # Decompose characters into base and accent using Unicode normalization
    normalized = unicodedata.normalize('NFD', word)
    # Return True if any decomposed character is a nonspacing mark (i.e., accent)
    return any(char for char in normalized if unicodedata.category(char) == 'Mn')

# Check each word in the column for accents and store result in a new column
df_expanded['has_accented'] = df_expanded['english_meaning'].apply(contains_accented)

df_expanded[df_expanded['has_accented']==True]

Unnamed: 0,local_word,english_meaning,has_accented
77,àkúpe ̣́,fan. àkwàn 3,True
117,àshásọ,rag. áshíwín 4,True
154,àze ̣́n,witch. ba ba mì lílá 5 B - b,True
191,bù,twinkle. búkú 6,True
194,bùwálẹ̀,lessen. cháchá 7 C - c,True
197,chinch,bedbug. dà 8 D - d,True
309,ézìnzìn,stable. e bíbì 3 Ẹ - ẹ,True
347,e ̣́ghíghó,boil. ẹ́ ghó àyáká 4,True
387,ẹnẹte ̣́dó,lover. e ne wíwe ̣́n 5,True
424,ẹnísọ́ ghò,examiner. e nísózúmézìn 6,True


In [39]:
# Removing all rows where an accented character was found
df_cleaned = df_expanded[~df_expanded['has_accented']].copy()

#Making sure all accented characters have been cleaned from the
df_cleaned[df_cleaned['has_accented']==True]

Unnamed: 0,local_word,english_meaning,has_accented


In [40]:
#Isolating erroneous rows after manual inspection
df_cleaned.iloc[[909, 1459, 1752, -1]]

Unnamed: 0,local_word,english_meaning,has_accented
765,ishara,ear of corn,False
1226,ọmobìnrẹn,daughter,False
1473,ùlà,brand,False
1667,zùyà,suffer. View publication stats,False


In [42]:
df_cleaned.loc[1459, 'english_meaning'] = df_cleaned.loc[1459, 'english_meaning'].split('.')[0].strip()

In [43]:
#Isolating erroneous rows after manual inspection
df_cleaned.iloc[[909, 1459, 1752, -1]]

Unnamed: 0,local_word,english_meaning,has_accented
765,ishara,ear of corn,False
1226,ọmobìnrẹn,daughter,False
1473,ùlà,brand,False
1667,zùyà,suffer. View publication stats,False


In [45]:
df_cleaned.loc[df_cleaned["local_word"] == "irégúgún", "english_meaning"] = "ghost"
df_cleaned.loc[df_cleaned["local_word"] == "ọkọ", "english_meaning"] = "husband"
df_cleaned.loc[df_cleaned["local_word"] == "ùkán", "english_meaning"] = "different"
df_cleaned.loc[df_cleaned["local_word"] == "zùyà", "english_meaning"] = "suffer"

In [46]:
#Isolating erroneous rows after manual inspection
df_cleaned.iloc[[909, 1459, 1752, -1]]

Unnamed: 0,local_word,english_meaning,has_accented
765,ishara,ear of corn,False
1226,ọmobìnrẹn,daughter,False
1473,ùlà,brand,False
1667,zùyà,suffer,False


In [47]:
df_cleaned = df_cleaned.drop_duplicates()

In [49]:
df_cleaned['english_meaning'] = df_cleaned['english_meaning'].str.lower()
df_cleaned.shape

(1980, 3)

In [50]:
# Drop the temporary helper column and reset row index
df_cleaned.drop(columns=['has_accented'], inplace=True)
df_cleaned.reset_index(drop=True, inplace=True)

In [51]:
# Display the first few cleaned records
df_cleaned.head()

Unnamed: 0,local_word,english_meaning
0,ababe,poison
1,ábe ̣́,below
2,ábe ̣́,bottom
3,àbéké,knife
4,abọrọkpọ,spinning wheel


 Export Cleaned and Validated Data

In [52]:
df_cleaned.to_csv("final_olukumi_validated.csv", index=False)