In [652]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

#loading Data
df = pd.read_csv('olukumi_local_english.csv')

Data Inspection

In [653]:
#Previewing Data
print("HEAD:\n")
print(df.head())
print("\n")

print("SHAPE:\n")
print(df.shape)
print("\n")

print("INFO:\n")
print(df.info())
print("\n")

print("DESCRIBE:\n")
print(df.describe())
print("\n")


print("DATATYPES:\n")
print(df.dtypes)

HEAD:

    Local Word English Meaning
0        ababe          poison
1     ábe  ̣́           below
2     ábe  ̣́          bottom
3     àbéké           knife
4  abọrọkpọ  spinning wheel


SHAPE:

(1668, 2)


INFO:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Local Word       1668 non-null   object
 1   English Meaning  1662 non-null   object
dtypes: object(2)
memory usage: 26.2+ KB
None


DESCRIBE:

       Local Word English Meaning
count        1668            1662
unique       1625            1645
top           bí             fan
freq            3               2


DATATYPES:

Local Word         object
English Meaning    object
dtype: object


In [654]:
#Checking for missing values
df.isna().sum()

Local Word         0
English Meaning    6
dtype: int64

In [655]:
#Dropping missing values
df.dropna(inplace = True)

df.isna().sum()

Local Word         0
English Meaning    0
dtype: int64

In [656]:
#Checking For Duplicates
duplicates = df[df.duplicated()]
print(duplicates)

Empty DataFrame
Columns: [Local Word, English Meaning]
Index: []


In [657]:
#Checking For Empty strings or white space
df['Local Word'] = df['Local Word'].str.strip()
df['English Meaning'] = df['English Meaning'].str.strip()

df.replace('',pd.NA,inplace=True)
print(df.isna().sum())

Local Word         0
English Meaning    0
dtype: int64


In [658]:
#Confirming Datatypes
print(df.dtypes)

Local Word         object
English Meaning    object
dtype: object


In [659]:
#making sure datatypes are all strings
df['Local Word'] = df['Local Word'].astype('str')
df['English Meaning'] = df['English Meaning'].astype('str')

In [660]:
df.head(15)

Unnamed: 0,Local Word,English Meaning
0,ababe,poison
1,ábe ̣́,below
2,ábe ̣́,bottom
3,àbéké,knife
4,abọrọkpọ,spinning wheel
5,abo ̣́wo ̣́,armpit
6,àdá,"hatchet, sword"
7,adan,bat
8,adé,crown
9,adele,house lizard


Looking for strange characters or symbols

In [661]:
import re
from textblob import TextBlob

#Remove unwanted characters.
def clean_text(text):
    # Keep: letters (a-z, A-Z), digits (0-9), spaces, hyphens, and accented letters (Latin-1 and Latin Extended)
    return re.sub(r"[^a-zA-Z0-9\s\-\u00C0-\u024F]", "", text)

df.head(15)

Unnamed: 0,Local Word,English Meaning
0,ababe,poison
1,ábe ̣́,below
2,ábe ̣́,bottom
3,àbéké,knife
4,abọrọkpọ,spinning wheel
5,abo ̣́wo ̣́,armpit
6,àdá,"hatchet, sword"
7,adan,bat
8,adé,crown
9,adele,house lizard


In [662]:
#Splitting multiple English meanings (separated by commas) into lists
df["English Meaning"] = df["English Meaning"].astype(str).str.split(",")

#Expanding the DataFrame so each English meaning gets its own row
df_expanded = df.explode("English Meaning")

#Removing leading and trailing spaces from each English meaning
df_expanded['English Meaning'] = df_expanded['English Meaning'].str.strip()

df_expanded.head(15)

Unnamed: 0,Local Word,English Meaning
0,ababe,poison
1,ábe ̣́,below
2,ábe ̣́,bottom
3,àbéké,knife
4,abọrọkpọ,spinning wheel
5,abo ̣́wo ̣́,armpit
6,àdá,hatchet
6,àdá,sword
7,adan,bat
8,adé,crown


Check for Language Consistency

In [663]:
# Import the Natural Language Toolkit and the English word list
import nltk
from nltk.corpus import words

# Download the word list
nltk.download('words')

# Create a set of valid English words for fast lookup
english_vocab = set(words.words())

# Define a function to check if a given word is not in the English word list
def is_misspelled(word):
    return str(word).lower() not in english_vocab

# Apply the misspelling check across the "English Meaning" column
# Count how many words are not recognized as valid English
misspelled_count = df_expanded["English Meaning"].apply(is_misspelled).sum()

# Display the total number of misspelled or unrecognized English words
print("Number of badly spelled words:", misspelled_count)

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Emigb\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Number of badly spelled words: 181


In [664]:
# Identify and extract all misspelled words from the "English Meaning" column.
misspelled_words = df_expanded["English Meaning"][df_expanded["English Meaning"].apply(is_misspelled)]

# Convert to lowercase, remove any missing values (NaN), and eliminate duplicates.
unique_misspelled = misspelled_words.dropna().str.lower().unique()

# Convert the result to a list and print it for easier viewing.
print(list(unique_misspelled))


['spinning wheel', 'house lizard', 'bawdy', 'horriplate', 'for storage', 'fish basket', 'horn pipe', "adam's apple", 'in law', 'brother-in-law', 'old age', 'weaver bird', 'bee wax', 'sole of foot', 'palm of hand', 'cauldron', 'wooden plank', 'mint leaf', 'unmoveable', 'old days', 'great grandfather', 'as usual', 'etc', 'as if', 'according to', 'lessen. cháchá', 'self-defence', 'waist thread', 'half dried', 'vapour', "coconut's oil", 'mole rat', 'half boil', 'country people', 'clod of earth', 'hood of serpent', 'earlobe', 'tooth ache', 'body part', 'neighbour', 'fish trap of conical shape', 'centre', 'chicken egg', 'ear drum', 'embers', 'boil. ẹ́ ghó àyáká 4', 'eye ball', 'bride price', 'nook and cranny', 'palm wine', 'garden lizard', 'shop keeper', 'pick pocket', 'hunter', 'group leader', 'labourer', 'nape of the neck', 'backyard', 'day before yesterday', 'gills', 'proud', 'hunger strike', 'current of water', 'forced labour', 'distil', 'fart', 'gun powder', 'whimp', 'chicken pox

In [665]:
#Removing Rows where the English column contains Local words

import unicodedata

# Making sure all entries in the "English Meaning" column are treated as strings
df['English Meaning'] = df['English Meaning'].astype(str)

# Function to detect accented characters in a word
def contains_accented(word):
    # Decompose characters into base and accent using Unicode normalization
    normalized = unicodedata.normalize('NFD', word)
    # Return True if any decomposed character is a nonspacing mark (i.e., accent)
    return any(char for char in normalized if unicodedata.category(char) == 'Mn')

# Check each word in the column for accents and store result in a new column
df_expanded['has_accented'] = df_expanded['English Meaning'].apply(contains_accented)

df_expanded[df_expanded['has_accented']==True]

Unnamed: 0,Local Word,English Meaning,has_accented
194,bùwálẹ̀,lessen. cháchá,True
347,e ̣́ghíghó,boil. ẹ́ ghó àyáká 4,True
1017,núrún gede,ó ne ne kòkò,True
1232,ọmómàne ̣́bámàne ̣́ye,orphan. ọmoníle ̣̀ adj. aboriginal,True


In [666]:
# Removing all rows where an accented character was found
df_cleaned = df_expanded[~df_expanded['has_accented']].copy()

#Making sure all accented characters have been cleaned from the
df_cleaned[df_cleaned['has_accented']==True]

Unnamed: 0,Local Word,English Meaning,has_accented


In [667]:
#Isolating erroneous rows after manual inspection
df_cleaned.iloc[[909, 1459, 1752, -1]]

Unnamed: 0,Local Word,English Meaning,has_accented
751,irégúgún,ghost. ire runo kan 11,False
1204,ọkọ,husband. o kpan ekpome 28,False
1445,ùkán,(n). different(ce),False
1667,zùyà,suffer. View publication stats,False


In [668]:
df_cleaned.loc[1459, 'English Meaning'] = df_cleaned.loc[1459, 'English Meaning'].split('.')[0].strip()

In [669]:
#Isolating erroneous rows after manual inspection
df_cleaned.iloc[[909, 1459, 1752, -1]]

Unnamed: 0,Local Word,English Meaning,has_accented
751,irégúgún,ghost. ire runo kan 11,False
1204,ọkọ,husband. o kpan ekpome 28,False
1445,ùkán,(n). different(ce),False
1667,zùyà,suffer. View publication stats,False


In [670]:
df_cleaned.loc[df_cleaned["Local Word"] == "irégúgún", "English Meaning"] = "ghost"
df_cleaned.loc[df_cleaned["Local Word"] == "ọkọ", "English Meaning"] = "husband"
df_cleaned.loc[df_cleaned["Local Word"] == "ùkán", "English Meaning"] = "different"
df_cleaned.loc[df_cleaned["Local Word"] == "zùyà", "English Meaning"] = "suffer"

In [671]:
#Isolating erroneous rows after manual inspection
df_cleaned.iloc[[909, 1459, 1752, -1]]

Unnamed: 0,Local Word,English Meaning,has_accented
751,irégúgún,ghost,False
1204,ọkọ,husband,False
1445,ùkán,different,False
1667,zùyà,suffer,False


In [672]:
df_cleaned = df_cleaned.drop_duplicates()

In [673]:
df_cleaned['English Meaning'] = df_cleaned['English Meaning'].str.lower()
df_cleaned.shape

(2024, 3)

In [674]:
# Drop the temporary helper column and reset row index
df_cleaned.drop(columns=['has_accented'], inplace=True)
df_cleaned.reset_index(drop=True, inplace=True)

In [675]:
# Display the first few cleaned records
df_cleaned.head()

Unnamed: 0,Local Word,English Meaning
0,ababe,poison
1,ábe ̣́,below
2,ábe ̣́,bottom
3,àbéké,knife
4,abọrọkpọ,spinning wheel


 Export Cleaned and Validated Data

In [677]:
df_cleaned.to_csv("olukumi_validated_copy.csv", index=False)