In [97]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

#loading Data
df = pd.read_csv('olukumi_local_english.csv')

In [98]:
#Previewing Data
print("HEAD:\n")
print(df.head())
print("\n")

print("INFO:\n")
print(df.info())
print("\n")

print("DESCRIBE:\n")
print(df.describe())


HEAD:

    Local Word English Meaning
0        ababe          poison
1     ábe  ̣́           below
2     ábe  ̣́          bottom
3     àbéké           knife
4  abọrọkpọ  spinning wheel


INFO:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Local Word       1668 non-null   object
 1   English Meaning  1662 non-null   object
dtypes: object(2)
memory usage: 26.2+ KB
None


DESCRIBE:

       Local Word English Meaning
count        1668            1662
unique       1625            1645
top           bí             fan
freq            3               2


Basic Data Cleaning

In [99]:
#Checking for missing values
df.isna().sum()

Local Word         0
English Meaning    6
dtype: int64

In [100]:
#Dropping missing values
df.dropna(inplace = True)

df.isna().sum()

Local Word         0
English Meaning    0
dtype: int64

In [101]:
#Checking For Duplicates
duplicates = df[df.duplicated()]
print(duplicates)

Empty DataFrame
Columns: [Local Word, English Meaning]
Index: []


In [102]:
#Checking For Empty strings or white space
df['Local Word'] = df['Local Word'].str.strip()
df['English Meaning'] = df['English Meaning'].str.strip()

df.replace('',pd.NA,inplace=True)
print(df.isna().sum())

Local Word         0
English Meaning    0
dtype: int64


In [103]:
#Confirming Datatypes
print(df.dtypes)

Local Word         object
English Meaning    object
dtype: object


In [104]:
#making sure datatypes are all strings
df['Local Word'] = df['Local Word'].astype('str')
df['English Meaning'] = df['English Meaning'].astype('str')

In [105]:
df.head(15)

Unnamed: 0,Local Word,English Meaning
0,ababe,poison
1,ábe ̣́,below
2,ábe ̣́,bottom
3,àbéké,knife
4,abọrọkpọ,spinning wheel
5,abo ̣́wo ̣́,armpit
6,àdá,"hatchet, sword"
7,adan,bat
8,adé,crown
9,adele,house lizard


Looking for strange characters or symbols

In [106]:
print(df['Local Word'].nunique())
print(df['English Meaning'].nunique())

1620
1645


In [107]:
df.columns

Index(['Local Word', 'English Meaning'], dtype='object')

In [108]:
import re
from textblob import TextBlob

#Remove unwanted characters.
def clean_text(text):
    # Keep: letters (a-z, A-Z), digits (0-9), spaces, hyphens, and accented letters (Latin-1 and Latin Extended)
    return re.sub(r"[^a-zA-Z0-9\s\-\u00C0-\u024F]", "", text)

df.head(15)

Unnamed: 0,Local Word,English Meaning
0,ababe,poison
1,ábe ̣́,below
2,ábe ̣́,bottom
3,àbéké,knife
4,abọrọkpọ,spinning wheel
5,abo ̣́wo ̣́,armpit
6,àdá,"hatchet, sword"
7,adan,bat
8,adé,crown
9,adele,house lizard


In [109]:
#Splitting multiple English meanings (separated by commas) into lists
df["English Meaning"] = df["English Meaning"].astype(str).str.split(",")

#Expanding the DataFrame so each English meaning gets its own row
df_expanded = df.explode("English Meaning")

#Removing leading and trailing spaces from each English meaning
df_expanded['English Meaning'] = df_expanded['English Meaning'].str.strip()

df_expanded.head(15)

Unnamed: 0,Local Word,English Meaning
0,ababe,poison
1,ábe ̣́,below
2,ábe ̣́,bottom
3,àbéké,knife
4,abọrọkpọ,spinning wheel
5,abo ̣́wo ̣́,armpit
6,àdá,hatchet
6,àdá,sword
7,adan,bat
8,adé,crown


Export Cleaned and Validated Data

In [110]:
df_expanded.to_csv("validated_dictionary.csv", index=False)