In [1]:
import pandas as pd
import re, unicodedata

In [2]:
# Load extracted dictionary file
df = pd.read_csv("ibani-extracted_text main.csv")

In [3]:
# Drop unnecessary columns
if "pos" in df.columns:
    df = df.drop(columns=["pos"])

In [4]:
# Drop missing rows
df = df.dropna(subset=["headword", "gloss"])

In [5]:
# Cleaning function
def clean_word(text):
    text = unicodedata.normalize("NFKC", str(text))   # normalize Unicode (accents, special forms)
    text = re.sub(r"[^a-zA-Z0-9À-ÿ\s\-]", " ", text) # keep only letters, numbers, spaces, hyphen
    text = re.sub(r"\s+", " ", text)                 # collapse multiple spaces into one
    return text.strip().lower()                      # remove extra spaces & lowercase

In [6]:
# Apply cleaning to columns
df["headword"] = df["headword"].apply(clean_word)  # Ibani word
df["gloss"] = df["gloss"].apply(clean_word)        # English gloss

In [7]:
# Simplify gloss: keep only first English word
df["gloss"] = df["gloss"].apply(lambda x: x.split()[0] if isinstance(x, str) else x)

print("Dataset size:", df.shape)
df.head()

Dataset size: (2889, 2)


Unnamed: 0,headword,gloss
1,a,i
2,á,she
3,áa,exclamation
4,aba,forked
5,aba,hammer-headed


In [8]:
# Convert into dictionary mapping
ibani_to_eng = dict(zip(df["headword"], df["gloss"]))

In [9]:
# Example lookups
print("bari →", ibani_to_eng.get("bari", "[Not found]"))
print("ama →", ibani_to_eng.get("ama", "[Not found]"))

bari → [Not found]
ama → male


In [10]:
from collections import defaultdict

eng_to_ibani = defaultdict(list)

for _, row in df.iterrows():
    eng_to_ibani[row["gloss"]].append(row["headword"])

# Example lookups
print("good →", eng_to_ibani.get("good", ["[Not found]"]))
print("land →", eng_to_ibani.get("land", ["[Not found]"]))


good → ['dede r á', 'ibiamíénye', 'ibi dúmá', 'ibi fúbárá', 'ibisuó', 'báámá']
land → ['k r -ár']


In [11]:
while True:
    word = input("Enter a word (type 'quit' to stop): ").strip().lower()
    if word == "quit":
        break
    if word in ibani_to_eng:
        print(word, "→", ibani_to_eng[word])        # Ibani → English
    elif word in eng_to_ibani:
        print(word, "→", eng_to_ibani[word])        # English → Ibani
    else:
        print("[Word not found in dictionary]")


Enter a word (type 'quit' to stop):  aba


aba → hammer-headed


Enter a word (type 'quit' to stop):  man


man → ['ówítúw']


Enter a word (type 'quit' to stop):  girl


girl → ['amak gháa', 'ama l gháa', 'r ab', 'r anab', 'rùb túw']


Enter a word (type 'quit' to stop):  r ab


r ab → girl


Enter a word (type 'quit' to stop):  rub tuw


[Word not found in dictionary]


KeyboardInterrupt: Interrupted by user