In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import unicodedata  # To normalize diacritics representations
import wordninja # Handles misconcatenated words
from collections import Counter


nltk.download("punkt_tab") # For tokenizing
nltk.download("brown") # English corpus to distinguish English instances from Itsekiri

from nltk.corpus import brown
from nltk.tokenize import word_tokenize # word tokenizer

[nltk_data] Downloading package punkt_tab to C:\Users\BUYPC
[nltk_data]     COMPUTERS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package brown to C:\Users\BUYPC
[nltk_data]     COMPUTERS\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
# Dataset loading

url = r"https://docs.google.com/spreadsheets/d/1ASDz4mNSjGLqsMCZZigAsDgXtKATHIKZC2L3ytMRhYw/export?format=csv"
data = pd.read_csv(url)

data

Unnamed: 0,target,translation
0,Abaghari,blood.
1,Abalẹmaku,unbreakable pans of allsorts usually aluminum.
2,Aban,a “u” shaped metal used in floating timbers. A...
3,Abaọnẹje,gossipmonger
4,Abata,Incompatible. e.g. ‘Ma gba abata gua obiti’.
...,...,...
4079,Ẹkpetin-alawari,radio.
4080,Ukpali-ẹgho,video tape.
4081,Ukpali-owun,audio tape cassette.
4082,we nemi wa gba gbẹ ẹja ni adagba mi,you cannot take fish from my adagba.


In [3]:
# There are apparently six rows with missing values in the translation column

print(data.info())

# Inspecting rows with missing values
print(data[data["translation"].isna()==True])

# Row 1144 is missing the delimiter between the target and translation fields
# We can easily resolve with
malformed_row = data.iloc[1144]["target"]
data.loc[1144, "target"] = malformed_row[:len("Girigiri")]
data.loc[1144, "translation"] = malformed_row[len("Girigiri"):]

print("\n", data.iloc[1144])

# Just five other rows with missing values, we drop them
data = data.dropna()

print("\n", data.info())
# No more nulls

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4084 entries, 0 to 4083
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   target       4084 non-null   object
 1   translation  4078 non-null   object
dtypes: object(2)
memory usage: 63.9+ KB
None
                              target translation
1144  Girigiriforceful. Also grigri.         NaN
1256                           Idọlọ         NaN
3476        ‘A ka wuu ni wun origho’         NaN
3890                           Ọkẹnẹ         NaN
3891                           Okeji         NaN
3892                          Ọkẹẹta         NaN

 target                       Girigiri
translation    forceful. Also grigri.
Name: 1144, dtype: object
<class 'pandas.core.frame.DataFrame'>
Index: 4079 entries, 0 to 4083
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   target       4079 non-null   object
 1   translat

Observations on first inspection.

- Several English words mistakenly concatenated together within definitions
- Several translations have Itsekiri words and/or phrases embedded within them. Some are examples, others are antonyms
- This dataset strongly resembles a dictionary ("Also see", "Antonym")

I have no idea if the latter two observations are by design. I am frankly unsure what model besides a pre-trained LLM those inclusions would benefit.

Course of action:
- Diacritics normalization: The paper, "Don't Touch My Diacritics" [1] argues removing diacritics removes quite useful information and suggests we normalize them instead by making sure they are consistently encoded.
- Correcting misconcatenated words: Splitting words into subwords e.g "Ibegged" -> "I begged"
- Extracting Itsekiri instances from definitions (appending them as examples to test on possibly)
- Lowercasing
- Punctuation stripping (much more tentative frankly)

In [4]:
# Why we should do diacritic normalization

# Consider the following words in the "target column": "Abaọnẹje" and "Akanfọ"
word_3 = data.loc[3, "target"]
word_4 = data.loc[101, "target"]
print(word_3)
print(word_4)

# Now let us print the letter o with a dot below in both words
# ...It is the fourth character in "Abaọnẹje" and
# ...the sixth in "Akanfọ"
print("4th Character in Abaọnẹje: ", word_3[3])
print("6th Character in Akanfọ: ", word_4[5])

# What we observe is characters with diacritics are not encoded consistently
# The character encoding of "ọ" in "Abaọnẹje" is not the same as that in "Akanfọ"
# The first is a single character while the latter is composed of two glyphs...
# ... They are not equivalent.

print("ọ" == "ọ")

Abaọnẹje
Akanfọ
4th Character in Abaọnẹje:  ọ
6th Character in Akanfọ:  o
False


In [5]:
#Diacritics normalization ensures all letters with diacritics are encoded consistently
def normalize_diacritics(df, column):
    """
    Ensures all glyphs and diacritics are consistently encoded
    Args:
        df (pd.DataFrame): the dataset
        column (str): the name of the column to modify
    Return:
        modified DataFrame
    """

    def normalize(text):
        """Converts all characters to Normalized Form C encoding"""
        return unicodedata.normalize("NFC", text)

    df_copy = df.copy()
    df_copy[column+"_normalized"] = df_copy[column].apply(normalize)
    return df_copy




In [6]:
normal_data = normalize_diacritics(data, "target")

# Prints every cell with different representations of diacritics from our normalized ones
normal_data[normal_data["target"]!=normal_data["target_normalized"]]

Unnamed: 0,target,translation,target_normalized
101,Akanfọ,camphor.,Akanfọ
106,Akatọ,forehead.,Akatọ
108,Akidọ,monkey.,Akidọ
111,Akikọ,immature snail or a small snail.,Akikọ
942,Ẹwọ,once,Ẹwọ
...,...,...,...
4047,Uge-igbawọ,convocation ceremony.,Uge-igbawọ
4053,Ọgbọkunamọ,pilot.,Ọgbọkunamọ
4056,Uja-ọşẹ,boxing.,Uja-ọşẹ
4057,Ọja-ọşẹ,boxer.,Ọja-ọşẹ


In [7]:
# Lowercasing
def lowercase(df, cols):
    """Converts documents in given columns of dataframe to lowercase
    Args:
        df (pd.DataFrame): DataFrame to modify
        cols (list): list of columns to modify
    Returns:
        modified DataFrame"""
    df_copy = df.copy()
    return df_copy[cols].apply(lambda x: x.str.lower())


In [8]:
low_norm_data = lowercase(normal_data, normal_data.columns)
low_norm_data
low_norm_data.loc[1752, "translation"]

'ibegged but you did not agree. antonym kọkọ.'

In [9]:
# Tokenizing text

def tokenize(df: pd.DataFrame, cols: list|None = None) -> pd.DataFrame:
    """Tokenizes text"
    Args:
        df (pd.DataFrame): DataFrame to modify
        cols (list): list of columns to modify, modifies all if None
    Returns:
        modified DataFrame columns"""

    df_copy = df.copy()

    if cols==None:
        return df_copy.apply(lambda x: x.apply(word_tokenize))

    return df_copy[cols].apply(lambda x: x.apply(word_tokenize))


In [10]:
remove_dot = lambda x: re.sub(r"[.]", " ", x)
remove_dot("genuflect.Also see yiniudakun.")

'genuflect Also see yiniudakun '

In [11]:
low_norm_data["translation_tokens"] = tokenize(low_norm_data, ["translation"])
low_norm_data

Unnamed: 0,target,translation,target_normalized,translation_tokens
0,abaghari,blood.,abaghari,"[blood, .]"
1,abalẹmaku,unbreakable pans of allsorts usually aluminum.,abalẹmaku,"[unbreakable, pans, of, allsorts, usually, alu..."
2,aban,a “u” shaped metal used in floating timbers. a...,aban,"[a, “, u, ”, shaped, metal, used, in, floating..."
3,abaọnẹje,gossipmonger,abaọnẹje,[gossipmonger]
4,abata,incompatible. e.g. ‘ma gba abata gua obiti’.,abata,"[incompatible, ., e.g, ., ‘, ma, gba, abata, g..."
...,...,...,...,...
4079,ẹkpetin-alawari,radio.,ẹkpetin-alawari,"[radio, .]"
4080,ukpali-ẹgho,video tape.,ukpali-ẹgho,"[video, tape, .]"
4081,ukpali-owun,audio tape cassette.,ukpali-owun,"[audio, tape, cassette, .]"
4082,we nemi wa gba gbẹ ẹja ni adagba mi,you cannot take fish from my adagba.,we nemi wa gba gbẹ ẹja ni adagba mi,"[you, can, not, take, fish, from, my, adagba, .]"


In [12]:
def generate_wordlist(threshold: int) -> set:
    """Generates a set of the most common English words, set size is equal
    to threshold
    Args:
        threshold (int): Number of most common words to include
    Return:
        set: most common N words, n = threshold"""
    common_words = brown.words()
    word_freq = Counter(common_words)

    common_word_list = {word.lower() for word, _ in word_freq.most_common(threshold)}
    return common_word_list

wordlist = generate_wordlist(20000)

In [13]:
print(wordlist)




In [14]:
def build_Itsekiri_lexicon(df, col):
    df_copy = df.copy()
    df = tokenize(df_copy[col].to_frame())
    all_tokens = sum(df[col], [])
    return set([token for token in all_tokens if token not in string.punctuation])

def word_split(tokens: list, wordlist: set, itskr_lexicon: set) -> list:
    """Splits wrongly concatenated words into the correct individual subwords
       e.g "Ibegged" -> "I begged", "Isextended" -> "is extended"
    Args:
        tokens (list): list of strings
        wordlist (set): collection of English words
    Return:
        list: list of tokens with corrections"""

    def isValidSubword(lst: list) -> bool:
        """Checks if list elements are English words
        Args:
            lst (list | str): string or list of strings
        Returns:
            True if all words in list are in wordlist, else False
        """

        single_char = {"i", "a"}

        for word in lst:
            if word not in wordlist:
                return False

            # Handles cases of splits into meaningless single characters
            # ...present in wordlist
            if len(word) < 2 and word not in single_char:
                return False
        return True

    new_tokens = []

    for token in tokens:
        if token in itskr_lexicon:
            new_tokens.append(token) # Append tokens we know to be Itsekiri
        elif token not in wordlist:
            subwords = wordninja.split(token) # Splits token into possible subwords
            if isValidSubword(subwords):
                new_tokens.extend(subwords) # Add subwords to new token list if all are known English words
            else:
                new_tokens.append(token) # Otherwise append the original word to new token list
        else:
            new_tokens.append(token)

    return new_tokens




In [15]:
itskr_lexicon =  build_Itsekiri_lexicon(low_norm_data, "target_normalized")

In [16]:
low_norm_data["tokens_typo_correct"] = low_norm_data["translation_tokens"].apply(lambda x: word_split(x, wordlist, itskr_lexicon))
low_norm_data

Unnamed: 0,target,translation,target_normalized,translation_tokens,tokens_typo_correct
0,abaghari,blood.,abaghari,"[blood, .]","[blood, .]"
1,abalẹmaku,unbreakable pans of allsorts usually aluminum.,abalẹmaku,"[unbreakable, pans, of, allsorts, usually, alu...","[unbreakable, pans, of, all, sorts, usually, a..."
2,aban,a “u” shaped metal used in floating timbers. a...,aban,"[a, “, u, ”, shaped, metal, used, in, floating...","[a, u, shaped, metal, used, in, floating, timb..."
3,abaọnẹje,gossipmonger,abaọnẹje,[gossipmonger],[gossipmonger]
4,abata,incompatible. e.g. ‘ma gba abata gua obiti’.,abata,"[incompatible, ., e.g, ., ‘, ma, gba, abata, g...","[incompatible, ., e.g, ., ‘, ma, gba, abata, g..."
...,...,...,...,...,...
4079,ẹkpetin-alawari,radio.,ẹkpetin-alawari,"[radio, .]","[radio, .]"
4080,ukpali-ẹgho,video tape.,ukpali-ẹgho,"[video, tape, .]","[video, tape, .]"
4081,ukpali-owun,audio tape cassette.,ukpali-owun,"[audio, tape, cassette, .]","[audio, tape, cassette, .]"
4082,we nemi wa gba gbẹ ẹja ni adagba mi,you cannot take fish from my adagba.,we nemi wa gba gbẹ ẹja ni adagba mi,"[you, can, not, take, fish, from, my, adagba, .]","[you, can, not, take, fish, from, my, adagba, .]"


In [17]:
# Some instances of wronglyy concatenated words in the texts

def demo_outcome(row, old, new, operation):
    print(f"Before {operation}: \n", low_norm_data.loc[row, old])
    print(f"After {operation}: \n", low_norm_data.loc[row, new])
    print("\n")


lst = [1752, 1770, 1783, 1812, 1813, 1817, 1821, 1834, 1840, 1844, 1848, 1884]

for ind in lst:
    demo_outcome(ind, "translation_tokens", "tokens_typo_correct", "word splitting")

Before word splitting: 
 ['ibegged', 'but', 'you', 'did', 'not', 'agree', '.', 'antonym', 'kọkọ', '.']
After word splitting: 
 ['i', 'begged', 'but', 'you', 'did', 'not', 'agree', '.', 'antonym', 'kọkọ', '.']


Before word splitting: 
 ['though', 'dying', ',', '‘', 'ọkuọnkuọn', '’', 'is', 'stillgnawing', '.']
After word splitting: 
 ['though', 'dying', ',', '‘', 'ọkuọnkuọn', '’', 'is', 'still', 'gnawing', '.']


Before word splitting: 
 ['iwant', 'to', 'burn', 'my', 'farm', '.']
After word splitting: 
 ['i', 'want', 'to', 'burn', 'my', 'farm', '.']


Before word splitting: 
 ['iwill', 'not', 'do', 'it', 'again', '.']
After word splitting: 
 ['i', 'will', 'not', 'do', 'it', 'again', '.']


Before word splitting: 
 ['iwant', 'to', 'pluck', 'the', 'mango', '.']
After word splitting: 
 ['i', 'want', 'to', 'pluck', 'the', 'mango', '.']


Before word splitting: 
 ['ithink', 'he', 'has', 'veil', 'on', 'his', 'face', '.']
After word splitting: 
 ['i', 'think', 'he', 'has', 'veil', 'on', 'his',

In [18]:
low_norm_data[low_norm_data["translation_tokens"] != low_norm_data["tokens_typo_correct"]]

Unnamed: 0,target,translation,target_normalized,translation_tokens,tokens_typo_correct
1,abalẹmaku,unbreakable pans of allsorts usually aluminum.,abalẹmaku,"[unbreakable, pans, of, allsorts, usually, alu...","[unbreakable, pans, of, all, sorts, usually, a..."
2,aban,a “u” shaped metal used in floating timbers. a...,aban,"[a, “, u, ”, shaped, metal, used, in, floating...","[a, u, shaped, metal, used, in, floating, timb..."
26,abokundo,mixture of salt/fresh water.,abokundo,"[mixture, of, salt/fresh, water, .]","[mixture, of, salt, fresh, water, .]"
28,abokuroli,substandard e.g. as inclothes.,abokuroli,"[substandard, e.g, ., as, inclothes, .]","[substandard, e.g, ., as, in, clothes, .]"
82,aja-kaja,any or alltown/ communities.,aja-kaja,"[any, or, alltown/, communities, .]","[any, or, all, town, communities, .]"
...,...,...,...,...,...
4012,ọlọgua,flagbearer.,ọlọgua,"[flagbearer, .]","[flag, bearer, .]"
4022,ugbo,port – ship,ugbo,"[port, –, ship]","[port, ship]"
4029,abẹtẹ-ero,conference room/hall.,abẹtẹ-ero,"[conference, room/hall, .]","[conference, room, hall, .]"
4061,okpo,antenna/mast.,okpo,"[antenna/mast, .]","[antenna, mast, .]"


After writing the word split function, I came across ["byte-pair encoding"](https://en.wikipedia.org/wiki/Byte_pair_encoding), an encoding scheme that encodes individual words as collections of subtokens. Maybe it natively handles the wrong concatenation problem `word_split` tries to solve, I'm currently unsure.

Here I use heuristics based on observations from inspecting the dataset.
One such observation is many itsekiri instances follow the words/phrases:
 - "also see"
 - "also"
 - "e.g."
 - "antonym"
 The following function strips away text that follows occurences of these phrases.
 It also checks each word in our token list to see if it appears in the Itsekiri lexicon

 These heuristics are obviously imperfect, there is also a good question of whether we really ought to clean out Itsekiri instances. Seeing as we are training an RNN from scratch, it seems these Itsekiri occurences will be little more than noise for the model to overcome

In [19]:
def clean_itsekiri(tokens: list) -> list:
    """Args:
        tokens (list): a list of tokens
    Returns:
        list: list of tokens with Itsekiri words stripped out"""

    new_tokens = []

    prev_token = ""
    for token in tokens: # Iterate through list of tokens per document
        if token == "antonym":
            return new_tokens # Ignores all tokens including and following "antonym"

        if token == "originally":
            return new_tokens

        if token == "e.g":
            return new_tokens # Ignores all tokens including and following "e.g.""

        if token == "see" and prev_token == "also":
            return new_tokens[:-1]    # Ignores all tokens including and following "also see"

        if prev_token == "also" and token not in wordlist:
            return new_tokens[:-1] # Ignores all non-English tokens following also

        new_tokens.append(token)
        prev_token = token

    return new_tokens




In [20]:
# Applying calling clean_itsekiri on the "tokens_typo_correct" column

low_norm_data["clean_trans_tokens"] = low_norm_data["tokens_typo_correct"].apply(clean_itsekiri)

In [21]:
low_norm_data

Unnamed: 0,target,translation,target_normalized,translation_tokens,tokens_typo_correct,clean_trans_tokens
0,abaghari,blood.,abaghari,"[blood, .]","[blood, .]","[blood, .]"
1,abalẹmaku,unbreakable pans of allsorts usually aluminum.,abalẹmaku,"[unbreakable, pans, of, allsorts, usually, alu...","[unbreakable, pans, of, all, sorts, usually, a...","[unbreakable, pans, of, all, sorts, usually, a..."
2,aban,a “u” shaped metal used in floating timbers. a...,aban,"[a, “, u, ”, shaped, metal, used, in, floating...","[a, u, shaped, metal, used, in, floating, timb...","[a, u, shaped, metal, used, in, floating, timb..."
3,abaọnẹje,gossipmonger,abaọnẹje,[gossipmonger],[gossipmonger],[gossipmonger]
4,abata,incompatible. e.g. ‘ma gba abata gua obiti’.,abata,"[incompatible, ., e.g, ., ‘, ma, gba, abata, g...","[incompatible, ., e.g, ., ‘, ma, gba, abata, g...","[incompatible, .]"
...,...,...,...,...,...,...
4079,ẹkpetin-alawari,radio.,ẹkpetin-alawari,"[radio, .]","[radio, .]","[radio, .]"
4080,ukpali-ẹgho,video tape.,ukpali-ẹgho,"[video, tape, .]","[video, tape, .]","[video, tape, .]"
4081,ukpali-owun,audio tape cassette.,ukpali-owun,"[audio, tape, cassette, .]","[audio, tape, cassette, .]","[audio, tape, cassette, .]"
4082,we nemi wa gba gbẹ ẹja ni adagba mi,you cannot take fish from my adagba.,we nemi wa gba gbẹ ẹja ni adagba mi,"[you, can, not, take, fish, from, my, adagba, .]","[you, can, not, take, fish, from, my, adagba, .]","[you, can, not, take, fish, from, my, adagba, .]"


In [22]:
# Demonstrating the changes we have made

ex = [4, 531, 1020, 1443, 1490, 1755, 3588]

for row in ex:
    demo_outcome(row, "tokens_typo_correct", "clean_trans_tokens", "cleaning itsekiri")

Before cleaning itsekiri: 
 ['incompatible', '.', 'e.g', '.', '‘', 'ma', 'gba', 'abata', 'gua', 'obiti', '’', '.']
After cleaning itsekiri: 
 ['incompatible', '.']


Before cleaning itsekiri: 
 ['conversely', '.', 'also', 'see', 'kenịkp', '.']
After cleaning itsekiri: 
 ['conversely', '.']


Before cleaning itsekiri: 
 ['to', 'blow', 'into', 'fire', '.', 'e.g', '.', '‘', 'ma', 'fan', 'una', '’', '.']
After cleaning itsekiri: 
 ['to', 'blow', 'into', 'fire', '.']


Before cleaning itsekiri: 
 ['mosquito', '.', 'originally', 'ọlimọ-urẹn', '.']
After cleaning itsekiri: 
 ['mosquito', '.']


Before cleaning itsekiri: 
 ['molar', '.', 'also', '‘', 'irinkin-ẹ', 'ji', '’', '.']
After cleaning itsekiri: 
 ['molar', '.']


Before cleaning itsekiri: 
 ['guilty', '.', 'antonym', 'jẹre', '.']
After cleaning itsekiri: 
 ['guilty', '.']


Before cleaning itsekiri: 
 ['genuflect.also', 'see', 'yiniudakun', '.']
After cleaning itsekiri: 
 ['genuflect.also', 'see', 'yiniudakun', '.']




In [23]:
# Here we tackle punctuation

# There are several instances in the "target" column of fields like these:
# "Igirigi/i-gi-ri-gi/", "Şişi/şi-şi/,si"
# These are seemingly pronunciation guides, yet they introduce noise to those fields, we will strip them

# Quotation marks like "‘’" abound and bear seemingly little information content, we will strip them

def clean_itskr_punct(text: str) -> str:
    """Cleans punctuation from Itsekiri word fields"""

    text = re.sub(r"’|‘", "", text) # Clean quotation marks
    text = re.sub(r"\/.*", "", text) # Clean every character including and following a slash
    text = re.sub(r",|\.", " ", text) # replace with space
    return text

def clean_eng_tokens(tokens: list) -> str:
    "Cleans punctuation from tokens"
    single_chars = {"i", "u", "a"}
    new_tokens = []
    for token in tokens:
        if len(token) == 1 and token not in single_chars:
            continue
        token = re.sub("[.]", " ", token)
        new_tokens.append(token)
    text = " ".join(new_tokens)
    return text



In [24]:
# Demonstrate cleaning output

low_norm_data["new_target"] = low_norm_data["target"].apply(clean_itskr_punct)

ex = [463, 1248, 1754, 1759, 1767]

for row in ex:
    demo_outcome(row, "target", "new_target", "cleaning itsekiri punctuation")

Before cleaning itsekiri punctuation: 
 bujẹ/bu-jẹ/
After cleaning itsekiri punctuation: 
 bujẹ


Before cleaning itsekiri punctuation: 
 idibi/i-di-bi/
After cleaning itsekiri punctuation: 
 idibi


Before cleaning itsekiri punctuation: 
 ‘o jẹẹ ọwa’
After cleaning itsekiri punctuation: 
 o jẹẹ ọwa


Before cleaning itsekiri punctuation: 
 ‘odo we jẹn’
After cleaning itsekiri punctuation: 
 odo we jẹn


Before cleaning itsekiri punctuation: 
 jikijiki/ji-ki-ji-ki/four
After cleaning itsekiri punctuation: 
 jikijiki




In [25]:
# Clean and join English tokens
low_norm_data["clean_translation"] = low_norm_data['clean_trans_tokens'].apply(clean_eng_tokens)

# Attack all "also" again, those things have no utility
low_norm_data["clean_translation"] = low_norm_data["clean_translation"].apply(lambda x: re.sub(r"also.*", "", x))

In [26]:
# Compare first hundred fields
#for i in range(500, 600):
    #print(low_norm_data.loc[i, "translation"])
    #print(low_norm_data.loc[i, "clean_translation"])
    #print("\n")


In [27]:
# Drop other columns
clean_data = low_norm_data[["new_target", "clean_translation"]]

clean_data

Unnamed: 0,new_target,clean_translation
0,abaghari,blood
1,abalẹmaku,unbreakable pans of all sorts usually aluminum
2,aban,a u shaped metal used in floating timbers
3,abaọnẹje,gossipmonger
4,abata,incompatible
...,...,...
4079,ẹkpetin-alawari,radio
4080,ukpali-ẹgho,video tape
4081,ukpali-owun,audio tape cassette
4082,we nemi wa gba gbẹ ẹja ni adagba mi,you can not take fish from my adagba


In [34]:

# Write cleaned dataset to file with a proper file name and extension
clean_data.to_csv(r"itsekiri-translator\data\clean_itsekiri.csv", index=False)

See the cleaned dataset here: https://docs.google.com/spreadsheets/d/1hEairWjgkNpBdAE1kNnoTkGxcrplgTwXlDt8hembCRU/edit?usp=sharing

In [35]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load cleaned dataset
df = pd.read_csv("itsekiri-translator\data\clean_itsekiri.csv")

# Rename for clarity
df = df.rename(columns={"new_target": "itsekiri", "clean_translation": "english"})

# Drop any remaining rows with NaNs in key columns
df = df.dropna(subset=["english", "itsekiri"])

# Add start/end tokens to English translations
df["english"] = df["english"].apply(lambda x: "<start> " + x.strip() + " <end>")

# Train-test split
train_data, val_data = train_test_split(df, test_size=0.01, random_state=42)

# Tokenizer setup
def tokenize(lang):
    tokenizer = Tokenizer(filters='', lower=True, oov_token="<unk>")
    tokenizer.fit_on_texts(lang)
    tensor = tokenizer.texts_to_sequences(lang)
    tensor = pad_sequences(tensor, padding='post')
    return tensor, tokenizer

# Tokenize both sides
input_tensor, inp_lang_tokenizer = tokenize(train_data["itsekiri"].tolist())
target_tensor, targ_lang_tokenizer = tokenize(train_data["english"].tolist())

# Validation data
val_input_tensor = pad_sequences(inp_lang_tokenizer.texts_to_sequences(val_data["itsekiri"]), padding='post')
val_target_tensor = pad_sequences(targ_lang_tokenizer.texts_to_sequences(val_data["english"]), padding='post')

# Vocabulary sizes
input_vocab_size = len(inp_lang_tokenizer.word_index) + 1
target_vocab_size = len(targ_lang_tokenizer.word_index) + 1

# Parameters
embedding_dim = 256
units = 512
BATCH_SIZE = 64
BUFFER_SIZE = len(input_tensor)
steps_per_epoch = len(input_tensor) // BATCH_SIZE

# Create dataset
train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor)).shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices((val_input_tensor, val_target_tensor)).batch(BATCH_SIZE)

# Encoder
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super().__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

# Decoder
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super().__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        x = self.fc(output)
        return x, state

# Instantiate encoder-decoder
encoder = Encoder(input_vocab_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(target_vocab_size, embedding_dim, units, BATCH_SIZE)

# Optimizer and loss
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.not_equal(real, 0)
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

# Training step
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden = decoder(dec_input, dec_hidden)
            loss += loss_function(targ[:, t], predictions[:, 0, :])
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = loss / int(targ.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

# Training loop
EPOCHS = 10

for epoch in range(EPOCHS):
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(train_dataset):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

    print(f'Epoch {epoch+1} Loss {total_loss / steps_per_epoch:.4f}')

 


  df = pd.read_csv("itsekiri-translator\data\clean_itsekiri.csv")
  df = pd.read_csv("itsekiri-translator\data\clean_itsekiri.csv")


KeyboardInterrupt: 

In [None]:
def evaluate_sentence(sentence):
    sentence = inp_lang_tokenizer.texts_to_sequences([sentence])
    sentence = pad_sequences(sentence, maxlen=input_tensor.shape[1], padding='post')

    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(tf.convert_to_tensor(sentence), hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang_tokenizer.word_index['<start>']], 0)

    for _ in range(20):  # Max output length
        predictions, dec_hidden = decoder(dec_input, dec_hidden)
        predicted_id = tf.argmax(predictions[0][0]).numpy()
        next_word = targ_lang_tokenizer.index_word.get(predicted_id, '')

        if next_word == '<end>':
            break

        result += next_word + ' '
        dec_input = tf.expand_dims([predicted_id], 0)

    return result.strip()

# Try a few
for i in range(5):
    test_input = val_data.iloc[i]["itsekiri"]
    expected = val_data.iloc[i]["english"]
    predicted = evaluate_sentence(test_input)
    print(f"\n🔹 Input: {test_input}")
    print(f"✅ Expected: {expected}")
    print(f"🤖 Predicted: {predicted}")