## 01 - Evaluation - ChrF
In this notebook, we will perform translation evaluation against a test set

In [1]:
# ! pip install -U sentence-transformers

In [2]:
# !pip install nltk

In [3]:
import ojibwe_translation as oj_to_en
import pandas as pd
from sentence_transformers import SentenceTransformer
from nltk.translate.chrf_score import sentence_chrf
import re

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Initialization
data_folder_path = '../app/data/full_dataset/'

# Initialize new core environment object
core_env = oj_to_en.initialize_environment(data_folder_path)
# Load FST parser
core_env = oj_to_en.load_foma(environment=core_env, 
                              file_name="foma_bin/ojibwe.att")
# Load templates from csv files
core_env = oj_to_en.load_templates(environment=core_env)

In [5]:
# Translation
input = "gigii-noondawaag"
print(f"Input = {input}")
result = oj_to_en.oj2en_builder(input, environment=core_env)
for item in result:
    print(item)
    print(item["translation"])

Input = gigii-noondawaag
{'input': 'gigii-noondawaag', 'definition': 'hear h/', 'fst_output': 'PVTense/gii+noondaw+VTA+Ind+Pos+Neu+2SgSubj+3PlProxObj', 'sentence_structure': {'tense': 'past', 'verb': 'noondaw', 'verb_type': 'vta', 'order': 'ind', 'polarity': 'pos', 'mode': 'neu', 'subject': '2sgsubj', 'object': '3plproxobj'}, 'translation': 'you heard them (proximate)'}
you heard them (proximate)


In [6]:
# load test set
test_set_file_path = "./data/ojibwe_translation_test_set.csv"
test_set = pd.read_csv(test_set_file_path)
print("Number of records =", len(test_set))
test_set.head()

Number of records = 214


Unnamed: 0,ojibwe_verb,verb_type,english_translation_raw,english_translation_standardized,lesson_number
0,nimbakade,"1s pf, vai",I am hungry,I am hungry,Lesson 1
1,gibakade,"2s pf, vai",you are hungry,you are hungry,Lesson 1
2,bakade,"3s, vai",s/he is hungry,he/she is hungry,Lesson 1
3,niwii-wiisin,"1s pf, pv tns, vai","I want to eat, I will eat",I want to eat,Lesson 1
4,giwii-wiisin,"2s pf, pv tns, vai","you want to eat, you will eat",you want to eat,Lesson 1


### Semantic similarity 

We will use SentenceTransformer with LaBSE embedding model to compare semantic similarity between 2 sentence (system translation and gold translation)

In [7]:
model = SentenceTransformer("LaBSE")

In [8]:
sys_translation = "I like the taste of something"
gold_translation  = "I am enjoying the taste of things"
incorrect_translation = "We liked the taste of it"

In [9]:
v_sys = model.encode(sys_translation)
v_gold = model.encode(gold_translation)
v_incorrect = model.encode(incorrect_translation)

print(f"Sys vs Gold = {model.similarity(v_sys, v_gold).item():.2f}") # should be high
print(f"Incorrect vs Gold = {model.similarity(v_incorrect, v_gold).item():.2f}") # should be lower


Sys vs Gold = 0.83
Incorrect vs Gold = 0.63


#### Chrf score
We can also try ChrF score for the translations

In [10]:
print(f"ChrF score: Gold vs Gold = {sentence_chrf(gold_translation.split(), gold_translation.split()):.2f}") # should be 1
print(f"ChrF score: Sys vs Gold = {sentence_chrf(gold_translation.split(), sys_translation.split()):.2f}") # should be high
print(f"ChrF score: Incorrect vs Gold = {sentence_chrf(gold_translation.split(), incorrect_translation.split()):.2f}") # should be lower


ChrF score: Gold vs Gold = 1.00
ChrF score: Sys vs Gold = 0.45
ChrF score: Incorrect vs Gold = 0.34


#### We will move forward using ChrF score, which is popular in Machine Translation

In [11]:
def sentence_cleaning(sentence:str) -> str:
    """Text cleaning, to remove extra information within brackets such as (proximate) (animate)"""
    result = (sentence.lower()
              .replace("s/he", "he/she")
              .replace("h/", "him/her/it")
    )
    result = re.sub(r" \([^)]*\)", # regex match "(anything)" 
                    "",
                    result 
                    )
    return result.strip(" .") # remove trailing white spaces and punctuations

assert sentence_cleaning("a") == "a"
assert sentence_cleaning("a (bc)") == "a"
assert sentence_cleaning("he/she (proximate) wants to go home") == "he/she wants to go home"
assert sentence_cleaning("s/he is thirsty") == "he/she is thirsty"

In [12]:
def eval_score_function(gold_translation:str, sys_translation:str) -> float:
    """Scoring function for evaluation"""
    gold_translation = sentence_cleaning(gold_translation)
    sys_translation = sentence_cleaning(sys_translation)
    return sentence_chrf(reference=gold_translation, 
                         hypothesis=sys_translation, 
                         min_len=1,  # min unigram 
                         max_len=3   # max 3-gram
                         )
    
assert eval_score_function("I go somewhere", "i go somewhere") == 1
assert eval_score_function("I go", "I went") < 1


In [13]:
def get_system_translations(ojibwe_word:str) -> list[str]:
    """Returns a list of translations by the system"""
    translations = oj_to_en.oj2en_builder(ojibwe_word, environment=core_env)
    result = [item["translation"]
              for item in translations
              if item.get("translation", "") != ""
             ]
    return result 

get_system_translations("nimaajaa")

['I leave', 'I go off', 'I depart']

In [14]:
def select_best_translation(sys_translations:list[str], gold_translation:str, score_function:callable) -> tuple:
    """Evaluate all system translation against gold translation, using provided score function (BLEU/chrF/etc).
    Returns the best translation, along with the score
    """
    best_score = 0
    best_translation = ""

    for translation in sys_translations:
        score = score_function(gold_translation, translation)
        if  score > best_score:
            best_score = score
            best_translation = translation
            
    return (best_translation, best_score)

gold_translation = "he/she leaves"
sys_translations = get_system_translations("maajaa")
select_best_translation(sys_translations=sys_translations, gold_translation=gold_translation, score_function=eval_score_function)

('he/she (proximate) leaves', 1.0)

### Batch Processing

Now we can batch process the test set

In [15]:
best_translation_list = []
best_score_list = []

max_row = len(test_set) # use this for full dataset

for i in range(max_row):
    data_row = test_set.iloc[i]
    ojibwe_word = data_row["ojibwe_verb"]
    gold_translation = data_row["english_translation_standardized"]

    print(f"Processing item {i+1} / {max_row}, {(i+1)*100/max_row:.0f}% ")
    print(f"Ojibwe verb = {ojibwe_word}. Gold translation = {gold_translation}")
    try:
        sys_translations = get_system_translations(ojibwe_word)
        best_translation, best_score = select_best_translation(sys_translations=sys_translations, 
                                                               gold_translation=gold_translation, 
                                                               score_function=eval_score_function
                                                              )
                                                            
        print(f"Best matched translation = {best_translation}. Score = {best_score:.2f}")

        best_translation_list.append(best_translation)
        best_score_list.append(best_score)
    except Exception as e:
        print(f"Error {e}")
        best_translation_list.append("")
        best_score_list.append(0)
        
    print("-"*10)
    

Processing item 1 / 214, 0% 
Ojibwe verb = nimbakade. Gold translation = I am hungry
Best matched translation = I am hungry. Score = 1.00
----------
Processing item 2 / 214, 1% 
Ojibwe verb = gibakade. Gold translation = you are hungry
Best matched translation = you are hungry. Score = 1.00
----------
Processing item 3 / 214, 1% 
Ojibwe verb = bakade. Gold translation = he/she is hungry
Best matched translation = he/she (proximate) is hungry. Score = 1.00
----------
Processing item 4 / 214, 2% 
Ojibwe verb = niwii-wiisin. Gold translation = I want to eat
Best matched translation = I want to eat. Score = 1.00
----------
Processing item 5 / 214, 2% 
Ojibwe verb = giwii-wiisin. Gold translation = you want to eat
Best matched translation = you want to eat. Score = 1.00
----------
Processing item 6 / 214, 3% 
Ojibwe verb = ningiiwe. Gold translation = I am going home
Best matched translation = I go home. Score = 0.40
----------
Processing item 7 / 214, 3% 
Ojibwe verb = gigiiwe. Gold transl

In [16]:
print(len(test_set))
print(len(best_translation_list))
print(len(best_score_list))

214
214
214


In [17]:
test_set["best_system_translation"] = best_translation_list
test_set["chrf_score"] = best_score_list
test_set.head()

Unnamed: 0,ojibwe_verb,verb_type,english_translation_raw,english_translation_standardized,lesson_number,best_system_translation,chrf_score
0,nimbakade,"1s pf, vai",I am hungry,I am hungry,Lesson 1,I am hungry,1.0
1,gibakade,"2s pf, vai",you are hungry,you are hungry,Lesson 1,you are hungry,1.0
2,bakade,"3s, vai",s/he is hungry,he/she is hungry,Lesson 1,he/she (proximate) is hungry,1.0
3,niwii-wiisin,"1s pf, pv tns, vai","I want to eat, I will eat",I want to eat,Lesson 1,I want to eat,1.0
4,giwii-wiisin,"2s pf, pv tns, vai","you want to eat, you will eat",you want to eat,Lesson 1,you want to eat,1.0


In [18]:
print("Score statistics")
print(f"Number of non-zero scores = {len(test_set.query("chrf_score > 0"))} / {len(test_set)}")
print("Out of successful-processed translations, the statistics are:")
test_set.query("chrf_score > 0")["chrf_score"].describe()

Score statistics
Number of non-zero scores = 200 / 214
Out of successful-processed translations, the statistics are:


count    200.000000
mean       0.826347
std        0.232857
min        0.177901
25%        0.642991
50%        1.000000
75%        1.000000
max        1.000000
Name: chrf_score, dtype: float64

We achieved a mean score of `~0.82`, and successfully processed 200 out of 214 verbs 

### Semantic meaning scoring
We can also assess semantic meaning similarity between gold translation and system translations

In [19]:
def semantic_eval_score_function(gold_translation:str, sys_translation:str) -> float:
    """Scoring function for evaluation"""
    gold_translation = sentence_cleaning(gold_translation)
    sys_translation = sentence_cleaning(sys_translation)
    
    gold_embeddings = model.encode(gold_translation)
    sys_embeddings = model.encode(sys_translation)
    
    return model.similarity(gold_embeddings, sys_embeddings).item()

assert eval_score_function("I go somewhere", "i go somewhere") == 1
assert eval_score_function("I go", "I went") < 1


In [20]:
semantic_best_translation_list = []
semantic_best_score_list = []

max_row = len(test_set) # use this for full dataset

for i in range(max_row):
    data_row = test_set.iloc[i]
    ojibwe_word = data_row["ojibwe_verb"]
    gold_translation = data_row["english_translation_standardized"]

    print(f"Processing item {i+1} / {max_row}, {(i+1)*100/max_row:.0f}% ")
    print(f"Ojibwe verb = {ojibwe_word}. Gold translation = {gold_translation}")
    try:
        sys_translations = get_system_translations(ojibwe_word)
        best_translation, best_score = select_best_translation(sys_translations=sys_translations, 
                                                               gold_translation=gold_translation, 
                                                               score_function=semantic_eval_score_function
                                                            )
                                                            
        print(f"Best matched translation = {best_translation}. Score = {best_score:.2f}")

        semantic_best_translation_list.append(best_translation)
        semantic_best_score_list.append(best_score)
    except Exception as e:
        print(f"Error {e}")
        semantic_best_translation_list.append("")
        semantic_best_score_list.append(0)
        
    print("-"*10)
    

Processing item 1 / 214, 0% 
Ojibwe verb = nimbakade. Gold translation = I am hungry
Best matched translation = I am hungry. Score = 1.00
----------
Processing item 2 / 214, 1% 
Ojibwe verb = gibakade. Gold translation = you are hungry
Best matched translation = you are hungry. Score = 1.00
----------
Processing item 3 / 214, 1% 
Ojibwe verb = bakade. Gold translation = he/she is hungry
Best matched translation = he/she (proximate) is hungry. Score = 1.00
----------
Processing item 4 / 214, 2% 
Ojibwe verb = niwii-wiisin. Gold translation = I want to eat
Best matched translation = I want to eat. Score = 1.00
----------
Processing item 5 / 214, 2% 
Ojibwe verb = giwii-wiisin. Gold translation = you want to eat
Best matched translation = you want to eat. Score = 1.00
----------
Processing item 6 / 214, 3% 
Ojibwe verb = ningiiwe. Gold translation = I am going home
Best matched translation = I go home. Score = 0.99
----------
Processing item 7 / 214, 3% 
Ojibwe verb = gigiiwe. Gold transl

In [21]:
# create output dataframe to write as csv
test_set["best_semantic_translation"] = semantic_best_translation_list
test_set["semantic_similarity_score"] = semantic_best_score_list
output_df = test_set[["ojibwe_verb", "english_translation_raw", "english_translation_standardized", 
                      "best_system_translation", "chrf_score",
                      "best_semantic_translation", "semantic_similarity_score"
                      ]]
output_df.head()

Unnamed: 0,ojibwe_verb,english_translation_raw,english_translation_standardized,best_system_translation,chrf_score,best_semantic_translation,semantic_similarity_score
0,nimbakade,I am hungry,I am hungry,I am hungry,1.0,I am hungry,1.0
1,gibakade,you are hungry,you are hungry,you are hungry,1.0,you are hungry,1.0
2,bakade,s/he is hungry,he/she is hungry,he/she (proximate) is hungry,1.0,he/she (proximate) is hungry,1.0
3,niwii-wiisin,"I want to eat, I will eat",I want to eat,I want to eat,1.0,I want to eat,1.0
4,giwii-wiisin,"you want to eat, you will eat",you want to eat,you want to eat,1.0,you want to eat,1.0


In [22]:
print("Semantic score statistics")
print(f"Number of non-zero scores = {len(output_df.query("semantic_similarity_score > 0"))} / {len(output_df)}")
print("Out of successful-processed translations, the statistics are:")
output_df.query("semantic_similarity_score > 0")["semantic_similarity_score"].describe()

Semantic score statistics
Number of non-zero scores = 200 / 214
Out of successful-processed translations, the statistics are:


count    200.000000
mean       0.932813
std        0.107401
min        0.550725
25%        0.883415
50%        1.000000
75%        1.000000
max        1.000000
Name: semantic_similarity_score, dtype: float64

We achieved a mean score of `~0.93` for semantic meaning similarity

### Write to output file

In [23]:
# write to output file
output_filename = "././data/ojibwe_translation_test_set_result.csv"
print("Writing to output file =", output_filename)
output_df.to_csv(output_filename, index=False)
print("Completed")

Writing to output file = ././data/ojibwe_translation_test_set_result.csv
Completed
