In [3]:
# Calculate scores for misaligned texts
import re
import os
import json

def detect_language(text):
    """
    Checks whether the majority of the letters in the input text are in the greek or the latin script
    It is used to identify whether the text is in greek or greeklish (latin script), in order to skip unnecessary conversions.

    Args:
        text (str): The input text

    Returns:
        script (str): The dominant script
    """
    # Filter out non-letter characters
    valid_characters = [char for char in text if char.isalpha()]
    
    # Count Greek and English letters
    greek_count = sum(1 for char in valid_characters if '\u0370' <= char <= '\u03FF' or '\u1F00' <= char <= '\u1FFF')
    english_count = sum(1 for char in valid_characters if '\u0041' <= char <= '\u005A' or '\u0061' <= char <= '\u007A')
    
    if(greek_count == 0 and english_count == 0):
        return "unknown"
    
    # If there is even a single latin letter, the text is considered to be in the latin script
    if(english_count > 0):
        return "latin"
    else:
        return "greek"


    # script = "greek" if greek_count >= english_count else "latin"
    # return script


def compare_bare_words(word1, word2):
    """
    Compares two words by removing punctation and making them lowercase
    """
    word1 = " ".join(re.findall("[a-zA-Z]+", word1.lower()))
    word2 = " ".join(re.findall("[a-zA-Z]+", word2.lower()))
    return word1 == word2

def calculate_scores_misaligned(original_text, predicted_text, gt_indices):
    TP = 0
    FP = 0
    FN = 0

    
    FP_words = []
    FN_words = []

    predicted_text = predicted_text.lower()
    
    indices_not_found = []
    
    for index in gt_indices:
        word = original_text.split(" ")[index].lower()
        stripped_word = " ".join(re.findall("[a-zA-Z]+", word))
        stripped_predicted_text = " ".join(re.findall("[a-zA-Z]+", predicted_text))
        
        # Check whether the word is in the predicted text
        if(stripped_word in stripped_predicted_text):
            TP += 1
        # If the word is not in the predicted text, it is a false negative
        else:
            print("original text: ", original_text)
            print("predicted text: ", predicted_text)
            print("False negative: ", stripped_word)
            FN += 1
            indices_not_found.append(index)
            FN_words.append(word)
     # Check for false positives
    original_english_words = [original_text.split(" ")[index].lower() for index in gt_indices]
    stripped_original_english_words = [" ".join(re.findall("[a-zA-Z]+", word)) for word in original_english_words]
    for i, word in enumerate(predicted_text.split(" ")):
        # only keep the letters
        stripped_word = " ".join(re.findall("[a-zA-Z]+", word))
        if detect_language(stripped_word) == "latin":
            if(stripped_word.lower() not in stripped_original_english_words):
                print("original text: ", original_text)
                print("predicted text: ", predicted_text)
                FP += 1
                FP_words.append(stripped_word.lower())
                print("False positive: ", word)

    
    return TP, FP, FN, FP_words, FN_words


def calculate_scores_aligned(original_text, predicted_text, gt_indices):
    TP = 0
    FP = 0
    FN = 0

    FP_words = []
    FN_words = []

    split_text = original_text.split(" ")
    split_predicted_text = predicted_text.split(" ")

    for gt_index in gt_indices:
        if(compare_bare_words(split_text[gt_index], split_predicted_text[gt_index])):
            TP += 1
            
        else:
            print("original text: ", original_text)
            print("predicted text: ", predicted_text)
            print("False negative: ", split_text[gt_index], ' -> ', split_predicted_text[gt_index])
            FN += 1
            FN_words.append(split_text[gt_index])
    
     # Check for false positives
    original_english_words = [original_text.split(" ")[index].lower() for index in gt_indices]
    stripped_original_english_words = [" ".join(re.findall("[a-zA-Z]+", word)) for word in original_english_words]
    for i, word in enumerate(predicted_text.split(" ")):
        # only keep the letters
        stripped_word = " ".join(re.findall("[a-zA-Z]+", word))
        if detect_language(stripped_word) == "latin":
            if(stripped_word.lower() not in stripped_original_english_words):
                print("original text: ", original_text)
                print("predicted text: ", predicted_text)
                FP += 1
                FP_words.append(stripped_word.lower())
                print("False positive: ", word)
    

        
    return TP, FP, FN, FP_words, FN_words

In [9]:
# original_text = {
#     "text" : "prepei na kanoume adjust sta kainouria guidelines",
#     "gt_indices": [3, 6]
# }

# predicted_text = "πρεπει να κανουμε adjust στα kainourgia guidelines"

# recall, precision = calculate_scores_misaligned(original_text["text"], predicted_text, original_text["gt_indices"])
# print("Recall: ", recall)
# print("Precision: ", precision)

In [4]:

def benchmark_llm(path="LLMs/LLM_data/llama-3.1-70b-versatile_0.0_data/"):
    llama_data = os.listdir(path)
    print(llama_data)


    precisions = []
    recalls = []

    TP_all = 0
    FP_all = 0
    FN_all = 0

    FP_words_all = []
    FN_words_all = []

    aligned = 0
    misaligned = 0

    for forum in llama_data:
        with open(path + forum, "r") as f:
            forum_data = json.load(f)
        
        # with open("forums_info/forums_sampled/" + forum, "r") as f:
        #     annotations_data = json.load(f)

        for annotation in forum_data:
            original_text = annotation["greeklish"]
            gt_indices = annotation["gt_indices"]
            # check if the annotation file is aligned with the LLM data
                
            predicted_text = annotation["greek"]

            # Check if the text is aligned

            if(len(original_text.split(" ")) == len(predicted_text.split(" "))):
                aligned += 1
                TP, FP, FN, FP_words, FN_words = calculate_scores_aligned(original_text, predicted_text, gt_indices)
            else:
                misaligned += 1
                TP, FP, FN, FP_words, FN_words  = calculate_scores_misaligned(original_text, predicted_text, gt_indices)
                

            # skip if the denominator is 0
            if(TP + FN == 0 or TP + FP == 0):
                continue

            TP_all += TP
            FP_all += FP
            FN_all += FN

            recall = TP / (TP + FN)
            precision = TP / (TP + FP)

            FP_words_all.extend(FP_words)
            FN_words_all.extend(FN_words)
            
            precisions.append(precision)
            recalls.append(recall)

    micro_average_precision = TP_all / (TP_all + FP_all)
    micro_average_recall = TP_all / (TP_all + FN_all)

    macro_average_precision = sum(precisions) / len(precisions)
    macro_average_recall = sum(recalls) / len(recalls)
    
    print("FP_words: ", FP_words_all)
    print("FN_words: ", FN_words_all)
    
    print("Aligned: ", aligned)
    print("Misaligned: ", misaligned)

    return micro_average_precision, micro_average_recall, macro_average_precision, macro_average_recall


In [8]:
    # Benchmark LLama
    
    micro_average_precision, micro_average_recall, macro_average_precision, macro_average_recall = benchmark_llm("LLMs/LLM_data/llama-3.1-70b-versatile_0.0_data/")
    
    print(f"Average macro precision: {macro_average_precision}")
    print(f"Average macro recall: {macro_average_recall}")

   

    print(f"Average micro precision: {micro_average_precision}")
    print(f"Average micro recall: {micro_average_recall}")

['Προγραμματισμός_sample.json', 'Hardware_sample.json', 'Gadgets_sample.json', 'Λειτουργικά Συστήματα_sample.json', 'Ειδήσεις_sample.json', 'Ψυχαγωγία_sample.json', 'Software_sample.json', 'Διαδίκτυο_sample.json']
original text:  ena poly aplo pou mou ir8e ayti ti stigmi einai na to kaneis olo ena buton kai mesa na kaneis ta animation pou pi8anon na 8es.... apla kai meta ka8orizeis to hot shmeio tou button
predicted text:  ένα πολύ απλό που μου ήρθε αυτή τη στιγμή είναι να το κάνεις όλο ένα button και μέσα να κάνεις τα animation που πιθανόν να θες.... απλά και μετά καθορίζεις το hot σημείο του button
False negative:  buton  ->  button
original text:  Mia xara ide einai... eida demos einai polu kalh fash !
predicted text:  Μια χαρά ιδεί είναι... είδα δемо είναι πολύ καλή φάση !
False negative:  demos  ->  δемо
original text:  basika 0lew me to pu mpenei se mia selida na petaei to pop-p, xwris kapoios na prepei na pathse to click me, alla apla 0a balw to scriptaki gia automatic fwd, opot

In [7]:
# Benchmark GPT-4o

micro_average_precision, micro_average_recall, macro_average_precision, macro_average_recall = benchmark_llm("LLMs/LLM_data/gpt-4o_0.0_data/")
print(f"Average macro precision: {macro_average_precision}")
print(f"Average macro recall: {macro_average_recall}")



print(f"Average micro precision: {micro_average_precision}")
print(f"Average micro recall: {micro_average_recall}")

['Προγραμματισμός_sample.json', 'Hardware_sample.json', 'Gadgets_sample.json', 'Λειτουργικά Συστήματα_sample.json', 'Ειδήσεις_sample.json', 'Ψυχαγωγία_sample.json', 'Software_sample.json', 'Διαδίκτυο_sample.json']
original text:  tha ithela kana site na katebaso e books gia borland c
predicted text:  θα ηθελα κανα site να κατεβασω e books για borland c
False positive:  e
original text:  ena poly aplo pou mou ir8e ayti ti stigmi einai na to kaneis olo ena buton kai mesa na kaneis ta animation pou pi8anon na 8es.... apla kai meta ka8orizeis to hot shmeio tou button
predicted text:  ένα πολύ απλό που μου ήρθε αυτή τη στιγμή είναι να το κάνεις όλο ένα button και μέσα να κάνεις τα animation που πιθανόν να θες.... απλά και μετά καθορίζεις το hot σημείο του button
False negative:  buton  ->  button
original text:  Exo katevasei Siemens Mobility ToolKit (SMTK) apo Auto den kanei?
predicted text:  Έχω κατεβάσει Siemens Mobility ToolKit (SMTK) από Αυτό δεν κάνει?
False negative:  Auto  ->  Αυτό


In [5]:
# Benchmark GPT-4o-mini

micro_average_precision, micro_average_recall, macro_average_precision, macro_average_recall = benchmark_llm("LLMs/LLM_data/gpt-4o-mini_0.0_data/")
print(f"Average macro precision: {macro_average_precision:.2f}")
print(f"Average macro recall: {macro_average_recall:.2f}")
    


print(f"Average micro precision: {micro_average_precision:.2f}")
print(f"Average micro recall: {micro_average_recall:.2f}")

['Προγραμματισμός_sample.json', 'Hardware_sample.json', 'Gadgets_sample.json', 'Λειτουργικά Συστήματα_sample.json', 'Ειδήσεις_sample.json', 'Ψυχαγωγία_sample.json', 'Software_sample.json', 'Διαδίκτυο_sample.json']
original text:  tha ithela kana site na katebaso e books gia borland c
predicted text:  θα ηθελα κανα site να κατεβασω e books για borland c
False positive:  e
original text:  ena poly aplo pou mou ir8e ayti ti stigmi einai na to kaneis olo ena buton kai mesa na kaneis ta animation pou pi8anon na 8es.... apla kai meta ka8orizeis to hot shmeio tou button
predicted text:  ένα πολύ απλό που μου ήρθε αυτή τη στιγμή είναι να το κάνεις όλο ένα μπουτόν και μέσα να κάνεις τα animation που πιθανόν να θες.... απλά και μετά καθορίζεις το hot σημείο του button
False negative:  buton  ->  μπουτόν
original text:  Exo katevasei Siemens Mobility ToolKit (SMTK) apo Auto den kanei?
predicted text:  Έχω κατεβάσει Siemens Mobility ToolKit (SMTK) από Αυτό δεν κάνει?
False negative:  Auto  ->  Αυτ

In [6]:
# Test greeklish2me

micro_average_precision, micro_average_recall, macro_average_precision, macro_average_recall = benchmark_llm("LLMs/LLM_data/allgreek2me_data/")
print(f"Average macro precision: {macro_average_precision:.2f}")
print(f"Average macro recall: {macro_average_recall:.2f}")
    


print(f"Average micro precision: {micro_average_precision:.2f}")
print(f"Average micro recall: {micro_average_recall:.2f}")

['Προγραμματισμός_sample.json', 'Hardware_sample.json', 'Gadgets_sample.json', 'Λειτουργικά Συστήματα_sample.json', 'Ειδήσεις_sample.json', 'Ψυχαγωγία_sample.json', 'Software_sample.json', 'Διαδίκτυο_sample.json']
original text:  tha ithela kana site na katebaso e books gia borland c
predicted text:  tha ithela kana site na katebaso e books gia borland c
False positive:  tha
original text:  tha ithela kana site na katebaso e books gia borland c
predicted text:  tha ithela kana site na katebaso e books gia borland c
False positive:  ithela
original text:  tha ithela kana site na katebaso e books gia borland c
predicted text:  tha ithela kana site na katebaso e books gia borland c
False positive:  kana
original text:  tha ithela kana site na katebaso e books gia borland c
predicted text:  tha ithela kana site na katebaso e books gia borland c
False positive:  na
original text:  tha ithela kana site na katebaso e books gia borland c
predicted text:  tha ithela kana site na katebaso e book