In [1]:
# Calculate scores for misaligned texts
import re
import os
import json

def detect_language(text):
    """
    Checks whether the majority of the letters in the input text are in the greek or the latin script
    It is used to identify whether the text is in greek or greeklish (latin script), in order to skip unnecessary conversions.

    Args:
        text (str): The input text

    Returns:
        script (str): The dominant script
    """
    # Filter out non-letter characters
    valid_characters = [char for char in text if char.isalpha()]
    
    # Count Greek and English letters
    greek_count = sum(1 for char in valid_characters if '\u0370' <= char <= '\u03FF' or '\u1F00' <= char <= '\u1FFF')
    english_count = sum(1 for char in valid_characters if '\u0041' <= char <= '\u005A' or '\u0061' <= char <= '\u007A')
    
    if(greek_count == 0 and english_count == 0):
        return "unknown"

    script = "greek" if greek_count >= english_count else "latin"
    return script

def calculate_scores_misaligned(original_text, predicted_text, gt_indices):
    TP = 0
    FP = 0
    FN = 0

    
    FP_words = []
    FN_words = []

    predicted_text = predicted_text.lower()
    
    indices_not_found = []
    for index in gt_indices:
        word = original_text.split(" ")[index].lower()
        
        # Check whether the word is in the predicted text
        if(word in predicted_text):
            TP += 1
        # If the word is not in the predicted text, it is a false negative
        else:
            FN += 1
            indices_not_found.append(index)
            FN_words.append(word)

    # Check for false positives
    original_english_words = [original_text.split(" ")[index].lower() for index in gt_indices]
    for i, word in enumerate(predicted_text.split(" ")):
        # only keep the letters
        stripped_word = " ".join(re.findall("[a-zA-Z]+", word))
        if detect_language(stripped_word) == "latin":
            if(word not in original_english_words):
                FP += 1
                FP_words.append(word)
    
    return TP, FP, FN, FP_words, FN_words

In [9]:
# original_text = {
#     "text" : "prepei na kanoume adjust sta kainouria guidelines",
#     "gt_indices": [3, 6]
# }

# predicted_text = "πρεπει να κανουμε adjust στα kainourgia guidelines"

# recall, precision = calculate_scores_misaligned(original_text["text"], predicted_text, original_text["gt_indices"])
# print("Recall: ", recall)
# print("Precision: ", precision)

In [10]:
# Evaluate LLama

Forum:  Προγραμματισμός_sample.json
Forum:  Hardware_sample.json
Forum:  Gadgets_sample.json
Forum:  Λειτουργικά Συστήματα_sample.json
Forum:  Ειδήσεις_sample.json
Forum:  Ψυχαγωγία_sample.json
Forum:  Software_sample.json
Forum:  Διαδίκτυο_sample.json


In [2]:

def benchmark_llm(path="LLMs/LLM_data/llama-3.1-70b-versatile_0.0_data/"):
    llama_data = os.listdir(path)
    print(llama_data)


    precisions = []
    recalls = []

    TP_all = 0
    FP_all = 0
    FN_all = 0

    FP_words_all = []
    FN_words_all = []

    for forum in llama_data:
        with open(path + forum, "r") as f:
            text_data = json.load(f)
        
        with open("forums_info/forums_sampled/" + forum, "r") as f:
            annotations_data = json.load(f)

        for i, annotation in enumerate(annotations_data):
            original_text = annotation["text"]
            gt_indices = annotation["gt_indices"]
            # check if the annotation file is aligned with the LLM data
            if(text_data[i]["greeklish"] != original_text):
                print("text not found (misalignment)")
                exit(0)
                
            predicted_text = text_data[i]["greek"]

            TP, FP, FN, FP_words, FN_words  = calculate_scores_misaligned(original_text, predicted_text, gt_indices)
            
            FP_words_all.extend(FP_words)
            FN_words_all.extend(FN_words)

            # skip if the denominator is 0
            if(TP + FN == 0 or TP + FP == 0):
                continue

            TP_all += TP
            FP_all += FP
            FN_all += FN

            recall = TP / (TP + FN)
            precision = TP / (TP + FP)
            
            precisions.append(precision)
            recalls.append(recall)

    micro_average_precision = TP_all / (TP_all + FP_all)
    micro_average_recall = TP_all / (TP_all + FN_all)

    macro_average_precision = sum(precisions) / len(precisions)
    macro_average_recall = sum(recalls) / len(recalls)
    
    print("FP_words: ", FP_words_all)
    print("FN_words: ", FN_words_all)
    
    return micro_average_precision, micro_average_recall, macro_average_precision, macro_average_recall


In [4]:
    # Benchmark LLama
    
    micro_average_precision, micro_average_recall, macro_average_precision, macro_average_recall = benchmark_llm("LLMs/LLM_data/llama-3.1-70b-versatile_0.0_data/")
    
    print(f"Average macro precision: {macro_average_precision:.2f}")
    print(f"Average macro recall: {macro_average_recall:.2f}")

   

    print(f"Average micro precision: {micro_average_precision:.2f}")
    print(f"Average micro recall: {micro_average_recall:.2f}")

['Hardware_sample.json']
FP_words:  ['out', 'mac', 'imac;', 'ghost', 'guard', 'nf3', 'πρίντσο(τρόμος)(yamaha', 'atapi', 'cdχειροποίητης', '230watt(οι', 'ξiaomi', 'pcάκι...', '"thermaltake']
FN_words:  ['ips:', 'ips:', 'prob', 'imac?', 'sata', 'gost', 'princo', 'n8x', 'princo(tromos)(yamaha', 'dhl', 'xiaomi', 'pcaki...', 'rakor', 'photos', 'modes', 'sp2']
Average macro precision: 0.84
Average macro recall: 0.83
Average micro precision: 0.87
Average micro recall: 0.87


In [58]:
# Benchmark GPT-4o

micro_average_precision, micro_average_recall, macro_average_precision, macro_average_recall = benchmark_llm("LLMs/LLM_data/gpt-4o_0.0_data/")
print(f"Average macro precision: {macro_average_precision:.2f}")
print(f"Average macro recall: {macro_average_recall:.2f}")



print(f"Average micro precision: {micro_average_precision:.2f}")
print(f"Average micro recall: {micro_average_recall:.2f}")

['Προγραμματισμός_sample.json']
FP_words:  ['e', 'tcp/ip.', 'flooders,', 'delphi/kylix.', 'java)', 'χολή....------------------edgstr5smdy=', 'scriptaki', 'thanks']
FN_words:  ['site', 'buton', 'demos', 'pop-p,', 'pop-up']
Average macro precision: 0.95
Average macro recall: 0.95
Average micro precision: 0.87
Average micro recall: 0.93
