#MODELING SCALED OFFENSIVENESS IN GREEK TEXTS THROUGH REGRESSION WITH BEST0WORST SCALING AND PRETRAINED MODELS

#National and Kapodistrian University of Athens

#Department of Informatics and Telecommunications

#Program of Postgraduate Studies: (M.Sc.) in Language Technology

#Master's Thesis


#Balas Antonis (lt12100021)


In [None]:
# Importing required libraries

import os
import json
import pandas as pd

from google.colab import drive

drive.mount("/content/drive")
os.chdir('/content/drive/MyDrive/MODELS/Regression/AIKIA')

Mounted at /content/drive


In [None]:
# Initializing empty dictionary to store all model statistics# Initialize empty dictionary to store all model statistics
statistics = {}

# Looping through each item in the current directory
for model_cat in os.listdir():
    # Skipping Excel files and checkpoint folders
  if model_cat.endswith('.xlsx') or model_cat.endswith('.ipynb_checkpoints'):
    continue
  # Looping through each model inside the current model category folder
  for model in os.listdir(model_cat):

    # Constructing the full path to the statistics file
    json_path = os.path.join('.', model_cat, model, 'general_statistics.json')

    # Skipping this model if the JSON file does not exist
    if not os.path.exists(json_path):
      continue
    # Opening and loading the JSON file
    with open(json_path, 'r') as file:
        data = json.load(file)

        # Extracting best epoch information for training and evaluation
        best_train = data['Best']['Training']
        best_eval = data['Best']['Evaluation']

        # If the model category already exists in the dictionary, add the current model to it
        if model_cat in statistics:
          statistics[model_cat][model] = {
              'train': {},
              'eval': {}
          }
        else:
          # Otherwise, initialize the category and add the model
          statistics[model_cat] = {
              model: {
                  'train': {},
                  'eval': {}
              }
          }
        # Store best training metrics
        statistics[model_cat][model]['train'] = {
              'epoch': best_train['Epoch'],
              'r2': data['Entirely Statistics']['Training']['R_Squared'][str(best_train['Epoch'])],
              'mse': data['Entirely Statistics']['Training']['MSE'][str(best_train['Epoch'])],
              'rmse': data['Entirely Statistics']['Training']['RMSE'][str(best_train['Epoch'])],
              'rse': data['Entirely Statistics']['Training']['RSE'][str(best_train['Epoch'])],
              'loss': best_train['Value']
          }
        # Store best evaluation metrics
        statistics[model_cat][model]['eval'] = {
              'epoch': best_eval['Epoch'],
              'r2': data['Entirely Statistics']['Evaluation']['R_Squared'][str(best_eval['Epoch'])],
              'mse': data['Entirely Statistics']['Evaluation']['MSE'][str(best_eval['Epoch'])],
              'rmse': data['Entirely Statistics']['Evaluation']['RMSE'][str(best_eval['Epoch'])],
              'rse': data['Entirely Statistics']['Evaluation']['RSE'][str(best_eval['Epoch'])],
              'loss': best_eval['Value']
          }

In [None]:
# Initializing an empty list to collect rows of model evaluation results
records = []

# Iterating over all model categories and their corresponding models
for model_cat, models in statistics.items():
    for model_name, metrics in models.items():
        # Appending a dictionary of relevant metrics for each model to the records list
        records.append({
            "Model Category": model_cat,
            "Model": model_name,
            "Train R2": metrics['train']['r2'],
            "Train RSE": metrics['train']['rse'],
            "Train MSE": metrics['train']['mse'],
            "Train RMSE": metrics['train']['rmse'],
            "Train Loss": metrics['train']['loss'],
            "Eval R2": metrics['eval']['r2'],
            "Eval RSE": metrics['eval']['rse'],
            "Eval MSE": metrics['eval']['mse'],
            "Eval RMSE": metrics['eval']['rmse'],
            "Eval Loss": metrics['eval']['loss']
        })

# Converting the collected records into a pandas DataFrame for further analysis or export
df = pd.DataFrame(records)

In [None]:
# Sorting the DataFrame based on two evaluation metrics:
#  - "Eval R2" in descending order (higher R² is better)
#  -"Eval Loss" in ascending order (lower loss is better)
sorted_by_eval = df.sort_values(by=["Eval R2", "Eval Loss"],
                                ascending=[False, True]).reset_index(drop=True)

# Displaying the sorted DataFrame with the best-performing models at the top
sorted_by_eval

Unnamed: 0,Model Category,Model,Train R2,Train RSE,Train MSE,Train RMSE,Train Loss,Eval R2,Eval RSE,Eval MSE,Eval RMSE,Eval Loss
0,BERT,nlpaueb-bert-base-greek-uncased-v1,0.542213,0.154777,0.023949,0.154753,0.023949,0.542288,0.154732,0.023912,0.154635,0.023912
1,BERT,dimitriz-greek-media-bert-base-uncased,0.512573,0.15971,0.025499,0.159685,0.025499,0.501183,0.161531,0.026059,0.161429,0.026059
2,DeBERTaV2,microsoft-mdeberta-v3-base,0.54063,0.155045,0.024031,0.155021,0.024031,0.489454,0.163419,0.026672,0.163316,0.026672
3,DeBERTaV2,microsoft-deberta-v3-large,0.495758,0.162441,0.026379,0.162416,0.026379,0.402916,0.176727,0.031193,0.176616,0.031193
4,other,cvcio-comments-el-toxic,0.419942,0.174226,0.030345,0.174198,0.030345,0.37026,0.181495,0.032899,0.181382,0.032899
5,other,studio-ousia-mluke-base,0.349444,0.184509,0.034033,0.184481,0.034033,0.306306,0.190488,0.03624,0.190369,0.03624
6,other,autopilot-ai-EthicalEye,0.23026,0.2007,0.040268,0.200669,0.040268,0.297343,0.191715,0.036709,0.191595,0.036709
7,BERT,bert-base-multilingual-uncased,0.303523,0.19091,0.036435,0.19088,0.036435,0.28816,0.192964,0.037188,0.192843,0.037188
8,BERT,bert-base-multilingual-cased,0.283754,0.193601,0.03747,0.193571,0.03747,0.255886,0.197289,0.038874,0.197166,0.038874
9,DeBERTa,microsoft-deberta-large,0.22596,0.20126,0.040493,0.201229,0.040493,0.211841,0.203045,0.041175,0.202917,0.041175


In [None]:
# Sorting the dataframe by Training R2 in descending order and Training Loss in ascending order
sorted_by_train = df.sort_values(by=["Train R2", "Train Loss"],
                                ascending=[False, True]).reset_index(drop=True)

# Displaying the sorted DataFrame with the best-performing models at the top
sorted_by_train

Unnamed: 0,Model Category,Model,Train R2,Train RSE,Train MSE,Train RMSE,Train Loss,Eval R2,Eval RSE,Eval MSE,Eval RMSE,Eval Loss
0,BERT,nlpaueb-bert-base-greek-uncased-v1,0.542213,0.154777,0.023949,0.154753,0.023949,0.542288,0.154732,0.023912,0.154635,0.023912
1,DeBERTaV2,microsoft-mdeberta-v3-base,0.54063,0.155045,0.024031,0.155021,0.024031,0.489454,0.163419,0.026672,0.163316,0.026672
2,BERT,dimitriz-greek-media-bert-base-uncased,0.512573,0.15971,0.025499,0.159685,0.025499,0.501183,0.161531,0.026059,0.161429,0.026059
3,DeBERTaV2,microsoft-deberta-v3-large,0.495758,0.162441,0.026379,0.162416,0.026379,0.402916,0.176727,0.031193,0.176616,0.031193
4,other,cvcio-comments-el-toxic,0.419942,0.174226,0.030345,0.174198,0.030345,0.37026,0.181495,0.032899,0.181382,0.032899
5,other,studio-ousia-mluke-base,0.349444,0.184509,0.034033,0.184481,0.034033,0.306306,0.190488,0.03624,0.190369,0.03624
6,BERT,bert-base-multilingual-uncased,0.303523,0.19091,0.036435,0.19088,0.036435,0.28816,0.192964,0.037188,0.192843,0.037188
7,BERT,bert-base-multilingual-cased,0.283754,0.193601,0.03747,0.193571,0.03747,0.255886,0.197289,0.038874,0.197166,0.038874
8,AlBERT,albert-base-v2,0.269078,0.195574,0.038237,0.195544,0.038237,0.198846,0.204711,0.041854,0.204583,0.041854
9,other,autopilot-ai-EthicalEye,0.23026,0.2007,0.040268,0.200669,0.040268,0.297343,0.191715,0.036709,0.191595,0.036709


In [None]:
# Creating an Excel file to save the sorted metrics for each model
with pd.ExcelWriter("/content/drive/MyDrive/MODELS/Regression/Best_Models/sorted_by_metrics_AIKIA.xlsx") as writer:
    # Writing training performance metrics to the "Sorted by Train" sheet
    sorted_by_train.to_excel(writer, sheet_name="Sorted by Train", index=False)
    # Write evaluation performance metrics to the "Sorted by Eval" sheet
    sorted_by_eval.to_excel(writer, sheet_name="Sorted by Eval", index=False)