#MODELING SCALED OFFENSIVENESS IN GREEK TEXTS THROUGH REGRESSION WITH BEST0WORST SCALING AND PRETRAINED MODELS

#National and Kapodistrian University of Athens

#Department of Informatics and Telecommunications

#Program of Postgraduate Studies: (M.Sc.) in Language Technology

#Master's Thesis


#Balas Antonis (lt12100021)


In [None]:
# Installing and setting up libraries
!pip3 install emoji
!pip3 install sentencepiece
!pip3 install protobuf
!pip3 install torchtext
!pip3 install transformers
!pip3 install unidecode
!pip3 install ekphrasis -U

!huggingface-cli login --token   # Login to HuggingFace Hub

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `test` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `test`


In [None]:
#Importing all necessary Python libraries

import os # Operating system utilities
import re # Regular expressions
import emoji # For emoji handling
import torch # PyTorch for deep learning
import json  # JSON parsing
import pandas as pd  # DataFrame operations
import torch.nn as nn # Neural network modules
import numpy as np # Numerical operations
import seaborn as sns # Plotting
import matplotlib.pyplot as plt # Plotting library
from sklearn.model_selection import train_test_split # For splitting dataset
from math import sqrt  # Square root function
from torch.utils.data import DataLoader, TensorDataset # Data loading utilities
from bs4 import BeautifulSoup  # HTML parsing
from google.colab import drive # Google Drive integration
from transformers import get_linear_schedule_with_warmup  # Learning rate scheduler
from torch.utils.data import Dataset # Custom dataset handling
from tqdm import tqdm # Progress bar
from sklearn.metrics import mean_squared_error # MSE calculation
from sklearn.metrics import r2_score # R-squared calculation
from transformers import (BertConfig, BertForSequenceClassification, BertTokenizer, AlbertConfig, BertModel,
                          AlbertForSequenceClassification, AlbertTokenizer, RobertaConfig,
                          RobertaForSequenceClassification, RobertaTokenizer, DebertaConfig,
                          DebertaForSequenceClassification, DebertaTokenizer, DebertaV2Config,
                          DebertaV2ForSequenceClassification, DebertaV2Tokenizer, XLMRobertaXLConfig,
                          XLMRobertaXLForSequenceClassification, AutoTokenizer, AutoConfig,
                          AutoModelForSequenceClassification) # Model imports

In [None]:
#Mounting Google Drive
drive.mount('/content/drive') # Mount Google Drive to access filesi

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Dictionary containing training and optimization hyperparameters
args = {"num_train_epochs": 20, # Total number of epochs to train the model
        'weight_decay': 0.01,  # Weight decay (L2 regularization) to prevent overfitting
        'learning_rate': 2e-5, # Learning rate for the optimizer (typically small for transformers)
        'adam_epsilon': 1e-2,  # Epsilon for the Adam optimizer to improve numerical stability
        'warmup_steps': 0, # Number of warmup steps for the learning rate scheduler
        'data_split_ratio': .2,  # Fraction of the data used for validation (20%)
        'max_seq_length': 280,  # Maximum input sequence length (token count)
        'batch_size': 10, # Number of training examples in each batch
        'max_grad_norm': 1.0,   # Maximum norm for gradient clipping to avoid exploding gradients
        'patient': 5, # Patience for early stopping (stop if no improvement after 5 epochs)
        'delta': 1.0,  # Threshold for considering an improvement in early stopping
        "threads": 1, # Number of threads to use for data loading or parallel processing
        'output_specific_model_dir': "/content/drive/MyDrive/MODELS/2/Regression/Best_Models/"} # Configuration dictionary for model-specific settings

config = {"hidden_size": 5}

In [None]:
# Converting emojis in the text to descriptive words (e.g., 😄 -> smiley face)

def emojis_into_text(sentence):
    demojized_sent = emoji.demojize(sentence) # Convert emoji to text format like :smile:
    emoji_txt = re.sub(r':\S+:', lambda x: x.group().replace('_', ' ').replace('-', ' ').replace(':', ''),
                       demojized_sent)
    return emoji_txt

# Replacing multiple substrings in a string with a new one

def replaceMultiple(main, replacements, new):
    for elem in replacements:
        if elem in main:
            main = main.replace(elem, new)
    return main

# Normalizes Greek characters

def normalize(x):
    x = x.replace('ά', 'α')
    x = x.replace('έ', 'ε')
    x = x.replace('ή', 'η')
    x = replaceMultiple(x, ['ί', 'ΐ', 'ϊ'], 'ι')
    x = x.replace('ό', 'ο')
    x = replaceMultiple(x, ['ύ', 'ΰ', 'ϋ'], 'υ')
    x = x.replace('ώ', 'ω')
    return x

# Separating digits from words (e.g., 'text12' → 'text 12')

def sep_digits(x):
    return " ".join(re.split('(\d+)', x))

# Separates punctuation from words by adding spaces around punctuation

def sep_punc(x):
    punc = '!"$%&\'()*+,-./:;<=>?@[\\]^_`{|}~؛،؟؛.»«”'
    out = []
    for char in x:
        if char in punc:
            out.append(' ' + char + ' ')
        else:
            out.append(char)
    return "".join(out)

# Complete preprocessing pipeline for Greek-language social media text

def preprocessing_greek(text):
    try:  # Remove the UTF-8 BOM (Byte Order Mark)
        text = text.decode('utf-8-sig').replace(u'\ufffd',
                                                '?')  # The UTF-8 BOM is a sequence of bytes (EF BB BF) that allows
        # the reader to identify a file as being encoded in UTF-8
    except:
        text = text

    soup = BeautifulSoup(text,
                         'lxml')  # HTML encoding has not been converted to text, and ended up in text field as

    text = soup.get_text() # Remove HTML encoding artifacts


    # Convert to lowercase
    text = str(text).lower()

    # Replace the RT with whitespace
    text = re.sub('rt @\w+: ', '', text)

    # Remove the @user tags
    text = re.sub(r'@[a-z0-9_]+', '', text)
    text = re.sub(r'username', '', text)

    # Remove the url links
    text = re.sub(r'http\S+', '', text)

    # Remove the 'url' and 'html' word
    url_words = ['url', 'html', 'http']
    for u in url_words:
        text = re.sub(u, '', text)

    # Convert the emojis into their textual representation
    text = emojis_into_text(text)

    # Replace '&amp;' with 'και'
    text = re.sub(r'&amp;', 'και', text)
    text = re.sub(r'&', 'και', text)

    # Replace the unicode apostrophe
    text = re.sub(r"’", "'", text)
    text = re.sub("”", "'", text)
    text = re.sub("“", "'", text)
    text = re.sub("'", '"', text)

    # Remove newlines
    text = re.sub("\n", '', text)
    text = text.replace("\\n", "")

    # Normalize characters
    text = normalize(text)

    # Space out digits and punctuation
    text = sep_digits(text)
    text = sep_punc(text)

    # Mark hashtags
    text = re.sub(r'#(\w+)', r'<hashtag> \1 </hashtag>', text)

    # Remove redundant retweet tokens
    text = re.sub(r'rt', ' ', text)

    # Remove redundant retweet tokens
    text = re.sub(' +', ' ', text)

        # Remove extra whitespace
    text = text.strip()

    return text

# Applying preprocessing to an entire corpus and optionally saves it

def modify_corpus(data, save=False):
    d = {"Text": [],
         "Lemma": [],
         "BWS": []}

    for x in range(len(data)):
        d["Text"].append(preprocessing_greek(data["Text"][x]))
        d["BWS"].append(data["BWS"][x])
        d["Lemma"].append(data["Lemma"][x] if "Lemma" in data else None)

    edited = pd.DataFrame(d)
    edited = edited.drop_duplicates(subset=["Text"])

    if save:
        edited.to_excel("MODIFIED_Corpus-offensive.xlsx", index=False)

    return edited


In [None]:
#Dictionary mapping numeric IDs to BERT model variants, including multilingual and Greek-specific models

dict_BERT_model_names = {1: 'bert-base-uncased',
                         2: 'bert-large-uncased',
                         3: 'bert-base-multilingual-uncased',  # multilingual
                         4: 'bert-base-multilingual-cased',
                         5: 'dimitriz/greek-media-bert-base-uncased',  # Greek-specific
                         6: 'nlpaueb/bert-base-greek-uncased-v1'}  # Greek-specific

# Dictionary of various AlBERT model variants by size and version
dict_AlBERT_model_names = {1: 'albert-base-v1',
                           2: 'albert-base-v2',}

# Dictionary of RoBERTa model variants (base and large)
dict_RoBERTa_model_names = {1: 'roberta-base',
                            2: 'roberta-large'}

# Dictionary of DeBERTa model variants
dict_DeBERTa_model_names = {1: 'microsoft/deberta-base',
                            2: 'microsoft/deberta-large',}

# Dictionary of  DeBERTaV3 models (includes multilingual variants)
dict_DeBERTaV2_model_names = {1: 'microsoft/deberta-v3-large',
                              2: 'microsoft/mdeberta-v3-base'}  # multilingual

# Dictionary of XLM-RoBERTa multilingual models (base and large)
dict_XLM_RoBERTa_model_names = {1: 'xlm-roberta-base',}  # multilingual


# Dictionary of miscellaneous multilingual models from various sources
dict_multilingual_model_names = {1: 'studio-ousia/mluke-base',  # multilingual
                                 2: 'cvcio/comments-el-toxic',  # Greek toxic comment model
                                 3: 'autopilot-ai/EthicalEye'}  # multilingual / ethical AI model


# Master dictionary that maps each model family to its configuration class,
# classification model class, tokenizer class, and name dictionary
MODEL_CLASSES = {'BERT': (BertConfig, BertForSequenceClassification, BertTokenizer, dict_BERT_model_names),
                 'AlBERT': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer, dict_AlBERT_model_names),
                 'RoBERTa': (
                     RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer, dict_RoBERTa_model_names),
                 'DeBERTa': (
                     DebertaConfig, DebertaForSequenceClassification, DebertaTokenizer, dict_DeBERTa_model_names),
                 'DeBERTaV3': (
                     DebertaV2Config, DebertaV2ForSequenceClassification, DebertaV2Tokenizer,
                     dict_DeBERTaV2_model_names),
                 'XLM_RoBERTa': (XLMRobertaXLConfig, XLMRobertaXLForSequenceClassification, AutoTokenizer,
                                 dict_XLM_RoBERTa_model_names),
                 'other': (
                     AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, dict_multilingual_model_names)}


In [None]:
# Apply full preprocessing pipeline on a sample Greek tweet/text
# This includes: lowercasing, emoji conversion, user and URL removal, punctuation spacing,
# unicode normalization, hashtag tagging, and spacing digits
# Expected result: a cleaned, token-ready version of the input suitable for transformer models

preprocessing_greek("EΤΣΙ ρε ανακατώστρα Πίτζη! #NomadsGR")

'eτσι ρε ανακατωστρα πιτζη ! <hashtag> nomadsgr </hashtag>'

In [None]:
# Defining a sample Greek sentence to be tokenized
sent = "EΤΣΙ ρε ανακατώστρα Πίτζη! #NomadsGR"

# Tokenize the sentence using the tokenizer's encode_plus method
encoding = tokenizer.encode_plus(
    text=sent,
    text_pair=None,                # No second sentence is used
    add_special_tokens=True,       # # Adds special tokens like [CLS] and [SEP] required by transformer models
    max_length=280,                # Set maximum sequence length to 280 tokens
    padding='max_length',          # Pad sequences shorter than max_length with padding tokens
    truncation=True,               # Truncate sequences longer than max_length
    return_token_type_ids=False,   # Skip token type IDs as they are not used in single-sentence tasks
    return_attention_mask=True,    # Generate attention mask to distinguish between real tokens and padding
    return_tensors='pt'            # Return results as PyTorch tensors
)


# Output the encoded representation to inspect input_ids and attention_mask
print(encoding)

{'input_ids': tensor([[  101,  1041, 29734, 29733, 18199,  1171, 29723,  1155, 16177, 14608,
         29726, 14608, 29734, 29739, 29733, 29734, 29732, 14608,  1170, 18199,
         29734, 29724, 24824,   999,  1001,  2053, 25666, 28745,  2099,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [None]:
# Print the length of the input_ids tensor
print(len(encoding['input_ids'][0]))

# Display the list of token IDs (includes [CLS], [SEP], and padding tokens)
encoding['input_ids'][0]

280


tensor([  101,  1041, 29734, 29733, 18199,  1171, 29723,  1155, 16177, 14608,
        29726, 14608, 29734, 29739, 29733, 29734, 29732, 14608,  1170, 18199,
        29734, 29724, 24824,   999,  1001,  2053, 25666, 28745,  2099,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [None]:
# Print the length of the attension_mask tensor
print(len(encoding['attention_mask'][0]))

# Display the attention mask tensor
# (1 for real tokens, 0 for padding positions)
encoding['attention_mask']

280


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
# Creating a PyTorch DataLoader from a dataframe
def create_data_loader(df, tokenizer, max_len, batch_size, shuffle):
    data = Data_Preparation(text=df.Text.to_numpy(),
                            BWS=df.BWS.to_numpy(),
                            tokenizer=tokenizer,
                            max_len=max_len)
    # Returning a DataLoader object
    return DataLoader(data, batch_size=batch_size, pin_memory=False, shuffle=shuffle, num_workers=args["threads"])



# Splitting the dataframe into training and validation sets
def data_splitting(dataframe, text_column, label_column, split_ratio):
    x_train_texts, y_val_texts, x_train_labels, y_val_labels = train_test_split(dataframe[text_column],
                                                                                dataframe[label_column],
                                                                                random_state=42,
                                                                             test_size=split_ratio)

    # Print shapes for verification
    print('Shape of x_train      : ', x_train_texts.shape)
    print('Shape of y_train      : ', x_train_labels.shape)
    print('Shape of x_validation : ', y_val_texts.shape)
    print('Shape of y_validation : ', y_val_labels.shape)

    return x_train_texts, y_val_texts, x_train_labels, y_val_labels


# Customing Dataset class for text and label preparation
class Data_Preparation(Dataset):

    def __init__(self, text, BWS, tokenizer, max_len):
        self.text = text
        self.label = BWS
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        # Cleaning up text input
        text = str(self.text[index])
        text = " ".join(text.split())

        # Tokenizing the text
        encoding = self.tokenizer.encode_plus(text=text,
                                              text_pair=None,
                                              add_special_tokens=True,  # Add [CLS] and [SEP]
                                              max_length=self.max_len,  # Max length to pad
                                              padding='max_length',
                                              # Pad sentence according to max length  'max_length'
                                              truncation=True,  # Truncate the sentences
                                              return_token_type_ids=False,  # Do not return the ids of type tokens
                                              return_attention_mask=True,  # Return attention mask
                                              return_tensors='pt')  # Return PyTorch tensor

         # Returning appropriate format depending on whether label exists
        if self.label is None:
            return {'input_ids': encoding['input_ids'].flatten(),
                    'attention_mask': encoding['attention_mask'].flatten(),
                    'text': self.text[index]}


        else:
            return {'input_ids': encoding['input_ids'].flatten(),
                    'attention_mask': encoding['attention_mask'].flatten(),
                    'label': torch.tensor(self.label[index], dtype=torch.float),
                    'text': self.text[index]}

 # Combining function to split the data and return either DataFrames or DataLoaders
def SplitDataPreparation(dataframe, split_ratio, tokenizer, max_length, batch_size, split_data=True,
                         make_dataloaders=True):
    if split_data:
        train_texts, validation_texts, train_labels, validation_labels = data_splitting(dataframe, 'Text', 'BWS',
                                                                                        split_ratio=split_ratio) # Split the dataset into train and validation sets

         # Creating pandas DataFrames from the splits
        train_df = pd.concat([train_texts, train_labels], axis=1)
        validation_df = pd.concat([validation_texts, validation_labels], axis=1)
        print(f'Dataset split into train and validation sets using {split_ratio} split ratio.')

        if make_dataloaders:
             # Converting the dataframes into DataLoader objects
            train_dataloader = create_data_loader(df=train_df, tokenizer=tokenizer, max_len=max_length,
                                                  batch_size=batch_size, shuffle=False)
            val_dataloader = create_data_loader(df=validation_df, tokenizer=tokenizer, max_len=max_length,
                                                batch_size=batch_size, shuffle=False)
            print('The train and validation dataloaders are ready for training and evaluation.')
            return train_dataloader, val_dataloader
        else:
            return train_df, validation_df

    else:
        # If not splitting, return only a single training dataloader
        train_dataloader = create_data_loader(df=dataframe, tokenizer=tokenizer, max_len=max_length,
                                              batch_size=batch_size, shuffle=True)
        val_dataloader = None
        print(
            'The dataframe is not split into train and validation sets. The dataframe converted to train dataloader for training.')
        return train_dataloader, val_dataloader


In [None]:
# A custom PyTorch module for regression using the [CLS] token from a transformer model

class RegressorLastHiddenState(nn.Module):

    def __init__(self, pretrained_model, device="cpu", freeze_bert=False):
        super(RegressorLastHiddenState, self).__init__()

        self.pretrained_model = pretrained_model # Pretrained transformer model
        self.regressor = nn.Sigmoid()            # Sigmoid activation for regression output
        self.n_input = self.pretrained_model.config.hidden_size  # Hidden size of the model
        self.regression = None # Placeholder (not used in this version)
        self.device = device # Device: 'cpu' or 'cuda'

        # Adding possibility to freeze the BERT model to avoid fine-tuning BERT params
        if freeze_bert:
            for param in self.pretrained_model.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
         # Getting the outputs from the pretrained model
        outputs = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)

        # Extracting the last hidden state of the token `[CLS]` for regression task
        last_hidden_state = outputs[0]
        cls_embeddings = last_hidden_state[:, 0]

        # Feeding input to regressor to get regression output
        regression_output = self.regressor(cls_embeddings)

        # regression_output = my_activation(regression_output)

         # Output shapes:
        # last_hidden_state       : (batch_size, sequence_length, hidden_size)
        # cls_embeddings          : (batch_size, hidden_size)
        # regression_output       : (batch_size, 1)

        return regression_output


In [None]:
#Training & Evaluation Process

def setup_pretrained_model(classificationModel, model_name, args, train_dataloader, device="cpu"):
    # Loading the pretrained model from the specified name
    pretrained_model = classificationModel.from_pretrained(
        model_name,
        output_attentions=False,  # Whether the model returns attentions weights
        output_hidden_states=True) # Output hidden states (needed for extracting [CLS] token)

    # Wrap the pretrained model in a regression head
    model = RegressorLastHiddenState(pretrained_model=pretrained_model, device=device)
    # Move the model to the specified device (CPU or GPU)
    model.to(device)


    # Calculating the total number of training steps
    num_train_steps = int(len(train_dataloader) * args["num_train_epochs"])

    # Preparing parameters for the optimizer with weight decay configuration
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    # Separating parameters into two groups for weight decay vs no weight decay
    optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                                     'weight_decay': args['weight_decay']},
                                    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                                     'weight_decay': 0.0}]

    # Defining the AdamW optimizer with custom learning rate and epsilon
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'],
                                  betas=(0.9, 0.999))

    # Defining a linear learning rate scheduler with warmup
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'],
                                                num_training_steps=num_train_steps)

    return model, optimizer, scheduler, num_train_steps



def splitData(tokenizer_name, model_name, dataframe, split_ratio, max_length, batch_size, split_data, make_dataloaders):
    # Loading tokenizer from pretrained model
    tokenizer = tokenizer_name.from_pretrained(model_name)
    # Splitting the dataset and prepare DataLoader objects
    train_dataloader, val_dataloader = SplitDataPreparation(dataframe=dataframe,
                                                            split_ratio=split_ratio,
                                                            tokenizer=tokenizer,
                                                            max_length=max_length,
                                                            batch_size=batch_size,
                                                            split_data=split_data,
                                                            make_dataloaders=make_dataloaders)

    return train_dataloader, val_dataloader


def train_the_model(model, train_dataloader, val_dataloader, optimizer, scheduler, args, patience, model_class, model_filename):

    # Defining the loss function (Huber loss is more robust to outliers)
    def loss_function(delta, device="cuda"):
      return nn.HuberLoss(delta=delta).to(device)

    training_stats = []    # Stores overall training statistics
    best_score = None      # Best validation loss seen so far
    counter = 0            # Early stopping counter
    lossF = loss_function(args["delta"])
    statistics_per_epoch = dict()  # Stores detailed epoch-wise predictions

    # Training loop across epochs
    for epoch in range(args['num_train_epochs']):
        print(f'\n======== EPOCH {epoch + 1} / {args["num_train_epochs"]} ========\n')
        print('TRAINING MODEL...')

        training_loss = 0
        r_squared = 0
        training_mse = 0
        training_rmse = 0
        labels = 0
        stats = {'Text': [],
                 'BWS': [],
                 'Prediction': [],
                 'Training': []}

        model.train()   # Setting model to training mode

        for step, batch in enumerate(tqdm(train_dataloader, desc='Training iteration')):

            # Progressing update every 661 batches  482  349  927  727 457
            if (step+1) % 1000 == 0:
                # Reporting progress
                print(f' Batch {step+1} of {len(train_dataloader)}')

            # Moving  input data to device
            input_ids = batch['input_ids'].to(device)
            attention_masks = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Zero the gradients/Resetting gradients before backward pass
            optimizer.zero_grad()

            # Performing forward propagation/ Computing the predictions
            predictions = model(input_ids=input_ids, attention_mask=attention_masks)

            # Computing the loss
            loss = lossF(predictions, labels)

            # Detaching predictions and move labels to CPU for logging
            predictions = predictions.detach().cpu().numpy()
            labels = labels.to('cpu').numpy()

            # Logging predictions
            for text, prediction, label in zip(batch["text"], predictions, labels):
              stats['Text'].append(text)
              stats['BWS'].append(float(label))
              stats['Prediction'].append(float(prediction))
              stats['Training'].append(True)

            # Backward pass: compute gradients
            loss.backward()

            # Cliping the gradient value to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           max_norm=args['max_grad_norm'])  # Avoid exploding of gradients

            # Using optimizer to take gradient step and Update the learning rate
            optimizer.step()
            scheduler.step()

            # Resetting gradients after update
            optimizer.zero_grad()

            # Accumulating total training loss
            training_loss += loss.item()

            # Clearing cache to save memory
            torch.cuda.empty_cache()
            del input_ids, attention_masks

        # Calculating average loss for this epoch
        train_loss_of_epoch = training_loss / len(train_dataloader)

        print(f'Train Loss: {train_loss_of_epoch:.3f}')


# ==============================================================================================================================================
        print('\nEVALUATING MODEL...')

        model.eval() # Setting the model to evaluation mode

        eval_loss = 0
        eval_mse = 0
        eval_rmse = 0
        eval_r_squared = 0
        output = 0

        # Calculation of gradient not required during evaluation
        with torch.no_grad():

            for batch in tqdm(val_dataloader, desc='Evaluation iteration'):
                # Moving input tensors to the designated device (GPU or CPU)
                ids_inputs = batch['input_ids'].to(device)
                att_masks = batch['attention_mask'].to(device)
                targets = batch['label'].to(device)

                # Performing forward propagation/ Getting model predictions
                outputs = model(input_ids=ids_inputs, attention_mask=att_masks)

                # Computing the loss
                logits = lossF(outputs, targets)

                # Moving logits and labels to CPU
                outputs = outputs.detach().cpu().numpy()
                targets = targets.to('cpu').numpy()

                # Saving predictions and corresponding true values for analysis
                for text, prediction, label in zip(batch["text"], outputs, targets):
                  stats['Text'].append(text)
                  stats['BWS'].append(float(label))
                  stats['Prediction'].append(float(prediction))
                  stats['Training'].append(False)

                # Accumulating total validation loss
                eval_loss += logits.item()

                # Free up memory by deleting unused tensors
                torch.cuda.empty_cache()
                del ids_inputs, att_masks, outputs

        # Calculating average validation loss for the current epoch
        loss_of_epoch_val = eval_loss / len(val_dataloader)

        print(f'Validation Loss: {loss_of_epoch_val:.3f}')

        # Saving epoch statistics (predictions, labels, texts)
        statistics_per_epoch[f"{epoch+1}"] = stats

        # Saving best model checkpoint (based on lowest validation loss)
        if best_score is None:
            best_score = loss_of_epoch_val
            save_model(model, model_filename, f"{args['output_specific_model_dir']}{model_class}/{re.sub('/', '-', model_filename)}")
        else:
            # Checking if val_loss improves or not.
            if loss_of_epoch_val < best_score:
                # val_loss improving then updating the latest best_score and saving the current model
                best_score = loss_of_epoch_val
                save_model(model, model_filename, f"{args['output_specific_model_dir']}{model_class}/{re.sub('/', '-', model_filename)}")
            else:
                # val_loss does not improve then increase the counter, stop training if it exceeds the amount of patience
                counter += 1
                if counter >= patience:
                    print('EARLY STOPPING!')
                    break

    print('Training and evaluation process complete!')

    return statistics_per_epoch    # Returning epoch-wise detailed prediction statistics


def run_it(model_class, model_name, args, total_stat):
    # Preparing the training and validation dataloaders using the tokenizer for the given model
    train_dataloader, val_dataloader = splitData(MODEL_CLASSES[model_class][2], model_name, training_dataset,
                                                 args['data_split_ratio'], args['max_seq_length'], args['batch_size'],
                                                 True, True) # split data into train/val # create DataLoaders

    # Loading the pretrained model and prepare optimizer, scheduler, etc.
    model, optimizer, scheduler, num_train_steps = setup_pretrained_model(MODEL_CLASSES[model_class][1], model_name,
                                                                          args, train_dataloader, device)

    # Training and evaluating the model, returning stats for each epoch
    statistics = train_the_model(model, train_dataloader, val_dataloader, optimizer, scheduler, args,
                                     args["patient"], model_class, model_name)

    # Preparing directory for saving statistics
    s_model_name = re.sub("/", "-", model_name)
    save_to = f"{args['output_specific_model_dir']}{model_class}/{s_model_name}"
    if not os.path.exists(save_to):
        os.makedirs(save_to)

    # Saving per-epoch statistics to a JSON file
    file_path = f"{save_to}/statistics_per_epoch.json"
    with open(file_path, "w") as json_file:
        json.dump(statistics, json_file)
        print(f"Best statistics saved as {file_path}")

    # Cleaning up GPU memory
    torch.cuda.empty_cache()
    del model, optimizer, scheduler


In [None]:
# Function to save the model
def save_model(model, experiment_name, model_output_dir):
    # Creating the output directory if it doesn't exist
    if not os.path.exists(model_output_dir):
        os.makedirs(model_output_dir)

    # Replacing slashes in experiment name to avoid directory issues
    experiment_name=re.sub(r'/', '-', experiment_name)

    # Defining full output path for the model file
    output_model_file = os.path.join(model_output_dir, experiment_name)

    # Getting the actual model if it's wrapped in a parallel/distributed wrapper
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training

    # Save the model's state_dict (recommended way)
    torch.save(model_to_save.state_dict(), f"{output_model_file}.pt")

    # Optionally save the entire model object (less portable but easier to reload)
    torch.save(model_to_save, f"{output_model_file}.pt")

    print(f'Model saved to {model_output_dir} as {experiment_name}')


# Function to load a previously saved model
def load_saved_model(defined_model, experiment_name, model_output_dir):
    # Building the full path to the saved model
    saved_model_path = os.path.join(model_output_dir, experiment_name)
    loaded_model = None

    # Loading state_dict if file ends in '.pth'
    if experiment_name.endswith('.pth'):
        loaded_model = defined_model.load_state_dict(torch.load(saved_model_path))
        print('Model loaded successfully')

    # Loading entire model object if file ends in '.pt'
    if experiment_name.endswith('.pt'):
        loaded_model = torch.load(saved_model_path)
        print('Model loaded successfully')
    else:
        print('No such model found.')
    return loaded_model

In [None]:
print('TRAINING PARAMETERS:\n')
print('Loss function used: HuberLoss')
print('Optimizer used: AdamW')
print(f"Learning Rate: {args['learning_rate']:.5}")
print(f"Adam Epsilon: {args['adam_epsilon']}")
print(f'Betas: {0.9, 0.999}')
print(f'Weight Decay: {args["weight_decay"]}')
print(f'Batch Size: {args["batch_size"]}')
print(f'Number of training epochs: {args["num_train_epochs"]}')
print(f'Maximum Sequence Length: {args["max_seq_length"]}')
print(f'Warm-up Steps: {args["warmup_steps"]}')
print(f"Delta value for Huber Loss: {args['delta']}")

In [None]:
# Define the device to run the model on: GPU if available, otherwise CPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading the dataset from an Excel file
# data = pd.read_excel(


# Apply preprocessing and formatting to the dataset without saving it to a file
#     "/content/drive/MyDrive/MODELS/corpus/fiction.xlsx")
# training_dataset = modify_corpus(data, False)

In [None]:
# total_stat = dict()

# for model_class in MODEL_CLASSES:
#   if model_class in ["BERT"]:
#     total_stat[model_class] = dict()
#     for model_name in MODEL_CLASSES[model_class][3].values():
#       if model_name in ["nlpaueb/bert-base-greek-uncased-v1",
#                         'dimitriz/greek-media-bert-base-uncased']:
#         run_it(model_class, model_name, args, total_stat)


In [None]:
# Defining paths to all available corpora for training
corpuses = {
    'tweets':         "/content/drive/MyDrive/MODELS/corpus/tweets.xlsx",
    'fiction':          "/content/drive/MyDrive/MODELS/corpus/fiction.xlsx",
    'blogs':          "/content/drive/MyDrive/MODELS/corpus/blogs.xlsx",
    'fiction+blogs':  "/content/drive/MyDrive/MODELS/corpus/fiction+blogs.xlsx",
    'tweets+fiction': "/content/drive/MyDrive/MODELS/corpus/tweets+fiction.xlsx",
    'tweets+blogs':   "/content/drive/MyDrive/MODELS/corpus/tweets+blogs.xlsx",
    'all':            "/content/drive/MyDrive/MODELS/corpus/fiction+blogs+tweets.xlsx"
    }

# Selecting device based on GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Looping over each corpus configuration
for folder_name, corpus_path in corpuses.items():
  # Loading data from Excel file
  data = pd.read_excel(corpus_path)
  # Apply text preprocessing
  training_dataset = modify_corpus(data, False)

  # Setting output directory specific to this corpus
  args['output_specific_model_dir'] = os.path.join(
      "/content/drive/MyDrive/MODELS/Regression/Best_Models/", folder_name)
  args['output_specific_model_dir'] = args['output_specific_model_dir'] + '/'
  # Dictionary to hold training statistics per model
  total_stat = dict()
  # Iterate through each model class
  for model_class in MODEL_CLASSES:
    if model_class in ["BERT"]:
      total_stat[model_class] = dict()
                  # Looping through all BERT model names (pretrained)
      for model_name in MODEL_CLASSES[model_class][3].values():
        # Running the full fine-tuning and evaluation pipeline
        run_it(model_class, model_name, args, total_stat)
