In [3]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoConfig
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy 
import numpy as np
from scipy.special import softmax
# import tensorflow as tf

import pandas as pd
from transformers import pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
 
from tqdm import tqdm
 
from sklearn.model_selection import train_test_split
 
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import evaluate
import os


In [4]:
sentences = pd.read_excel('LLM_examples.xlsx')

In [6]:
# MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
# MODEL = "DTAI-KULeuven/robbertje-merged-dutch-sentiment"

MODEL = "DTAI-KULeuven/robbert-v2-dutch-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels = 2)



In [8]:
def tokenizer_data(df_train, df_test, model_name, label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}):

    # Load datasets
    dataset_train = Dataset.from_pandas(df_train)
    dataset_test = Dataset.from_pandas(df_test)
    tokenizer = AutoTokenizer.from_pretrained(model_name)#("google-bert/bert-base-cased")
 
    # Tokenize function
    def tokenize_function(examples):
        return tokenizer(examples["Sentence"], padding="max_length", truncation=True)
 
    # Map sentiment labels to integers

    #label_mapping = {'Negatief': 0, 'Neutraal': 1, 'Positief': 2}
 
    def map_labels(examples):
        examples['labels'] = [label_mapping[label] for label in examples['Sentiment']]
        return examples
 
    # Tokenize datasets
    small_train_dataset = dataset_train.map(tokenize_function, batched=True)
    small_eval_dataset = dataset_test.map(tokenize_function, batched=True)
 
    # Apply label mapping
    small_train_dataset = small_train_dataset.map(map_labels, batched=True)
    small_eval_dataset = small_eval_dataset.map(map_labels, batched=True)

    return small_train_dataset, small_eval_dataset

In [110]:
def fine_tune_llm(df_train, df_test, model_name, num_labels, label_mapping = {'Negatief': 0, 'Neutraal': 1, 'Positief': 2}, epochs=3):
 
    small_train_dataset, small_eval_dataset = (
        tokenizer_data(df_train=df_train, 
                       df_test=df_test, 
                       model_name = model_name,
                       label_mapping = label_mapping)
    )
 
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, ignore_mismatched_sizes=True)

    # Load metric
    metric = evaluate.load("accuracy")
 
    # Compute metrics function
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)
 
    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        run_name="my_unique_run_name",
        report_to=[],  # Disable W&B logging
        logging_dir='./logs',  # Directory for storing logs
        logging_steps=10,  # Log every 10 steps
        eval_strategy="epoch",  # Evaluate at the end of each epoch
        save_strategy="epoch",  # Save the model at the end of each epoch
        per_device_train_batch_size=4,  # Adjust based on your setup
        per_device_eval_batch_size=4,  # Adjust based on your setup
        num_train_epochs=epochs,  # Number of training epochs
        load_best_model_at_end=True,  # Load the best model at the end of training
    )
 
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        compute_metrics=compute_metrics,
    )
 
    # Disable W&B logging
    os.environ["WANDB_DISABLED"] = "true"
 
    # Train the model
    trainer.train()

    return trainer, small_train_dataset, small_eval_dataset    
 

In [71]:
small_train_dataset, small_eval_dataset = (
    tokenizer_data(df_train=sentences, 
                    df_test=sentences, 
                    model_name = MODEL,
                    label_mapping = {'Negatief': 0, 'Neutraal': 1, 'Positief': 2})
)

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

In [111]:
trainer, small_train_dataset, small_eval_dataset = fine_tune_llm(sentences, sentences, model_name=MODEL,num_labels=3, label_mapping = {'Negatief': 0, 'Neutraal': 1, 'Positief': 2}, epochs=3)

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DTAI-KULeuven/robbert-v2-dutch-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/753 [00:00<?, ?it/s]

{'loss': 1.0512, 'grad_norm': 9.979782104492188, 'learning_rate': 4.9335989375830016e-05, 'epoch': 0.04}
{'loss': 0.809, 'grad_norm': 24.539335250854492, 'learning_rate': 4.867197875166003e-05, 'epoch': 0.08}
{'loss': 0.3882, 'grad_norm': 2.535130023956299, 'learning_rate': 4.8007968127490044e-05, 'epoch': 0.12}
{'loss': 0.4383, 'grad_norm': 1.3243513107299805, 'learning_rate': 4.734395750332006e-05, 'epoch': 0.16}
{'loss': 0.4687, 'grad_norm': 54.80116653442383, 'learning_rate': 4.6679946879150064e-05, 'epoch': 0.2}
{'loss': 0.4039, 'grad_norm': 0.14871855080127716, 'learning_rate': 4.601593625498008e-05, 'epoch': 0.24}
{'loss': 0.7554, 'grad_norm': 57.78717041015625, 'learning_rate': 4.535192563081009e-05, 'epoch': 0.28}
{'loss': 0.3846, 'grad_norm': 6.090554714202881, 'learning_rate': 4.4687915006640105e-05, 'epoch': 0.32}
{'loss': 0.2683, 'grad_norm': 5.018764019012451, 'learning_rate': 4.402390438247012e-05, 'epoch': 0.36}
{'loss': 0.2508, 'grad_norm': 0.12534798681735992, 'learni

  0%|          | 0/251 [00:00<?, ?it/s]

{'eval_loss': 0.06087331101298332, 'eval_accuracy': 0.9890219560878244, 'eval_runtime': 38.7775, 'eval_samples_per_second': 25.84, 'eval_steps_per_second': 6.473, 'epoch': 1.0}
{'loss': 0.0016, 'grad_norm': 0.017108196392655373, 'learning_rate': 3.2735723771580345e-05, 'epoch': 1.04}
{'loss': 0.1687, 'grad_norm': 0.01827259175479412, 'learning_rate': 3.207171314741036e-05, 'epoch': 1.08}
{'loss': 0.0079, 'grad_norm': 0.011581240221858025, 'learning_rate': 3.140770252324037e-05, 'epoch': 1.12}
{'loss': 0.0067, 'grad_norm': 0.014100435189902782, 'learning_rate': 3.0743691899070386e-05, 'epoch': 1.16}
{'loss': 0.0019, 'grad_norm': 6.783116340637207, 'learning_rate': 3.00796812749004e-05, 'epoch': 1.2}
{'loss': 0.1118, 'grad_norm': 0.01173590961843729, 'learning_rate': 2.9415670650730414e-05, 'epoch': 1.24}
{'loss': 0.0022, 'grad_norm': 0.01238594576716423, 'learning_rate': 2.8751660026560427e-05, 'epoch': 1.27}
{'loss': 0.188, 'grad_norm': 0.010920075699687004, 'learning_rate': 2.80876494

  0%|          | 0/251 [00:00<?, ?it/s]

{'eval_loss': 0.014515336602926254, 'eval_accuracy': 0.9970059880239521, 'eval_runtime': 38.8433, 'eval_samples_per_second': 25.796, 'eval_steps_per_second': 6.462, 'epoch': 2.0}
{'loss': 0.0007, 'grad_norm': 0.014316472224891186, 'learning_rate': 1.6135458167330678e-05, 'epoch': 2.03}
{'loss': 0.0005, 'grad_norm': 0.009110601618885994, 'learning_rate': 1.547144754316069e-05, 'epoch': 2.07}
{'loss': 0.0639, 'grad_norm': 0.010127268731594086, 'learning_rate': 1.4807436918990705e-05, 'epoch': 2.11}
{'loss': 0.0005, 'grad_norm': 0.010387727990746498, 'learning_rate': 1.4143426294820719e-05, 'epoch': 2.15}
{'loss': 0.0005, 'grad_norm': 0.007823833264410496, 'learning_rate': 1.3479415670650731e-05, 'epoch': 2.19}
{'loss': 0.09, 'grad_norm': 0.009702351875603199, 'learning_rate': 1.2815405046480745e-05, 'epoch': 2.23}
{'loss': 0.0034, 'grad_norm': 0.007953206077218056, 'learning_rate': 1.2151394422310758e-05, 'epoch': 2.27}
{'loss': 0.0005, 'grad_norm': 0.009092980995774269, 'learning_rate':

  0%|          | 0/251 [00:00<?, ?it/s]

{'eval_loss': 0.01121242344379425, 'eval_accuracy': 0.998003992015968, 'eval_runtime': 39.7749, 'eval_samples_per_second': 25.192, 'eval_steps_per_second': 6.311, 'epoch': 3.0}
{'train_runtime': 565.3605, 'train_samples_per_second': 5.317, 'train_steps_per_second': 1.332, 'train_loss': 0.14054683096809523, 'epoch': 3.0}


In [113]:
y_pred =  model_trained.predict(small_eval_dataset)

label_mapping = {'Negatief': 0, 'Neutraal': 1, 'Positief': 2}

mapped_labels = [label_mapping[label] for label in small_eval_dataset['Sentiment']]

  0%|          | 0/251 [00:00<?, ?it/s]

In [None]:
small_train_dataset, small_eval_dataset

In [121]:
from sklearn.metrics import f1_score, balanced_accuracy_score

# Calculate AUC
f1 = f1_score(mapped_labels, np.argmax(y_pred.predictions, axis=1), average='micro')

print(f1)
accuracy_ = balanced_accuracy_score(mapped_labels, np.argmax(y_pred.predictions, axis=1))
print(accuracy_)

0.9910179640718563
0.9910179640718564


In [None]:
# Tokenize datasets
small_train_dataset = dataset_train.map(tokenize_function, batched=True)
small_eval_dataset = dataset_test.map(tokenize_function, batched=True)

# Apply label mapping
small_train_dataset = small_train_dataset.map(map_labels, batched=True)
small_eval_dataset = small_eval_dataset.map(map_labels, batched=True)

In [24]:
def sentiment_labels(text, model):
    encoded_input = tokenizer(text, padding=True,truncation=True,max_length=512, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    return config.id2label[ranking[0]]


In [None]:
from sklearn.metrics import roc_auc_score, log_loss, matthews_corrcoef, cohen_kappa_score, mean_absolute_error, mean_squared_error, r2_score

# Assuming y_true and y_pred are already defined

# Classification metrics
roc_auc = roc_auc_score(y_true, y_pred)
log_loss_value = log_loss(y_true, y_pred)
mcc = matthews_corrcoef(y_true, y_pred)
cohen_kappa = cohen_kappa_score(y_true, y_pred)

# Regression metrics (assuming y_true and y_pred are continuous values)
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)

# Print metrics
print(f'ROC-AUC Score: {roc_auc:.2f}')
print(f'Log Loss: {log_loss_value:.2f}')
print(f'Matthews Correlation Coefficient: {mcc:.2f}')
print(f'Cohen\'s Kappa: {cohen_kappa:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'Mean Squared Error: {mse:.2f}')
print(f'Root Mean Squared Error: {rmse:.2f}')
print(f'R-squared: {r2:.2f}')


In [23]:
# sentences['prediction_KU'] = sentences['Sentence'].apply(sentiment_labels, model=model_trained.model)

In [None]:
sentences['prediction_KU'] = sentences['prediction_KU'].replace("Positive", "Positief")
sentences['prediction_KU'] = sentences['prediction_KU'].replace("Negative", "Negatief")