**Bert model op basis van Bert van Universiteit van Groningen. Context handling moet nog aangepast worden. Oversampling a.d.h.v de mediaan. dynamisch treshhold zoeken voor unknown.**

In [1]:
import pandas as pd
import re
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
import torch.nn.functional as F
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ✅ 1. Load & Preprocess Data
script_dir = os.path.dirname(os.getcwd()) # Ga één map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")

# 1. Dataset inladen
file_path = os.path.join(data_folder, "Grote_data_cleaned.xlsx")
df = pd.read_excel(file_path)

In [4]:
# ✅ Recompute label encoding AFTER filtering
unique_themes = list(df["theme"].unique())
theme_to_id = {theme: idx for idx, theme in enumerate(unique_themes)}
id_to_theme = {idx: theme for theme, idx in theme_to_id.items()}
df["theme_id"] = df["theme"].map(theme_to_id)

In [5]:
# ✅ 5. Split Data into Train & Test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["clean_text"].tolist(), df["theme_id"].tolist(), test_size=0.2, random_state=42, stratify=df["theme_id"]
)

In [6]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

# Create a small DataFrame from train lists
train_df = pd.DataFrame({
    "clean_text": train_texts,
    "theme_id": train_labels
})

# Compute class counts and use median as balancing target
theme_counts = train_df["theme_id"].value_counts()
median_count = theme_counts.median()

# Define strategy: only oversample underrepresented classes
sampling_strategy = {
    theme: int(median_count)
    for theme in theme_counts.index
    if theme_counts[theme] < median_count
}

# Apply RandomOverSampler
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = ros.fit_resample(train_df[["clean_text"]], train_df["theme_id"])

# Extract oversampled train lists
train_texts_resampled = X_resampled["clean_text"].tolist()
train_labels_resampled = y_resampled.tolist()

from collections import Counter
print("Class distribution after oversampling:", Counter(train_labels_resampled))


Class distribution after oversampling: Counter({15: 4896, 4: 1530, 9: 1463, 26: 1071, 12: 977, 6: 793, 21: 745, 20: 638, 0: 601, 16: 578, 28: 564, 14: 547, 8: 455, 1: 402, 17: 385, 11: 319, 22: 282, 23: 262, 10: 233, 2: 233, 25: 233, 24: 233, 7: 233, 5: 233, 31: 233, 19: 233, 13: 233, 3: 233, 35: 233, 18: 233, 36: 233, 32: 233, 33: 233, 34: 233, 30: 233, 27: 233, 29: 233})


In [7]:
# ✅ 7. Load BERT Tokenizer & Define Dataset Class
from transformers import RobertaTokenizer, RobertaForSequenceClassification

model_name = "pdelobelle/robbert-v2-dutch-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

class ThemeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt"
        )
        encoding["labels"] = torch.tensor(self.labels[idx])
        return {key: val.squeeze(0) for key, val in encoding.items()}

train_dataset = ThemeDataset(train_texts_resampled, train_labels_resampled, tokenizer)
test_dataset = ThemeDataset(test_texts, test_labels, tokenizer)

In [8]:
# ✅ 8. Load BERT Model for Classification
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=df["theme_id"].nunique())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# ✅ 9. Define Training Arguments (With Early Stopping)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",               # 👈 No auto-checkpoints
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,  
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",  # 🔥 Log only once per epoch
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

# ✅ 10. Define Metrics for Evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=1)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# ✅ 11. Train Model with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop if no improvement for 2 epochs
)

trainer.train()

 12%|█▎        | 2617/20936 [09:40<1:05:28,  4.66it/s]

{'loss': 2.231, 'grad_norm': 35.04418182373047, 'learning_rate': 1.75019105846389e-05, 'epoch': 1.0}


                                                      
 12%|█▎        | 2617/20936 [10:13<1:05:28,  4.66it/s]

{'eval_loss': 1.595786452293396, 'eval_accuracy': 0.5943436960276338, 'eval_precision': 0.6330498067237723, 'eval_recall': 0.5943436960276338, 'eval_f1': 0.5749841548601781, 'eval_runtime': 32.2973, 'eval_samples_per_second': 143.417, 'eval_steps_per_second': 17.927, 'epoch': 1.0}


 25%|██▌       | 5234/20936 [19:20<56:19,  4.65it/s]   

{'loss': 1.3373, 'grad_norm': 20.191516876220703, 'learning_rate': 1.500286587695835e-05, 'epoch': 2.0}


                                                    
 25%|██▌       | 5234/20936 [19:53<56:19,  4.65it/s]

{'eval_loss': 1.3730727434158325, 'eval_accuracy': 0.6509067357512953, 'eval_precision': 0.6674996849920133, 'eval_recall': 0.6509067357512953, 'eval_f1': 0.6406699330395074, 'eval_runtime': 32.9195, 'eval_samples_per_second': 140.707, 'eval_steps_per_second': 17.588, 'epoch': 2.0}


 38%|███▊      | 7851/20936 [29:38<47:00,  4.64it/s]   

{'loss': 0.9076, 'grad_norm': 22.898374557495117, 'learning_rate': 1.2503821169277801e-05, 'epoch': 3.0}


                                                    
 38%|███▊      | 7851/20936 [30:10<47:00,  4.64it/s]

{'eval_loss': 1.2874277830123901, 'eval_accuracy': 0.6778929188255614, 'eval_precision': 0.6817912265277261, 'eval_recall': 0.6778929188255614, 'eval_f1': 0.67217350193918, 'eval_runtime': 32.4354, 'eval_samples_per_second': 142.807, 'eval_steps_per_second': 17.851, 'epoch': 3.0}


 50%|█████     | 10468/20936 [39:59<38:20,  4.55it/s]  

{'loss': 0.6299, 'grad_norm': 21.153905868530273, 'learning_rate': 1.000477646159725e-05, 'epoch': 4.0}


                                                     
 50%|█████     | 10468/20936 [40:32<38:20,  4.55it/s]

{'eval_loss': 1.34096097946167, 'eval_accuracy': 0.6988341968911918, 'eval_precision': 0.700056736619114, 'eval_recall': 0.6988341968911918, 'eval_f1': 0.6943506939711058, 'eval_runtime': 32.6837, 'eval_samples_per_second': 141.722, 'eval_steps_per_second': 17.715, 'epoch': 4.0}


 62%|██████▎   | 13085/20936 [50:18<28:19,  4.62it/s]   

{'loss': 0.4409, 'grad_norm': 4.453324317932129, 'learning_rate': 7.506687046236149e-06, 'epoch': 5.0}


                                                     
 62%|██████▎   | 13085/20936 [50:51<28:19,  4.62it/s]

{'eval_loss': 1.4313913583755493, 'eval_accuracy': 0.7018566493955095, 'eval_precision': 0.7019848561361614, 'eval_recall': 0.7018566493955095, 'eval_f1': 0.6980608437477588, 'eval_runtime': 32.382, 'eval_samples_per_second': 143.042, 'eval_steps_per_second': 17.88, 'epoch': 5.0}


 75%|███████▌  | 15702/20936 [1:00:37<18:32,  4.70it/s] 

{'loss': 0.3018, 'grad_norm': 45.43848419189453, 'learning_rate': 5.007642338555599e-06, 'epoch': 6.0}


                                                       
 75%|███████▌  | 15702/20936 [1:01:10<18:32,  4.70it/s]

{'eval_loss': 1.5126323699951172, 'eval_accuracy': 0.7040155440414507, 'eval_precision': 0.7089561859669035, 'eval_recall': 0.7040155440414507, 'eval_f1': 0.7018526690181262, 'eval_runtime': 32.9312, 'eval_samples_per_second': 140.657, 'eval_steps_per_second': 17.582, 'epoch': 6.0}


 88%|████████▊ | 18319/20936 [1:10:57<09:27,  4.61it/s]   

{'loss': 0.2163, 'grad_norm': 1.5928444862365723, 'learning_rate': 2.5085976308750482e-06, 'epoch': 7.0}


                                                       
 88%|████████▊ | 18319/20936 [1:11:30<09:27,  4.61it/s]

{'eval_loss': 1.5766900777816772, 'eval_accuracy': 0.7141623488773747, 'eval_precision': 0.714810284558547, 'eval_recall': 0.7141623488773747, 'eval_f1': 0.7118710292783692, 'eval_runtime': 32.5156, 'eval_samples_per_second': 142.455, 'eval_steps_per_second': 17.807, 'epoch': 7.0}


100%|██████████| 20936/20936 [1:21:19<00:00,  4.54it/s]  

{'loss': 0.1591, 'grad_norm': 1.389404535293579, 'learning_rate': 8.597630875047766e-09, 'epoch': 8.0}


                                                       
100%|██████████| 20936/20936 [1:21:52<00:00,  4.54it/s]

{'eval_loss': 1.6215602159500122, 'eval_accuracy': 0.7109240069084629, 'eval_precision': 0.7111552276242372, 'eval_recall': 0.7109240069084629, 'eval_f1': 0.7086257774210002, 'eval_runtime': 33.1053, 'eval_samples_per_second': 139.917, 'eval_steps_per_second': 17.49, 'epoch': 8.0}


100%|██████████| 20936/20936 [1:21:56<00:00,  4.26it/s]

{'train_runtime': 4916.7549, 'train_samples_per_second': 34.063, 'train_steps_per_second': 4.258, 'train_loss': 0.7780064242825419, 'epoch': 8.0}





TrainOutput(global_step=20936, training_loss=0.7780064242825419, metrics={'train_runtime': 4916.7549, 'train_samples_per_second': 34.063, 'train_steps_per_second': 4.258, 'total_flos': 4.40796872804352e+16, 'train_loss': 0.7780064242825419, 'epoch': 8.0})

SAVE MODEL NAAR KUL DRIVE 

In [10]:
from datetime import datetime

run_id = datetime.now().strftime("%Y-%m-%d_%H-%M")
save_path = f"C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/RobBERT/Run_{run_id}"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

# Add a description of the model and training details
description = """
Model: RobBERT
Training Details:
geen context meegegeven in zowel train als test
"""

# Save the description to a text file
description_file = f"{save_path}/model_description.txt"
with open(description_file, "w", encoding="utf-8") as f:
    f.write(description)