**Bert model op basis van Bert van Universiteit van Groningen. Context handling moet nog aangepast worden. Oversampling a.d.h.v de mediaan. dynamisch treshhold zoeken voor unknown.**

In [1]:
import pandas as pd
import re
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
import torch.nn.functional as F
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ✅ 1. Load & Preprocess Data
script_dir = os.getcwd() # Ga één map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")

# 1. Dataset inladen
file_path = os.path.join(data_folder, "Grote_data.xlsx")
df = pd.read_excel(file_path)

In [3]:
# Drop unnecessary columns
if "TXT_file_name" in df.columns:
    df = df.drop(columns=["TXT_file_name"])

# Handle missing values
df = df.dropna(subset=["question"])
df["context"].fillna("", inplace=True)

# Clean text
def clean_text(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\b[a-z]\)\s+', ' ', text)
    text = re.sub(r'\b\d+\.\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_text"] = (df["context"] + " " + df["question"]).apply(clean_text)

# ✅ Now: drop rare themes using original theme names
theme_counts = df["theme"].value_counts()
valid_themes = theme_counts[theme_counts >= 2].index
df = df[df["theme"].isin(valid_themes)]

# ✅ Recompute label encoding AFTER filtering
unique_themes = list(df["theme"].unique())
theme_to_id = {theme: idx for idx, theme in enumerate(unique_themes)}
id_to_theme = {idx: theme for theme, idx in theme_to_id.items()}
df["theme_id"] = df["theme"].map(theme_to_id)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["context"].fillna("", inplace=True)


In [4]:
print("All theme_ids:", sorted(df["theme_id"].unique()))
print("num_labels:", df["theme_id"].nunique())

All theme_ids: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36)]
num_labels: 37


In [5]:
# ✅ 5. Split Data into Train & Test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["clean_text"].tolist(), df["theme_id"].tolist(), test_size=0.2, random_state=42, stratify=df["theme_id"]
)

In [6]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

# Create a small DataFrame from train lists
train_df = pd.DataFrame({
    "clean_text": train_texts,
    "theme_id": train_labels
})

# Compute class counts and use median as balancing target
theme_counts = train_df["theme_id"].value_counts()
median_count = theme_counts.median()

# Define strategy: only oversample underrepresented classes
sampling_strategy = {
    theme: int(median_count)
    for theme in theme_counts.index
    if theme_counts[theme] < median_count
}

# Apply RandomOverSampler
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = ros.fit_resample(train_df[["clean_text"]], train_df["theme_id"])

# Extract oversampled train lists
train_texts_resampled = X_resampled["clean_text"].tolist()
train_labels_resampled = y_resampled.tolist()

from collections import Counter
print("Class distribution after oversampling:", Counter(train_labels_resampled))


Class distribution after oversampling: Counter({15: 5147, 4: 1763, 9: 1699, 26: 1216, 12: 1124, 6: 949, 21: 860, 0: 855, 20: 736, 16: 641, 28: 622, 14: 612, 8: 553, 1: 527, 17: 433, 23: 332, 22: 330, 11: 323, 31: 253, 27: 253, 25: 253, 5: 253, 24: 253, 2: 253, 19: 253, 29: 253, 18: 253, 13: 253, 7: 253, 3: 253, 32: 253, 10: 253, 30: 253, 33: 253, 34: 253, 35: 253, 36: 253})


In [7]:
# ✅ 7. Load BERT Tokenizer & Define Dataset Class
from transformers import RobertaTokenizer, RobertaForSequenceClassification

model_name = "pdelobelle/robbert-v2-dutch-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

class ThemeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt"
        )
        encoding["labels"] = torch.tensor(self.labels[idx])
        return {key: val.squeeze(0) for key, val in encoding.items()}

train_dataset = ThemeDataset(train_texts_resampled, train_labels_resampled, tokenizer)
test_dataset = ThemeDataset(test_texts, test_labels, tokenizer)

In [8]:
# ✅ 8. Load BERT Model for Classification
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=df["theme_id"].nunique())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pdelobelle/robbert-v2-dutch-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# ✅ 9. Define Training Arguments (With Early Stopping)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",               # 👈 No auto-checkpoints
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,  
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",  # 🔥 Log only once per epoch
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

# ✅ 10. Define Metrics for Evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=1)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# ✅ 11. Train Model with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop if no improvement for 2 epochs
)

trainer.train()

 12%|█▎        | 2942/23536 [10:14<1:03:20,  5.42it/s]

{'loss': 1.3705, 'grad_norm': 27.96900177001953, 'learning_rate': 1.7501699524133243e-05, 'epoch': 1.0}


                                                      
 12%|█▎        | 2942/23536 [10:44<1:03:20,  5.42it/s]

{'eval_loss': 0.6914447546005249, 'eval_accuracy': 0.8224706328154604, 'eval_precision': 0.8289999745184685, 'eval_recall': 0.8224706328154604, 'eval_f1': 0.8205365516062215, 'eval_runtime': 30.4013, 'eval_samples_per_second': 173.611, 'eval_steps_per_second': 21.71, 'epoch': 1.0}


 25%|██▌       | 5884/23536 [20:58<1:01:19,  4.80it/s] 

{'loss': 0.5069, 'grad_norm': 0.08746741712093353, 'learning_rate': 1.5003399048266487e-05, 'epoch': 2.0}


                                                      
 25%|██▌       | 5884/23536 [21:29<1:01:19,  4.80it/s]

{'eval_loss': 0.5102088451385498, 'eval_accuracy': 0.874384236453202, 'eval_precision': 0.8781140364403, 'eval_recall': 0.874384236453202, 'eval_f1': 0.8734061409265582, 'eval_runtime': 30.5324, 'eval_samples_per_second': 172.865, 'eval_steps_per_second': 21.616, 'epoch': 2.0}


 38%|███▊      | 8826/23536 [31:43<50:54,  4.82it/s]   

{'loss': 0.3108, 'grad_norm': 0.19137337803840637, 'learning_rate': 1.2504248810333108e-05, 'epoch': 3.0}


                                                    
 38%|███▊      | 8826/23536 [32:14<50:54,  4.82it/s]

{'eval_loss': 0.4784555733203888, 'eval_accuracy': 0.8929518757104964, 'eval_precision': 0.906892289157979, 'eval_recall': 0.8929518757104964, 'eval_f1': 0.8960894191142803, 'eval_runtime': 31.0175, 'eval_samples_per_second': 170.162, 'eval_steps_per_second': 21.278, 'epoch': 3.0}


 50%|█████     | 11768/23536 [42:29<40:56,  4.79it/s]  

{'loss': 0.2333, 'grad_norm': 0.09257301688194275, 'learning_rate': 1.0005948334466351e-05, 'epoch': 4.0}


                                                     
 50%|█████     | 11768/23536 [42:59<40:56,  4.79it/s]

{'eval_loss': 0.49022215604782104, 'eval_accuracy': 0.8961727927245169, 'eval_precision': 0.9005954733748793, 'eval_recall': 0.8961727927245169, 'eval_f1': 0.8960902796164975, 'eval_runtime': 30.5431, 'eval_samples_per_second': 172.805, 'eval_steps_per_second': 21.609, 'epoch': 4.0}


 62%|██████▎   | 14710/23536 [53:19<30:59,  4.75it/s]   

{'loss': 0.193, 'grad_norm': 1.207539677619934, 'learning_rate': 7.5067980965329715e-06, 'epoch': 5.0}


                                                     
 62%|██████▎   | 14710/23536 [53:50<30:59,  4.75it/s]

{'eval_loss': 0.4673379957675934, 'eval_accuracy': 0.9018567639257294, 'eval_precision': 0.9066195362086161, 'eval_recall': 0.9018567639257294, 'eval_f1': 0.9015201575955143, 'eval_runtime': 30.6541, 'eval_samples_per_second': 172.179, 'eval_steps_per_second': 21.531, 'epoch': 5.0}


 75%|███████▌  | 17652/23536 [1:04:04<20:27,  4.79it/s] 

{'loss': 0.1707, 'grad_norm': 0.005223019514232874, 'learning_rate': 5.008497620666214e-06, 'epoch': 6.0}


                                                       
 75%|███████▌  | 17652/23536 [1:04:35<20:27,  4.79it/s]

{'eval_loss': 0.4676166772842407, 'eval_accuracy': 0.9028040924592649, 'eval_precision': 0.9084605086690636, 'eval_recall': 0.9028040924592649, 'eval_f1': 0.9024738326754054, 'eval_runtime': 30.5466, 'eval_samples_per_second': 172.785, 'eval_steps_per_second': 21.606, 'epoch': 6.0}


 88%|████████▊ | 20594/23536 [1:14:54<10:13,  4.79it/s]   

{'loss': 0.1602, 'grad_norm': 0.0035886585246771574, 'learning_rate': 2.508497620666214e-06, 'epoch': 7.0}


                                                       
 88%|████████▊ | 20594/23536 [1:15:25<10:13,  4.79it/s]

{'eval_loss': 0.46937158703804016, 'eval_accuracy': 0.9056460780598712, 'eval_precision': 0.9109848507042728, 'eval_recall': 0.9056460780598712, 'eval_f1': 0.9052833515583832, 'eval_runtime': 30.4134, 'eval_samples_per_second': 173.542, 'eval_steps_per_second': 21.701, 'epoch': 7.0}


100%|██████████| 23536/23536 [1:25:45<00:00,  4.82it/s]  

{'loss': 0.1543, 'grad_norm': 0.013446301221847534, 'learning_rate': 9.347382732834808e-09, 'epoch': 8.0}


                                                       
100%|██████████| 23536/23536 [1:26:16<00:00,  4.82it/s]

{'eval_loss': 0.46714767813682556, 'eval_accuracy': 0.907540735126942, 'eval_precision': 0.919305073401988, 'eval_recall': 0.907540735126942, 'eval_f1': 0.9100891098476592, 'eval_runtime': 30.5636, 'eval_samples_per_second': 172.689, 'eval_steps_per_second': 21.594, 'epoch': 8.0}


100%|██████████| 23536/23536 [1:26:18<00:00,  4.55it/s]

{'train_runtime': 5178.0435, 'train_samples_per_second': 36.352, 'train_steps_per_second': 4.545, 'train_loss': 0.3874780225721368, 'epoch': 8.0}





TrainOutput(global_step=23536, training_loss=0.3874780225721368, metrics={'train_runtime': 5178.0435, 'train_samples_per_second': 36.352, 'train_steps_per_second': 4.545, 'total_flos': 4.954148373639168e+16, 'train_loss': 0.3874780225721368, 'epoch': 8.0})

SAVE MODEL NAAR KUL DRIVE 

In [10]:
from datetime import datetime

run_id = datetime.now().strftime("%Y-%m-%d_%H-%M")
save_path = f"C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/RobBERT/Run_{run_id}"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/RobBERT/Run_2025-04-02_20-44\\tokenizer_config.json',
 'C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/RobBERT/Run_2025-04-02_20-44\\special_tokens_map.json',
 'C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/RobBERT/Run_2025-04-02_20-44\\vocab.json',
 'C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/RobBERT/Run_2025-04-02_20-44\\merges.txt',
 'C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/RobBERT/Run_2025-04-02_20-44\\added_tokens.json')