**Bert model op basis van Bert van Universiteit van Groningen. Context handling moet nog aangepast worden. Oversampling a.d.h.v de mediaan. dynamisch treshhold zoeken voor unknown.**

In [1]:
import pandas as pd
import re
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
import torch.nn.functional as F
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ✅ 1. Load & Preprocess Data
script_dir = os.path.dirname(os.getcwd()) # Ga één map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")


# 1. Dataset inladen
file_path = os.path.join(data_folder, "Grote_data_cleaned.xlsx")
df = pd.read_excel(file_path)

In [3]:
# Drop unnecessary columns
if "TXT_file_name" in df.columns:
    df = df.drop(columns=["TXT_file_name"])

# Handle missing values
df = df.dropna(subset=["question"])
df["context"].fillna("", inplace=True)

# Clean text
def clean_text(text):
    text = re.sub(r'\n', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'\b[a-z]\)\s*', ' ', text)  # Remove patterns like 'a)', 'b)', etc., with optional spaces
    text = re.sub(r'\b\d+\.\b', '', text)  # Remove patterns like '1.', '2.', etc.
    text = re.sub(r'\b\d+\)\b', '', text)  # Remove patterns like '1)', '2)', etc.
    text = re.sub(r'\b[i]+[.)]\b', '', text, flags=re.IGNORECASE)  # Remove patterns like 'i.', 'ii.', 'i)', etc.
    text = re.sub(r'\b\d+[.)]\s*', '', text) # Remove numeric list markers like 1., 2. or 1) 2)
    text = re.sub(r'\b[ivxlcdm]+\s*[.)]\s*', '', text, flags=re.IGNORECASE)# Remove roman numerals like i. ii. iii. or i) ii) iii)
    text = re.sub(r'•', '', text)  # Remove bullet symbol
    text = re.sub(r'\[\d+\]', '', text)  # Remove patterns like '[1]', '[2]', etc.
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and trim

    return text
# df["clean_text"] = (df["context"] + " " + df["question"]).apply(clean_text)
df["clean_text"] = (df["question"]).apply(clean_text) 

# Group by 'clean_text' and count unique themes
duplicates_with_diff_themes = df.groupby("clean_text")["theme"].nunique().reset_index()

# Filter rows where the number of unique themes is greater than 1
duplicates_with_diff_themes = duplicates_with_diff_themes[duplicates_with_diff_themes["theme"] > 1]

# Merge back with the original dataframe to get all rows with these 'clean_text'
filtered_df = df[df["clean_text"].isin(duplicates_with_diff_themes["clean_text"])]
# Exclude rows with these 'clean_text' from the original dataframe
df = df[~df["clean_text"].isin(duplicates_with_diff_themes["clean_text"])]


# Thema-cluster mapping
theme_merge_map = {
    # Bestuur en Beleid
    "Lokale overheden en Binnenlands bestuur": "Bestuur en Beleid",
    "Vlaamse administratie": "Bestuur en Beleid",
    "Staatshervorming en Verhoudingen binnen de Belgische federale staat": "Bestuur en Beleid",
    "Vlaamse Regering": "Bestuur en Beleid",
    "Vlaams Parlement": "Bestuur en Beleid",

    # Mobiliteit en Infrastructuur
    "Mobiliteit en Verkeer": "Mobiliteit en Infrastructuur",
    "Openbare werken": "Mobiliteit en Infrastructuur",
    "Ruimtelijke ordening": "Mobiliteit en Infrastructuur",

    # Economie en Arbeid
    "Werk": "Economie en Arbeid",
    "Economie": "Economie en Arbeid",
    "Sociale economie": "Economie en Arbeid",
    "Internationaal ondernemen": "Economie en Arbeid",

    # Welzijn en Gezondheid
    "Welzijn en Gezin": "Welzijn en Gezondheid",
    "Gezondheid": "Welzijn en Gezondheid",
    "Armoedebeleid": "Welzijn en Gezondheid",

    # Cultuur en Communicatie
    "Cultuur": "Cultuur en Communicatie",
    "Media": "Cultuur en Communicatie",
    "Taalgebruik": "Cultuur en Communicatie",

    # Onderwijs en Samenleving
    "Onderwijs en Vorming": "Onderwijs en Samenleving",
    "Gelijke kansen": "Onderwijs en Samenleving",
    "Jeugdbeleid": "Onderwijs en Samenleving",
    "Integratie en Inburgering": "Onderwijs en Samenleving",

    # Milieu en Landbouw
    "Natuur en Milieu": "Milieu en Landbouw",
    "Landbouw": "Milieu en Landbouw",
    "Dierenwelzijn": "Milieu en Landbouw",

    # Internationaal Beleid
    "Buitenlands beleid": "Internationaal Beleid",
    "Europese instellingen": "Internationaal Beleid",
    "Ontwikkelingssamenwerking": "Internationaal Beleid",
    "Oekraïnecrisis": "Internationaal Beleid",

    # Overige (apart laten tenzij weinig samples)
    "Financiën": "Financiën",
    "Begroting": "Begroting",
    "Wetenschap en Innovatie": "Wetenschap en Innovatie",
    "Toerisme": "Toerisme",
    "Justitie en Handhaving": "Justitie en Handhaving",
    "Brussel en de Vlaamse Rand": "Brussel en de Vlaamse Rand",
    "Sport": "Sport",
    "Onroerend erfgoed": "Onroerend erfgoed",
    "Energie": "Energie",
    "Wonen": "Wonen",
}

# Nieuwe kolom aanmaken met samengevoegde thema's
df["theme"] = df["theme"].map(theme_merge_map).fillna("Onbekend")

# ✅ Now: drop rare themes using original theme names
theme_counts = df["theme"].value_counts()
valid_themes = theme_counts[theme_counts >= 2].index
df = df[df["theme"].isin(valid_themes)]

# ✅ Recompute label encoding AFTER filtering
unique_themes = list(df["theme"].unique())
theme_to_id = {theme: idx for idx, theme in enumerate(unique_themes)}
id_to_theme = {idx: theme for theme, idx in theme_to_id.items()}
df["theme_id"] = df["theme"].map(theme_to_id)

#amount of rows 
print(f"Number of rows after filtering: {len(df)}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["context"].fillna("", inplace=True)


Number of rows after filtering: 92502


In [4]:
# Remove rows where the 'question' column has 9 or fewer words
df = df[df['clean_text'].apply(lambda x: len(str(x).split()) > 9)]

# Remove rows where the 'question' column contains the specific phrase (with flexible matching)
df = df[~df['clean_text'].str.contains(r'\bKan de minister een overzicht geven\b', flags=re.IGNORECASE, na=False)]

# Remove rows where 'clean_text' contains "https:"
df = df[~df['clean_text'].str.contains(r'https:', flags=re.IGNORECASE, na=False)]

In [5]:
print("All theme_ids:", sorted(df["theme_id"].unique()))
print("num_labels:", df["theme_id"].nunique())

All theme_ids: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17)]
num_labels: 18


In [6]:
print(len(df))

64571


In [7]:
df.to_excel("Brah.xlsx", index=False)


In [10]:
# ✅ 5. Split Data into Train & Test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["clean_text"].tolist(), df["theme_id"].tolist(), test_size=0.2, random_state=42, stratify=df["theme_id"]
)

In [11]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

# Create a small DataFrame from train lists
train_df = pd.DataFrame({
    "clean_text": train_texts,
    "theme_id": train_labels
})

# Compute class counts and use median as balancing target
theme_counts = train_df["theme_id"].value_counts()
median_count = theme_counts.median()

# Define strategy: only oversample underrepresented classes
sampling_strategy = {
    theme: int(median_count)
    for theme in theme_counts.index
    if theme_counts[theme] < median_count
}

# Apply RandomOverSampler
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = ros.fit_resample(train_df[["clean_text"]], train_df["theme_id"])

# Extract oversampled train lists
train_texts_resampled = X_resampled["clean_text"].tolist()
train_labels_resampled = y_resampled.tolist()

from collections import Counter
print("Class distribution after oversampling:", Counter(train_labels_resampled))


Class distribution after oversampling: Counter({9: 13730, 2: 8024, 12: 6169, 10: 5313, 4: 4156, 1: 3601, 6: 3314, 8: 2467, 5: 1793, 13: 1793, 0: 1793, 11: 1793, 3: 1793, 7: 1793, 14: 1793, 15: 1793})


In [12]:
# ✅ 7. Load BERT Tokenizer & Define Dataset Class
model_name = "GroNLP/bert-base-dutch-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

class ThemeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt"
        )
        encoding["labels"] = torch.tensor(self.labels[idx])
        return {key: val.squeeze(0) for key, val in encoding.items()}

train_dataset = ThemeDataset(train_texts_resampled, train_labels_resampled, tokenizer)
test_dataset = ThemeDataset(test_texts, test_labels, tokenizer)

In [13]:
# ✅ 8. Load BERT Model for Classification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=df["theme_id"].nunique())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# ✅ 9. Define Training Arguments (With Early Stopping)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,                 # 👈 Keep only the last checkpoint
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,  
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",  # 🔥 Log only once per epoch
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

# ✅ 10. Define Metrics for Evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=1)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# ✅ 11. Train Model with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop if no improvement for 2 epochs
)

trainer.train()

 12%|█▎        | 7640/61120 [26:05<2:51:08,  5.21it/s]

{'loss': 1.1798, 'grad_norm': 15.804769515991211, 'learning_rate': 1.750163612565445e-05, 'epoch': 1.0}


                                                      
 12%|█▎        | 7640/61120 [27:22<2:51:08,  5.21it/s]

{'eval_loss': 0.9586341381072998, 'eval_accuracy': 0.7127371273712737, 'eval_precision': 0.7288897047427446, 'eval_recall': 0.7127371273712737, 'eval_f1': 0.7141815177133344, 'eval_runtime': 76.9497, 'eval_samples_per_second': 167.837, 'eval_steps_per_second': 20.988, 'epoch': 1.0}


 25%|██▌       | 15280/61120 [53:36<2:28:24,  5.15it/s] 

{'loss': 0.5841, 'grad_norm': 80.64071655273438, 'learning_rate': 1.5002617801047121e-05, 'epoch': 2.0}


                                                       
 25%|██▌       | 15280/61120 [54:52<2:28:24,  5.15it/s]

{'eval_loss': 0.906536340713501, 'eval_accuracy': 0.7523809523809524, 'eval_precision': 0.7522041870757319, 'eval_recall': 0.7523809523809524, 'eval_f1': 0.7508831605748869, 'eval_runtime': 76.3132, 'eval_samples_per_second': 169.237, 'eval_steps_per_second': 21.163, 'epoch': 2.0}


 38%|███▊      | 22920/61120 [1:25:06<2:02:17,  5.21it/s]

{'loss': 0.3595, 'grad_norm': 73.30034637451172, 'learning_rate': 1.2503599476439792e-05, 'epoch': 3.0}


                                                         
 38%|███▊      | 22920/61120 [1:26:22<2:02:17,  5.21it/s]

{'eval_loss': 1.0884649753570557, 'eval_accuracy': 0.7670925280681378, 'eval_precision': 0.7704428236342079, 'eval_recall': 0.7670925280681378, 'eval_f1': 0.7665732677815865, 'eval_runtime': 76.2526, 'eval_samples_per_second': 169.371, 'eval_steps_per_second': 21.18, 'epoch': 3.0}


 50%|█████     | 30560/61120 [1:54:01<1:40:46,  5.05it/s]  

{'loss': 0.2236, 'grad_norm': 58.785804748535156, 'learning_rate': 1.0004908376963351e-05, 'epoch': 4.0}


                                                         
 50%|█████     | 30560/61120 [1:55:19<1:40:46,  5.05it/s]

{'eval_loss': 1.3289031982421875, 'eval_accuracy': 0.7711962833914053, 'eval_precision': 0.7753983831270107, 'eval_recall': 0.7711962833914053, 'eval_f1': 0.7704356423349179, 'eval_runtime': 78.39, 'eval_samples_per_second': 164.753, 'eval_steps_per_second': 20.602, 'epoch': 4.0}


 62%|██████▎   | 38200/61120 [2:21:42<1:12:07,  5.30it/s]  

{'loss': 0.1338, 'grad_norm': 0.0011212157551199198, 'learning_rate': 7.505562827225131e-06, 'epoch': 5.0}


                                                         
 62%|██████▎   | 38200/61120 [2:22:58<1:12:07,  5.30it/s]

{'eval_loss': 1.654502034187317, 'eval_accuracy': 0.7774680603948897, 'eval_precision': 0.7762357489916658, 'eval_recall': 0.7774680603948897, 'eval_f1': 0.7756522603690519, 'eval_runtime': 75.8463, 'eval_samples_per_second': 170.279, 'eval_steps_per_second': 21.293, 'epoch': 5.0}


 75%|███████▌  | 45840/61120 [2:50:45<56:32,  4.50it/s]    

{'loss': 0.0784, 'grad_norm': 0.3927507996559143, 'learning_rate': 5.006544502617801e-06, 'epoch': 6.0}


                                                       
 75%|███████▌  | 45840/61120 [2:52:34<56:32,  4.50it/s]

{'eval_loss': 1.8001006841659546, 'eval_accuracy': 0.7837398373983739, 'eval_precision': 0.7836500928631909, 'eval_recall': 0.7837398373983739, 'eval_f1': 0.7827071850407704, 'eval_runtime': 109.6732, 'eval_samples_per_second': 117.759, 'eval_steps_per_second': 14.726, 'epoch': 6.0}


 88%|████████▊ | 53480/61120 [3:19:04<24:22,  5.23it/s]    

{'loss': 0.043, 'grad_norm': 0.021843956783413887, 'learning_rate': 2.5078534031413615e-06, 'epoch': 7.0}


                                                       
 88%|████████▊ | 53480/61120 [3:20:22<24:22,  5.23it/s]

{'eval_loss': 1.8927680253982544, 'eval_accuracy': 0.783662408052652, 'eval_precision': 0.7826215428646138, 'eval_recall': 0.783662408052652, 'eval_f1': 0.7824796600461998, 'eval_runtime': 77.0726, 'eval_samples_per_second': 167.569, 'eval_steps_per_second': 20.954, 'epoch': 7.0}


100%|██████████| 61120/61120 [3:46:11<00:00,  5.34it/s]   

{'loss': 0.0199, 'grad_norm': 0.0019660000689327717, 'learning_rate': 8.507853403141361e-09, 'epoch': 8.0}


                                                       
100%|██████████| 61120/61120 [3:47:28<00:00,  5.34it/s]

{'eval_loss': 1.9375015497207642, 'eval_accuracy': 0.7841269841269841, 'eval_precision': 0.7829741275779198, 'eval_recall': 0.7841269841269841, 'eval_f1': 0.7828152723040626, 'eval_runtime': 77.4404, 'eval_samples_per_second': 166.773, 'eval_steps_per_second': 20.855, 'epoch': 8.0}


100%|██████████| 61120/61120 [3:47:30<00:00,  4.48it/s]

{'train_runtime': 13650.3679, 'train_samples_per_second': 35.819, 'train_steps_per_second': 4.478, 'train_loss': 0.3277642851724675, 'epoch': 8.0}





TrainOutput(global_step=61120, training_loss=0.3277642851724675, metrics={'train_runtime': 13650.3679, 'train_samples_per_second': 35.819, 'train_steps_per_second': 4.478, 'total_flos': 1.2866274277117133e+17, 'train_loss': 0.3277642851724675, 'epoch': 8.0})

SAVE MODEL TO KUL DRIVE

In [15]:
import os
from datetime import datetime
import numpy as np
import pandas as pd
import json


# === Create timestamped save path in OneDrive ===
run_id = datetime.now().strftime("%Y-%m-%d_%H-%M")
base_path = "C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/GroNLP"
save_path = os.path.join(base_path, f"Run_{run_id}")
os.makedirs(save_path, exist_ok=True)

# === Save model using Trainer ===
trainer.save_model(save_path)

# === Get predictions ===
preds_output = trainer.predict(test_dataset)
logits = preds_output.predictions
predictions = np.argmax(logits, axis=1)

# === Save predictions to CSV ===
output_df = pd.DataFrame({
    "text": test_texts,  # make sure test_texts is defined
    "true_label": test_labels,  # make sure test_labels is defined
    "predicted_label": predictions,
    "logits": logits.tolist()
})

csv_path = os.path.join(save_path, "test_predictions.csv")
output_df.to_csv(csv_path, index=False)

# === Optional: Save a description file ===
description = """
Model: GroNLP BERT-based Dutch Cased
Training Details:
!!!
geen context meegegeven in zowel train als test
vragen langer dan 9 woorden wordden behouden
geen vragen met 'Kan de minister een overzicht geven' in train of test  
geen vragen met 'https:' in train of test
!!!

Dataset: Grote_data_cleaned.xlsx
Number of training samples: {train_samples}
Number of test samples: {test_samples}
Number of labels: {num_labels}
Label mapping: {label_mapping}


"""

desc_path = os.path.join(save_path, "model_description.txt")
with open(desc_path, "w", encoding="utf-8") as f:
    f.write(description)

# === Save label mappings ===
mappings_path = os.path.join(save_path, "label_mappings.json")
with open(mappings_path, "w", encoding="utf-8") as f:
    json.dump({
        "theme_to_id": theme_to_id,
        "id_to_theme": {str(k): v for k, v in id_to_theme.items()}  # keys must be str for JSON
    }, f, ensure_ascii=False, indent=4)


print(f"Everything saved in: {save_path}")


100%|██████████| 1615/1615 [01:16<00:00, 21.14it/s]


Everything saved in: C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/GroNLP\Run_2025-04-10_21-40
