**Bert model op basis van Bert van Universiteit van Groningen. Context handling moet nog aangepast worden. Oversampling a.d.h.v de mediaan. dynamisch treshhold zoeken voor unknown.**

In [1]:
import pandas as pd
import re
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
import torch.nn.functional as F
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ✅ 1. Load & Preprocess Data
script_dir = os.getcwd() # Ga één map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")

# 1. Dataset inladen
file_path = os.path.join(data_folder, "Grote_data_cleaned.xlsx")
df = pd.read_excel(file_path)

In [3]:
# Drop unnecessary columns
if "TXT_file_name" in df.columns:
    df = df.drop(columns=["TXT_file_name"])

# Handle missing values
df = df.dropna(subset=["question"])
df["context"].fillna("", inplace=True)

# Clean text
def clean_text(text):
    text = re.sub(r'\n', ' ', text)  # Replace newlines with spaces
    text = re.sub(r'\b[a-z]\)\s+', ' ', text)  # Remove patterns like 'a)', 'b)', etc.
    text = re.sub(r'\b\d+\.\b', '', text)  # Remove patterns like '1.', '2.', etc.
    text = re.sub(r'\b\d+\)\b', '', text)  # Remove patterns like '1)', '2)', etc.
    text = re.sub(r'\b[i]+[.)]\b', '', text, flags=re.IGNORECASE)  # Remove patterns like 'i.', 'ii.', 'i)', etc.
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and trim
    return text

df["clean_text"] = (df["context"] + " " + df["question"]).apply(clean_text)
# Group by 'clean_text' and count unique themes
duplicates_with_diff_themes = df.groupby("clean_text")["theme"].nunique().reset_index()

# Filter rows where the number of unique themes is greater than 1
duplicates_with_diff_themes = duplicates_with_diff_themes[duplicates_with_diff_themes["theme"] > 1]

# Merge back with the original dataframe to get all rows with these 'clean_text'
filtered_df = df[df["clean_text"].isin(duplicates_with_diff_themes["clean_text"])]
# Exclude rows with these 'clean_text' from the original dataframe
df = df[~df["clean_text"].isin(duplicates_with_diff_themes["clean_text"])]


# ✅ Now: drop rare themes using original theme names
theme_counts = df["theme"].value_counts()
valid_themes = theme_counts[theme_counts >= 2].index
df = df[df["theme"].isin(valid_themes)]

# ✅ Recompute label encoding AFTER filtering
unique_themes = list(df["theme"].unique())
theme_to_id = {theme: idx for idx, theme in enumerate(unique_themes)}
id_to_theme = {idx: theme for theme, idx in theme_to_id.items()}
df["theme_id"] = df["theme"].map(theme_to_id)


#amount of rows 
print(f"Number of rows after filtering: {len(df)}")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["context"].fillna("", inplace=True)


Number of rows after filtering: 24521


In [4]:
print("All theme_ids:", sorted(df["theme_id"].unique()))
print("num_labels:", df["theme_id"].nunique())

All theme_ids: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36)]
num_labels: 37


In [5]:
# ✅ 5. Split Data into Train & Test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["clean_text"].tolist(), df["theme_id"].tolist(), test_size=0.2, random_state=42, stratify=df["theme_id"]
)

In [6]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

# Create a small DataFrame from train lists
train_df = pd.DataFrame({
    "clean_text": train_texts,
    "theme_id": train_labels
})

# Compute class counts and use median as balancing target
theme_counts = train_df["theme_id"].value_counts()
median_count = theme_counts.median()

# Define strategy: only oversample underrepresented classes
sampling_strategy = {
    theme: int(median_count)
    for theme in theme_counts.index
    if theme_counts[theme] < median_count
}

# Apply RandomOverSampler
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = ros.fit_resample(train_df[["clean_text"]], train_df["theme_id"])

# Extract oversampled train lists
train_texts_resampled = X_resampled["clean_text"].tolist()
train_labels_resampled = y_resampled.tolist()

from collections import Counter
print("Class distribution after oversampling:", Counter(train_labels_resampled))


Class distribution after oversampling: Counter({15: 5101, 4: 1665, 9: 1544, 26: 1112, 12: 998, 6: 846, 21: 801, 20: 713, 0: 654, 16: 634, 14: 590, 28: 578, 8: 479, 1: 453, 17: 398, 11: 338, 22: 298, 23: 282, 25: 241, 35: 241, 19: 241, 2: 241, 24: 241, 18: 241, 29: 241, 31: 241, 5: 241, 3: 241, 13: 241, 10: 241, 33: 241, 30: 241, 27: 241, 7: 241, 32: 241, 34: 241, 36: 241})


In [7]:
# ✅ 7. Load BERT Tokenizer & Define Dataset Class
model_name = "GroNLP/bert-base-dutch-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

class ThemeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt"
        )
        encoding["labels"] = torch.tensor(self.labels[idx])
        return {key: val.squeeze(0) for key, val in encoding.items()}

train_dataset = ThemeDataset(train_texts_resampled, train_labels_resampled, tokenizer)
test_dataset = ThemeDataset(test_texts, test_labels, tokenizer)

In [8]:
# ✅ 8. Load BERT Model for Classification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=df["theme_id"].nunique())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# ✅ 9. Define Training Arguments (With Early Stopping)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,                 # 👈 Keep only the last checkpoint
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,  
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",  # 🔥 Log only once per epoch
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

# ✅ 10. Define Metrics for Evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=1)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# ✅ 11. Train Model with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop if no improvement for 2 epochs
)

trainer.train()

 12%|█▎        | 2758/22064 [10:11<1:07:53,  4.74it/s]

{'loss': 0.9446, 'grad_norm': 6.755377292633057, 'learning_rate': 1.750181290790428e-05, 'epoch': 1.0}


                                                      
 12%|█▎        | 2758/22064 [10:44<1:07:53,  4.74it/s]

{'eval_loss': 0.27810218930244446, 'eval_accuracy': 0.9355759429153925, 'eval_precision': 0.9365465396655162, 'eval_recall': 0.9355759429153925, 'eval_f1': 0.934984323181152, 'eval_runtime': 32.707, 'eval_samples_per_second': 149.968, 'eval_steps_per_second': 18.773, 'epoch': 1.0}


 25%|██▌       | 5516/22064 [20:51<58:17,  4.73it/s]   

{'loss': 0.1453, 'grad_norm': 33.9677848815918, 'learning_rate': 1.5003625815808558e-05, 'epoch': 2.0}


                                                    
 25%|██▌       | 5516/22064 [21:24<58:17,  4.73it/s]

{'eval_loss': 0.24781298637390137, 'eval_accuracy': 0.9510703363914373, 'eval_precision': 0.9523178068841733, 'eval_recall': 0.9510703363914373, 'eval_f1': 0.9510768302673384, 'eval_runtime': 32.6671, 'eval_samples_per_second': 150.151, 'eval_steps_per_second': 18.796, 'epoch': 2.0}


 38%|███▊      | 8274/22064 [31:27<48:32,  4.74it/s]   

{'loss': 0.0563, 'grad_norm': 0.0636298805475235, 'learning_rate': 1.2504532269760696e-05, 'epoch': 3.0}


                                                    
 38%|███▊      | 8274/22064 [31:59<48:32,  4.74it/s]

{'eval_loss': 0.21905556321144104, 'eval_accuracy': 0.963710499490316, 'eval_precision': 0.9640600453456435, 'eval_recall': 0.963710499490316, 'eval_f1': 0.9636695533177622, 'eval_runtime': 32.4755, 'eval_samples_per_second': 151.037, 'eval_steps_per_second': 18.907, 'epoch': 3.0}


 50%|█████     | 11032/22064 [42:08<39:09,  4.70it/s]  

{'loss': 0.0304, 'grad_norm': 0.0077250306494534016, 'learning_rate': 1.0004532269760696e-05, 'epoch': 4.0}


                                                     
 50%|█████     | 11032/22064 [42:41<39:09,  4.70it/s]

{'eval_loss': 0.21483458578586578, 'eval_accuracy': 0.9683995922528033, 'eval_precision': 0.9688331640793041, 'eval_recall': 0.9683995922528033, 'eval_f1': 0.9684059789052903, 'eval_runtime': 32.5419, 'eval_samples_per_second': 150.729, 'eval_steps_per_second': 18.868, 'epoch': 4.0}


 62%|██████▎   | 13790/22064 [52:46<29:05,  4.74it/s]   

{'loss': 0.0163, 'grad_norm': 0.002110428409650922, 'learning_rate': 7.506345177664975e-06, 'epoch': 5.0}


                                                     
 62%|██████▎   | 13790/22064 [53:18<29:05,  4.74it/s]

{'eval_loss': 0.23716183006763458, 'eval_accuracy': 0.9673802242609582, 'eval_precision': 0.9678454022742038, 'eval_recall': 0.9673802242609582, 'eval_f1': 0.9673868782152469, 'eval_runtime': 32.3187, 'eval_samples_per_second': 151.77, 'eval_steps_per_second': 18.998, 'epoch': 5.0}


 75%|███████▌  | 16548/22064 [1:03:22<19:07,  4.81it/s] 

{'loss': 0.0106, 'grad_norm': 0.0010589327430352569, 'learning_rate': 5.007251631617114e-06, 'epoch': 6.0}


                                                       
 75%|███████▌  | 16548/22064 [1:03:54<19:07,  4.81it/s]

{'eval_loss': 0.22260597348213196, 'eval_accuracy': 0.9698267074413863, 'eval_precision': 0.9699166628290318, 'eval_recall': 0.9698267074413863, 'eval_f1': 0.9697315964716244, 'eval_runtime': 31.7685, 'eval_samples_per_second': 154.398, 'eval_steps_per_second': 19.327, 'epoch': 6.0}


 88%|████████▊ | 19306/22064 [1:14:56<10:12,  4.50it/s]   

{'loss': 0.0065, 'grad_norm': 0.0014592966763302684, 'learning_rate': 2.5081580855692534e-06, 'epoch': 7.0}


                                                       
 88%|████████▊ | 19306/22064 [1:15:28<10:12,  4.50it/s]

{'eval_loss': 0.22862990200519562, 'eval_accuracy': 0.9708460754332314, 'eval_precision': 0.9708924073661862, 'eval_recall': 0.9708460754332314, 'eval_f1': 0.970749819313395, 'eval_runtime': 32.1121, 'eval_samples_per_second': 152.746, 'eval_steps_per_second': 19.121, 'epoch': 7.0}


100%|██████████| 22064/22064 [1:26:25<00:00,  4.43it/s]  

{'loss': 0.0044, 'grad_norm': 0.000874483201187104, 'learning_rate': 9.064539521392314e-09, 'epoch': 8.0}


                                                       
100%|██████████| 22064/22064 [1:27:00<00:00,  4.43it/s]

{'eval_loss': 0.22666610777378082, 'eval_accuracy': 0.9710499490316004, 'eval_precision': 0.9710695092719064, 'eval_recall': 0.9710499490316004, 'eval_f1': 0.970946994743531, 'eval_runtime': 35.0736, 'eval_samples_per_second': 139.849, 'eval_steps_per_second': 17.506, 'epoch': 8.0}


100%|██████████| 22064/22064 [1:27:01<00:00,  4.23it/s]

{'train_runtime': 5221.7661, 'train_samples_per_second': 33.802, 'train_steps_per_second': 4.225, 'train_loss': 0.15180027450315325, 'epoch': 8.0}





TrainOutput(global_step=22064, training_loss=0.15180027450315325, metrics={'train_runtime': 5221.7661, 'train_samples_per_second': 33.802, 'train_steps_per_second': 4.225, 'total_flos': 4.645474757431296e+16, 'train_loss': 0.15180027450315325, 'epoch': 8.0})

SAVE MODEL TO KUL DRIVE

In [10]:
from datetime import datetime

run_id = datetime.now().strftime("%Y-%m-%d_%H-%M")
save_path = f"C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/GroNLP/Run_{run_id}"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/GroNLP/Run_2025-04-03_14-30\\tokenizer_config.json',
 'C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/GroNLP/Run_2025-04-03_14-30\\special_tokens_map.json',
 'C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/GroNLP/Run_2025-04-03_14-30\\vocab.txt',
 'C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/GroNLP/Run_2025-04-03_14-30\\added_tokens.json')