**Bert model op basis van Bert van Universiteit van Groningen. Context handling moet nog aangepast worden. Oversampling a.d.h.v de mediaan. dynamisch treshhold zoeken voor unknown.**

In [1]:
import pandas as pd
import re
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
import torch.nn.functional as F
import os 

In [2]:
# ✅ 1. Load & Preprocess Data
script_dir = os.path.dirname(os.getcwd()) # Ga één map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")

# 1. Dataset inladen
file_path = os.path.join(data_folder, "Grote_data_cleaned.xlsx")
df = pd.read_excel(file_path)

In [3]:
import re

# Drop unnecessary columns
columns_to_drop = ["file_name", "context"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Handle missing values
df = df.dropna(subset=["question"])

# Clean text
def clean_text(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\b[a-z]\)\s+', ' ', text)
    text = re.sub(r'\b\d+\.\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning to just the question
df["clean_text"] = df["question"].apply(clean_text)

# Group by 'clean_text' and count unique themes
duplicates_with_diff_themes = df.groupby("clean_text")["theme"].nunique().reset_index()

# Filter rows where the number of unique themes is greater than 1
duplicates_with_diff_themes = duplicates_with_diff_themes[duplicates_with_diff_themes["theme"] > 1]

# Exclude rows with duplicate clean_texts that map to multiple themes
df = df[~df["clean_text"].isin(duplicates_with_diff_themes["clean_text"])]

# ✅ Drop rare themes (appearing < 2 times)
theme_counts = df["theme"].value_counts()
valid_themes = theme_counts[theme_counts >= 2].index
df = df[df["theme"].isin(valid_themes)]

# ✅ Recompute label encoding AFTER filtering
unique_themes = list(df["theme"].unique())
theme_to_id = {theme: idx for idx, theme in enumerate(unique_themes)}
id_to_theme = {idx: theme for theme, idx in theme_to_id.items()}
df["theme_id"] = df["theme"].map(theme_to_id)

# Amount of rows
print(f"Number of rows after filtering: {len(df)}")



Number of rows after filtering: 93306


In [4]:
print("All theme_ids:", sorted(df["theme_id"].unique()))
print("num_labels:", df["theme_id"].nunique())


All theme_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
num_labels: 38


In [5]:
# ✅ 5. Split Data into Train & Test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["clean_text"].tolist(), df["theme_id"].tolist(), test_size=0.2, random_state=42, stratify=df["theme_id"]
)

In [6]:
from collections import Counter
import random

# Step 1: Identify the top class
theme_counts = Counter(train_labels)
top_class = max(theme_counts, key=theme_counts.get)
top_class_count = theme_counts[top_class]
print(f"🔻 Top class {top_class} has {top_class_count} samples")

# Step 2: Get all indices for that class
top_class_indices = [i for i, label in enumerate(train_labels) if label == top_class]

# Step 3: Randomly remove half
random.seed(42)
drop_count = len(top_class_indices) // 2
drop_indices = set(random.sample(top_class_indices, drop_count))

# Step 4: Filter train_texts and train_labels
train_texts = [text for i, text in enumerate(train_texts) if i not in drop_indices]
train_labels = [label for i, label in enumerate(train_labels) if i not in drop_indices]

print(f"✅ Removed {drop_count} samples from top class {top_class}")


🔻 Top class 10 has 19377 samples
✅ Removed 9688 samples from top class 10


In [7]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

# Create a small DataFrame from train lists
train_df = pd.DataFrame({
    "clean_text": train_texts,
    "theme_id": train_labels
})

# Compute class counts and use median as balancing target
theme_counts = train_df["theme_id"].value_counts()
median_count = theme_counts.median()

# Define strategy: only oversample underrepresented classes
sampling_strategy = {
    theme: int(median_count)
    for theme in theme_counts.index
    if theme_counts[theme] < median_count and theme_counts[theme] > 50 
}

# Apply RandomOverSampler
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = ros.fit_resample(train_df[["clean_text"]], train_df["theme_id"])

# Extract oversampled train lists
train_texts_resampled = X_resampled["clean_text"].tolist()
train_labels_resampled = y_resampled.tolist()

from collections import Counter
print("Class distribution after oversampling:", Counter(train_labels_resampled))


Class distribution after oversampling: Counter({10: 9689, 2: 6082, 15: 5548, 12: 5098, 5: 3923, 27: 3408, 4: 2770, 1: 2647, 14: 2547, 7: 2522, 11: 2218, 21: 2133, 17: 1634, 29: 1496, 23: 1315, 16: 1214, 3: 1131, 33: 1072, 13: 1037, 0: 1014, 19: 1014, 8: 1014, 28: 1014, 34: 1014, 30: 1014, 24: 1014, 22: 1014, 26: 1014, 32: 1014, 9: 1014, 6: 1014, 20: 1014, 31: 1014, 25: 1014, 18: 1014, 35: 1014, 36: 28, 37: 11})




In [8]:
# ✅ 7. Load BERT Tokenizer & Define Dataset Class
model_name = "bert-base-multilingual-cased"  # mBERT
tokenizer = BertTokenizer.from_pretrained(model_name)

class ThemeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt"
        )
        encoding["labels"] = torch.tensor(self.labels[idx])
        return {key: val.squeeze(0) for key, val in encoding.items()}

train_dataset = ThemeDataset(train_texts_resampled, train_labels_resampled, tokenizer)
test_dataset = ThemeDataset(test_texts, test_labels, tokenizer)

In [9]:
# ✅ 8. Load BERT Model for Classification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=df["theme_id"].nunique())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from sklearn.utils.class_weight import compute_class_weight
import torch

# Assuming train_labels is a list of class indices
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels_resampled),
    y=train_labels_resampled
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

clipped_weights = torch.clamp(class_weights, min=0.5, max=10.0)#normalizing is ook een optie maar zitten hier met enorm skewed data dus clipping




In [11]:
from transformers import Trainer
from torch.nn import CrossEntropyLoss

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=clipped_weights.to(model.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [12]:
# ✅ 9. Define Training Arguments (With Early Stopping)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,                 # 👈 Keep only the last checkpoint
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,  
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",  # 🔥 Log only once per epoch
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# ✅ 10. Define Metrics for Evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=1)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# ✅ 11. Train Model with Early Stopping
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop if no improvement for 2 epochs
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.3203,1.910168,0.53049,0.588111,0.53049,0.548098
2,1.4014,1.725297,0.593934,0.635341,0.593934,0.605732


KeyboardInterrupt: 

In [17]:
trainer.save_model("output/mixed-bert-model")


In [18]:
# Get predictions from the model
preds_output = trainer.predict(test_dataset)

# Extract logits (raw model outputs)
logits = preds_output.predictions  # shape: (num_samples, num_classes)

# Get predicted class labels
predictions = np.argmax(logits, axis=1)

# Save everything in a DataFrame
output_df = pd.DataFrame({
    "text": test_texts,
    "true_label": test_labels,
    "predicted_label": predictions,
    "logits": logits.tolist()  # Save logits for softmax + threshold later
})

# Save to CSV
output_df.to_csv("output/test_predictions_mixed.csv", index=False)


In [23]:
# Final check before training
print("✔️ Final weights used in training:")
for i, weight in enumerate(class_weights):
    print(f"Class {i} → {weight.item():.4f}")


✔️ Final weights used in training:
Class 0 → 0.5527
Class 1 → 0.4432
Class 2 → 0.6322
Class 3 → 0.1752
Class 4 → 7.4268
Class 5 → 8.7315
Class 6 → 0.7557
Class 7 → 1.2795
Class 8 → 1.7654
Class 9 → 0.3045
Class 10 → 1.0191
Class 11 → 4.0132
Class 12 → 2.3842
Class 13 → 2.6054
Class 14 → 2.2280
Class 15 → 4.4870
Class 16 → 0.3594
Class 17 → 0.9049
Class 18 → 0.6335
Class 19 → 0.3942
Class 20 → 10.9514
Class 21 → 1.1684
Class 22 → 1.0112
Class 23 → 1.6525
Class 24 → 1.9759
Class 25 → 0.2987
Class 26 → 0.8755
Class 27 → 1.2571
Class 28 → 4.1957
Class 29 → 22.2804
Class 30 → 10.7689
Class 31 → 20.8430
Class 32 → 4.0637
Class 33 → 5.2531
Class 34 → 129.2263
Class 35 → 2.6159
Class 36 → 7.5132
Class 37 → 64.6132


In [24]:
# Final check before training
print("✔️ Final weights used in training:")
for i, weight in enumerate(clipped_weights):
    print(f"Class {i} → {weight.item():.4f}")

✔️ Final weights used in training:
Class 0 → 0.5527
Class 1 → 0.5000
Class 2 → 0.6322
Class 3 → 0.5000
Class 4 → 7.4268
Class 5 → 8.7315
Class 6 → 0.7557
Class 7 → 1.2795
Class 8 → 1.7654
Class 9 → 0.5000
Class 10 → 1.0191
Class 11 → 4.0132
Class 12 → 2.3842
Class 13 → 2.6054
Class 14 → 2.2280
Class 15 → 4.4870
Class 16 → 0.5000
Class 17 → 0.9049
Class 18 → 0.6335
Class 19 → 0.5000
Class 20 → 10.0000
Class 21 → 1.1684
Class 22 → 1.0112
Class 23 → 1.6525
Class 24 → 1.9759
Class 25 → 0.5000
Class 26 → 0.8755
Class 27 → 1.2571
Class 28 → 4.1957
Class 29 → 10.0000
Class 30 → 10.0000
Class 31 → 10.0000
Class 32 → 4.0637
Class 33 → 5.2531
Class 34 → 10.0000
Class 35 → 2.6159
Class 36 → 7.5132
Class 37 → 10.0000


SAVE MODEL NAAR KUL DRIVE 

In [16]:
from datetime import datetime

run_id = datetime.now().strftime("%Y-%m-%d_%H-%M")
#save_path = f"C:/Users/corne/OneDrive - KU Leuven/Thesis/Working Code/SAVED-Models/mBERT/Run_{run_id}"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

NameError: name 'save_path' is not defined