In [1]:

import pandas as pd
import re
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
import torch.nn as nn
import torch.nn.functional as F
import os


In [2]:

# ✅ 1. Load & Preprocess Data
script_dir = os.path.dirname(os.getcwd())
project_root = os.path.dirname(script_dir)
data_folder = os.path.join(project_root, "Data")

file_path = os.path.join(data_folder, "Grote_data.xlsx")
df = pd.read_excel(file_path)

# Drop unnecessary columns
if "TXT_file_name" in df.columns:
    df = df.drop(columns=["TXT_file_name"])

df = df.dropna(subset=["question"])
df["context"].fillna("", inplace=True)

# ✅ 2. Clean text
dutch_stopwords = {"de", "het", "een", "en", "van", "ik", "te", "dat", "die", "in", "je", "is",
                   "niet", "op", "aan", "met", "als", "voor", "zijn", "was", "heeft", "heb",
                   "om", "bij", "of", "geen", "dan", "toch", "maar", "wel", "meer", "doen",
                   "ook", "kan", "mijn", "zo", "dus", "zou", "kunnen"}

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\b[a-z]\)\s+', ' ', text)
    text = re.sub(r'\b\d+\.\b', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = " ".join([word for word in text.split() if word not in dutch_stopwords])
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_text"] = (df["context"] + " " + df["question"]).apply(clean_text)

# ✅ 3. Encode theme labels
unique_themes = list(df["theme"].unique())
theme_to_id = {theme: idx for idx, theme in enumerate(unique_themes)}
id_to_theme = {idx: theme for theme, idx in theme_to_id.items()}
df["theme_id"] = df["theme"].map(theme_to_id)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["context"].fillna("", inplace=True)


In [3]:

# ✅ 4. Oversampling to fix class imbalance
theme_counts = df["theme_id"].value_counts()
median_count = theme_counts.median()

sampling_strategy = {theme: int(median_count) for theme in theme_counts.index if theme_counts[theme] < median_count}
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)

X_resampled, y_resampled = ros.fit_resample(df[["clean_text"]], df["theme_id"])
df_resampled = pd.DataFrame({"clean_text": X_resampled["clean_text"], "theme_id": y_resampled})

# ✅ 5. Split Data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_resampled["clean_text"].tolist(),
    df_resampled["theme_id"].tolist(),
    test_size=0.2,
    stratify=df_resampled["theme_id"],
    random_state=42
)

# ✅ 6. Load tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
label_texts = [tokenizer.cls_token + " " + theme for theme in unique_themes]




In [4]:

class LabelFusionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, label_texts, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.label_texts = label_texts
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_text = self.texts[idx]
        label = self.labels[idx]
        label_input = " ".join([tokenizer.sep_token + " " + label_text for label_text in self.label_texts])
        full_input = input_text + " " + label_input
        encoding = self.tokenizer(
            full_input,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        encoding["label_id"] = torch.tensor(label)
        return encoding

train_dataset = LabelFusionDataset(train_texts, train_labels, tokenizer, unique_themes)
test_dataset = LabelFusionDataset(test_texts, test_labels, tokenizer, unique_themes)


In [5]:

class LabelFusionBERT(nn.Module):
    def __init__(self, model_name, label_texts, tokenizer):
        super(LabelFusionBERT, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.tokenizer = tokenizer
        self.label_texts = label_texts
        self.label_embeddings = self.get_label_embeddings()

    def get_label_embeddings(self):
        with torch.no_grad():
            inputs = self.tokenizer(
                self.label_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=20
            )
            outputs = self.bert(**inputs)
            embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        return embeddings

    def forward(self, input_ids, attention_mask, label_id=None):
        device = input_ids.device
        label_embeddings = self.label_embeddings.to(device)  # move to same device as inputs
    
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = output.last_hidden_state[:, 0, :]  # shape: (batch_size, hidden_size)
    
        similarities = F.cosine_similarity(cls_embedding.unsqueeze(1), label_embeddings.unsqueeze(0), dim=-1)
    
        if label_id is not None:
            loss = F.cross_entropy(similarities, label_id)
            return {"loss": loss, "logits": similarities}
        return {"logits": similarities}


In [6]:
model = LabelFusionBERT(model_name, unique_themes, tokenizer)


In [7]:
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=1)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)




In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


In [10]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.4176,3.350542,0.736779,0.772856,0.736779,0.733119
2,3.3394,3.323322,0.818398,0.849661,0.818398,0.827357
3,3.3171,3.314909,0.846625,0.876541,0.846625,0.855769
4,3.3064,3.307475,0.87128,0.896039,0.87128,0.879005
5,3.3002,3.304561,0.881143,0.909917,0.881143,0.890619
6,3.296,3.301999,0.887944,0.917199,0.887944,0.897625
7,3.293,3.301331,0.890495,0.917087,0.890495,0.899262


KeyboardInterrupt: 

In [11]:
trainer.evaluate()


{'eval_loss': 3.30133056640625,
 'eval_accuracy': 0.8904948138071757,
 'eval_precision': 0.9170868244134099,
 'eval_recall': 0.8904948138071757,
 'eval_f1': 0.8992617803215309}