**Bert model op basis van Bert van Universiteit van Groningen. Context handling moet nog aangepast worden. Oversampling a.d.h.v de mediaan. dynamisch treshhold zoeken voor unknown.**

In [1]:
import pandas as pd
import re
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
import torch.nn.functional as F
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ✅ 1. Load & Preprocess Data
script_dir = os.getcwd() # Ga één map omhoog om 'baseline' te verwijderen en ga naar 'Data'
project_root = os.path.dirname(script_dir)  # Dit verwijdert 'baseline' van het script_dir
data_folder = os.path.join(project_root, "Data")

# 1. Dataset inladen
file_path = os.path.join(data_folder, "Grote_data.xlsx")
df = pd.read_excel(file_path)

In [3]:
# Drop unnecessary columns
if "TXT_file_name" in df.columns:
    df = df.drop(columns=["TXT_file_name"])

# Handle missing values
df = df.dropna(subset=["question"])
df["context"].fillna("", inplace=True)

# Clean text
def clean_text(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\b[a-z]\)\s+', ' ', text)
    text = re.sub(r'\b\d+\.\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["clean_text"] = (df["context"] + " " + df["question"]).apply(clean_text)

# ✅ Now: drop rare themes using original theme names
theme_counts = df["theme"].value_counts()
valid_themes = theme_counts[theme_counts >= 2].index
df = df[df["theme"].isin(valid_themes)]

# ✅ Recompute label encoding AFTER filtering
unique_themes = list(df["theme"].unique())
theme_to_id = {theme: idx for idx, theme in enumerate(unique_themes)}
id_to_theme = {idx: theme for theme, idx in theme_to_id.items()}
df["theme_id"] = df["theme"].map(theme_to_id)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["context"].fillna("", inplace=True)


In [4]:
print("All theme_ids:", sorted(df["theme_id"].unique()))
print("num_labels:", df["theme_id"].nunique())


All theme_ids: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36)]
num_labels: 37


In [5]:
# ✅ 5. Split Data into Train & Test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["clean_text"].tolist(), df["theme_id"].tolist(), test_size=0.2, random_state=42, stratify=df["theme_id"]
)

In [7]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

# Create a small DataFrame from train lists
train_df = pd.DataFrame({
    "clean_text": train_texts,
    "theme_id": train_labels
})

# Compute class counts and use median as balancing target
theme_counts = train_df["theme_id"].value_counts()
median_count = theme_counts.median()

# Define strategy: only oversample underrepresented classes
sampling_strategy = {
    theme: int(median_count)
    for theme in theme_counts.index
    if theme_counts[theme] < median_count
}

# Apply RandomOverSampler
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
X_resampled, y_resampled = ros.fit_resample(train_df[["clean_text"]], train_df["theme_id"])

# Extract oversampled train lists
train_texts_resampled = X_resampled["clean_text"].tolist()
train_labels_resampled = y_resampled.tolist()

from collections import Counter
print("Class distribution after oversampling:", Counter(train_labels_resampled))


Class distribution after oversampling: Counter({15: 5147, 4: 1763, 9: 1699, 26: 1216, 12: 1124, 6: 949, 21: 860, 0: 855, 20: 736, 16: 641, 28: 622, 14: 612, 8: 553, 1: 527, 17: 433, 23: 332, 22: 330, 11: 323, 31: 253, 27: 253, 25: 253, 5: 253, 24: 253, 2: 253, 19: 253, 29: 253, 18: 253, 13: 253, 7: 253, 3: 253, 32: 253, 10: 253, 30: 253, 33: 253, 34: 253, 35: 253, 36: 253})


In [8]:
# ✅ 7. Load BERT Tokenizer & Define Dataset Class
model_name = "GroNLP/bert-base-dutch-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

class ThemeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt"
        )
        encoding["labels"] = torch.tensor(self.labels[idx])
        return {key: val.squeeze(0) for key, val in encoding.items()}

train_dataset = ThemeDataset(train_texts_resampled, train_labels_resampled, tokenizer)
test_dataset = ThemeDataset(test_texts, test_labels, tokenizer)

In [9]:
# ✅ 8. Load BERT Model for Classification
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=df["theme_id"].nunique())


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# ✅ 9. Define Training Arguments (With Early Stopping)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,  
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",  # 🔥 Log only once per epoch
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

# ✅ 10. Define Metrics for Evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=1)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# ✅ 11. Train Model with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop if no improvement for 2 epochs
)

trainer.train()


 12%|█▎        | 2942/23536 [10:03<1:10:54,  4.84it/s]

{'loss': 1.2549, 'grad_norm': 27.439586639404297, 'learning_rate': 1.7501699524133243e-05, 'epoch': 1.0}


                                                      
 12%|█▎        | 2942/23536 [10:36<1:10:54,  4.84it/s]

{'eval_loss': 0.5774040818214417, 'eval_accuracy': 0.8469117089806745, 'eval_precision': 0.8534180514016733, 'eval_recall': 0.8469117089806745, 'eval_f1': 0.8448863153229106, 'eval_runtime': 32.6199, 'eval_samples_per_second': 161.803, 'eval_steps_per_second': 20.233, 'epoch': 1.0}


 25%|██▌       | 5884/23536 [20:44<53:34,  5.49it/s]   

{'loss': 0.3911, 'grad_norm': 0.01684476248919964, 'learning_rate': 1.5003399048266487e-05, 'epoch': 2.0}


                                                    
 25%|██▌       | 5884/23536 [21:17<53:34,  5.49it/s]

{'eval_loss': 0.4223193824291229, 'eval_accuracy': 0.892004547176961, 'eval_precision': 0.896252573460413, 'eval_recall': 0.892004547176961, 'eval_f1': 0.8914847785082272, 'eval_runtime': 33.3144, 'eval_samples_per_second': 158.43, 'eval_steps_per_second': 19.811, 'epoch': 2.0}


 38%|███▊      | 8826/23536 [31:34<50:46,  4.83it/s]   

{'loss': 0.239, 'grad_norm': 0.07002907246351242, 'learning_rate': 1.250509857239973e-05, 'epoch': 3.0}


                                                    
 38%|███▊      | 8826/23536 [32:07<50:46,  4.83it/s]

{'eval_loss': 0.4351058602333069, 'eval_accuracy': 0.9037514209928003, 'eval_precision': 0.9190272477342063, 'eval_recall': 0.9037514209928003, 'eval_f1': 0.9071099049278853, 'eval_runtime': 33.1616, 'eval_samples_per_second': 159.16, 'eval_steps_per_second': 19.903, 'epoch': 3.0}


 50%|█████     | 11768/23536 [42:24<40:51,  4.80it/s]  

{'loss': 0.1877, 'grad_norm': 0.024935776367783546, 'learning_rate': 1.0005948334466351e-05, 'epoch': 4.0}


                                                     
 50%|█████     | 11768/23536 [42:57<40:51,  4.80it/s]

{'eval_loss': 0.4210189878940582, 'eval_accuracy': 0.9105721864342554, 'eval_precision': 0.9239065923337575, 'eval_recall': 0.9105721864342554, 'eval_f1': 0.9135813285090802, 'eval_runtime': 32.9112, 'eval_samples_per_second': 160.371, 'eval_steps_per_second': 20.054, 'epoch': 4.0}


 62%|██████▎   | 14710/23536 [53:11<30:33,  4.81it/s]   

{'loss': 0.1662, 'grad_norm': 0.016496390104293823, 'learning_rate': 7.5067980965329715e-06, 'epoch': 5.0}


                                                     
 62%|██████▎   | 14710/23536 [53:44<30:33,  4.81it/s]

{'eval_loss': 0.40811023116111755, 'eval_accuracy': 0.911898446381205, 'eval_precision': 0.9168264792434633, 'eval_recall': 0.911898446381205, 'eval_f1': 0.9116081347360636, 'eval_runtime': 32.9026, 'eval_samples_per_second': 160.413, 'eval_steps_per_second': 20.059, 'epoch': 5.0}


 75%|███████▌  | 17652/23536 [1:04:00<17:57,  5.46it/s] 

{'loss': 0.1575, 'grad_norm': 0.002360285958275199, 'learning_rate': 5.0076478585995925e-06, 'epoch': 6.0}


                                                       
 75%|███████▌  | 17652/23536 [1:04:33<17:57,  5.46it/s]

{'eval_loss': 0.4208798408508301, 'eval_accuracy': 0.9100037893141342, 'eval_precision': 0.9149758010786558, 'eval_recall': 0.9100037893141342, 'eval_f1': 0.9096044899179712, 'eval_runtime': 32.9657, 'eval_samples_per_second': 160.106, 'eval_steps_per_second': 20.021, 'epoch': 6.0}


 75%|███████▌  | 17652/23536 [1:04:35<21:31,  4.55it/s]

{'train_runtime': 3875.4729, 'train_samples_per_second': 48.57, 'train_steps_per_second': 6.073, 'train_loss': 0.3993839425563056, 'epoch': 6.0}





TrainOutput(global_step=17652, training_loss=0.3993839425563056, metrics={'train_runtime': 3875.4729, 'train_samples_per_second': 48.57, 'train_steps_per_second': 6.073, 'total_flos': 3.715611280229376e+16, 'train_loss': 0.3993839425563056, 'epoch': 6.0})

In [None]:
# Save final model and tokenizer
model.save_pretrained("./ImprovedModelGroNLP")
tokenizer.save_pretrained("./ImprovedModelGroNLP")
# Save the mapping of theme IDs to themes
with open("./final_model/theme_mapping.txt", "w") as f:
    for theme_id, theme in id_to_theme.items():
        f.write(f"{theme_id}\t{theme}\n")  



EVALUATION

In [None]:
predictions = trainer.predict(test_dataset)


In [None]:
y_pred = predictions.predictions.argmax(axis=1)
y_true = predictions.label_ids


from sklearn.metrics import classification_report

all_labels = sorted(id_to_theme.keys())  # [0, 1, 2, ..., 34]
all_names = [id_to_theme[i] for i in all_labels]

print(classification_report(y_true, y_pred, labels=all_labels, target_names=all_names))



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(16, 12))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=id_to_theme.values(), yticklabels=id_to_theme.values(), cbar=True)
plt.xlabel("Predicted Labels", fontsize=14)
plt.ylabel("True Labels", fontsize=14)
plt.title("Confusion Matrix", fontsize=16)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.xticks(rotation=30, ha="right", fontsize=10)  # Rotate x-axis labels slightly
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()







Handle extremely short or vague questions by labeling them "Unknown"



In [None]:
# ✅ 13. Make Predictions (With Dynamic Confidence Threshold & Short Question Handling)
predictions = trainer.predict(test_dataset)
probabilities = F.softmax(torch.tensor(predictions.predictions), dim=1)

# ✅ Dynamically Adjust the Confidence Threshold (1st Percentile)
confidence_values = torch.max(probabilities, dim=1)[0].tolist()
dynamic_threshold = np.percentile(confidence_values, 1)  # ✅ Set threshold at the 5th percentile
print(f"Dynamic Threshold: {dynamic_threshold}")  # ✅ Print the new threshold

# ✅ Predict Themes with "Unknown" for Unclear Questions
predicted_labels = []
for i in range(len(probabilities)):
    max_prob = torch.max(probabilities[i]).item()
    pred_label = torch.argmax(probabilities[i]).item()
    question_text = test_texts[i]

    # ✅ If question is too short and lacks context, assign "Unknown"
    if len(question_text.split()) < 4:
        predicted_labels.append("Unknown")
    elif max_prob < dynamic_threshold:
        predicted_labels.append("Unknown")  # ✅ Filter out low-confidence predictions
    else:
        predicted_labels.append(id_to_theme[pred_label])  # ✅ Assign label


In [None]:
unknown_count = predicted_labels.count("Unknown")
print(f"Unknown predictions: {unknown_count} / {len(predicted_labels)} ({unknown_count/len(predicted_labels)*100:.2f}%)")


In [None]:
# ✅ 14. Save Predictions 
output_df = pd.DataFrame({
    "Text": test_texts,
    "True_Theme": [id_to_theme[label] for label in test_labels],
    "Predicted_Theme": predicted_labels
})
output_df["Correct"] = output_df["True_Theme"] == output_df["Predicted_Theme"]
output_df["Was_Unknown"] = output_df["Predicted_Theme"] == "Unknown"


output_df.to_excel("BertGroNLP-theme_classification.xlsx", index=False)
print("✅ Model Training Completed! Predictions saved.")

In [None]:
# ✅ 15. Visualize Distribution of Predicted Themes

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ✅ Replace this with your actual predictions DataFrame
# Example: If your predictions are stored in a variable `predicted_labels`
# Convert it into a DataFrame for visualization
df = pd.DataFrame({"Predicted_Theme": predicted_labels})

# ✅ Count occurrences of each predicted theme
label_counts = df["Predicted_Theme"].value_counts()

# ✅ Create the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.xticks(rotation=90)
plt.xlabel("Predicted Theme")
plt.ylabel("Count")
plt.title("Distribution of Assigned Labels in Model Predictions")
plt.show()


TO GET ATTENTION SCORES

In [None]:
import torch

# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Using device:", device)

In [None]:
def get_attention_and_prediction(clean_text):
    """Extracts attention scores and model prediction for a given question."""
    model.config.output_attentions = True  # Ensure attention is enabled

    # Tokenize input
    tokenizer_inputs = tokenizer(clean_text, return_tensors="pt")  
    tokenizer_inputs = {key: val.to(device) for key, val in tokenizer_inputs.items()}  # Move to GPU if available

    # Forward pass to get attention scores and logits
    with torch.no_grad():
        outputs = model(**tokenizer_inputs)

    attentions = outputs.attentions  # Extract attention scores
    logits = outputs.logits  # Model prediction scores

    predicted_class_id = logits.argmax(dim=1).item()  # Get predicted class ID
    predicted_class_name = id_to_theme.get(predicted_class_id, "Unknown")  # Convert ID to actual class name

    return attentions, predicted_class_id, predicted_class_name


test_question = "Hoeveel subsidies zijn toegekend aan bedrijven?"
attention_scores, predicted_class_id, predicted_class_name = get_attention_and_prediction(test_question)

print(f"✅ Model predicted class: {predicted_class_name} (ID: {predicted_class_id})")
print(f"🔍 Total Attention Layers Extracted: {len(attention_scores)}")

In [None]:
import matplotlib.pyplot as plt

def visualize_attention_with_class(question):
    """Visualizes attention scores and shows the predicted class."""
    attentions, predicted_class_id, predicted_class_name = get_attention_and_prediction(question)
    
    num_layers = len(attentions)
    layer = num_layers - 1  # Last layer
    head = 0  # Choose the first attention head

    attention_matrix = attentions[layer][0, head].cpu().numpy()
    tokens = tokenizer.tokenize(question)

    plt.figure(figsize=(10, 8))
    plt.imshow(attention_matrix, cmap="viridis", aspect="auto")
    plt.xticks(range(len(tokens)), tokens, rotation=90)
    plt.yticks(range(len(tokens)), tokens)
    plt.colorbar(label="Attention Score")
    plt.title(f"Predicted Class: {predicted_class_name} | Attention Heatmap (Layer {layer+1}, Head {head+1})")
    plt.show()

correctly_classified_questions = []

for _, row in df.sample(100, random_state=42).iterrows():  # Test 100 random samples
    question = row["clean_text"]
    true_class = row["theme"]  # The actual correct theme

    _, predicted_class_id, predicted_class_name = get_attention_and_prediction(question)

    if predicted_class_name == true_class:  # ✅ Now we check for correct predictions
        correctly_classified_questions.append((question, true_class, predicted_class_name))

# Print first few correctly classified questions
print("✅ Correctly Classified Questions:")
for q, actual, predicted in correctly_classified_questions[:5]:
    print(f"🔍 Question: {q}")
    print(f"✅ Actual Class: {actual}")
    print(f"✅ Predicted Class: {predicted}\n")

# ✅ If there are correct predictions, visualize one
if correctly_classified_questions:
    sample_correct = correctly_classified_questions[0]  # Pick first correct prediction
    question, actual_class, predicted_class = sample_correct

    print(f"✅ Correctly Classified Example:")
    print(f"🔍 Question: {question}")
    print(f"✅ Actual Class: {actual_class}")
    print(f"✅ Predicted Class: {predicted_class}")

    # Visualize attention for correctly classified question
    visualize_attention_with_class(question)
else:
    print("❌ No correctly classified questions found in the sample!")

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import umap
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os

# ✅ 1. Load & Preprocess Data
script_dir = os.getcwd()
project_root = os.path.dirname(script_dir)

data_folder = os.path.join(project_root, "Data")
file_path = os.path.join(data_folder, "Grote_data.xlsx")
df = pd.read_excel(file_path)

# Recreate train-test split to match training
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["theme"]
)

# Combine context and question for model input
df_test["context_question"] = df_test["context"].astype(str) + " " + df_test["question"].astype(str)

# ✅ 2. Load fine-tuned model and tokenizer
model_path = "results/checkpoint-8823"  # Adjust as needed
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# ✅ 3. Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# ✅ 4. Function to get CLS embedding
def get_cls_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.bert(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.cpu().squeeze().numpy()

# ✅ 5. Sample and embed from test set
sample_df = df_test.sample(n=2000, random_state=42)
embeddings = []

print("Generating fine-tuned embeddings (from test set)...")
for text in tqdm(sample_df["context_question"]):
    emb = get_cls_embedding(text)
    embeddings.append(emb)

# ✅ 6. UMAP dimensionality reduction
print("Running UMAP...")
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
embedding_2d = reducer.fit_transform(embeddings)

# ✅ 7. Plot UMAP
le = LabelEncoder()
labels = le.fit_transform(sample_df["theme"])

plt.figure(figsize=(12, 8))
scatter = plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], c=labels, cmap="tab20", s=10, alpha=0.8)
plt.title("UMAP of GroNLP BERT Embeddings (After Fine-tuning on Test Set)")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.colorbar(scatter, ticks=range(len(le.classes_)), label="Theme")
plt.clim(-0.5, len(le.classes_)-0.5)
plt.grid(True)
plt.tight_layout()
plt.show()
