**Een Identifier die gebruik maakt van active learning. 500 vragen zijn zelf gelabeled. Vervolgens fine tunen we een bert model hierop. We laten deze getunede bert dan zelf 1000 ongeziene vragen labelen. We kijken naar de 20% waar het model het minst zeker van is. Die kijken we handmatig na. De gecorrigeerde voegen we dan toe aan de training data en zo herhalen we dit proces tot het bert model naar behoren werkt.** 

In [1]:
import pandas as pd
import re
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import RandomOverSampler
import torch.nn.functional as F
import os 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load labeled data
df = pd.read_excel("labeled_questions.xlsx")  
df.columns = ['question', 'label']  

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['question'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)


In [3]:
'''
Een random lange zin om te zien of we met 128 tokens per zin goed zitten. Deze zin bevat 44 dus 128 zou ok moeten zijn 
'''


tokenizer = BertTokenizer.from_pretrained('GroNLP/bert-base-dutch-cased')

text = "Zal de minister initiatieven nemen om ervoor te zorgen dat personen in een palliatief zorgtraject in de toekomst wel aanspraak kunnen maken op een tegemoetkoming van hulpmiddelen? Zo ja, welke en binnen welke termijn?"

tokens = tokenizer.tokenize(text)
print("Number of tokens:", len(tokens))
print("Tokens:", tokens)


Number of tokens: 44
Tokens: ['Zal', 'de', 'minister', 'initiatie', '##ven', 'nemen', 'om', 'ervoor', 'te', 'zorgen', 'dat', 'personen', 'in', 'een', 'pal', '##lia', '##tief', 'zorgt', '##ra', '##ject', 'in', 'de', 'toekomst', 'wel', 'aan', '##spraak', 'kunnen', 'maken', 'op', 'een', 'tegemoetkoming', 'van', 'hulpmiddel', '##en', '?', 'Zo', 'ja', ',', 'welke', 'en', 'binnen', 'welke', 'termijn', '?']


In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('GroNLP/bert-base-dutch-cased')  # or 'GroNLP/bert-base-dutch-cased'

def tokenize(texts):
    return tokenizer(
        texts, padding=True, truncation=True, return_tensors="pt", max_length=128
    )

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)


In [5]:

class QuestionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = QuestionDataset(train_encodings, train_labels)
val_dataset = QuestionDataset(val_encodings, val_labels)


In [6]:
model = BertForSequenceClassification.from_pretrained('GroNLP/bert-base-dutch-cased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="no",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.312175
2,No log,0.328864
3,No log,0.428136


TrainOutput(global_step=300, training_loss=0.27606043497721355, metrics={'train_runtime': 40.3859, 'train_samples_per_second': 59.427, 'train_steps_per_second': 7.428, 'total_flos': 157866633216000.0, 'train_loss': 0.27606043497721355, 'epoch': 3.0})

In [7]:
import torch
import torch.nn.functional as F
import pandas as pd

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load data
unlabeled_df = pd.read_excel("unlabeled_questions.xlsx")
questions = unlabeled_df['question'].tolist()
encodings = tokenize(questions)
encodings = {k: v.to(device) for k, v in encodings.items()}

# Run model on the data
with torch.no_grad():
    outputs = model(**encodings)
    probs = F.softmax(outputs.logits, dim=1)
    
    predicted_probs, predicted_labels = torch.max(probs, dim=1)
    uncertainty = 1 - predicted_probs  # lower confidence = more uncertain

# Add results to DataFrame
unlabeled_df['predicted_label'] = predicted_labels.cpu().numpy()
unlabeled_df['confidence'] = predicted_probs.cpu().numpy()
unlabeled_df['uncertainty'] = uncertainty.cpu().numpy()

# Sort by uncertainty and get top 20%
top_uncertain = unlabeled_df.sort_values(by='uncertainty', ascending=False).head(int(0.2 * len(unlabeled_df)))

# Save to Excel for manual labeling
top_uncertain.to_excel("to_label.xlsx", index=False)

