# Importeer de benodigde libraries

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

# Stap 1: Laad de dataset

In [None]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Stap 2: Laad de tokenizer en het model

In [None]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

# Stap 3: Preprocess de data

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokeniseer de dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Creëer een data collator voor Masked Language Modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# Stap 4: Stel de training parameters in

In [None]:
training_args = TrainingArguments(
    output_dir="./resultaat",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Maak een Trainer voor het model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Stap 5: Train het model

In [None]:
trainer.train()

# Stap 6: Evalueer het model

In [None]:
eval_results = trainer.evaluate()
print(f"Evaluatieresultaten: {eval_results}")

# Voorbeeld van voorspellingen op gemaskeerde tekst
def mask_and_predict(text):
    inputs = tokenizer(text, return_tensors="pt")
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    mask_token_logits = logits[0, mask_token_index, :]
    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
    predicted_tokens = [tokenizer.decode([token]) for token in top_5_tokens]
    return predicted_tokens

# Test het model met een gemaskeerde zin

In [None]:
sample_text = "Machine learning is a [MASK] field."
predicted_tokens = mask_and_predict(sample_text)
print(f"Top 5 voorspellingen voor gemaskeerd woord in '{sample_text}': {predicted_tokens}")