# Importeer de benodigde libraries

In [1]:
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


# Stap 1: Laad de dataset

In [2]:
# Load the full dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Take a 0.1% subset of each split
train_dataset = dataset["train"].train_test_split(test_size=0.997, seed=42)["train"]
validation_dataset = dataset["validation"].train_test_split(test_size=0.997, seed=42)["train"]

# Combine into a dictionary format for training
dataset = {"train": train_dataset, "validation": validation_dataset}

# Stap 2: Laad de tokenizer en het model

In [3]:
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForMaskedLM.from_pretrained(model_name)

# Stap 3: Preprocess de data

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize each split separately
tokenized_train = dataset["train"].map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_validation = dataset["validation"].map(tokenize_function, batched=True, remove_columns=["text"])

# Store tokenized datasets in a dictionary
tokenized_datasets = {"train": tokenized_train, "validation": tokenized_validation}

# Create a data collator for Masked Language Modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.1)

# Stap 4: Stel de training parameters in

In [5]:
# Define a custom function to compute accuracy
def compute_accuracy(p):
    # Extract predictions and labels
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    
    # Accuracy score: ignore padding tokens (-100 in labels)
    preds = preds[labels != -100]
    labels = labels[labels != -100]
    
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

training_args = TrainingArguments(
    output_dir="./resultaat",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=3e-05,  # Reduced learning rate
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,
    num_train_epochs=5,  # Fewer epochs for quicker, stable testing
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision if a GPU is available
)

# Maak een Trainer voor het model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_accuracy  # Add the accuracy metric
)

  trainer = Trainer(


# Stap 5: Train het model

In [6]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.320791,0.756098
2,No log,2.428404,0.515152
3,No log,1.404012,0.705882
4,No log,1.7952,0.617647
5,No log,3.053416,0.534884


TrainOutput(global_step=140, training_loss=2.227349417550223, metrics={'train_runtime': 307.6103, 'train_samples_per_second': 1.788, 'train_steps_per_second': 0.455, 'total_flos': 18227158963200.0, 'train_loss': 2.227349417550223, 'epoch': 5.0})

# Stap 6: Evalueer het model

In [7]:
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

# Voorbeeld van voorspellingen op gemaskeerde tekst
def mask_and_predict(text):
    inputs = tokenizer(text, return_tensors="pt")
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    mask_token_logits = logits[0, mask_token_index, :]
    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
    predicted_tokens = [tokenizer.decode([token]) for token in top_5_tokens]
    return predicted_tokens

Evaluation Results: {'eval_loss': 1.2021193504333496, 'eval_accuracy': 0.75, 'eval_runtime': 1.7118, 'eval_samples_per_second': 6.426, 'eval_steps_per_second': 1.753, 'epoch': 5.0}


# Test het model met een gemaskeerde zin

In [8]:
sample_text = "Artificial intelligence is transforming the [MASK] industry."
predicted_tokens = mask_and_predict(sample_text)
print(f"Top 5 predictions for masked word in '{sample_text}': {predicted_tokens}")

Top 5 predictions for masked word in 'Artificial intelligence is transforming the [MASK] industry.': ['gaming', 'electronics', 'automotive', 'aerospace', 'pharmaceutical']
