In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Modellname
model_name = "Babelscape/wikineural-multilingual-ner"

# Label-Liste und Mapping
label_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# Lade Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Lade Modell mit der Option, Größenunterschiede zu ignorieren
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    ignore_mismatched_sizes=True
)

# Aktualisiere das Modell mit den passenden Labels
model.config.id2label = id2label
model.config.label2id = label2id





Some weights of BertForTokenClassification were not initialized from the model checkpoint at Babelscape/wikineural-multilingual-ner and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
from datasets import load_from_disk

dataset = load_from_disk("Data_en/test_en")
print(type(dataset))  # Zeigt an, welcher Typ geladen wurde
print(dataset)  # Gibt einen Überblick über die Struktur des Datasets



<class 'datasets.arrow_dataset.Dataset'>
Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'spans', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})


In [7]:
pip install 'accelerate>=0.26.0'


Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate>=0.26.0
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.2.1-py3-none-any.whl (336 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.4/336.4 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-1.2.1
Note: you may need to restart the kernel to use updated packages.


In [10]:
!pip install accelerate>=0.26.0
!pip install transformers[torch]


Defaulting to user installation because normal site-packages is not writeable


In [6]:
from datasets import load_from_disk

try:
    eval_dataset = load_from_disk("Data_en/validation_en")
    print("Dataset erfolgreich geladen!")
    print(eval_dataset)
except Exception as e:
    print(f"Fehler beim Laden des Datasets: {e}")


Dataset erfolgreich geladen!
Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'spans', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})


In [7]:
train_dataset = load_from_disk("Data_en/train_en")
eval_dataset = load_from_disk("Data_en/validation_en")
test_dataset=load_from_disk("Data_en/test_en")

In [14]:
import os
print(os.listdir("Data_en/validation_en"))


['.ipynb_checkpoints', 'state.json', 'data-00000-of-00001.arrow', 'dataset_info.json']


In [17]:
print(eval_dataset[0])  # Zeigt die erste Zeile des Datasets
print(eval_dataset[:5])  # Zeigt die ersten fünf Zeilen


{'tokens': ['Sioux', 'Falls', 'Arena', 'Sioux', 'Falls', 'South', 'Dakota'], 'ner_tags': [3, 4, 4, 0, 5, 6, 6, 6, 6, 0], 'langs': ['en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en'], 'spans': ['ORG: Sioux Falls Arena', 'LOC: Sioux Falls , South Dakota'], 'input_ids': [101, 72663, 23118, 17951, 72663, 23118, 11056, 21435, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 3, 4, 4, 0, 5, 6, 6, -100]}
{'tokens': [['Sioux', 'Falls', 'Arena', 'Sioux', 'Falls', 'South', 'Dakota'], ['George', 'Randolph', 'Hearst', 'Jr'], ['Conch', 'Key', 'Florida'], ['Fairground', 'by', 'Simply', 'Red', 'Bad', 'Romance', 'by', 'Lady', 'Gaga'], ['The', 'Game', 'Ice', 'Cube', 'Dr', 'Dre']], 'ner_tags': [[3, 4, 4, 0, 5, 6, 6, 6, 6, 0], [1, 2, 2, 2, 2, 2], [5, 6, 6, 6], [0, 0, 0, 3, 4, 0, 3, 4, 0, 0, 1, 2], [0, 0, 0, 1, 2, 0, 1, 2]], 'langs': [['en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en'], ['en', 'en', 'en', 'en', 'en', 'en'

In [15]:
!pip install --upgrade datasets


Defaulting to user installation because normal site-packages is not writeable


In [6]:
print(type(train_dataset))  # Sollte <class 'datasets.arrow_dataset.Dataset'> anzeigen
print(type(eval_dataset))  # Sollte <class 'datasets.arrow_dataset.Dataset'> anzeigen


<class 'datasets.arrow_dataset.Dataset'>
<class 'datasets.arrow_dataset.Dataset'>


In [8]:
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq

# Erstelle den DataCollator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./results",             # Speichert das Modell hier
    evaluation_strategy="epoch",       # Evaluiert am Ende jeder Epoche
    save_strategy="epoch",             # Speichert am Ende jeder Epoche
    learning_rate=1e-5,                # Lernrate
    per_device_train_batch_size=8,    # Batchgröße fürs Training
    per_device_eval_batch_size=8,     # Batchgröße für Evaluation
    num_train_epochs=3,                # Anzahl der Epochen
    weight_decay=0.01,                 # Gewichtszerfall
    save_total_limit=2,                # Speichere nur die letzten zwei Checkpoints
    load_best_model_at_end=True,       # Lädt das beste Modell am Ende
    logging_dir="./logs",              # Log-Verzeichnis
    logging_steps=10,
    push_to_hub=False                  # Setze auf True, wenn du das Modell teilen möchtest
)


# Initialisiere den Trainer mit dem DataCollator
trainer = Trainer(
    model=model,
    args=training_args,         # Deine Trainingseinstellungen
    data_collator=data_collator, # Setze den DataCollator hier
    tokenizer=tokenizer,         # Übergib den Tokenizer
    train_dataset=train_dataset, # Dein Trainingsdatensatz
    eval_dataset=eval_dataset    # Dein Evaluationsdatensatz
)


  trainer = Trainer(


In [11]:
# Teste mit einem kleinen Teildatensatz
small_train_dataset = train_dataset.select(range(10))
small_eval_dataset = eval_dataset.select(range(10))

trainer.train_dataset = small_train_dataset
trainer.eval_dataset = small_eval_dataset

trainer.train()

Epoch,Training Loss,Validation Loss
1,1.6412,2.006855
2,1.6307,1.977502
3,1.757,1.965532


TrainOutput(global_step=6, training_loss=1.6600581407546997, metrics={'train_runtime': 12.7864, 'train_samples_per_second': 2.346, 'train_steps_per_second': 0.469, 'total_flos': 532824503112.0, 'train_loss': 1.6600581407546997, 'epoch': 3.0})

In [9]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.5866,0.590594
2,0.7384,0.572959
3,0.4378,0.579609


TrainOutput(global_step=7500, training_loss=0.5479087477684021, metrics={'train_runtime': 2817.8395, 'train_samples_per_second': 21.293, 'train_steps_per_second': 2.662, 'total_flos': 607444431225984.0, 'train_loss': 0.5479087477684021, 'epoch': 3.0})

In [10]:
model.save_pretrained("./finetuned_wikineural")
tokenizer.save_pretrained("./finetuned_wikineural")


('./finetuned_wikineural/tokenizer_config.json',
 './finetuned_wikineural/special_tokens_map.json',
 './finetuned_wikineural/vocab.txt',
 './finetuned_wikineural/added_tokens.json',
 './finetuned_wikineural/tokenizer.json')