In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset
from sklearn.metrics import classification_report
import torch

In [None]:
# Load the pre-trained model and tokenizer
model_name = "./finetuned_wikineural"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [4]:
# Load preprocessed French dataset
from datasets import load_from_disk
french_dataset = load_from_disk("Data/Data_fr/test_fr")

In [5]:
print(french_dataset[0])  # Print the first example
print(french_dataset.features)  # Print the dataset features


{'tokens': ['Upton', 'Park', 'exempt'], 'ner_tags': [0, 0, 3, 4, 0, 0, 0, 0, 0], 'langs': ['fr', 'fr', 'fr', 'fr', 'fr', 'fr', 'fr', 'fr', 'fr'], 'spans': ['ORG: Upton Park'], 'input_ids': [101, 13656, 11183, 11239, 11419, 10451, 14971, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, -100, 0, 3, -100, -100, -100]}
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None), 'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'spans': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-

In [11]:
# Visualize the dataset nicely with spaCy library for visualization:
!pip install spacy


Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Downloading spacy-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.11-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0

In [26]:
!python -m spacy download "fr_core_news_sm"


Defaulting to user installation because normal site-packages is not writeable
Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [27]:
print(doc.ents)


(exempt,)


In [28]:
nlp = spacy.load("fr_core_news_sm")  # For French text


In [None]:
html = f"<div>{doc.text}</div>"
with open("ner_visualization.html", "w", encoding="utf-8") as f:
    f.write(html)



In [29]:
import spacy
from spacy import displacy
from IPython.display import display, HTML
html = displacy.render(doc, style="ent", page=True)
with open("ner_visualization.html", "w", encoding="utf-8") as f:
    f.write(html)


# Load the language model (use "fr_core_news_sm" for French if available)
nlp = spacy.load("en_core_web_sm")

# Create a spaCy Doc object
tokens = french_dataset[0]['tokens']
doc = nlp(" ".join(tokens))

# Create entity spans
ents = []
for start, tag in enumerate(french_dataset[0]['ner_tags']):
    if tag != 0:  # Not 'O'
        end = start + 1
        while end < len(french_dataset[0]['ner_tags']) and french_dataset[0]['ner_tags'][end] == tag + 1:
            end += 1
        # Ensure the span doesn't exceed the document length
        end = min(end, len(doc))
        if start < len(doc):
            ents.append(spacy.tokens.Span(doc, start, end, label=french_dataset.features['ner_tags'].feature.names[tag]))

doc.ents = ents

# Visualize
colors = {"ORG": "#7aecec", "PER": "#feca74", "LOC": "#ff9561"}
options = {"ents": ["ORG", "PER", "LOC"], "colors": colors}

# Render the visualization
html = displacy.render(doc, style="ent", options=options, page=True)

# Display in Jupyter notebook
from IPython.display import display, HTML
display(HTML(html))



TypeError: write() argument must be str, not None

In [6]:
def prepare_dataset(examples):
    # The tokenization step is not needed as input_ids are already present
    return {
        'input_ids': examples['input_ids'],
        'attention_mask': examples['attention_mask'],
        'token_type_ids': examples['token_type_ids'],
        'labels': examples['labels']
    }

tokenized_french = french_dataset.map(prepare_dataset, batched=True)


In [9]:
# Create a data loader:
from torch.utils.data import DataLoader

def collate_fn(batch):
    return {
        'input_ids': torch.tensor([item['input_ids'] for item in batch]),
        'attention_mask': torch.tensor([item['attention_mask'] for item in batch]),
        'token_type_ids': torch.tensor([item['token_type_ids'] for item in batch]),
        'labels': torch.tensor([item['labels'] for item in batch])
    }

data_loader = DataLoader(tokenized_french, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [11]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    attention_mask = [torch.tensor(item['attention_mask']) for item in batch]
    token_type_ids = [torch.tensor(item['token_type_ids']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]
    
    # Pad sequences to the maximum length in the batch
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Use -100 for padding in labels
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,
        'labels': labels
    }

data_loader = DataLoader(tokenized_french, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [12]:
from tqdm import tqdm
from seqeval.metrics import classification_report
import torch

model.eval()
all_predictions = []
all_true_labels = []

label_map = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}

with torch.no_grad():
    for batch in tqdm(data_loader, desc="Evaluating"):
        # Überprüfe, ob alle Keys vorhanden sind
        assert 'input_ids' in batch and 'attention_mask' in batch and 'labels' in batch, \
            "Batch fehlt erforderliche Schlüssel!"

        # Inputs und Labels in das richtige Device laden
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(model.device)
        
        # Vorhersagen generieren
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
        
        # Labels und Vorhersagen verarbeiten
        for pred, label, mask in zip(predictions, labels, batch['attention_mask']):
            pred = pred.cpu().numpy()
            label = label.cpu().numpy()
            mask = mask.cpu().numpy()

            # Filtern mit der attention_mask
            true_label = [label_map.get(l, 'O') for l, m in zip(label, mask) if m != 0]
            pred_label = [label_map.get(p, 'O') for p, m in zip(pred, mask) if m != 0]
            
            # Listenlänge angleichen (falls nötig)
            if len(true_label) != len(pred_label):
                min_len = min(len(true_label), len(pred_label))
                true_label = true_label[:min_len]
                pred_label = pred_label[:min_len]
            
            all_predictions.append(pred_label)
            all_true_labels.append(true_label)

# Klassifikationsbericht ausgeben
print(classification_report(all_true_labels, all_predictions))


Evaluating: 100%|██████████| 625/625 [01:10<00:00,  8.87it/s]


              precision    recall  f1-score   support

         LOC       0.17      0.19      0.18      4640
         ORG       0.13      0.14      0.14      5089
         PER       0.17      0.16      0.17      4714

   micro avg       0.16      0.17      0.16     14443
   macro avg       0.16      0.17      0.16     14443
weighted avg       0.16      0.17      0.16     14443

