In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from datasets import load_dataset
from sklearn.metrics import classification_report
import torch

In [5]:
# Load the pre-trained model and tokenizer
model_name = "Babelscape/wikineural-multilingual-ner"
# model_name = ""
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [6]:
# Load preprocessed French dataset
from datasets import load_from_disk
french_dataset = load_from_disk("Data/Data_fr/test_fr")

In [7]:
print(french_dataset[0])  # Print the first example
print(french_dataset.features)  # Print the dataset features


{'tokens': ['Upton', 'Park', 'exempt'], 'ner_tags': [0, 0, 3, 4, 0, 0, 0, 0, 0], 'langs': ['fr', 'fr', 'fr', 'fr', 'fr', 'fr', 'fr', 'fr', 'fr'], 'spans': ['ORG: Upton Park'], 'input_ids': [101, 13656, 11183, 11239, 11419, 10451, 14971, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, -100, 0, 3, -100, -100, -100]}
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None), 'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'spans': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-

In [8]:
def prepare_dataset(examples):
    # The tokenization step is not needed as input_ids are already present
    return {
        'input_ids': examples['input_ids'],
        'attention_mask': examples['attention_mask'],
        'token_type_ids': examples['token_type_ids'],
        'labels': examples['labels']
    }

tokenized_french = french_dataset.map(prepare_dataset, batched=True)


In [9]:
# Create a data loader:
from torch.utils.data import DataLoader

def collate_fn(batch):
    return {
        'input_ids': torch.tensor([item['input_ids'] for item in batch]),
        'attention_mask': torch.tensor([item['attention_mask'] for item in batch]),
        'token_type_ids': torch.tensor([item['token_type_ids'] for item in batch]),
        'labels': torch.tensor([item['labels'] for item in batch])
    }

data_loader = DataLoader(tokenized_french, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [10]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    attention_mask = [torch.tensor(item['attention_mask']) for item in batch]
    token_type_ids = [torch.tensor(item['token_type_ids']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]
    
    # Pad sequences to the maximum length in the batch
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Use -100 for padding in labels
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,
        'labels': labels
    }

data_loader = DataLoader(tokenized_french, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [13]:
from tqdm import tqdm
from seqeval.metrics import classification_report

model.eval()
all_predictions = []
all_true_labels = []

label_map = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}

with torch.no_grad():
    for batch in tqdm(data_loader, desc="Evaluating"):
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(model.device)
        
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
        
        for pred, label, mask in zip(predictions, labels, batch['attention_mask']):
            pred = pred.cpu().numpy()
            label = label.cpu().numpy()
            mask = mask.cpu().numpy()
            
            true_label = [label_map.get(l.item(), 'O') for l, m in zip(label, mask) if m != 0]
            pred_label = [label_map.get(p.item(), 'O') for p, m in zip(pred, mask) if m != 0]
            
            # Ensure both lists have the same length
            min_len = min(len(true_label), len(pred_label))
            true_label = true_label[:min_len]
            pred_label = pred_label[:min_len]
            
            all_predictions.append(pred_label)
            all_true_labels.append(true_label)

# Print the classification report
print(classification_report(all_true_labels, all_predictions))


Evaluating: 100%|██████████| 625/625 [01:10<00:00,  8.89it/s]


              precision    recall  f1-score   support

         LOC       0.14      0.16      0.15      4640
         ORG       0.18      0.07      0.10      5089
         PER       0.12      0.13      0.13      4714

   micro avg       0.14      0.12      0.13     14443
   macro avg       0.15      0.12      0.13     14443
weighted avg       0.15      0.12      0.13     14443

