In [None]:
import pandas as pd
import torch
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from huggingface_hub import HfFolder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Set your HuggingFace token
HF_TOKEN = "secret"  # Replace with your token
HfFolder.save_token(HF_TOKEN)
from huggingface_hub import login

tag_to_id = {
    'B-geo': 0, 'O': 1, 'B-gpe': 2, 'B-per': 3, 'I-per': 4, 'B-tim': 5,
    'B-org': 6, 'I-org': 7, 'B-art': 8, 'I-art': 9, 'I-tim': 10,
    'B-eve': 11, 'I-eve': 12, 'I-geo': 13, 'I-gpe': 14, 'B-nat': 15, 'I-nat': 16
}
id_to_tag = {v: k for k, v in tag_to_id.items()}

# Login with the token
login(token="secret")

In [None]:
def test_model(test_file, result_file, metrics_file):
    model_name = "Debk/Ben_NER_xlm-roberta-large"
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)
    model = XLMRobertaForTokenClassification.from_pretrained(model_name)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    df = pd.read_csv(test_file)
    sentences = df.groupby('Sentence #')['Word'].apply(list).values
    true_tags = df.groupby('Sentence #')['Tag'].apply(list).values

    model.eval()
    all_predictions = []

    with torch.no_grad():
        for words in tqdm(sentences, desc="Predicting"):
            encoding = tokenizer(words,
                               is_split_into_words=True,
                               return_tensors="pt",
                               padding=True)
            inputs = {k: v.to(device) for k, v in encoding.items()}

            outputs = model(**inputs)
            word_ids = encoding.word_ids(0)  # Get word_ids for first sequence

            predictions = []
            prev_word_idx = None
            for idx, word_idx in enumerate(word_ids):
                if word_idx != prev_word_idx and word_idx is not None:
                    pred_id = outputs.logits[0, idx].argmax().item()
                    predictions.append(id_to_tag[pred_id])
                prev_word_idx = word_idx

            all_predictions.extend(predictions[:len(words)])

    df['Pred_Tag'] = all_predictions
    df.to_csv(result_file, index=False)

    true_tags_flat = [tag for tags in true_tags for tag in tags]
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        true_tags_flat, all_predictions, average='macro'
    )
    accuracy = accuracy_score(true_tags_flat, all_predictions)

    with open(metrics_file, 'w') as f:
        f.write(f"Model: {model_name}\n")
        f.write(f"Number of sentences: {len(sentences)}\n")
        f.write(f"Accuracy: {accuracy:.4f}\n")
        f.write(f"Precision (Macro): {precision_macro:.4f}\n")
        f.write(f"Recall (Macro): {recall_macro:.4f}\n")
        f.write(f"F1 Score (Macro): {f1_macro:.4f}\n")

In [None]:
test_model(
    '/content/drive/MyDrive/NER_Dataset/b-ner-test.csv',
    '/content/drive/MyDrive/NER_Dataset/Result-b-ner-test.csv',
    '/content/drive/MyDrive/NER_Dataset/Result-b-ner-test.txt'
)


Predicting: 100%|██████████| 4429/4429 [01:14<00:00, 59.57it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
