In [None]:
!pip install transformers datasets
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117


In [None]:
import pandas as pd
from datasets import load_dataset

# Load the CoNLL format dataset
dataset = load_dataset('../Data/first_labeled_ner_data.conll', split='train')


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, padding=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100] * len(tokenized_inputs['input_ids'])  # Initialize with -100 for ignored tokens
        for j, label_id in enumerate(label):
            if word_ids[j] is not None:  # Only set labels for real tokens
                label_ids[word_ids[j]] = label_id
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
)


In [None]:
from transformers import Trainer, AutoModelForTokenClassification

# Fine-tune each model in a loop
models = ["xlm-roberta-base", "distilbert-base-multilingual-cased", "bert-base-multilingual-cased"]

for model_name in models:
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets,
        eval_dataset=validation_dataset,
    )
    trainer.train()
    trainer.save_model(f"./results/{model_name}")


In [None]:
# Evaluation
results = trainer.evaluate()
print(f"Evaluation results for {model_name}: {results}")


In [None]:
import pandas as pd

results_list = []

for model_name in models:
    # Load evaluation results and append to list
    eval_results = # Load the results for the model
    results_list.append({
        'Model': model_name,
        'Accuracy': eval_results['eval_accuracy'],
        'Precision': eval_results['eval_precision'],
        'Recall': eval_results['eval_recall'],
        'F1-Score': eval_results['eval_f1'],
    })

comparison_df = pd.DataFrame(results_list)
print(comparison_df)
