In [10]:
!pip install -U transformers datasets evaluate seqeval -q


In [11]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# Copy your project to Google Drive inside "MyDrive"
project_path = "/content/drive/MyDrive/EthioMart-NER-Project"
data_path = "/content/drive/MyDrive/conll_labeled.txt"


Parse CoNLL Format to Dataset

In [13]:
from datasets import Dataset
import pandas as pd

def read_conll(file_path):
    tokens = []
    tags = []

    temp_tokens = []
    temp_tags = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if temp_tokens:
                    tokens.append(temp_tokens)
                    tags.append(temp_tags)
                    temp_tokens, temp_tags = [], []
            else:
                parts = line.split()
                if len(parts) >= 2:
                    word = " ".join(parts[:-1])  # Handles phrases like "Addis Ababa"
                    tag = parts[-1]
                    temp_tokens.append(word)
                    temp_tags.append(tag)

    return pd.DataFrame({'tokens': tokens, 'ner_tags': tags})





# Load and convert to HuggingFace Dataset
df = read_conll(data_path)
dataset = Dataset.from_pandas(df)


In [17]:
from transformers import AutoTokenizer

model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Create label list and mappings
label_list = list(set(tag for sublist in df["ner_tags"] for tag in sublist))
label_list.sort()
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}
num_labels = len(label_list)


In [18]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label_to_id[example["ner_tags"][word_idx]])
        else:
            current_label = label_list[label_to_id[example["ner_tags"][word_idx]]]
            if current_label.startswith("B-"):
                current_label = current_label.replace("B-", "I-")
            labels.append(label_to_id[current_label])
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels)


Map:   0%|          | 0/28 [00:00<?, ? examples/s]

In [15]:
from transformers import AutoTokenizer

model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

label_list = sorted(list(set(tag for tags in df['ner_tags'] for tag in tags)))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128,
    )

    labels = []
    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            tag = example["ner_tags"][word_idx]
            labels.append(label_to_id[tag])
        else:
            tag = example["ner_tags"][word_idx]
            # Convert B-XXX to I-XXX
            if tag.startswith("B-"):
                tag = tag.replace("B-", "I-")
            labels.append(label_to_id[tag])
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_dataset = dataset.map(tokenize_and_align_labels)


Map:   0%|          | 0/28 [00:00<?, ? examples/s]

to be


Model Setup and Training

In [19]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=num_labels, id2label=id_to_label, label2id=label_to_id
)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    logging_steps=10,
    save_steps=50,
    eval_strategy="no",       # ← use eval_strategy instead of evaluation_strategy
    report_to="none"
)


In [22]:
from transformers import Trainer, DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


  trainer = Trainer(


Step,Training Loss
10,1.4182
20,0.8255


TrainOutput(global_step=20, training_loss=1.1218275070190429, metrics={'train_runtime': 710.8943, 'train_samples_per_second': 0.197, 'train_steps_per_second': 0.028, 'total_flos': 22792036929408.0, 'train_loss': 1.1218275070190429, 'epoch': 5.0})

In [23]:
trainer.save_model("/content/drive/MyDrive/fine_tuned_xlm_roberta")


In [None]:
import transformers
print(transformers.__version__)
