# Task 3: Fine Tune NER Model
**Objective**

Fine-tune a Named Entity Recognition (NER) model to extract key entities (e.g., products, prices, and location) from Amharic Telegram messages.

**Steps:**

1. Use Google Colab or any other environment with GPU support for faster training.
2. Install necessary libraries by running the following commands:
3. You will use the pre-trained XLM-Roberta or bert-tiny-amharic or afroxmlr model, which supports multilingual tasks, including Amharic.
4. Load the labeled dataset in CoNLL format from the previous task.
5. You can use Hugging Face's datasets library to load the data or manually parse the CoNLL format into a pandas DataFrame.
6. Tokenize the data and align the labels with tokens produced by the tokenizer
7. Set up training arguments, such as learning rate, number of epochs, batch size, and evaluation strategy.
8. Use Hugging Face's Trainer API to fine-tune the model.
9. Evaluate the fine-tuned model on the validation set to check performance.
10. After fine-tuning, save the model for future use.


In [3]:
pip install transformers datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-an

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments
from google.colab import drive
from sklearn.model_selection import train_test_split
from datasets import Dataset, Features, Sequence, ClassLabel, Value
from sklearn.metrics import classification_report


In [2]:
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/tokens_labels.conll'

with open(file_path, 'r') as file:
    contents = file.readlines()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# extract tokens and labels from the dataset
def extract_tokens_labels(text):
  words = []
  labels = []
  # checks for English word
  def is_amharic(word):
      # Amharic characters are in the Unicode range: 1200-137F (hex)
      for char in word:
          if not (0x1200 <= ord(char) <= 0x137F):
              return False
      return True

  # split tokens and labels
  for con in content:
    con = con.strip().replace('[', '').replace(']', '').replace(',', '').replace("'", "").split(' ')
    if not(is_amharic(con[0])):
      pass
    else:
      words.append(con[0])
      labels.append(con[-1])

  return words, labels


In [47]:
# align tokens and labels
def align_token_label(text, tokenizer):
  # labels to id number
  label_to_id = {
      "O": 0,
      "B-LOC": 1,
      "I-LOC": 2,
      "B-PRODUCT": 3,
      "I-PRODUCT": 4,
      "B-PRICE": 5,
      "I-PRICE": 6
}
  tokens, labels = extract_tokens_labels(text)
  tokenized_inputs = tokenizer(tokens, truncation = True, padding = True, is_split_into_words = True)

  word_ids = tokenized_inputs.word_ids()
  aligned_labels = []

  previous_id = None
  for k,id in enumerate(word_ids):
    if id is None:
      aligned_labels.append(-100)

    elif id != previous_id:
      aligned_labels.append(label_to_id[labels[id]])

    else:
      aligned_labels.append(-100)

    previous_id = id
  tokenized_inputs['labels'] = aligned_labels
  # print(aligned_labels)
  return tokenized_inputs



In [52]:
# take a subset
content = contents[0:1000]

# split validation and train sets
train_data, validation_data = train_test_split(content, test_size=0.2, random_state=42)

# intializing model
tokenizer = AutoTokenizer.from_pretrained("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0")
model = AutoModelForTokenClassification.from_pretrained("masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0",gradient_checkpointing=True)



In [48]:
# Create dictionaries to hold the tokenized datasets
tokenized_datasets = {'train': [], 'validation': []}

batch_size = 4

# batch for faster computation
def batch_data(data, batch_size):
  for i in range(0, len(data), batch_size):
    yield data[i:i + batch_size]

# align_token_label for train dataset
for batch in batch_data(train_data,batch_size):
  tokenized_batch = [align_token_label(con, tokenizer) for con in batch]
  tokenized_datasets['train'].extend(tokenized_batch)

# align_token_label for validation dataset
for batch in batch_data(validation_data,batch_size):
    tokenized_batch = [align_token_label(con, tokenizer) for con in batch]
    tokenized_datasets['validation'].extend(tokenized_batch)

# Convert lists to Hugging Face Dataset objects
tokenized_datasets['train'] = Dataset.from_list(tokenized_datasets['train'])
tokenized_datasets['validation'] = Dataset.from_list(tokenized_datasets['validation'])

In [60]:
# fine tunning the model
training_args = TrainingArguments(
    output_dir = '/content/drive/My Drive/results',
    evaluation_strategy = 'epoch',
    learning_rate = 2e-5,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 4,
    num_train_epochs = 3,
    weight_decay = 0.01,
    fp16 = True # Enable mixed precision training
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['validation'],
)

# train the model
trainer.train()

# evaluate the model
trainer.evaluate()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
1,No log,1.1e-05
2,No log,8e-06
3,No log,8e-06


{'eval_loss': 8.087497917586006e-06,
 'eval_runtime': 8.4695,
 'eval_samples_per_second': 23.614,
 'eval_steps_per_second': 5.904,
 'epoch': 3.0}

In [61]:
model.save_pretrained("afroxlmr_fine_tuned_model")
tokenizer.save_pretrained("afroxlmr_fine_tuned_model")

('afroxlmr_fine_tuned_model/tokenizer_config.json',
 'afroxlmr_fine_tuned_model/special_tokens_map.json',
 'afroxlmr_fine_tuned_model/sentencepiece.bpe.model',
 'afroxlmr_fine_tuned_model/added_tokens.json',
 'afroxlmr_fine_tuned_model/tokenizer.json')

In [61]:
# for cleaning up memory
# import torch
# torch.cuda.empty_cache()