In [1]:
!pip install transformers datasets seqeval
!pip install torch # Ensure PyTorch is installed for training

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading di

In [3]:
from datasets import load_dataset
dataset = load_dataset('conll2003', split='train')

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [7]:
import pandas as pd

def load_conll_text(file_path):
    sentences, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence, label = [], []
        for line in f:
            if line.strip() == '':
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                # Handling cases where line.split() might have fewer than 4 elements
                parts = line.split()
                if len(parts) >= 4:  # Check if enough elements are present
                    token, _, _, tag = parts
                    sentence.append(token)
                    label.append(tag)
                else:
                    # Handle lines with fewer elements, e.g., print a warning or skip them
                    print(f"Warning: Skipping line with unexpected format: {line.strip()}")
    return pd.DataFrame({'tokens': sentences, 'labels': labels})

dataset = load_conll_text('labeled_data_conll.txt')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [8]:
from transformers import XLMRobertaTokenizerFast

tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding="max_length", max_length=128)
    labels = []

    for i, label in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]



In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)




In [27]:
import pandas as pd
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split

# ... (Your existing functions: load_conll_text, tokenize_and_align_labels) ...

dataset = load_conll_text('labeled_data_conll.txt')

# Check if the dataset is empty
if dataset.empty:
    print("Error: The dataset is empty. Please check the input file and loading logic.")
else:
    # ... (Your existing code for training: train_test_split, tokenize, etc.) ...
    unique_labels = set(label for sublist in dataset['labels'] for label in sublist)
    label2id = {label: i for i, label in enumerate(unique_labels)}
    id2label = {i: label for label, i in label2id.items()}

    # Split the dataset into train and eval sets
    train_dataset, eval_dataset = train_test_split(dataset, test_size=0.2, random_state=42)  # Adjust test_size as needed

    # Tokenize the datasets
    tokenized_train_dataset = train_dataset.apply(tokenize_and_align_labels, axis=1)
    tokenized_eval_dataset = eval_dataset.apply(tokenize_and_align_labels, axis=1)

    model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(unique_labels))

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_eval_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Error: The dataset is empty. Please check the input file and loading logic.


In [24]:
unique_labels = set(label for sublist in dataset['labels'] for label in sublist)
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

In [38]:
import os
print(os.getcwd())
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaTokenizerFast
from datasets import Dataset, Features, Sequence, Value
from transformers import TrainingArguments
from transformers import XLMRobertaForTokenClassification, AutoModelForTokenClassification, AutoTokenizer, Trainer

/content


In [41]:
# Function to load CoNLL formatted data
def load_conll_text(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence = []
        label = []
        for line in f:
            if line.strip():  # Non-empty line
                token, label_item = line.split()
                sentence.append(token)
                label.append(label_item)
            else:  # Empty line indicates end of a sentence
                sentences.append(sentence)
                labels.append(label)
                sentence = []
                label = []
    return pd.DataFrame({'tokens': sentences, 'labels': labels})

file_path = r'labeled_data_conll.txt'
# Load your CoNLL file
df = load_conll_text(file_path)

In [42]:
# Explore the first few rows
df.head()

Unnamed: 0,tokens,labels
0,"[3, እስከ, 260, ሙቀት, መቆቆም, የሚችል, ዋጋ550ብር, አድራሻ, ...","[O, O, O, O, O, O, I-PRICE, O, O, O, O, O, O, ..."
1,"[ጊዜ, ቆጣቢ, ስላይስ, ማድረጊያ, ለእጅ, ሴፍቲ, ተመራጭ, ለድንች, ለ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[2, 1, 1, ዋጋ, 2, 500, ብር, ውስን, ፍሬ, ነው, ያለው, አድ...","[O, O, O, B-PRICE, I-PRICE, I-PRICE, I-PRICE, ..."
3,"[2, 1, 1, ዋጋ, 2, 500, ብር, ውስን, ፍሬ, ነው, ያለው, አድ...","[O, O, O, B-PRICE, I-PRICE, I-PRICE, I-PRICE, ..."
4,"[31, ዋጋ3000ብር, ውስን, ፍሬ, ነው, ያለው, አድራሻ, ቁ1, ስሪ,...","[O, I-PRICE, O, O, O, O, O, O, O, O, O, O, O, ..."


In [43]:
unique_labels = set(label for sublist in df['labels'] for label in sublist)
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

In [44]:
unique_labels

{'B-LOC', 'B-PRICE', 'B-PROD', 'I-PRICE', 'O'}

In [45]:
df['labels'] = df['labels'].apply(lambda x: [label2id[label] for label in x])

**Step 3:** Convert DataFrame to Hugging Face Dataset

In [46]:
# Convert DataFrame to Hugging Face Dataset
# Make sure 'labels' is a list of lists
# Define the features with the correct data types
features = Features({
    'tokens': Sequence(Value('string')),  # List of strings for tokens
    'labels': Sequence(Value('int32'))    # List of integers for labels
})

In [47]:
# Convert DataFrame to Hugging Face Dataset with specified features
dataset = Dataset.from_pandas(df[['tokens', 'labels']], features=features)

In [48]:
# Explore the datast
dataset

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 3009
})

**Step 4:** Tokenization and Label Alignment

In [49]:
# Initialize the Fast Tokenizer
# Use the fast tokenizer
# For XLM-Roberta
tokenizer = XLMRobertaTokenizerFast.from_pretrained(
    "xlm-roberta-base",
    clean_up_tokenization_spaces=True
    )


In [50]:
# Tokenization and alignment function
def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding="max_length", max_length=128)  # Set max_length as needed
  labels = []

  for i in range(len(examples['tokens'])):
      label = examples['labels'][i]
      tokenized_label = [-100] * len(tokenized_inputs['input_ids'][i])  # Default label for all tokens

      # Aligning labels with tokens
      for j, token in enumerate(tokenized_inputs['input_ids'][i]):
          # Check if this token corresponds to the original word
          original_word_idx = tokenizer.decode(token).strip()
          if original_word_idx in examples['tokens'][i]:
              token_index = examples['tokens'][i].index(original_word_idx)
              tokenized_label[j] = label[token_index]  # Use the corresponding label

      labels.append(tokenized_label)

  tokenized_inputs['labels'] = labels
  return tokenized_inputs

In [51]:
# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3009 [00:00<?, ? examples/s]

In [52]:
tokenized_dataset

Dataset({
    features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 3009
})

In [53]:
# Split into train and validation datasets
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)  # 90% train, 10% validation

In [54]:
# Print the lengths of input_ids, attention_mask, and labels for verification
print(f"Number of samples: {len(tokenized_dataset)}")
print(f"Input IDs length: {[len(x) for x in tokenized_dataset['input_ids']]}")
print(f"Attention Mask length: {[len(x) for x in tokenized_dataset['attention_mask']]}")
print(f"Labels length: {[len(x) for x in tokenized_dataset['labels']]}")

Number of samples: 3009
Input IDs length: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 12

In [55]:
# Check the train and test split
train_test_split

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 2708
    })
    test: Dataset({
        features: ['tokens', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 301
    })
})

In [56]:
# Set up training arguments with adjustments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",     # Evaluates at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    num_train_epochs=3,
    weight_decay=0.01,               # Strength of weight decay
    max_grad_norm=1.0,  # Gradient clipping
    logging_dir='./logs',            # Directory for storing logs
    logging_strategy="steps",        # Log at regular intervals
    logging_steps=50,                # Log every 50 steps
    save_strategy="epoch",           # Save model at the end of each epoch
    report_to="none",                # Only show logs in the output (no TensorBoard)
)

**Step 6: Load and Fine-Tune the pre-trained model**

- Use Hugging Face Trainer API Fine-tune the model using the Trainer API.

- fine-tune each of the following pre-trained models:

- xlm-roberta-base

- DistilBERT

- mBERT

In [57]:
# Initialize each of the models
# For XLM-Roberta
model_xlmr = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(unique_labels)) # Ensure unique_labels is defined

# For DistilBERT
model_distilbert = AutoModelForTokenClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=len(unique_labels))

# For mBERT
model_distilbert = AutoModelForTokenClassification.from_pretrained('bert-base-multilingual-cased', num_labels=len(unique_labels))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
trainer_xlmr = Trainer(
    model=model_xlmr,
    args=training_args,
    train_dataset=train_test_split['train'],
    eval_dataset=train_test_split['test'],  # Changed from validation to test based on split
)
trainer_distilbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=train_test_split['train'],
    eval_dataset=train_test_split['test'],  # Changed from validation to test based on split
)
trainer_mbert = Trainer(
    model=model_distilbert,
    args=training_args,
    train_dataset=train_test_split['train'],
    eval_dataset=train_test_split['test'],  # Changed from validation to test based on split
)

**Step 7:** Evaluate and Train each model

In [59]:
# Fine-tune XLM-Roberta
trainer_xlmr.train()
trainer_xlmr.evaluate()

# Fine-tune DistilBERT
trainer_distilbert.train()
trainer_distilbert.evaluate()

# Fine-tune mBERT
trainer_mbert.train()
trainer_mbert.evaluate()

Epoch,Training Loss,Validation Loss
1,0.0128,0.003677
2,0.0028,0.002223
3,0.0054,0.001242


IndexError: index out of range in self

**Step 7:** Save the trained model

In [60]:
# Save the model
model.save_pretrained("./fine_tuned_ner_model")
tokenizer.save_pretrained("./fine_tuned_ner_model")

('./fine_tuned_ner_model/tokenizer_config.json',
 './fine_tuned_ner_model/special_tokens_map.json',
 './fine_tuned_ner_model/sentencepiece.bpe.model',
 './fine_tuned_ner_model/added_tokens.json',
 './fine_tuned_ner_model/tokenizer.json')

**Step 8:** Evaluate the model

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
from seqeval.metrics import classification_report
import numpy as np

predictions, labels, _ = trainer.predict(tokenized_dataset['validation'])
preds = np.argmax(predictions, axis=2)

# Create a list of true labels and predicted labels
true_labels = [[label_list[l] for l in label] for label in labels]
pred_labels = [[label_list[p] for p in pred] for pred in preds]

print(classification_report(true_labels, pred_labels))

# Task 5: Model Interpretability
Objective: Ensure transparency by explaining how the NER model identifies entities in Amharic text.

**Steps:**
- Implement SHAP and LIME:

- Use libraries like SHAP and LIME to analyze model predictions.

In [None]:
!pip install shap lime

In [None]:
import shap
explainer = shap.Explainer(model, tokenized_train_dataset)
shap_values = explainer(tokenized_eval_dataset)
shap.plots.text(shap_values[0])

In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer()
explanation = explainer.explain_instance("ምርት ዋጋ በአዲስ አበባ", model.predict_proba, num_features=6)
explanation.show_in_notebook()