In [1]:
# Install necessary libraries
!pip install transformers==4.41.2 peft==0.10.0 datasets seqeval accelerate


Collecting transformers==4.41.2
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.2)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft==0.10.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metad

In [2]:
from google.colab import files
uploaded = files.upload()  # Upload conll_raw_sample.txt


Saving conll_raw_sample.txt to conll_raw_sample.txt


In [3]:
def read_conll(file_path):
    sentences = []
    tokens, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append((tokens, labels))
                    tokens, labels = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    labels.append(splits[1])
    if tokens:
        sentences.append((tokens, labels))
    return sentences

data = read_conll("conll_raw_sample.txt")


In [4]:
from datasets import Dataset

tokens = [x[0] for x in data]
ner_tags = [x[1] for x in data]

label_list = sorted(set(tag for seq in ner_tags for tag in seq))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

tag_ids = [[label2id[tag] for tag in seq] for seq in ner_tags]

dataset = Dataset.from_dict({"tokens": tokens, "ner_tags": tag_ids})
dataset = dataset.train_test_split(test_size=0.2)


In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "FacebookAI/xlm-roberta-base"  # Or use your preferred model
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        padding='max_length',  # ✅ Add this
        is_split_into_words=True,
        max_length=128         # ✅ Optionally limit sequence length
    )

    labels = []
    word_ids = tokenized_inputs.word_ids()
    prev_word_id = None

    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(example["ner_tags"][word_id] if word_id < len(example["ner_tags"]) else -100)
        else:
            labels.append(example["ner_tags"][word_id] if word_id < len(example["ner_tags"]) else -100)
        prev_word_id = word_id

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [7]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./amharic-ner-results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)




In [9]:
from seqeval.metrics import classification_report
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[pred] for pred, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return classification_report(true_labels, true_predictions, output_dict=True)


In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamuelwoyesso2016[0m ([33msamuelwoyesso2016-university-of-gondar[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Loc,Price,Product,Micro avg,Macro avg,Weighted avg
1,No log,1.920107,"{'precision': 0.026442307692307692, 'recall': 1.0, 'f1-score': 0.05152224824355972, 'support': 22}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 16}","{'precision': 0.026442307692307692, 'recall': 0.5, 'f1-score': 0.0502283105022831, 'support': 44}","{'precision': 0.008814102564102564, 'recall': 0.3333333333333333, 'f1-score': 0.01717408274785324, 'support': 44}","{'precision': 0.013221153846153848, 'recall': 0.5, 'f1-score': 0.025761124121779857, 'support': 44}"
2,No log,1.508311,"{'precision': 0.036016949152542374, 'recall': 0.7727272727272727, 'f1-score': 0.06882591093117409, 'support': 22}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 16}","{'precision': 0.036016949152542374, 'recall': 0.38636363636363635, 'f1-score': 0.06589147286821705, 'support': 44}","{'precision': 0.012005649717514125, 'recall': 0.25757575757575757, 'f1-score': 0.02294197031039136, 'support': 44}","{'precision': 0.018008474576271187, 'recall': 0.38636363636363635, 'f1-score': 0.03441295546558704, 'support': 44}"
3,1.693800,1.234572,"{'precision': 0.06862745098039216, 'recall': 0.3181818181818182, 'f1-score': 0.11290322580645162, 'support': 22}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 16}","{'precision': 0.06862745098039216, 'recall': 0.1590909090909091, 'f1-score': 0.09589041095890412, 'support': 44}","{'precision': 0.022875816993464054, 'recall': 0.10606060606060606, 'f1-score': 0.03763440860215054, 'support': 44}","{'precision': 0.03431372549019608, 'recall': 0.1590909090909091, 'f1-score': 0.05645161290322582, 'support': 44}"


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.026442307692307692, 'recall': 1.0, 'f1-score': 0.05152224824355972, 'support': 22}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6}" of type <class 'dict'> for key "eval/PRICE" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 16}" of type <class 'dict'> for key "eval/Product" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.026442307692307692, 'recall': 0.5, 'f1-score': 0.0502283105022831, 'suppo

TrainOutput(global_step=12, training_loss=1.6017772555351257, metrics={'train_runtime': 343.5795, 'train_samples_per_second': 0.279, 'train_steps_per_second': 0.035, 'total_flos': 6271462342656.0, 'train_loss': 1.6017772555351257, 'epoch': 3.0})

In [11]:
results = trainer.evaluate()
print("Validation Results:", results)

# Save model and tokenizer locally
model.save_pretrained("./amharic-ner-model")
tokenizer.save_pretrained("./amharic-ner-model")

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.06862745098039216, 'recall': 0.3181818181818182, 'f1-score': 0.11290322580645162, 'support': 22}" of type <class 'dict'> for key "eval/LOC" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6}" of type <class 'dict'> for key "eval/PRICE" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 16}" of type <class 'dict'> for key "eval/Product" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.06862745098039216, 'recall': 0.1590909090909091, 'f1-score'

Validation Results: {'eval_loss': 1.234572172164917, 'eval_LOC': {'precision': 0.06862745098039216, 'recall': 0.3181818181818182, 'f1-score': 0.11290322580645162, 'support': 22}, 'eval_PRICE': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 6}, 'eval_Product': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 16}, 'eval_micro avg': {'precision': 0.06862745098039216, 'recall': 0.1590909090909091, 'f1-score': 0.09589041095890412, 'support': 44}, 'eval_macro avg': {'precision': 0.022875816993464054, 'recall': 0.10606060606060606, 'f1-score': 0.03763440860215054, 'support': 44}, 'eval_weighted avg': {'precision': 0.03431372549019608, 'recall': 0.1590909090909091, 'f1-score': 0.05645161290322582, 'support': 44}, 'eval_runtime': 2.7723, 'eval_samples_per_second': 2.886, 'eval_steps_per_second': 0.361, 'epoch': 3.0}


('./amharic-ner-model/tokenizer_config.json',
 './amharic-ner-model/special_tokens_map.json',
 './amharic-ner-model/sentencepiece.bpe.model',
 './amharic-ner-model/added_tokens.json',
 './amharic-ner-model/tokenizer.json')

In [13]:
!pip install shap lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=1ad08fa9f4733ee3aa858806c24db0a8becb10d3126377558a0c51dc73b187a6
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [32]:
# Install dependencies
# !pip install transformers==4.41.2 datasets==2.14.4 seqeval shap lime torch numpy pandas

# Import libraries
import shap
import lime
import lime.lime_text
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from datasets import Dataset
import pandas as pd
from seqeval.metrics import classification_report
import os
from google.colab import files

# Define paths
model_path = "./amharic-ner-model"
output_report = "./amharic_ner_interpretability_report.md"

# Load the fine-tuned model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
id2label = model.config.id2label
label2id = model.config.label2id

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Recreate test dataset
def read_conll(file_path):
    sentences = []
    tokens, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append((tokens, labels))
                    tokens, labels = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    labels.append(splits[1])
    if tokens:
        sentences.append((tokens, labels))
    return sentences

# Check for data file
if not os.path.exists("conll_raw_sample.txt"):
    print("Please upload conll_raw_sample.txt")
    uploaded = files.upload()
    if not os.path.exists("conll_raw_sample.txt"):
        raise FileNotFoundError("conll_raw_sample.txt not found.")

data = read_conll("conll_raw_sample.txt")
tokens = [x[0] for x in data]
ner_tags = [x[1] for x in data]
label_list = sorted(set(tag for seq in ner_tags for tag in seq))  # Fixed NameError
tag_ids = [[label2id[tag] for tag in seq] for seq in ner_tags]
dataset = Dataset.from_dict({"tokens": tokens, "ner_tags": tag_ids})
dataset = dataset.train_test_split(test_size=0.2)
test_dataset = dataset["test"]

# Tokenize test dataset
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        padding='max_length',
        is_split_into_words=True,
        max_length=128
    )
    labels = []
    word_ids = tokenized_inputs.word_ids()
    prev_word_id = None
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(example["ner_tags"][word_id] if word_id < len(example["ner_tags"]) else -100)
        else:
            labels.append(example["ner_tags"][word_id] if word_id < len(example["ner_tags"]) else -100)
        prev_word_id = word_id
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=False)

# Initialize NER pipeline
ner_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Function to prepare input for SHAP
def predict_proba(texts):
    # Handle 2D input by flattening if necessary
    if isinstance(texts, np.ndarray) and texts.ndim > 1:
        texts = texts.flatten()
    inputs = tokenizer(texts.tolist() if isinstance(texts, np.ndarray) else texts,
                      return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs).logits
        probs = torch.softmax(outputs, dim=-1).cpu().numpy()
    return probs.reshape(-1, len(id2label))

# SHAP Explainer with corrected reference data
reference_data = np.array(["placeholder"]).reshape(1, -1)  # Fixed for IndexError
explainer_shap = shap.KernelExplainer(predict_proba, reference_data)

# Function for LIME explanation
def lime_predict_proba(texts):
    return predict_proba(texts)

# LIME Explainer
lime_explanation = lime.lime_text.LimeTextExplainer(class_names=list(label2id.keys()), bow=False)

# Analyze a sample from the test set
sample = test_dataset[0]
tokens = sample["tokens"]
true_labels = [id2label[l] for l in sample["ner_tags"] if l != -100]
text_input = " ".join(tokens)

# Get SHAP values
try:
    shap_values = explainer_shap.shap_values(text_input, nsamples=50)
except Exception as e:
    print(f"SHAP error: {e}. Reducing nsamples or skipping SHAP.")
    shap_values = "SHAP computation failed."

# Get LIME explanation
try:
    lime_explanation = lime_explainer.explain_instance(text_input, lime_predict_proba, num_features=10, labels=range(len(id2label)))
    lime_explanation_list = lime_explanation.as_list()
except Exception as e:
    print(f"LIME error: {e}. Skipping LIME.")
    lime_explanation_list = "LIME computation failed."

# Predict with pipeline
predictions = ner_pipeline(text_input)
pred_labels = [pred["entity"] for pred in predictions if "entity" in pred]

# Analyze difficult cases
difficult_cases = []
for idx, example in enumerate(test_dataset):
    tokens = example["tokens"]
    true_tags = [id2label[l] for l in example["ner_tags"] if l != -100]
    text = " ".join(tokens)
    preds = ner_pipeline(text)
    pred_tags = [pred["entity"] for pred in preds if "entity" in pred]
    min_len = min(len(true_tags), len(pred_tags))
    true_tags = true_tags[:min_len]
    pred_tags = pred_tags[:min_len]
    if min_len > 0:
        incorrect = [t != p for t, p in zip(true_tags, pred_tags)]
        if any(incorrect):
            difficult_cases.append({
                "index": idx,
                "tokens": tokens,
                "true_labels": true_tags,
                "pred_labels": pred_tags,
                "incorrect_positions": [i for i, x in enumerate(incorrect) if x]
            })

# Generate classification report
true_all = []
pred_all = []
for example in test_dataset:
    true_tags = [id2label[l] for l in example["ner_tags"] if l != -100]
    text = " ".join(example["tokens"])
    preds = ner_pipeline(text)
    pred_tags = [pred["entity"] for pred in preds if "entity" in pred]
    min_len = min(len(true_tags), len(pred_tags))
    true_all.append(true_tags[:min_len])
    pred_all.append(pred_tags[:min_len])
report_dict = classification_report(true_all, pred_all, output_dict=True)

# Generate Interpretability Report
report = f"""
# Amharic NER Model Interpretability Report

## 1. Model Overview
- **Model**: {model_path} (based on xlm-roberta-base)
- **Task**: Named Entity Recognition (NER) for Amharic text
- **Labels**: {list(id2label.values())}

## 2. SHAP Analysis
### Sample Text: {" ".join(sample["tokens"])}
### SHAP Insights:
- Top contributing tokens for each label:
{shap_values}

## 3. LIME Analysis
### LIME Explanation for Sample:
{lime_explanation_list}

## 4. Performance Metrics
### Classification Report:
{pd.DataFrame(report_dict).T.to_markdown()}

## 5. Difficult Cases Analysis
### Number of Difficult Cases: {len(difficult_cases)}
### Example Difficult Case:
{difficult_cases[0] if difficult_cases else "No difficult cases found."}

## 6. Recommendations for Improvement
- **Ambiguous Entities**: If 'LOC' performance is low, collect more diverse location data.
- **Overlapping Entities**: Enhance tokenization to handle multi-word entities.
- **Data Augmentation**: Use synthetic Amharic NER data to increase dataset size.
- **Hyperparameter Tuning**: Try learning rates (e.g., 5e-5) or more epochs (e.g., 5).
"""

# Save report
with open(output_report, "w", encoding="utf-8") as f:
    f.write(report)

# Download report
files.download(output_report)

print("Interpretability analysis completed. Report saved and downloaded as 'amharic_ner_interpretability_report.md'.")

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

SHAP error: Unknown instance type: <class 'str'>. Reducing nsamples or skipping SHAP.
LIME error: name 'lime_explainer' is not defined. Skipping LIME.


  _warn_prf(average, modifier, msg_start, len(result))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Interpretability analysis completed. Report saved and downloaded as 'amharic_ner_interpretability_report.md'.
