In [2]:
# EthioMart/notebooks/model_interpretability.ipynb

### --- Section 1: Setup and Configuration ---

In [3]:
import pandas as pd
from pathlib import Path
import logging
import sys
import torch
from collections import Counter

# Hugging Face imports
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# Interpretability libraries
import shap
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Add the project root to sys.path to allow importing from src
project_root = Path.cwd().parent # Assuming notebook is in EthioMart/notebooks/
sys.path.insert(0, str(project_root))

from src.preprocessor import preprocess_amharic

print(f"Project root set to: {project_root}")

Project root set to: d:\@kaim\EthioMart


In [5]:
# --- Configuration ---
# Use the best-performing model from Task 4 (DistilBERT)
MODEL_PATH = Path(project_root / "models" / "distilbert_ner_fine_tuned") 
LABELS_PATH = MODEL_PATH # Labels are saved with the model in Hugging Face format
print(f"Using model from: {MODEL_PATH}")
# Example sentences for interpretation. Try to include some challenging ones.we can also use from the original `telegram_data.csv` dataset.

# Example sentences from `telegram_data.csv` not in the small 50-sentence sample
EXAMPLE_TEXTS = [
    "New balance master quality Made In VIETNAM Size: 5500 ETB Free Delivery INBOX: @Maraki2211 ስልክ: +251 913321831 አድራሻ አዲስ አበባ , ሜክሲኮ፡ ከ ኬኬር ህንጻ 50ሜ ወረድ ብሎ አይመን ህንፃ ግራውንድ ፍሎር ላይ፡ የሱቅ ቁ. 012 Maraki Brand ማራኪ ብራንድ",
    "Foldable High Capacity Travel Bags Lightweight Travel Carry Bag High Capacity Waterresistant multiple pockets Multifunctional Sport Travel Bags It is portable with multiple ways to carry , handheld , shoulderon , or put on luggage ዋጋ፦ 1550 ከነፃ ዲሊቨሪ ጋር ዕቃዉ እጅዎ ሲደርስከፈለጉበካሽአልያምበሞባይልባንኪንግመፈፀምይችላሉ በተጨማሪ በላይ የሚተመኑ ሲገዙ ስጦታ እንልክለዎታለን T.meLeyueqa ቻናላችንን ለጓደኛዎ ሸር ማድረግዎን አይርሱ ያሉበት ድረስ በነፃ እናደርሳለን 0933334444 @LeMazezz 0944109295 @Lemazez 0946242424 @LeMazez",
    "Skechers Gowalk Size 40 , 41 , 42 , 43 Price 2900 ETB አድራሻ ሜክሲኮ ኮሜርስ ጀርባ መዚድ ፕላዛ የመጀመሪያ ደረጃ እንደወጡ 101 የቢሮ ቁጥር ያገኙናል or call 0920238243 EthioBrand https :",
    "Reebok classic club volvet size 40 , 41 , 42 , 43 Price 2900 ETB አድራሻ ሜክሲኮ ኮሜርስ ጀርባ መዚድ ፕላዛ አንደኛ ደረጃ እንደወጡ ያገኙናል or call 0920238243",
    "ዉሀ ስርገትን ወደ ፍራሽ ዉስጥ እንዳይገባ እና አላስፈላጊ ሽታን እንዲሁ ም ድካምን የሚከላከል አንሶላ Mattress PROTECTOR POLYESTER MICROFIBERBed Size 200 cm 1.20 cm ነጭ ቬጅ ከለር ዋጋ 3400 ማሳሰቢያ የትራስ ልብስ የለዉም 0933334444 @LeMazezz 0946242424 @LeMazez"
]

Using model from: d:\@kaim\EthioMart\models\distilbert_ner_fine_tuned


### --- Section 2: Load Model and Setup Inference Pipeline ---

In [6]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

# Check if CUDA (GPU) is available and move model to GPU
device = 0 if torch.cuda.is_available() else -1 # 0 for GPU 0, -1 for CPU
logging.info(f"Using device: {'cuda' if device == 0 else 'cpu'}")

# Create a Hugging Face pipeline for NER
# This pipeline handles tokenization, model inference, and decoding predictions.
nlp_pipeline = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple", # Aggregates subword tokens into words
    device=device # Use GPU if available
)

logging.info(f"Model and tokenizer loaded for inference pipeline from {MODEL_PATH}")

2025-06-26 06:50:21,857 - INFO - Using device: cpu
Device set to use cpu
2025-06-26 06:50:21,861 - INFO - Model and tokenizer loaded for inference pipeline from d:\@kaim\EthioMart\models\distilbert_ner_fine_tuned


### --- Section 3: Prepare Data for Interpretation ---

In [7]:
def get_ner_predictions(text):
    """
    Helper function to get NER predictions from the pipeline.
    Returns a list of dictionaries with 'word', 'entity_group', 'score', 'start', 'end'.
    """
    preprocessed_text = preprocess_amharic(text)
    if not preprocessed_text.strip():
        return []
    try:
        predictions = nlp_pipeline(preprocessed_text)
        return predictions
    except Exception as e:
        logging.error(f"Error during NER prediction for text '{preprocessed_text[:50]}...': {e}")
        return []

def format_predictions_for_display(predictions, text):
    """
    Formats the raw predictions into a more readable string.
    """
    formatted_output = []
    if not predictions:
        return f"No entities found for: '{text}'"

    for ent in predictions:
        word = ent['word']
        entity_type = ent['entity_group']
        score = ent['score']
        formatted_output.append(f"'{word}' ({entity_type} - {score:.2f})")
    return "; ".join(formatted_output)

In [8]:
# Test the pipeline with a sample text
sample_text_for_test = "New balance master quality Made In VIETNAM Size: 5500 ETB Free Delivery INBOX: @Maraki2211 ስልክ: +251 913321831"
print(f"Original Text: {sample_text_for_test}")
test_predictions = get_ner_predictions(sample_text_for_test)
print(f"Formatted Predictions: {format_predictions_for_display(test_predictions, sample_text_for_test)}")

Original Text: New balance master quality Made In VIETNAM Size: 5500 ETB Free Delivery INBOX: @Maraki2211 ስልክ: +251 913321831
Formatted Predictions: 'New balance master' (PRICE - 0.08); 'quality' (PRODUCT - 0.07); 'Made In' (PRICE - 0.07); 'VI' (PRODUCT - 0.08); '##ETNA' (PRICE - 0.08); '##M Size :' (PRODUCT - 0.08); '5500' (PRODUCT - 0.07); 'ETB' (PRICE - 0.08); 'Free' (PRODUCT - 0.07); 'Delivery INB' (PRICE - 0.08); '##OX :' (PRODUCT - 0.08); '@' (PRICE - 0.08); 'Mara' (PRICE - 0.07); '##ki 221' (PRODUCT - 0.07); '##1 ስልክ' (PRICE - 0.08); ':' (PRICE - 0.07); '+' (LOC - 0.07); '251' (LOC - 0.07); '913321831' (PRICE - 0.08)




### --- Section 4: SHAP Explanations ---

In [None]:
logging.info("Starting SHAP explanations...")

def f(x):
    """
    Prediction function for SHAP.
    Takes a list of texts, preprocesses them, tokenizes, and returns the logits.
    """
    with torch.no_grad():
        inputs = tokenizer(x, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        logits = model(**inputs).logits
    return logits.cpu().numpy()

# Let's pick a specific example to demonstrate SHAP
shap_example_text = EXAMPLE_TEXTS[2] # "Skechers Gowalk Size 40 , 41 , 42 , 43 Price 2900 ETB አድራሻ ሜክሲኮ ኮሜርስ ጀርባ መዚድ ፕላዛ የመጀመሪያ ደረጃ እንደወጡ 101 የቢሮ ቁጥር ያገኙናል or call 0920238243 EthioBrand https :"
print(f"\n--- SHAP Explanation for: '{shap_example_text}' ---")


# Preprocess the text for explanation
preprocessed_shap_text = preprocess_amharic(shap_example_text)
if preprocessed_shap_text.strip():
    # Get original words from the preprocessed text
    original_words = preprocessed_shap_text.split()
        
    # Find the ID for the 'B-LOC' label
    b_loc_id = model.config.label2id.get('B-LOC')
    
    if b_loc_id is not None:
        def predict_b_loc_prob(word_inputs):
            # word_inputs is a list of lists of words (perturbed sentences)
            probabilities = []
            for words_list in word_inputs:
                text_input = " ".join(words_list)
                inputs = tokenizer(text_input, return_tensors="pt", truncation=True, padding='max_length', max_length=tokenizer.model_max_length)
                inputs = {k: v.to(model.device) for k, v in inputs.items()}

                with torch.no_grad():
                    logits = model(**inputs).logits
                    probs = torch.softmax(logits, dim=-1) # Convert logits to probabilities

                # Find the probability for the target token and B-LOC
                # We need to find the subword index corresponding to the original word "ሜክሲኮ"
                # This requires re-tokenizing and mapping word_ids for each perturbed input.
                
                # A more robust way to find the index of "ሜክሲኮ" in the current perturbed list of words
                # If "ሜክሲኮ" is removed or partially masked, it won't be found.
                try:
                    # Find the position of 'ሜክሲኮ' in the current `words_list`
                    # This relies on 'ሜክሲኮ' being a single word and not split across words_list
                    target_word_idx_in_perturbed = words_list.index("ሜክሲኮ")
                    
                    # Re-tokenize the *perturbed* text to get token-to-word mapping
                    temp_inputs = tokenizer(text_input, return_tensors="pt", truncation=True, is_split_into_words=True)
                    temp_word_ids = temp_inputs.word_ids(batch_index=0)
                    
                    target_subword_idx_in_tokens = -1
                    for token_idx, word_id in enumerate(temp_word_ids):
                        if word_id == target_word_idx_in_perturbed:
                            target_subword_idx_in_tokens = token_idx
                            break
                            
                    if target_subword_idx_in_tokens != -1 and target_subword_idx_in_tokens < probs.shape[1]:
                        probabilities.append(probs[0, target_subword_idx_in_tokens, b_loc_id].item())
                    else:
                        probabilities.append(0.0) # If token not found in perturbed text, probability is 0
                except ValueError:
                    # 'ሜክሲኮ' not in the current perturbed `words_list`
                    probabilities.append(0.0)
            return np.array(probabilities)

        # Initialize the explainer
        # Corrected: Pass the preprocessed text string directly to shap.maskers.Text
        explainer = shap.Explainer(predict_b_loc_prob, shap.maskers.Text(preprocessed_shap_text, tokenizer=tokenizer))
        
        # Explain the `original_words` sequence.
        shap_values = explainer(original_words)
        
        logging.info("Generating SHAP plot...")
        # For text explanations, if the explainer output is a single value per segment (word), use `shap.plots.text`
        # In a Jupyter environment, this will render an interactive plot.
        # For command line output, it will print some representation.
        shap.plots.text(shap_values[0]) # Use shap_values[0] for the first example in the batch

        print("\nSHAP values for 'B-LOC' prediction for each word:")
        for word, val in zip(original_words, shap_values.values):
            print(f"'{word}': {val[0]:.4f}")

    else:
        print("B-LOC label not found in model's label mappings. Cannot generate SHAP explanation.")
else:
    print(f"Target text for SHAP explanation is empty after preprocessing: '{shap_example_text}'")

2025-06-26 06:50:34,932 - INFO - Starting SHAP explanations...



--- SHAP Explanation for: 'Skechers Gowalk Size 40 , 41 , 42 , 43 Price 2900 ETB አድራሻ ሜክሲኮ ኮሜርስ ጀርባ መዚድ ፕላዛ የመጀመሪያ ደረጃ እንደወጡ 101 የቢሮ ቁጥር ያገኙናል or call 0920238243 EthioBrand https :' ---


TypeError: Text.__init__() got multiple values for argument 'tokenizer'

### --- Section 5: LIME Explanations (Conceptual Approach) ---

In [10]:
print("\nStarting LIME explanations (conceptual approach for NER)...")

print("LIME for token-level NER is complex and often requires custom wrappers.")
print("A direct implementation for this task would involve significant adaptation to LIME's core functionalities.")
print("SHAP is generally more directly applicable and computationally feasible for transformer-based token classification models.")



Starting LIME explanations (conceptual approach for NER)...
LIME for token-level NER is complex and often requires custom wrappers.
A direct implementation for this task would involve significant adaptation to LIME's core functionalities.
SHAP is generally more directly applicable and computationally feasible for transformer-based token classification models.


### --- Section 6: Analysis and Reporting ---

In [11]:
print("\n--- Analysis of Interpretability Results ---")
print("\nSHAP (SHapley Additive exPlanations):")
print("- SHAP values aim to show how each word in the input contributes to the model's output prediction for a specific label (e.g., 'B-LOC').")
print("- Positive SHAP values indicate that the word pushes the prediction towards the target label.")
print("- Negative SHAP values indicate that the word pushes the prediction away from the target label.")
print("- In our example for 'ሜክሲኮ' (Mexico) as 'B-LOC', words like 'አድራሻ' (address) and 'ሜክሲኮ' itself are expected to have high positive SHAP values for the 'LOC' label.")
print("- The interpretation might be less clear if the model's overall performance is low, as it's explaining a less accurate prediction.")

print("\nLIME (Local Interpretable Model-agnostic Explanations):")
print("- LIME creates local surrogate models (simple, interpretable models) to explain individual predictions.")
print("- It perturbs the input (e.g., removing words) and observes how the prediction changes, then fits a local model.")
print("- For NER, LIME can be used to show which words contribute most to the prediction of a *specific entity type* for *a given instance*.")
print("- However, its standard implementation is more geared towards classification (e.g., sentiment, topic), where the output is a single class per input. Adapting it for token-level NER (multiple labels per sentence) is non-trivial and often requires custom wrappers around LIME's core, making SHAP a more straightforward choice for this kind of task.")

print("\nDifficult Cases and Areas for Improvement:")
print("- Given the very low F1-scores from Task 3 and 4, all cases are effectively 'difficult cases' as the model is struggling significantly.")
print("- Ambiguous text and overlapping entities are known challenges for NER models generally, and especially with limited data.")
print("- **Primary Area for Improvement:** Significantly increase the size and diversity of the labeled Amharic dataset. Rule-based labeling is a good start, but manual review and annotation of a much larger corpus would yield better results.")
print("- **Data Quality:** Re-verify the quality and consistency of rule-based labels. Errors in labeling will directly impact model performance.")
print("- **Model Architecture:** While DistilBERT is lightweight, a more powerful model like `afro-xlmr-large` (given more data) or other mBERT variants might perform better.")
print("- **Hyperparameter Tuning:** More extensive hyperparameter tuning (learning rate, batch size, epochs, optimizers) could potentially yield marginal improvements, but is unlikely to overcome data limitations.")
print("- **Pre-training:** Explore domain-specific pre-training or continued pre-training on a large corpus of general Amharic text before fine-tuning for NER.")
print('- **Complex Entity Boundaries:** Analyze cases where entity boundaries are fluid (e.g., "ዋጋ ስልክ አድራሻ" or "price contact") after tokenization and labeling to refine rules or model capabilities.')

print("\nConclusion for Interpretability:")
print("Model interpretability tools like SHAP are valuable for understanding how even poorly performing models make decisions, which can guide data collection, feature engineering, and model selection. With more high-quality labeled data, these tools would provide deeper and more actionable insights into a well-performing NER system.")




--- Analysis of Interpretability Results ---

SHAP (SHapley Additive exPlanations):
- SHAP values aim to show how each word in the input contributes to the model's output prediction for a specific label (e.g., 'B-LOC').
- Positive SHAP values indicate that the word pushes the prediction towards the target label.
- Negative SHAP values indicate that the word pushes the prediction away from the target label.
- In our example for 'ሜክሲኮ' (Mexico) as 'B-LOC', words like 'አድራሻ' (address) and 'ሜክሲኮ' itself are expected to have high positive SHAP values for the 'LOC' label.
- The interpretation might be less clear if the model's overall performance is low, as it's explaining a less accurate prediction.

LIME (Local Interpretable Model-agnostic Explanations):
- LIME creates local surrogate models (simple, interpretable models) to explain individual predictions.
- It perturbs the input (e.g., removing words) and observes how the prediction changes, then fits a local model.
- For NER, LIME can b