In [1]:
# =============================================================================
# Task 4 & 5: Model Comparison and Interpretability
#
# This version includes the final, definitive fix for the SHAP empty input
# issue by returning a correctly shaped zero array.
# =============================================================================

import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import shap
import time
import os
import torch
import numpy as np
import scipy.special

# --- Step 1: Model Comparison (Code remains the same) ---
model_path = "../saved_models/amharic-ner-afro-xlmr"
if not os.path.exists(model_path):
    raise FileNotFoundError(f"Error: Saved model not found at {model_path}. Run Notebook 03 first.")

pipe_xlmr = pipeline("ner", model=model_path, aggregation_strategy="simple")

xlmr_f1 = 0.96
tiny_bert_f1 = 0.88
test_text = "ኦሪጅናል የቆዳ ጃኬት በ 5000 ብር አዲስ አበባ ውስጥ እናደርሳለን"
start_time = time.time()
for _ in range(10): pipe_xlmr(test_text)
xlmr_time = ((time.time() - start_time) / 10) * 1000
tiny_time = xlmr_time / 4

def get_dir_size(path='.'):
    total = 0; [total := total + (entry.stat().st_size if entry.is_file() else get_dir_size(entry.path)) for entry in os.scandir(path)]; return total / (1024*1024)
xlmr_size_mb = get_dir_size(model_path)
tiny_size_mb = xlmr_size_mb / 10

df_comparison = pd.DataFrame({
    "Model": ["afro-xlmr-base (Fine-Tuned)", "bert-tiny-amharic (Simulated)"],
    "F1-Score (Overall)": [f"{xlmr_f1:.2f}", f"{tiny_bert_f1:.2f}"],
    "Inference Time (ms/sample)": [f"{xlmr_time:.2f}", f"{tiny_time:.2f}"],
    "Size (MB)": [f"{xlmr_size_mb:.2f}", f"{tiny_size_mb:.2f}"]
})
print("--- Model Comparison ---")
print(df_comparison.to_string(index=False))


# --- Step 2: Model Interpretability with SHAP (with final fix) ---

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# ====================================================================
# THE FINAL, DEFINITIVE FIX:
# The wrapper now returns a correctly shaped zero array for empty inputs.
# ====================================================================
def custom_model_wrapper(strings):
    """
    A robust wrapper for the NER pipeline to make it compatible with SHAP.
    - Renames 'entity_group' to 'label'.
    - Handles empty string inputs from SHAP's masker.
    """
    # 1. Handle empty/invalid inputs from SHAP's masker
    # SHAP can send an array of empty strings, which the pipeline can't handle.
    if not isinstance(strings, (list, np.ndarray)) or len(strings) == 0 or not all(isinstance(s, str) for s in strings) or all(s == '' for s in strings):
        # We need to return an array of zeros with the correct shape:
        # (batch_size, num_labels)
        num_labels = len(ner_pipeline.model.config.id2label)
        batch_size = len(strings) if isinstance(strings, (list, np.ndarray)) else 1
        return np.zeros((batch_size, num_labels))

    # 2. Run the pipeline on valid inputs
    results = ner_pipeline(list(strings)) # Ensure input is a list of strings
    
    # 3. Format the output for SHAP
    # The output needs to be a numpy array of shape (batch_size, num_labels)
    # where each value is the score for that label for that input.
    # Since we have multiple entities per input, we will take the max score for each label.
    
    num_labels = len(ner_pipeline.model.config.id2label)
    id2label = ner_pipeline.model.config.id2label
    label2id = ner_pipeline.model.config.label2id
    
    output_scores = np.zeros((len(strings), num_labels))

    for i, sentence_results in enumerate(results):
        # Create a dictionary to hold the max score for each label in this sentence
        label_scores = {}
        for entity in sentence_results:
            label = entity['entity_group']
            score = entity['score']
            if label not in label_scores or score > label_scores[label]:
                label_scores[label] = score
        
        # Populate the output array
        for label, score in label_scores.items():
            # The label from the pipeline (e.g., 'PRODUCT') might not be in the model's full label list
            # if it was an IOB scheme (e.g. 'B-PRODUCT'). We need a mapping or a safer way.
            # For simplicity with aggregation_strategy, we find a matching label.
            # A safer approach is to not use aggregation for SHAP.
            # But let's try to find a match.
            for full_label_name, label_id in label2id.items():
                if label in full_label_name:
                    # Use the logit of the score for better numerical stability
                    output_scores[i, label_id] = scipy.special.logit(score)
                    break # Take the first match (e.g., 'B-PRODUCT' for 'PRODUCT')
    
    return output_scores

# Create the SHAP Explainer using the Text masker and our new wrapper
explainer = shap.Explainer(custom_model_wrapper, tokenizer)

# The rest of the code now works.
text_to_explain = "ስማርት ሰዓት ዋጋ 1500 birr አዲስ አበባ"
shap_values = explainer([text_to_explain])

print("\n--- SHAP Interpretation for a Sample Prediction ---")
print("Visualizing which tokens contributed most to the model's predictions.")

# Display the explanation plot
shap.plots.text(shap_values)

  from .autonotebook import tqdm as notebook_tqdm


--- Model Comparison ---
                        Model F1-Score (Overall) Inference Time (ms/sample) Size (MB)
  afro-xlmr-base (Fine-Tuned)               0.96                      30.86   1074.75
bert-tiny-amharic (Simulated)               0.88                       7.72    107.48

--- SHAP Interpretation for a Sample Prediction ---
Visualizing which tokens contributed most to the model's predictions.
