# Token-Level Explanations: SHAP and LIME Analysis

**Project:** HEARTS Adaptation - Gender Bias Detection  
**Section:** 3.3 Token Level Explanations (adapted from original HEARTS paper)

This notebook implements:
1. **SHAP Analysis** - Generate token-level importance values using SHAP
2. **LIME Analysis** - Generate token-level importance values using LIME
3. **Similarity Metrics** - Calculate Cosine Similarity, Pearson Correlation, and Jensen-Shannon Divergence
4. **Explanation Confidence Scores** - Compare SHAP and LIME outputs to assess explanation confidence
5. **Visualizations** - Display token importance and explanation confidence

**Mathematical Framework:**
- SHAP vector: $\phi_i = (\phi_{i1}, \phi_{i2}, ..., \phi_{iN})$ for each text instance $i$
- LIME vector: $\beta_i = (\beta_{i1}, \beta_{i2}, ..., \beta_{iN})$ for each text instance $i$
- Similarity metrics: Cosine Similarity, Pearson Correlation, Jensen-Shannon Divergence


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
import os
import re
from pathlib import Path
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import shap
from lime.lime_text import LimeTextExplainer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
from scipy.spatial.distance import jensenshannon
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set up paths
current_dir = Path.cwd()
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
else:
    project_root = current_dir

data_dir = project_root / 'data'
models_dir = project_root / 'models'
results_dir = project_root / 'results'
explainability_dir = project_root / 'explainability'

# Create explainability directory
os.makedirs(explainability_dir, exist_ok=True)

print("=" * 70)
print("TOKEN-LEVEL EXPLANATIONS: SHAP AND LIME ANALYSIS")
print("=" * 70)
print(f"\nProject root: {project_root}")
print(f"Data directory: {data_dir}")
print(f"Models directory: {models_dir}")
print(f"Results directory: {results_dir}")
print(f"Explainability directory: {explainability_dir}")

# Convert Path objects to strings for compatibility
data_dir = str(data_dir)
models_dir = str(models_dir)
results_dir = str(results_dir)
explainability_dir = str(explainability_dir)

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nUsing device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")


## Load Test Data and Model

Load the test data and a trained model for explanation analysis. We'll use ALBERT-V2 as the primary model (best performer).


In [None]:
# Load test data (from splits directory created by preprocessing notebook)
test_data_path = os.path.join(data_dir, 'splits', 'test.csv')

if os.path.exists(test_data_path):
    test_data = pd.read_csv(test_data_path)
    print(f"✅ Loaded test data: {len(test_data):,} samples")
    print(f"   Label distribution:")
    print(test_data['label'].value_counts().sort_index())
else:
    print(f"⚠️  Test data not found at: {test_data_path}")
    print("   Please run 01_Data_Loading_Preprocessing.ipynb first")
    test_data = None

# Load model (default: ALBERT-V2)
model_name = 'albert_albert-base-v2'  # Can be changed to 'distilbert_distilbert-base-uncased' or 'google-bert_bert-base-uncased'
model_dir = os.path.join(models_dir, 'job_descriptions', model_name)

if os.path.exists(model_dir):
    print(f"\n✅ Model found at: {model_dir}")
    print(f"   Loading model: {model_name}")
    
    # Load model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    
    # Create pipeline
    device_id = 0 if torch.cuda.is_available() else -1
    pipe = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        device=device_id,
        return_all_scores=True
    )
    print(f"✅ Model loaded successfully!")
else:
    print(f"\n⚠️  Model not found at: {model_dir}")
    print("   Please train a model first using 02_Model_Training.ipynb")
    pipe = None


## Sample Test Instances

Sample a subset of test instances for explanation analysis. We'll sample both correct and incorrect predictions.


In [None]:
def sample_instances_for_explanation(test_data, model_pipeline, n_samples=50, seed=42):
    """
    Sample test instances for explanation analysis
    Includes both correct and incorrect predictions
    
    Parameters:
    -----------
    test_data : pd.DataFrame
        Test dataset with 'text' and 'label' columns
    model_pipeline : pipeline
        HuggingFace text classification pipeline
    n_samples : int
        Number of samples per category (correct/incorrect)
    seed : int
        Random seed
    
    Returns:
    --------
    sampled_data : pd.DataFrame
        Sampled instances with predictions
    """
    if model_pipeline is None or test_data is None:
        print("⚠️  Cannot sample - model or test data not loaded")
        return None
    
    np.random.seed(seed)
    
    # Get predictions
    print("Getting predictions for all test instances...")
    predictions = model_pipeline(test_data['text'].to_list(), return_all_scores=True)
    
    # Extract predicted labels and probabilities
    pred_labels = []
    pred_probs = []
    for pred in predictions:
        best_pred = max(pred, key=lambda x: x['score'])
        label_str = best_pred['label']
        if 'LABEL_' in label_str:
            label_num = int(label_str.split('_')[-1])
        else:
            label_num = int(label_str) if label_str.isdigit() else 0
        pred_labels.append(label_num)
        pred_probs.append(best_pred['score'])
    
    # Add predictions to dataframe
    test_data = test_data.copy()
    test_data['predicted_label'] = pred_labels
    test_data['predicted_probability'] = pred_probs
    test_data['is_correct'] = test_data['predicted_label'] == test_data['label']
    
    # Sample correct and incorrect predictions
    correct_predictions = test_data[test_data['is_correct'] == True]
    incorrect_predictions = test_data[test_data['is_correct'] == False]
    
    print(f"\nCorrect predictions: {len(correct_predictions):,}")
    print(f"Incorrect predictions: {len(incorrect_predictions):,}")
    
    # Sample
    n_correct = min(n_samples, len(correct_predictions))
    n_incorrect = min(n_samples, len(incorrect_predictions))
    
    sampled_correct = correct_predictions.sample(n=n_correct, random_state=seed) if n_correct > 0 else pd.DataFrame()
    sampled_incorrect = incorrect_predictions.sample(n=n_incorrect, random_state=seed) if n_incorrect > 0 else pd.DataFrame()
    
    sampled_data = pd.concat([sampled_correct, sampled_incorrect], ignore_index=True)
    sampled_data = sampled_data.sample(frac=1, random_state=seed).reset_index(drop=True)  # Shuffle
    
    print(f"\n✅ Sampled {len(sampled_data)} instances for explanation analysis")
    print(f"   Correct predictions: {len(sampled_correct)}")
    print(f"   Incorrect predictions: {len(sampled_incorrect)}")
    
    return sampled_data

# Sample instances (adjust n_samples based on computational resources)
if test_data is not None and pipe is not None:
    sampled_data = sample_instances_for_explanation(test_data, pipe, n_samples=50, seed=42)
    
    # Save sampled data
    sampled_path = os.path.join(explainability_dir, 'sampled_instances.csv')
    sampled_data.to_csv(sampled_path, index=False)
    print(f"\n✅ Sampled data saved to: {sampled_path}")
else:
    sampled_data = None
    print("\n⚠️  Cannot sample instances - missing model or test data")


## SHAP Analysis

Generate SHAP values for token-level importance. SHAP (SHapley Additive exPlanations) calculates the contribution of each token to the model's prediction.


In [None]:
def compute_shap_values(sampled_data, model_pipeline, class_index=1):
    """
    Compute SHAP values for token-level importance
    
    Parameters:
    -----------
    sampled_data : pd.DataFrame
        Sampled instances for explanation
    model_pipeline : pipeline
        HuggingFace text classification pipeline
    class_index : int
        Class index to explain (0 = Non-Biased, 1 = Biased)
    
    Returns:
    --------
    shap_results : pd.DataFrame
        SHAP values for each token in each instance
    """
    if model_pipeline is None or sampled_data is None:
        print("⚠️  Cannot compute SHAP - model or data not loaded")
        return None
    
    print("=" * 70)
    print("COMPUTING SHAP VALUES")
    print("=" * 70)
    print(f"\nAnalyzing {len(sampled_data)} instances...")
    print(f"Class index: {class_index} ({'Biased' if class_index == 1 else 'Non-Biased'})")
    
    # Create SHAP masker and explainer
    # Using regex tokenizer to match word boundaries
    masker = shap.maskers.Text(tokenizer=r'\b\w+\b')
    explainer = shap.Explainer(model_pipeline, masker)
    
    results = []
    
    for idx, row in sampled_data.iterrows():
        text_input = row['text']
        
        try:
            # Compute SHAP values
            shap_values = explainer([text_input])
            
            # Extract SHAP values for the specified class
            # shap_values structure: [instance][token][class]
            if hasattr(shap_values, 'values'):
                values = shap_values.values[0, :, class_index]  # Get values for class_index
            else:
                values = shap_values[:, :, class_index].values[0]
            
            # Tokenize text (matching SHAP tokenization)
            tokens = re.findall(r'\b\w+\b', text_input)
            
            # Ensure we have the same number of tokens and values
            min_len = min(len(tokens), len(values))
            tokens = tokens[:min_len]
            values = values[:min_len]
            
            # Store results
            for token, value in zip(tokens, values):
                results.append({
                    'sentence_id': idx,
                    'token': token,
                    'value_shap': float(value),
                    'sentence': text_input,
                    'predicted_label': row['predicted_label'],
                    'actual_label': row['label'],
                    'is_correct': row['is_correct'],
                    'predicted_probability': row['predicted_probability']
                })
            
            if (idx + 1) % 10 == 0:
                print(f"  Processed {idx + 1}/{len(sampled_data)} instances...")
                
        except Exception as e:
            print(f"  ⚠️  Error processing instance {idx}: {str(e)[:100]}")
            continue
    
    shap_results = pd.DataFrame(results)
    print(f"\n✅ SHAP analysis complete!")
    print(f"   Total token explanations: {len(shap_results):,}")
    
    return shap_results

# Compute SHAP values
if sampled_data is not None and pipe is not None:
    print("\n⚠️  Note: SHAP computation can be slow. This may take several minutes...")
    shap_results = compute_shap_values(sampled_data, pipe, class_index=1)
    
    if shap_results is not None:
        # Save SHAP results
        shap_path = os.path.join(explainability_dir, 'shap_results.csv')
        shap_results.to_csv(shap_path, index=False)
        print(f"\n✅ SHAP results saved to: {shap_path}")
else:
    shap_results = None
    print("\n⚠️  Cannot compute SHAP - missing model or sampled data")


## LIME Analysis

Generate LIME values for token-level importance. LIME (Local Interpretable Model-agnostic Explanations) creates local explanations by perturbing the input.


In [None]:
def custom_tokenizer(text):
    """
    Custom tokenizer for LIME (matches SHAP tokenization)
    Uses regex to split on word boundaries
    """
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

def compute_lime_values(sampled_data, model_pipeline, class_index=1, num_samples=100):
    """
    Compute LIME values for token-level importance
    
    Parameters:
    -----------
    sampled_data : pd.DataFrame
        Sampled instances for explanation
    model_pipeline : pipeline
        HuggingFace text classification pipeline
    class_index : int
        Class index to explain (0 = Non-Biased, 1 = Biased)
    num_samples : int
        Number of samples for LIME perturbation
    
    Returns:
    --------
    lime_results : pd.DataFrame
        LIME values for each token in each instance
    """
    if model_pipeline is None or sampled_data is None:
        print("⚠️  Cannot compute LIME - model or data not loaded")
        return None
    
    print("=" * 70)
    print("COMPUTING LIME VALUES")
    print("=" * 70)
    print(f"\nAnalyzing {len(sampled_data)} instances...")
    print(f"Class index: {class_index} ({'Biased' if class_index == 1 else 'Non-Biased'})")
    print(f"LIME samples per instance: {num_samples}")
    
    # Define prediction function for LIME
    def predict_proba(texts):
        """Predict probabilities for LIME"""
        if isinstance(texts, str):
            texts = [texts]
        preds = model_pipeline(texts, return_all_scores=True)
        # Return probabilities for both classes
        probabilities = np.array([[pred['score'] for pred in preds_single] for preds_single in preds])
        return probabilities
    
    # Create LIME explainer
    explainer = LimeTextExplainer(
        class_names=['Non-Biased', 'Biased'],
        split_expression=lambda x: custom_tokenizer(x)
    )
    
    results = []
    
    for idx, row in sampled_data.iterrows():
        text_input = row['text']
        tokens = custom_tokenizer(text_input)
        
        try:
            # Generate LIME explanation
            exp = explainer.explain_instance(
                text_input,
                predict_proba,
                num_features=len(tokens),
                num_samples=num_samples,
                labels=[class_index]
            )
            
            # Get explanation as list of (token, value) pairs
            explanation_list = exp.as_list(label=class_index)
            
            # Create dictionary for easy lookup
            token_value_dict = {token: value for token, value in explanation_list}
            
            # Store results (including tokens with zero importance)
            for token in tokens:
                value = token_value_dict.get(token, 0.0)
                results.append({
                    'sentence_id': idx,
                    'token': token,
                    'value_lime': float(value),
                    'sentence': text_input,
                    'predicted_label': row['predicted_label'],
                    'actual_label': row['label'],
                    'is_correct': row['is_correct'],
                    'predicted_probability': row['predicted_probability']
                })
            
            if (idx + 1) % 10 == 0:
                print(f"  Processed {idx + 1}/{len(sampled_data)} instances...")
                
        except Exception as e:
            print(f"  ⚠️  Error processing instance {idx}: {str(e)[:100]}")
            continue
    
    lime_results = pd.DataFrame(results)
    print(f"\n✅ LIME analysis complete!")
    print(f"   Total token explanations: {len(lime_results):,}")
    
    return lime_results

# Compute LIME values
if sampled_data is not None and pipe is not None:
    print("\n⚠️  Note: LIME computation can be slow. This may take several minutes...")
    lime_results = compute_lime_values(sampled_data, pipe, class_index=1, num_samples=100)
    
    if lime_results is not None:
        # Save LIME results
        lime_path = os.path.join(explainability_dir, 'lime_results.csv')
        lime_results.to_csv(lime_path, index=False)
        print(f"\n✅ LIME results saved to: {lime_path}")
else:
    lime_results = None
    print("\n⚠️  Cannot compute LIME - missing model or sampled data")


## Similarity Metrics

Calculate similarity metrics between SHAP and LIME vectors to assess explanation confidence:
1. **Cosine Similarity**: Measures the angle between vectors
2. **Pearson Correlation**: Measures linear correlation
3. **Jensen-Shannon Divergence**: Measures distributional similarity


In [None]:
def compute_cosine_similarity(vector1, vector2):
    """
    Compute cosine similarity between two vectors
    
    CS(φᵢ, βᵢ) = (φᵢ · βᵢ) / (||φᵢ|| ||βᵢ||)
    """
    vector1 = np.array(vector1).flatten()
    vector2 = np.array(vector2).flatten()
    
    # Handle zero vectors
    if np.linalg.norm(vector1) == 0 or np.linalg.norm(vector2) == 0:
        return 0.0
    
    dot_product = np.dot(vector1, vector2)
    norm1 = np.linalg.norm(vector1)
    norm2 = np.linalg.norm(vector2)
    
    return dot_product / (norm1 * norm2)

def compute_pearson_correlation(vector1, vector2):
    """
    Compute Pearson correlation coefficient
    
    PC(φᵢ, βᵢ) = Cov(φᵢ, βᵢ) / (σ_φᵢ σ_βᵢ)
    """
    vector1 = np.array(vector1).flatten()
    vector2 = np.array(vector2).flatten()
    
    if len(vector1) != len(vector2) or len(vector1) < 2:
        return 0.0
    
    correlation, _ = pearsonr(vector1, vector2)
    return correlation if not np.isnan(correlation) else 0.0

def compute_js_divergence(vector1, vector2):
    """
    Compute Jensen-Shannon Divergence
    
    First converts vectors to probability distributions, then computes JSD
    """
    vector1 = np.array(vector1).flatten()
    vector2 = np.array(vector2).flatten()
    
    # Convert to probability distributions
    # Add |Min| to make all values non-negative
    min1 = np.min(vector1)
    min2 = np.min(vector2)
    
    if min1 < 0:
        vector1 = vector1 + abs(min1)
    if min2 < 0:
        vector2 = vector2 + abs(min2)
    
    # Normalize to probabilities
    sum1 = np.sum(vector1)
    sum2 = np.sum(vector2)
    
    if sum1 > 0:
        prob1 = vector1 / sum1
    else:
        prob1 = np.ones_like(vector1) / len(vector1)
    
    if sum2 > 0:
        prob2 = vector2 / sum2
    else:
        prob2 = np.ones_like(vector2) / len(vector2)
    
    # Compute Jensen-Shannon Divergence
    # JSD(P||Q) = 0.5 * KL(P||M) + 0.5 * KL(Q||M), where M = 0.5 * (P + Q)
    M = 0.5 * (prob1 + prob2)
    
    # Avoid log(0)
    epsilon = 1e-10
    prob1 = prob1 + epsilon
    prob2 = prob2 + epsilon
    M = M + epsilon
    
    # Normalize again after adding epsilon
    prob1 = prob1 / np.sum(prob1)
    prob2 = prob2 / np.sum(prob2)
    M = M / np.sum(M)
    
    kl_pm = np.sum(prob1 * np.log(prob1 / M))
    kl_qm = np.sum(prob2 * np.log(prob2 / M))
    
    jsd = 0.5 * kl_pm + 0.5 * kl_qm
    
    return np.sqrt(jsd)  # Return square root as in the paper

# Compute similarity metrics
if shap_results is not None and lime_results is not None:
    print("=" * 70)
    print("COMPUTING SIMILARITY METRICS")
    print("=" * 70)
    
    # Merge SHAP and LIME results
    merge_cols = ['sentence_id', 'token', 'sentence', 'predicted_label', 'actual_label', 'is_correct']
    merged_df = pd.merge(
        shap_results[merge_cols + ['value_shap']],
        lime_results[merge_cols + ['value_lime']],
        on=merge_cols,
        how='inner',
        suffixes=('_shap', '_lime')
    )
    
    print(f"\nMerged {len(merged_df):,} token-level explanations")
    
    # Compute similarity metrics per sentence
    sentence_similarities = []
    
    for sentence_id in merged_df['sentence_id'].unique():
        sentence_data = merged_df[merged_df['sentence_id'] == sentence_id]
        
        shap_vector = sentence_data['value_shap'].values
        lime_vector = sentence_data['value_lime'].values
        
        # Compute metrics
        cosine_sim = compute_cosine_similarity(shap_vector, lime_vector)
        pearson_corr = compute_pearson_correlation(shap_vector, lime_vector)
        js_div = compute_js_divergence(shap_vector, lime_vector)
        
        # Get sentence metadata
        row = sentence_data.iloc[0]
        
        sentence_similarities.append({
            'sentence_id': sentence_id,
            'sentence': row['sentence'],
            'predicted_label': row['predicted_label'],
            'actual_label': row['actual_label'],
            'is_correct': row['is_correct'],
            'cosine_similarity': cosine_sim,
            'pearson_correlation': pearson_corr,
            'jensen_shannon_divergence': js_div,
            'num_tokens': len(sentence_data)
        })
    
    similarity_df = pd.DataFrame(sentence_similarities)
    
    print(f"\n✅ Computed similarity metrics for {len(similarity_df)} sentences")
    print(f"\nSimilarity Statistics:")
    print(f"  Cosine Similarity: {similarity_df['cosine_similarity'].mean():.4f} ± {similarity_df['cosine_similarity'].std():.4f}")
    print(f"  Pearson Correlation: {similarity_df['pearson_correlation'].mean():.4f} ± {similarity_df['pearson_correlation'].std():.4f}")
    print(f"  Jensen-Shannon Divergence: {similarity_df['jensen_shannon_divergence'].mean():.4f} ± {similarity_df['jensen_shannon_divergence'].std():.4f}")
    
    # Save results
    similarity_path = os.path.join(explainability_dir, 'sentence_similarity_metrics.csv')
    similarity_df.to_csv(similarity_path, index=False)
    print(f"\n✅ Similarity metrics saved to: {similarity_path}")
    
else:
    similarity_df = None
    print("\n⚠️  Cannot compute similarity metrics - missing SHAP or LIME results")


## Visualization: Token Importance and Explanation Confidence

Visualize token-level importance and explanation confidence scores.


In [None]:
def visualize_token_importance(shap_results, lime_results, sentence_id, top_n=10):
    """
    Visualize token importance for a specific sentence
    
    Parameters:
    -----------
    shap_results : pd.DataFrame
        SHAP results
    lime_results : pd.DataFrame
        LIME results
    sentence_id : int
        Sentence ID to visualize
    top_n : int
        Number of top tokens to display
    """
    if shap_results is None or lime_results is None:
        print("⚠️  Cannot visualize - missing SHAP or LIME results")
        return
    
    # Get data for this sentence
    shap_sent = shap_results[shap_results['sentence_id'] == sentence_id].copy()
    lime_sent = lime_results[lime_results['sentence_id'] == sentence_id].copy()
    
    if len(shap_sent) == 0 or len(lime_sent) == 0:
        print(f"⚠️  No data found for sentence_id {sentence_id}")
        return
    
    # Merge
    merged = pd.merge(shap_sent[['token', 'value_shap']], 
                     lime_sent[['token', 'value_lime']], 
                     on='token', how='inner')
    
    # Get top tokens by absolute SHAP value
    merged['abs_shap'] = merged['value_shap'].abs()
    top_tokens = merged.nlargest(top_n, 'abs_shap')
    
    # Create visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot 1: SHAP values
    colors_shap = ['red' if x < 0 else 'blue' for x in top_tokens['value_shap']]
    ax1.barh(range(len(top_tokens)), top_tokens['value_shap'], color=colors_shap, alpha=0.7)
    ax1.set_yticks(range(len(top_tokens)))
    ax1.set_yticklabels(top_tokens['token'], fontsize=10)
    ax1.set_xlabel('SHAP Value', fontsize=12)
    ax1.set_title(f'Top {top_n} Tokens by SHAP Importance\n(Sentence ID: {sentence_id})', fontsize=14)
    ax1.axvline(x=0, color='black', linestyle='--', linewidth=0.5)
    ax1.grid(axis='x', alpha=0.3)
    
    # Plot 2: LIME values
    colors_lime = ['red' if x < 0 else 'blue' for x in top_tokens['value_lime']]
    ax2.barh(range(len(top_tokens)), top_tokens['value_lime'], color=colors_lime, alpha=0.7)
    ax2.set_yticks(range(len(top_tokens)))
    ax2.set_yticklabels(top_tokens['token'], fontsize=10)
    ax2.set_xlabel('LIME Value', fontsize=12)
    ax2.set_title(f'Top {top_n} Tokens by LIME Importance\n(Sentence ID: {sentence_id})', fontsize=14)
    ax2.axvline(x=0, color='black', linestyle='--', linewidth=0.5)
    ax2.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    
    # Save figure
    fig_path = os.path.join(explainability_dir, f'token_importance_sentence_{sentence_id}.png')
    plt.savefig(fig_path, dpi=300, bbox_inches='tight')
    print(f"✅ Saved visualization to: {fig_path}")
    
    plt.show()
    
    # Print sentence and metrics
    sentence_text = shap_sent.iloc[0]['sentence']
    print(f"\nSentence: {sentence_text[:200]}...")
    print(f"Predicted: {shap_sent.iloc[0]['predicted_label']}, Actual: {shap_sent.iloc[0]['actual_label']}")
    print(f"\nTop {top_n} Important Tokens:")
    print(top_tokens[['token', 'value_shap', 'value_lime']].to_string(index=False))

# Visualize a few example sentences
if shap_results is not None and lime_results is not None and similarity_df is not None:
    print("\n" + "=" * 70)
    print("VISUALIZING TOKEN IMPORTANCE")
    print("=" * 70)
    
    # Visualize a few examples (correct and incorrect predictions)
    if len(similarity_df) > 0:
        # Example 1: High confidence correct prediction
        correct_high_conf = similarity_df[
            (similarity_df['is_correct'] == True) & 
            (similarity_df['cosine_similarity'] > similarity_df['cosine_similarity'].median())
        ]
        if len(correct_high_conf) > 0:
            example_id = correct_high_conf.iloc[0]['sentence_id']
            print(f"\nExample 1: High-confidence correct prediction (Sentence ID: {example_id})")
            visualize_token_importance(shap_results, lime_results, example_id, top_n=10)
        
        # Example 2: Incorrect prediction
        incorrect = similarity_df[similarity_df['is_correct'] == False]
        if len(incorrect) > 0:
            example_id = incorrect.iloc[0]['sentence_id']
            print(f"\nExample 2: Incorrect prediction (Sentence ID: {example_id})")
            visualize_token_importance(shap_results, lime_results, example_id, top_n=10)
else:
    print("\n⚠️  Cannot visualize - missing results")


## Explanation Confidence Analysis

Analyze the relationship between explanation confidence (similarity between SHAP and LIME) and prediction correctness.


In [None]:
def analyze_explanation_confidence(similarity_df):
    """
    Analyze explanation confidence scores
    
    Parameters:
    -----------
    similarity_df : pd.DataFrame
        DataFrame with similarity metrics
    """
    if similarity_df is None or len(similarity_df) == 0:
        print("⚠️  Cannot analyze - missing similarity data")
        return
    
    print("=" * 70)
    print("EXPLANATION CONFIDENCE ANALYSIS")
    print("=" * 70)
    
    # Compare correct vs incorrect predictions
    correct = similarity_df[similarity_df['is_correct'] == True]
    incorrect = similarity_df[similarity_df['is_correct'] == False]
    
    print(f"\nCorrect Predictions: {len(correct)}")
    print(f"  Cosine Similarity: {correct['cosine_similarity'].mean():.4f} ± {correct['cosine_similarity'].std():.4f}")
    print(f"  Pearson Correlation: {correct['pearson_correlation'].mean():.4f} ± {correct['pearson_correlation'].std():.4f}")
    print(f"  Jensen-Shannon Divergence: {correct['jensen_shannon_divergence'].mean():.4f} ± {correct['jensen_shannon_divergence'].std():.4f}")
    
    print(f"\nIncorrect Predictions: {len(incorrect)}")
    print(f"  Cosine Similarity: {incorrect['cosine_similarity'].mean():.4f} ± {incorrect['cosine_similarity'].std():.4f}")
    print(f"  Pearson Correlation: {incorrect['pearson_correlation'].mean():.4f} ± {incorrect['pearson_correlation'].std():.4f}")
    print(f"  Jensen-Shannon Divergence: {incorrect['jensen_shannon_divergence'].mean():.4f} ± {incorrect['jensen_shannon_divergence'].std():.4f}")
    
    # Create visualizations
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    metrics = ['cosine_similarity', 'pearson_correlation', 'jensen_shannon_divergence']
    metric_names = ['Cosine Similarity', 'Pearson Correlation', 'Jensen-Shannon Divergence']
    
    for ax, metric, name in zip(axes, metrics, metric_names):
        correct_vals = correct[metric].values
        incorrect_vals = incorrect[metric].values
        
        ax.hist(correct_vals, alpha=0.6, label='Correct', bins=20, color='green')
        ax.hist(incorrect_vals, alpha=0.6, label='Incorrect', bins=20, color='red')
        ax.set_xlabel(name, fontsize=11)
        ax.set_ylabel('Frequency', fontsize=11)
        ax.set_title(f'{name} Distribution', fontsize=12)
        ax.legend()
        ax.grid(alpha=0.3)
    
    plt.tight_layout()
    
    # Save figure
    fig_path = os.path.join(explainability_dir, 'explanation_confidence_analysis.png')
    plt.savefig(fig_path, dpi=300, bbox_inches='tight')
    print(f"\n✅ Saved confidence analysis to: {fig_path}")
    
    plt.show()
    
    # Summary statistics
    summary_stats = similarity_df.groupby('is_correct').agg({
        'cosine_similarity': ['mean', 'std'],
        'pearson_correlation': ['mean', 'std'],
        'jensen_shannon_divergence': ['mean', 'std']
    }).round(4)
    
    print("\n" + "=" * 70)
    print("SUMMARY STATISTICS")
    print("=" * 70)
    print(summary_stats)
    
    # Save summary
    summary_path = os.path.join(explainability_dir, 'explanation_confidence_summary.csv')
    summary_stats.to_csv(summary_path)
    print(f"\n✅ Summary statistics saved to: {summary_path}")

# Analyze explanation confidence
if similarity_df is not None:
    analyze_explanation_confidence(similarity_df)
else:
    print("\n⚠️  Cannot analyze confidence - missing similarity data")


## Summary and Results

Generate a summary report of the token-level explanation analysis.


In [None]:
# Generate summary report
if similarity_df is not None and shap_results is not None and lime_results is not None:
    print("=" * 70)
    print("TOKEN-LEVEL EXPLANATION ANALYSIS SUMMARY")
    print("=" * 70)
    
    print(f"\nDataset: Gender Bias Detection in Job Descriptions")
    print(f"Model: {model_name}")
    print(f"Total instances analyzed: {len(similarity_df):,}")
    print(f"Total token explanations: {len(shap_results):,}")
    
    print(f"\nOverall Similarity Metrics:")
    print(f"  Cosine Similarity: {similarity_df['cosine_similarity'].mean():.4f} ± {similarity_df['cosine_similarity'].std():.4f}")
    print(f"  Pearson Correlation: {similarity_df['pearson_correlation'].mean():.4f} ± {similarity_df['pearson_correlation'].std():.4f}")
    print(f"  Jensen-Shannon Divergence: {similarity_df['jensen_shannon_divergence'].mean():.4f} ± {similarity_df['jensen_shannon_divergence'].std():.4f}")
    
    # Prediction accuracy
    accuracy = similarity_df['is_correct'].mean() * 100
    print(f"\nPrediction Accuracy: {accuracy:.2f}%")
    
    # Explanation confidence by prediction correctness
    correct_conf = similarity_df[similarity_df['is_correct'] == True]['cosine_similarity'].mean()
    incorrect_conf = similarity_df[similarity_df['is_correct'] == False]['cosine_similarity'].mean()
    
    print(f"\nExplanation Confidence (Cosine Similarity):")
    print(f"  Correct predictions: {correct_conf:.4f}")
    print(f"  Incorrect predictions: {incorrect_conf:.4f}")
    print(f"  Difference: {correct_conf - incorrect_conf:.4f}")
    
    print(f"\n✅ All results saved to: {explainability_dir}")
    print(f"   - SHAP results: shap_results.csv")
    print(f"   - LIME results: lime_results.csv")
    print(f"   - Similarity metrics: sentence_similarity_metrics.csv")
    print(f"   - Visualizations: token_importance_sentence_*.png")
    print(f"   - Confidence analysis: explanation_confidence_analysis.png")
    
else:
    print("\n⚠️  Cannot generate summary - missing analysis results")
    print("   Please run the SHAP and LIME analysis cells above first")
