--> To put content in collapsable format, we can use the following snippet in the markdowns:

<details>
<summary style="cursor: pointer">
<b> double click the markdown to see the code instead of this </b>
</summary>

# 3.6 Neural Network Specific Methods
-- Gradient & attention-based importance
- Integrated Gradients
- Saliency Maps / Grad-CAM
- Attention Weights (NLP/Transformers)
- DeepLIFT
- Attention Weights (NLP/Transformers)
- DeepSHAP
- Layer-wise Relevance Propagation (LRP)

----------

## 3.6.1 Integrated Gradients

<details>
<summary style="cursor: pointer">
<h2> { Understanding Integrated Gradients } </h2>
</summary>
<h3> What is Integrated Gradients? </h3>
<p> Integrated Gradients is a technique to attribute a model's prediction to its input features by integrating gradients along a path from a baseline input to the actual input.</p>
<h3> Its role in Deep Learning Interpretability: </h3>
<ul>
    <li> Model-agnostic and applicable to any differentiable model.</li>
    <li> Helps attribute importance scores to input features like image pixels or text tokens.</li>
</ul>
<h3> Resources: </h3>
<ol>
    <li><a href="https://arxiv.org/abs/1703.01365" target="_blank">Original Paper: Axiomatic Attribution for Deep Networks</a></li>
    <li><a href="https://captum.ai/api/integrated_gradients.html" target="_blank">Captum (PyTorch) API</a></li>
</ol>
</details>

##### Parameters:
- model: A trained deep learning model (TensorFlow/Keras or PyTorch style)
- X: Features (DataFrame or numpy array) to explain
- baseline: Baseline input to start integrating from (default: zero baseline)
- target_label: Target output index/class for explanation (classification) or None (regression)
- m_steps: Number of interpolation steps between baseline and input
- batch_size: Batch size for gradient calculation
- scale_data: Whether to standardize features before applying IG
- show_plot: Whether to plot the integrated gradients per feature
- plot_size: Tuple indicating plot size (width, height)
- random_state: Random seed for reproducibility

##### Returns:
- DataFrame of feature attributions (importance scores)
- Displays feature attribution bar plot if show_plot=True

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

def integrated_gradients(model,
                          X,
                          baseline=None,
                          target_label=None,
                          m_steps=50,
                          batch_size=32,
                          scale_data=True,
                          show_plot=True,
                          plot_size=(12, 8),
                          random_state=42):

    if isinstance(X, pd.DataFrame):
        feature_names = X.columns.tolist()
        X_processed = X.copy()
    else:
        X = np.array(X)
        feature_names = [f"Feature_{i}" for i in range(X.shape[1])]
        X_processed = X.copy()

    tf.random.set_seed(random_state)
    np.random.seed(random_state)

    if scale_data:
        scaler = StandardScaler()
        X_processed = scaler.fit_transform(X_processed)

    X_processed = tf.convert_to_tensor(X_processed, dtype=tf.float32)

    if baseline is None:
        baseline = tf.zeros(shape=X_processed.shape, dtype=tf.float32)
    else:
        if isinstance(baseline, pd.DataFrame):
            baseline = scaler.transform(baseline)
        baseline = tf.convert_to_tensor(baseline, dtype=tf.float32)

    def interpolate(baseline, input, steps):
        alphas = tf.linspace(0.0, 1.0, steps + 1)
        alphas_x = tf.expand_dims(alphas, axis=-1)
        interpolated = baseline + alphas_x * (input - baseline)
        return interpolated

    def compute_gradients(inputs, target_label=None):
        with tf.GradientTape() as tape:
            tape.watch(inputs)
            predictions = model(inputs)

            if target_label is not None:
                outputs = predictions[:, target_label]
            else:
                outputs = predictions[:, 0]

        grads = tape.gradient(outputs, inputs)
        return grads

    integrated_grads_list = []

    for i in range(0, len(X_processed), batch_size):
        input_batch = X_processed[i:i + batch_size]
        baseline_batch = baseline[i:i + batch_size]

        interpolated_batch = interpolate(baseline_batch, input_batch, m_steps)
        interpolated_batch = tf.reshape(interpolated_batch, [-1, X_processed.shape[1]])

        grads = compute_gradients(interpolated_batch, target_label=target_label)
        grads = tf.reshape(grads, [batch_size, m_steps + 1, X_processed.shape[1]])

        avg_grads = tf.reduce_mean(grads, axis=1)
        delta = input_batch - baseline_batch
        integrated_grads = delta * avg_grads

        integrated_grads_list.append(integrated_grads.numpy())

    integrated_grads_all = np.vstack(integrated_grads_list)
    feature_importance = np.mean(np.abs(integrated_grads_all), axis=0)

    attributions_df = pd.DataFrame({
        'Feature': feature_names,
        'Attribution': feature_importance
    }).sort_values('Attribution', ascending=False)

    if show_plot:
        plt.figure(figsize=plot_size)
        attributions_df_sorted = attributions_df.sort_values('Attribution', ascending=True)

        colors = plt.cm.plasma(np.linspace(0.2, 1, len(attributions_df_sorted)))
        bars = plt.barh(attributions_df_sorted['Feature'],
                        attributions_df_sorted['Attribution'],
                        color=colors,
                        alpha=0.9)

        plt.xlabel('Attribution Magnitude', fontsize=12)
        plt.title('Integrated Gradients Feature Importance', fontsize=14, pad=20)
        plt.grid(axis='x', linestyle='--', alpha=0.3)

        for bar in bars:
            width = bar.get_width()
            plt.text(width + 0.0005,
                     bar.get_y() + bar.get_height() / 2,
                     f'{width:.4f}',
                     va='center',
                     fontsize=9)

        method_text = (
            f"Method: Integrated Gradients\n"
            f"Steps: {m_steps}\n"
            f"Baseline: {'Zero' if baseline is None else 'Provided'}"
        )
        plt.annotate(method_text,
                     xy=(0.98, 0.02),
                     xycoords='axes fraction',
                     ha='right',
                     va='bottom',
                     fontsize=9,
                     bbox=dict(boxstyle='round', alpha=0.1))

        plt.tight_layout()
        plt.show()

    return attributions_df

----------

## 3.6.2 Saliency Maps / Grad-CAM

<details>
<summary style="cursor: pointer">
<h2> { Understanding Saliency Maps & Grad-CAM } </h2>
</summary>
<h3> What are Saliency Maps and Grad-CAM? </h3>
<p> Saliency Maps use gradients to highlight input features (e.g., image pixels) that influence the output. Grad-CAM improves this by using gradients flowing into convolutional layers to produce heatmaps.</p>
<h3> Their role in CNN Interpretability: </h3>
<ul>
    <li> Highlight important regions in an image for a specific class.</li>
    <li> Grad-CAM is class-discriminative and visually intuitive.</li>
</ul>
<h3> Resources: </h3>
<ol>
    <li><a href="https://arxiv.org/abs/1610.02391" target="_blank">Grad-CAM Paper</a></li>
    <li><a href="https://keras.io/examples/vision/grad_cam/" target="_blank">Keras Grad-CAM Example</a></li>
</ol>
</details>

##### Parameters:
- model: A trained deep learning model (TensorFlow/Keras)
- X: Input features (DataFrame, numpy array, or Tensor) for which saliency is computed
- target_layer_name: Name of the convolutional layer to compute Grad-CAM (for CNNs)
- target_label: Index of the output/class to explain (classification), or None (regression)
- method: 'saliency' for vanilla gradients, 'gradcam' for Grad-CAM
- scale_data: Whether to standardize inputs
- show_plot: Whether to visualize feature saliency
- plot_size: Tuple indicating plot size (width, height)
- random_state: Random seed for reproducibility

##### Returns:
- DataFrame of saliency scores (feature importance)
- Displays a saliency heatmap/bar plot if show_plot=True

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

def saliency_feature_importance(model,
                                 X,
                                 target_layer_name=None,
                                 target_label=None,
                                 method='saliency',
                                 scale_data=True,
                                 show_plot=True,
                                 plot_size=(12, 8),
                                 random_state=42):

    if isinstance(X, pd.DataFrame):
        feature_names = X.columns.tolist()
        X_processed = X.copy()
    else:
        X = np.array(X)
        feature_names = [f"Feature_{i}" for i in range(X.shape[1])]
        X_processed = X.copy()

    tf.random.set_seed(random_state)
    np.random.seed(random_state)

    if scale_data:
        scaler = StandardScaler()
        X_processed = scaler.fit_transform(X_processed)

    X_processed = tf.convert_to_tensor(X_processed, dtype=tf.float32)

    @tf.function
    def compute_saliency(inputs, target_label=None):
        with tf.GradientTape() as tape:
            tape.watch(inputs)
            preds = model(inputs)
            if target_label is not None:
                output = preds[:, target_label]
            else:
                output = preds[:, 0]
        grads = tape.gradient(output, inputs)
        return grads

    @tf.function
    def compute_gradcam(inputs, model, target_layer_name, target_label=None):
        grad_model = tf.keras.models.Model(
            inputs=[model.inputs],
            outputs=[model.get_layer(target_layer_name).output, model.output]
        )

        with tf.GradientTape() as tape:
            inputs = tf.cast(inputs, tf.float32)
            conv_outputs, predictions = grad_model(inputs)
            if target_label is not None:
                loss = predictions[:, target_label]
            else:
                loss = predictions[:, 0]

        grads = tape.gradient(loss, conv_outputs)

        pooled_grads = tf.reduce_mean(grads, axis=(1, 2))

        conv_outputs = conv_outputs[0]
        pooled_grads = pooled_grads[0]

        heatmap = tf.reduce_sum(tf.multiply(pooled_grads, conv_outputs), axis=-1)

        heatmap = tf.maximum(heatmap, 0) / tf.math.reduce_max(heatmap)
        return heatmap

    if method == 'saliency':
        grads = compute_saliency(X_processed, target_label=target_label)
        saliency_scores = tf.reduce_mean(tf.abs(grads), axis=0).numpy()
    elif method == 'gradcam':
        if target_layer_name is None:
            raise ValueError("For Grad-CAM, 'target_layer_name' must be specified.")
        heatmaps = []
        for i in range(X_processed.shape[0]):
            heatmap = compute_gradcam(X_processed[i:i+1], model, target_layer_name, target_label=target_label)
            heatmaps.append(heatmap.numpy().flatten())

        saliency_scores = np.mean(np.array(heatmaps), axis=0)
        feature_names = [f"Activation_{i}" for i in range(len(saliency_scores))]
    else:
        raise ValueError("Method must be either 'saliency' or 'gradcam'.")

    saliency_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': saliency_scores
    }).sort_values('Importance', ascending=False)

    if show_plot:
        plt.figure(figsize=plot_size)
        saliency_df_sorted = saliency_df.sort_values('Importance', ascending=True)

        colors = plt.cm.inferno(np.linspace(0.2, 1, len(saliency_df_sorted)))
        bars = plt.barh(saliency_df_sorted['Feature'],
                        saliency_df_sorted['Importance'],
                        color=colors,
                        alpha=0.9)

        plt.xlabel('Saliency Magnitude', fontsize=12)
        plt.title(f'{method.upper()} Feature Importance', fontsize=14, pad=20)
        plt.grid(axis='x', linestyle='--', alpha=0.3)

        for bar in bars:
            width = bar.get_width()
            plt.text(width + 0.0005,
                     bar.get_y() + bar.get_height() / 2,
                     f'{width:.4f}',
                     va='center',
                     fontsize=9)

        method_text = (
            f"Method: {method.upper()}\n"
            f"Target Layer: {target_layer_name if target_layer_name else 'Input Layer'}"
        )
        plt.annotate(method_text,
                     xy=(0.98, 0.02),
                     xycoords='axes fraction',
                     ha='right',
                     va='bottom',
                     fontsize=9,
                     bbox=dict(boxstyle='round', alpha=0.1))

        plt.tight_layout()
        plt.show()

    return saliency_df

----------

## 3.6.3 Attention Weights (NLP/Transformers)

<details>
<summary style="cursor: pointer">
<h2> { Understanding Attention Weights in Transformers } </h2>
</summary>
<h3> What are Attention Weights? </h3>
<p> Attention weights determine how much each input token contributes to the representation of other tokens. In Transformers, they are central to how context is understood.</p>
<h3> Their role in Interpretability: </h3>
<ul>
    <li> Show which parts of the input the model is focusing on at each layer/head.</li>
    <li> Can be visualized as heatmaps to understand token-to-token influence.</li>
</ul>
<h3> Resources: </h3>
<ol>
    <li><a href="https://jalammar.github.io/illustrated-transformer/" target="_blank">The Illustrated Transformer (By Jay Alammar)</a></li>
    <li><a href="https://huggingface.co/transformers/main_classes/output.html#transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions" target="_blank">Hugging Face Outputs</a></li>
</ol>
</details>

##### Parameters:
- model: A trained Transformer/NLP model (e.g., HuggingFace, TensorFlow/Keras, or PyTorch model)
- tokenizer: Tokenizer used for the model to process raw text
- text_inputs: List of text inputs (or a single string) to analyze
- layer: Transformer layer index to extract attention from (default last layer)
- head: Attention head index to extract attention from (optional, defaults to averaging heads)
- aggregation: How to aggregate token attention ('mean', 'max', 'first', etc.)
- target_label: Optional, specific class index if classification is multi-output
- show_plot: Whether to plot the attention importance
- plot_size: Tuple indicating plot size (width, height)
- random_state: Random seed for reproducibility

##### Returns:
- DataFrame of token-level attention scores (feature importance)
- Displays a token attention heatmap if show_plot=True

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModel

def attention_feature_importance(model,
                                  tokenizer,
                                  text_inputs,
                                  layer=-1,
                                  head=None,
                                  aggregation='mean',
                                  target_label=None,
                                  show_plot=True,
                                  plot_size=(14, 6),
                                  random_state=42):

    if isinstance(text_inputs, str):
        text_inputs = [text_inputs]

    np.random.seed(random_state)
    torch.manual_seed(random_state)

    model.eval()

    encoded = tokenizer(text_inputs,
                        return_tensors='pt',
                        padding=True,
                        truncation=True,
                        return_attention_mask=True)

    with torch.no_grad():
        outputs = model(**encoded, output_attentions=True)

    attentions = outputs.attentions  # List: (layers, batch_size, heads, tokens, tokens)
    selected_attention = attentions[layer]  # Pick specified layer

    # Shape: (batch_size, num_heads, tokens, tokens)
    if head is not None:
        attention_values = selected_attention[:, head, :, :]
    else:
        if aggregation == 'mean':
            attention_values = selected_attention.mean(dim=1)
        elif aggregation == 'max':
            attention_values = selected_attention.max(dim=1).values
        else:
            raise ValueError("Unsupported aggregation method.")

    attention_per_token = attention_values.mean(dim=1)  # Mean across attended tokens (axis=1)

    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])

    attention_scores = attention_per_token[0].cpu().numpy()

    attention_df = pd.DataFrame({
        'Token': tokens,
        'Importance': attention_scores
    })

    if show_plot:
        plt.figure(figsize=plot_size)
        colors = plt.cm.plasma(np.linspace(0.3, 1, len(attention_df)))

        bars = plt.barh(attention_df['Token'],
                        attention_df['Importance'],
                        color=colors,
                        alpha=0.9)

        plt.xlabel('Attention Weight', fontsize=12)
        plt.title('Transformer Attention-Based Feature Importance', fontsize=14, pad=20)
        plt.grid(axis='x', linestyle='--', alpha=0.3)

        for bar in bars:
            width = bar.get_width()
            plt.text(width + 0.0005,
                     bar.get_y() + bar.get_height() / 2,
                     f'{width:.4f}',
                     va='center',
                     fontsize=8)

        method_text = (
            f"Method: Attention Weights\n"
            f"Layer: {layer}\n"
            f"Aggregation: {aggregation.upper()}"
        )
        plt.annotate(method_text,
                     xy=(0.98, 0.02),
                     xycoords='axes fraction',
                     ha='right',
                     va='bottom',
                     fontsize=9,
                     bbox=dict(boxstyle='round', alpha=0.1))

        plt.tight_layout()
        plt.show()

    return attention_df

----------

## 3.6.4 DeepLIFT

<details>
<summary style="cursor: pointer">
<h2> { Understanding DeepLIFT } </h2>
</summary>
<h3> What is DeepLIFT? </h3>
<p> DeepLIFT assigns contribution scores by comparing each neuron's activation to a reference activation and tracking differences back to input features.</p>
<h3> Its role in Neural Network Interpretability: </h3>
<ul>
    <li> Tracks both positive and negative contributions of features.</li>
    <li> More accurate than simple gradient methods in certain cases (e.g., ReLU saturation).</li>
</ul>
<h3> Resources: </h3>
<ol>
    <li><a href="https://arxiv.org/abs/1704.02685" target="_blank">DeepLIFT Paper</a></li>
    <li><a href="https://github.com/kundajelab/deeplift" target="_blank">DeepLIFT GitHub</a></li>
</ol>
</details>

##### Parameters:
- model: A trained deep learning model (TensorFlow/Keras or PyTorch)
- inputs: Input data for which to compute feature importance (numpy array or tensor)
- baseline: Baseline input for DeepLIFT comparisons (e.g., zeros or dataset mean; if None, uses zeros)
- target_label: Optional, specific output neuron/class index for multi-output models
- framework: Either 'tensorflow' or 'pytorch' depending on model type
- show_plot: Whether to plot feature importances
- plot_size: Tuple indicating plot size (width, height)
- feature_names: List of feature names (optional; otherwise indices will be used)
- random_state: Random seed for reproducibility

##### Returns:
- DataFrame of feature-level importance scores
- Displays a bar plot of feature importances if show_plot=True

In [4]:
!pip install captum



In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import torch

from captum.attr import DeepLift as DL_Pytorch
import shap  # For TensorFlow DeepLIFT implementation

def deep_lift_feature_importance(model,
                                  inputs,
                                  baseline=None,
                                  target_label=None,
                                  framework='tensorflow',
                                  show_plot=True,
                                  plot_size=(12, 6),
                                  feature_names=None,
                                  random_state=42):

    np.random.seed(random_state)
    if framework == 'pytorch':
        torch.manual_seed(random_state)
    else:
        tf.random.set_seed(random_state)

    if isinstance(inputs, np.ndarray):
        inputs_tensor = torch.tensor(inputs, dtype=torch.float32) if framework == 'pytorch' else tf.convert_to_tensor(inputs)
    else:
        inputs_tensor = inputs

    if baseline is None:
        baseline = np.zeros_like(inputs) if isinstance(inputs, np.ndarray) else torch.zeros_like(inputs)

    if framework == 'pytorch':
        model.eval()
        deep_lift = DL_Pytorch(model)
        attributions = deep_lift.attribute(inputs_tensor, baselines=torch.tensor(baseline, dtype=torch.float32), target=target_label)
        importance_scores = attributions.detach().cpu().numpy().mean(axis=0)
    else:  # TensorFlow
        explainer = shap.DeepExplainer(model, baseline)
        shap_values = explainer.shap_values(inputs)
        if isinstance(shap_values, list):
            if target_label is not None:
                shap_values = shap_values[target_label]
            else:
                shap_values = shap_values[0]
        importance_scores = np.mean(shap_values, axis=0)

    if feature_names is None:
        feature_names = [f'Feature_{i}' for i in range(importance_scores.shape[-1])]

    if importance_scores.ndim > 1:
        importance_scores = importance_scores.mean(axis=0)

    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance_scores
    }).sort_values('Importance', ascending=False)

    if show_plot:
        plt.figure(figsize=plot_size)
        colors = plt.cm.inferno(np.linspace(0.3, 1, len(importance_df)))

        bars = plt.barh(importance_df['Feature'],
                        importance_df['Importance'],
                        color=colors,
                        alpha=0.9)

        plt.xlabel('Attribution Score', fontsize=12)
        plt.title('DeepLIFT Feature Importance', fontsize=14, pad=20)
        plt.grid(axis='x', linestyle='--', alpha=0.3)

        for bar in bars:
            width = bar.get_width()
            plt.text(width + 0.0005,
                     bar.get_y() + bar.get_height() / 2,
                     f'{width:.4f}',
                     va='center',
                     fontsize=8)

        method_text = (
            f"Method: DeepLIFT\n"
            f"Framework: {framework.capitalize()}"
        )
        plt.annotate(method_text,
                     xy=(0.98, 0.02),
                     xycoords='axes fraction',
                     ha='right',
                     va='bottom',
                     fontsize=9,
                     bbox=dict(boxstyle='round', alpha=0.1))

        plt.tight_layout()
        plt.show()

    return importance_df

----------

## 3.6.5 DeepSHAP

<details>
<summary style="cursor: pointer">
<h2> { Understanding DeepSHAP } </h2>
</summary>
<h3> What is DeepSHAP? </h3>
<p> DeepSHAP is a SHAP variant for deep learning models. It combines DeepLIFT and SHAP theory to produce SHAP values for neural networks.</p>
<h3> Its role in Deep Learning Feature Attribution: </h3>
<ul>
    <li> Captures nonlinear interactions and baseline references.</li>
    <li> Suitable for DNNs, works via backward recursive rules.</li>
</ul>
<h3> Resources: </h3>
<ol>
    <li><a href="https://shap.readthedocs.io/en/latest/generated/shap.DeepExplainer.html" target="_blank">SHAP DeepExplainer Docs</a></li>
    <li><a href="https://github.com/slundberg/shap" target="_blank">SHAP GitHub</a></li>
</ol>
</details>

##### Parameters:
- model: A trained deep learning model (TensorFlow/Keras)
- inputs: Input data to compute feature importance (numpy array or tensor)
- background_data: Background dataset for baseline/reference distribution (e.g., a random sample of training data)
- target_label: Optional, specific output neuron/class index for multi-output models
- show_plot: Whether to plot feature importances
- plot_size: Tuple indicating plot size (width, height)
- feature_names: List of feature names (optional; otherwise uses indices)
- random_state: Random seed for reproducibility

##### Returns:
- DataFrame of feature-level importance scores
- Displays a bar plot of feature importances if show_plot=True

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import shap

def deepshap_feature_importance(model,
                                 inputs,
                                 background_data,
                                 target_label=None,
                                 show_plot=True,
                                 plot_size=(12, 6),
                                 feature_names=None,
                                 random_state=42):

    np.random.seed(random_state)
    tf.random.set_seed(random_state)

    if not isinstance(inputs, np.ndarray):
        inputs = np.array(inputs)
    if not isinstance(background_data, np.ndarray):
        background_data = np.array(background_data)

    explainer = shap.DeepExplainer(model, background_data)
    shap_values = explainer.shap_values(inputs)

    if isinstance(shap_values, list):
        if target_label is not None:
            shap_values = shap_values[target_label]
        else:
            shap_values = shap_values[0]
    else:
        shap_values = shap_values

    importance_scores = np.abs(shap_values).mean(axis=0)

    if importance_scores.ndim > 1:
        importance_scores = importance_scores.mean(axis=0)

    if feature_names is None:
        feature_names = [f'Feature_{i}' for i in range(importance_scores.shape[-1])]

    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance_scores
    }).sort_values('Importance', ascending=False)

    if show_plot:
        plt.figure(figsize=plot_size)
        colors = plt.cm.plasma(np.linspace(0.3, 1, len(importance_df)))

        bars = plt.barh(importance_df['Feature'],
                        importance_df['Importance'],
                        color=colors,
                        alpha=0.9)

        plt.xlabel('Mean |SHAP Value|', fontsize=12)
        plt.title('DeepSHAP Feature Importance', fontsize=14, pad=20)
        plt.grid(axis='x', linestyle='--', alpha=0.3)

        for bar in bars:
            width = bar.get_width()
            plt.text(width + 0.0005,
                     bar.get_y() + bar.get_height() / 2,
                     f'{width:.4f}',
                     va='center',
                     fontsize=8)

        method_text = (
            f"Method: DeepSHAP\n"
            f"Model: Deep Neural Network"
        )
        plt.annotate(method_text,
                     xy=(0.98, 0.02),
                     xycoords='axes fraction',
                     ha='right',
                     va='bottom',
                     fontsize=9,
                     bbox=dict(boxstyle='round', alpha=0.1))

        plt.tight_layout()
        plt.show()

    return importance_df

----------

## 3.6.6 Layer-wise Relevance Propagation (LRP)

<details>
<summary style="cursor: pointer">
<h2> { Understanding Layer-wise Relevance Propagation (LRP) } </h2>
</summary>
<h3> What is LRP? </h3>
<p> LRP is a technique to explain predictions by propagating relevance scores backward from the output to the input features.</p>
<h3> Its role in Model Interpretability: </h3>
<ul>
    <li> Particularly useful for visualizing decisions in image classification.</li>
    <li> It redistributes the prediction score layer by layer in a conservative way.</li>
</ul>
<h3> Resources: </h3>
<ol>
    <li><a href="https://www.sciencedirect.com/science/article/pii/S0031320320303703" target="_blank">Comprehensive LRP Review</a></li>
    <li><a href="http://heatmapping.org/tutorial/" target="_blank">LRP Tutorial Site</a></li>
</ol>
</details>

##### Parameters:
- model: A trained neural network model (preferably TensorFlow/Keras or PyTorch with LRP support)
- inputs: Input data to compute feature importance (numpy array or tensor)
- target_label: Optional, specific output neuron/class index for relevance focusing
- show_plot: Whether to plot feature relevance scores
- plot_size: Tuple indicating plot size (width, height)
- feature_names: List of feature names (optional; otherwise uses indices)
- random_state: Random seed for reproducibility

##### Returns:
- DataFrame of feature-level relevance scores
- Displays a bar plot of feature relevance if show_plot=True

In [2]:
!pip install innvestigate

Collecting innvestigate
  Downloading innvestigate-2.1.2-py3-none-any.whl (66 kB)
     ---------------------------------------- 66.8/66.8 kB ? eta 0:00:00
Installing collected packages: innvestigate
Successfully installed innvestigate-2.1.2




In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

try:
    import innvestigate
except ImportError:
    raise ImportError("Please install 'innvestigate' package: pip install innvestigate")

def lrp_feature_importance(model,
                            inputs,
                            target_label=None,
                            show_plot=True,
                            plot_size=(12, 6),
                            feature_names=None,
                            random_state=42):

    np.random.seed(random_state)
    tf.random.set_seed(random_state)

    if not isinstance(inputs, np.ndarray):
        inputs = np.array(inputs)

    analyzer = innvestigate.create_analyzer("lrp.z", model)

    if target_label is not None:
        analysis = analyzer.analyze(inputs, neuron_selection=target_label)
    else:
        analysis = analyzer.analyze(inputs)

    relevance_scores = np.mean(np.abs(analysis), axis=0)

    if relevance_scores.ndim > 1:
        relevance_scores = relevance_scores.mean(axis=0)

    if feature_names is None:
        feature_names = [f'Feature_{i}' for i in range(relevance_scores.shape[-1])]

    relevance_df = pd.DataFrame({
        'Feature': feature_names,
        'Relevance': relevance_scores
    }).sort_values('Relevance', ascending=False)

    if show_plot:
        plt.figure(figsize=plot_size)
        colors = plt.cm.magma(np.linspace(0.3, 1, len(relevance_df)))

        bars = plt.barh(relevance_df['Feature'],
                        relevance_df['Relevance'],
                        color=colors,
                        alpha=0.9)

        plt.xlabel('Mean Relevance Score', fontsize=12)
        plt.title('Layer-wise Relevance Propagation (LRP) Feature Importance', fontsize=14, pad=20)
        plt.grid(axis='x', linestyle='--', alpha=0.3)

        for bar in bars:
            width = bar.get_width()
            plt.text(width + 0.0005,
                     bar.get_y() + bar.get_height() / 2,
                     f'{width:.4f}',
                     va='center',
                     fontsize=8)

        method_text = (
            f"Method: LRP\n"
            f"Model: Deep Neural Network"
        )
        plt.annotate(method_text,
                     xy=(0.98, 0.02),
                     xycoords='axes fraction',
                     ha='right',
                     va='bottom',
                     fontsize=9,
                     bbox=dict(boxstyle='round', alpha=0.1))

        plt.tight_layout()
        plt.show()

    return relevance_df

----------