In [6]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from torch.optim import AdamW               # <-- use PyTorch's AdamW

# Load the dataset
train_data = pd.read_csv("train.tsv", sep='\t')
test_data  = pd.read_csv("test.tsv",  sep='\t')

# Peek at the first few rows
print("Train sample:")
train_data.head()

Train sample:


Unnamed: 0,sentence1,sentence2,gold_label
0,A man is riding a red motorcycle with a small ...,A man rides his motorcyle with his won.,neutral
1,A man is riding a motorcycle with a small chil...,A man rides his motorcyle with his won.,entailment
2,A man is riding a red motorcycle with a small ...,A man rides his car with his son.,contradiction
3,A man is riding a blue truck with a small chil...,A man rides his motorcyle with his won.,contradiction
4,A man is riding a red motorcycle with a small ...,A man rides his motorcycle with a child.,entailment


In [7]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [None]:
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Detect device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


In [8]:
# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize sentences
train_encodings = tokenizer(list(train_data['sentence1']), list(train_data['sentence2']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_data['sentence1']), list(test_data['sentence2']), truncation=True, padding=True)

# Convert the data into tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
test_inputs = torch.tensor(test_encodings['input_ids'])


label_encoder = LabelEncoder()

train_labels = torch.tensor(label_encoder.fit_transform(train_data['gold_label']))
test_labels = torch.tensor(label_encoder.transform(test_data['gold_label']))

train_inputs = torch.tensor(train_encodings['input_ids'])
test_inputs = torch.tensor(test_encodings['input_ids'])

# Create TensorDatasets
train_dataset = TensorDataset(train_inputs, train_labels)
test_dataset = TensorDataset(test_inputs, test_labels)

#DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [9]:
print(train_encodings['input_ids'][0])
print(list(train_data['sentence1'][0]))

[101, 1037, 2158, 2003, 5559, 1037, 2417, 9055, 2007, 1037, 2235, 2775, 3564, 1999, 2392, 1997, 2032, 1012, 102, 1037, 2158, 12271, 2010, 5013, 5666, 2571, 2007, 2010, 2180, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['A', ' ', 'm', 'a', 'n', ' ', 'i', 's', ' ', 'r', 'i', 'd', 'i', 'n', 'g', ' ', 'a', ' ', 'r', 'e', 'd', ' ', 'm', 'o', 't', 'o', 'r', 'c', 'y', 'c', 'l', 'e', ' ', 'w', 'i', 't', 'h', ' ', 'a', ' ', 's', 'm', 'a', 'l', 'l', ' ', 'c', 'h', 'i', 'l', 'd', ' ', 's', 'i', 't', 't', 'i', 'n', 'g', ' ', 'i', 'n', ' ', 'f', 'r', 'o', 'n', 't', ' ', 'o', 'f', ' ', 'h', 'i', 'm', '.']


In [10]:
print(len(train_encodings['input_ids']))
print(len(list(train_data['sentence1'])))

8330
8330


In [12]:

# Load model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
optimizer = AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
#  Fine-tune the model
epochs = 15
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        batch = [item.to(device) for item in batch]
        input_ids, labels = batch

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward() 
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}")

Epoch 1, Training Loss: 0.8818532376051407
Epoch 2, Training Loss: 0.7591576356004616
Epoch 3, Training Loss: 0.6570768331440305
Epoch 4, Training Loss: 0.555697098860585
Epoch 5, Training Loss: 0.4585243695470971
Epoch 6, Training Loss: 0.3686400988351933


In [None]:
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# Define the evaluation function
def evaluate_model(model, tokenizer, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation
        for batch in tqdm(test_loader):  # Iterate through the test data
            input_ids, labels = [item.to(device) for item in batch]
            outputs = model(input_ids=input_ids)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)  # Get the class with highest probability

            all_preds.extend(preds.cpu().numpy())  # Collect predictions
            all_labels.extend(labels.cpu().numpy())  # Collect true labels

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)

    # Calculate F1 score for each class (entailment=0, neutral=1, contradiction=2)
    f1 = f1_score(all_labels, all_preds, average=None, labels=[0, 1, 2])

    return accuracy, f1

In [None]:
accuracy, f1 = evaluate_model(model, tokenizer, test_loader, device)

# Output the results
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"F1 Scores per Label (Entailment, Neutral, Contradiction): {f1}")

In [None]:
print(f"Accuracy: {accuracy * 100:.2f}%")
## Step 6: Apply XAI Methods

SHAP

In [None]:
import shap

# Load the tokenizer
# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Define SHAP explainer function
def shap_explainer(texts, model, tokenizer):
    def f(x):
        inputs = tokenizer(list(x), return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        with torch.no_grad():
            logits = model(**inputs).logits
        return logits.cpu().numpy()

    # Create a SHAP explainer for text
    explainer = shap.Explainer(f, tokenizer)
    return explainer(texts)

# Example sentences for explanation
texts = ["The movie was great.", "I did not enjoy the movie."]

# Run the SHAP explainer
shap_values = shap_explainer(texts, model, tokenizer)

# Visualize the SHAP values
shap.initjs()
shap.text_plot(shap_values)


LIME

In [None]:
from lime.lime_text import LimeTextExplainer
import numpy as np

# LIME
def lime_explainer(sentence1, sentence2, model, tokenizer, device):
    def predict_fn(texts):
        # Create pairs of (text, sentence2) for entailment prediction
        inputs = tokenizer([sentence1] * len(texts), texts, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            logits = model(**inputs).logits
        return torch.softmax(logits, dim=-1).cpu().numpy()  # Return numpy arrays for LIME compatibility

    # Create the explainer for 3 classes
    explainer = LimeTextExplainer(class_names=["contradiction", "entailment", "neutral"])

    # Explain the instance with the first sentence and the varying second sentence
    explanation = explainer.explain_instance(
        sentence2,  # Sentence2 is treated as the text to explain
        predict_fn,  # Prediction function
        num_features=6
    )
    explanation.show_in_notebook()

# Example usage:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Detect if GPU is available
sentence1 = "The movie was good."
sentence2 = "I did not enjoy the movie."

# Assuming `model` and `tokenizer` are defined and loaded
lime_explainer(sentence1, sentence2, model, tokenizer, device)


In [None]:

def predict_label(text1, text2, model, tokenizer, device):

    # Tokenize and encode the input
    inputs = tokenizer(text1, text2, return_tensors="pt", truncation=True, padding=True).to(device)

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # Get the raw logits
        predicted_label = torch.argmax(logits, dim=1).item()  # Get the label with the highest probability

    return predicted_label



In [None]:



def replace_with_antonym(sentence):
    """Replace words in the sentence with their antonyms."""
    # A simplified dictionary of antonyms for demonstration purposes
    antonyms_dict = {
    "good": "bad",
    "great": "poor",
    "enjoyed": "disappointed",
    "bad": "good",
    "poor": "great",
    "disappointed": "enjoyed",
    "like": "dislike",
    "love": "hate",
    "happy": "sad",
    "fast": "slow",
    "strong": "weak",
    "fun": "boring",
    "easy": "hard",
    "son": "daughter",
    "brother": "sister",
    "friend": "foe",
    "car": "bike",
    "dog": "cat",
    "old": "young",
    "hot": "cold",
  }
    for word, antonym in antonyms_dict.items():
        # Replace word with its antonym in the sentence
        sentence = sentence.replace(word, antonym)

    return sentence

def counterfactual_explanation(sentence1, sentence2, model, tokenizer, device, label_mapping, actual_label):
    """Generate a counterfactual explanation by replacing words with antonyms."""
    # Get the actual predicted label for comparison
    original_prediction = actual_label  # Use the original label passed as argument for the counterfactual
    print(f"Actual label: {label_mapping[original_prediction]}")

    #  replacing words in sentence1 with antonyms
    modified_sentence1 = replace_with_antonym(sentence1)
    modified_prediction1 = predict_label(modified_sentence1, sentence2, model, tokenizer, device)

    if modified_prediction1 != original_prediction:
        print(f"Prediction flipped: {label_mapping[modified_prediction1]} for sentence1: {modified_sentence1}")
        return modified_sentence1, sentence2, original_prediction, modified_prediction1

    #  replacing words in sentence2 with antonyms
    modified_sentence2 = replace_with_antonym(sentence2)
    modified_prediction2 = predict_label(sentence1, modified_sentence2, model, tokenizer, device)

    if modified_prediction2 != original_prediction:
        print(f"Prediction flipped: {label_mapping[modified_prediction2]} for sentence2: {modified_sentence2}")
        return sentence1, modified_sentence2, original_prediction, modified_prediction2

    print("No counterfactual explanation found.")
    return sentence1, sentence2, original_prediction, original_prediction



In [None]:


# Define the label mapping
label_mapping = {0: "contradiction", 1: "entailment", 2: "neutral"}

# Original sentences
sentence1 = "The movie was good."
sentence2 = "The movie is bad"
#sentence1 = train_data['sentence1'][70]
#sentence2 = train_data['sentence2'][70]

# Manually predict the actual label first
actual_label = predict_label(sentence1, sentence2, model, tokenizer, device)

# Generate counterfactual explanation
modified_sentence1, modified_sentence2, original_pred_label, modified_pred_label = counterfactual_explanation(
    sentence1, sentence2, model, tokenizer, device, label_mapping, actual_label
)

print(f"Original Sentences: Sentence1: {sentence1}, Sentence2: {sentence2}")
print(f"Modified Sentences: Sentence1: {modified_sentence1}, Sentence2: {modified_sentence2}")
print(f"Original label: {label_mapping[original_pred_label]}")
print(f"Modified label: {label_mapping[modified_pred_label]}")


In [None]:


# Original sentences
sentence1 = "The movie was good."
sentence2 = "I didn't like the movie"


# Manually predict the actual label first
actual_label = predict_label(sentence1, sentence2, model, tokenizer, device)

# Generate counterfactual explanation
modified_sentence1, modified_sentence2, original_pred_label, modified_pred_label = counterfactual_explanation(
    sentence1, sentence2, model, tokenizer, device, label_mapping, actual_label
)

print(f"Original Sentences: Sentence1: {sentence1}, Sentence2: {sentence2}")
print(f"Modified Sentences: Sentence1: {modified_sentence1}, Sentence2: {modified_sentence2}")
print(f"Original label: {label_mapping[original_pred_label]}")
print(f"Modified label: {label_mapping[modified_pred_label]}")


In [None]:
import torch

def permutation_importance(model, tokenizer, data, labels, n_repeats=5, device='cuda'):
    model.to(device)
    original_accuracy = 0
    importances = []

    # Define the baseline accuracy
    with torch.no_grad():
        # Tokenize the input data
        inputs = tokenizer(data, return_tensors="pt", padding=True, truncation=True).to(device)
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        # Get the model's predictions (logits) and calculate accuracy
        baseline_preds = model(input_ids=input_ids, attention_mask=attention_mask).logits.argmax(dim=1)
        baseline_accuracy = (baseline_preds.cpu().numpy() == labels.cpu().numpy()).astype(int).mean()  # Convert to numpy for comparison

    for _ in range(n_repeats):
        # Shuffle the labels to create new data
        shuffled_labels = labels[torch.randperm(labels.size(0))]

        with torch.no_grad():
            # Calculate the accuracy with shuffled labels
            accuracy_with_shuffled = (baseline_preds.cpu().numpy() == shuffled_labels.cpu().numpy()).astype(int).mean()

        # Calculate the importance by the drop in accuracy
        importance = baseline_accuracy - accuracy_with_shuffled
        importances.append(importance)

    # Calculate the mean importance and return it
    mean_importance = torch.tensor(importances).mean().item()
    return mean_importance


mean_importances = permutation_importance(model, tokenizer, list(test_data["sentence1"]), test_labels, n_repeats=5)
print("Permutation Importance (Mean) for sentence1:", mean_importances)


In [None]:
mean_importances = permutation_importance(model, tokenizer, list(test_data["sentence2"]), test_labels, n_repeats=5)
print("Permutation Importance (Mean) for sentence2:", mean_importances)


In [None]:


def permutation_importance_for_words(model, tokenizer, sentence, label, device='cuda', n_repeats=5):
    model.to(device)
    original_accuracy = 0
    word_importances = []

    # Step 1: Tokenize the sentence and get baseline prediction (accuracy)
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        # Get model's baseline prediction
        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        baseline_preds = logits.argmax(dim=1)
        baseline_accuracy = (baseline_preds.cpu().numpy() == label).mean()

    # Step 2: Iterate through each word in the sentence and calculate permutation importance
    for word_idx in range(1, len(input_ids[0]) - 1):  # Skip [CLS] and [SEP] tokens
        modified_input_ids = input_ids.clone()

        # Step 3: Replace the word with a mask token or a placeholder token (e.g., "<mask>")
        modified_input_ids[0][word_idx] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

        with torch.no_grad():
            # Get model's prediction after the word replacement
            logits = model(input_ids=modified_input_ids, attention_mask=attention_mask).logits
            modified_preds = logits.argmax(dim=1)
            accuracy_with_modification = (modified_preds.cpu().numpy() == label).mean()

        # Step 4: Calculate the importance by the drop in accuracy
        importance = baseline_accuracy - accuracy_with_modification
        word_importances.append((tokenizer.decode(input_ids[0][word_idx]), importance))

    # Step 5: Sort and display word importances in descending order
    word_importances.sort(key=lambda x: x[1], reverse=True)  # Sort by importance
    return word_importances




In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Detect if GPU is available
sentence1 = "The movie was great."
label1 = 2  #neutral

word_importances = permutation_importance_for_words(model, tokenizer, sentence1, label1, device, n_repeats=5)

# Output the word importance
for word, importance in word_importances:
    print(f"Word: '{word}', Importance: {importance}")

In [None]:
sentence2 = "The movie was good."
label2 = 2  #neutral

word_importances2 = permutation_importance_for_words(model, tokenizer, sentence2, label2, device, n_repeats=5)

# Output the word importance
for word, importance in word_importances2:
    print(f"Word: '{word}', Importance: {importance}")

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn3

# Create a figure
fig, ax = plt.subplots(figsize=(12, 12))

# Draw 4 overlapping circles to represent each XAI method
# These positions and radii are approximations of intersections
ax.set_facecolor('black')

# Draw circles representing SHAP, LIME, Counterfactual, Permutation Importance
circle_shap = plt.Circle((0.2, 0.8), 0.2, color='blue', alpha=0.5, label="SHAP")
circle_lime = plt.Circle((0.4, 0.8), 0.2, color='orange', alpha=0.5, label="LIME")
circle_counterfactual = plt.Circle((0.6, 0.6), 0.2, color='green', alpha=0.5, label="Counterfactual")
circle_permutation = plt.Circle((0.4, 0.6), 0.2, color='red', alpha=0.5, label="Permutation")

# Add the circles to the plot
ax.add_artist(circle_shap)
ax.add_artist(circle_lime)
ax.add_artist(circle_counterfactual)
ax.add_artist(circle_permutation)

# Annotate with key insights or example features for each method
ax.text(0.2, 0.85, 'SHAP\nToken Contribution\n["good", "movie", "enjoyed"]', horizontalalignment='center', verticalalignment='center', fontsize=12, color='blue')
ax.text(0.4, 0.85, 'LIME\nToken Perturbation\n["not", "enjoy", "movie"]', horizontalalignment='center', verticalalignment='center', fontsize=12, color='orange')
ax.text(0.6, 0.75, 'Counterfactual\nPrediction Flip\n["didn\'t", "enjoy", "the", "movie"]', horizontalalignment='center', verticalalignment='center', fontsize=12, color='green')
ax.text(0.4, 0.55, 'Permutation\nPerformance Impact\n["great", "performance", "interesting"]', horizontalalignment='center', verticalalignment='center', fontsize=12, color='red')

# Set limits and remove axes for clarity
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_xticks([])
ax.set_yticks([])

# Title and display
plt.title("Comparison of XAI Methods for DistilBERT on Sentences NLI Data")
plt.show()
