# Advanced Explainable AI (XAI) Analysis - 2025 Edition

## Objective
Beyond basic feature importance (SHAP/LIME), this notebook implements cutting-edge XAI techniques to answer deeper questions:
1.  **Counterfactuals (DiCE)**: "How do I *change* the outcome?"
2.  **Anchors (Alibi)**: "What are the *sufficient conditions* for this prediction?"
3.  **Fairness (Fairlearn)**: "Is the model *biased* against specific groups?"
4.  **Concept Bottlenecks**: "Which high-level *concepts* (e.g., Service, Taste) drive the sentiment?"

In [None]:
# Install libraries if missed
# !pip install dice-ml alibi fairlearn scikit-learn pandas numpy matplotlib seaborn sentence-transformers

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

## 1. Setup & Re-Training

In [None]:
# Load data from the backup created in the previous step
try:
    df = pd.read_csv('train_data_with_embeddings_BACKUP.csv')
    print("Loaded data from backup.")
except FileNotFoundError:
    print("Backup file not found. Loading original 'train_data.csv'...")
    df = pd.read_csv('train_data.csv')

# --- DATA RECOVERY & VALIDATION ---
# Check if 'embedding' or 'Sentiment' are missing and regenerate if needed

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import ast

# 1. Ensure Embeddings exist
if 'embedding' not in df.columns:
    print("'embedding' column missing. Generating embeddings now (this may take a minute)...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    df['embedding'] = list(model.encode(df['review'].tolist(), show_progress_bar=True))
else:
    # If they exist but act like strings (from CSV), parse them
    if isinstance(df['embedding'].iloc[0], str):
        print("Parsing embedding strings to lists...")
        df['embedding'] = df['embedding'].apply(ast.literal_eval)

# 2. Ensure Sentiment exists
if 'Sentiment' not in df.columns:
    print("'Sentiment' column missing. Deriving sentiment from embeddings...")
    # Initialize model if not already done
    if 'model' not in locals():
        model = SentenceTransformer('all-MiniLM-L6-v2')
        
    pos_anchor = model.encode(["Positive restaurant experience, delicious food, great service, loved it"])
    neg_anchor = model.encode(["Negative restaurant experience, bad food, terrible service, not recommended"])
    
    def get_sentiment(emb):
        # Ensure emb is array
        emb = np.array(emb).reshape(1, -1)
        pos_sim = cosine_similarity(emb, pos_anchor)[0][0]
        neg_sim = cosine_similarity(emb, neg_anchor)[0][0]
        return 1 if pos_sim > neg_sim else 0
        
    df['Sentiment'] = df['embedding'].apply(get_sentiment)
    print("Sentiment derived successfully.")

print("Data Validation Complete.")
# ----------------------------------

# Preprocessing
if 'Gender_Code' not in df.columns:
    le_gender = LabelEncoder()
    df['Gender_Code'] = le_gender.fit_transform(df['gender'])
    
if 'Meal_Code' not in df.columns:
    le_meal = LabelEncoder()
    df['Meal_Code'] = le_meal.fit_transform(df['meal_category'])

feature_names = ['age', 'Gender_Code', 'Meal_Code']
target_name = 'Sentiment'

X = df[feature_names]
y = df[target_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
print(f"Model Accuracy: {clf.score(X_test, y_test):.2f}")

## 2. Counterfactual Explanations (DiCE)
Generates "What-if" examples to flip the prediction.

In [None]:
import dice_ml
from dice_ml.utils import helpers  # helper functions

# DiCE requires a specific data object
# We handle numeric 'Gender_Code' and 'Meal_Code' as discrete categorical, or continuous for simplicity?
# Usually DiCE wants readable names. Let's make a readable dataframe for exploration
df_dice = df[feature_names + [target_name]].copy()

d = dice_ml.Data(dataframe=df_dice, 
                 continuous_features=['age'], 
                 outcome_name=target_name)

# Using 'sklearn' backend for RandomForest
m = dice_ml.Model(model=clf, backend="sklearn")

# Initialize DiCE
exp = dice_ml.Dice(d, m, method="random")

# Query: Show me how to change a Negative prediction (0) to Positive (1)
# Let's pick a negative instance from test set
negative_instances = X_test[y_test == 0]
if len(negative_instances) > 0:
    query_instance = negative_instances.iloc[[0]]
    
    print("Query Instance (Negative Prediction):")
    print(query_instance)
    
    # Generate Counterfactuals
    # Generating 3 examples with 'opposite' outcome
    dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=3, desired_class="opposite")
    
    # Visualize
    dice_exp.visualize_as_dataframe(show_only_changes=True)
else:
    print("No negative instances found in test set to explain.")

## 3. Anchor Explanations (Alibi)
Finds high-precision rules (sufficient conditions) for a prediction.

In [None]:
from alibi.explainers import AnchorTabular

# Initialize Anchor explainer
predict_fn = lambda x: clf.predict(x)
explainer = AnchorTabular(predict_fn, feature_names)

# Determine feature categorical map for Alibi (it needs to know which cols are categorical)
# Age is continuous (roughly), Gender/Meal are categorical
# We used LabelEncoder 0,1,2...
explainer.fit(X_train.values, categorical_names={1: ['Female', 'Male'], 2: ['Asian', 'Main', 'Snack', 'etc']})

# Explain an instance
idx = 0
explanation = explainer.explain(X_test.iloc[idx].values, threshold=0.95)

print(f"Anchor Explanation for Instance {idx}:")
print(f"Prediction: {predict_fn([X_test.iloc[idx].values])[0]}")
print(f"Anchor Rule: {explanation.anchor}")
print(f"Precision: {explanation.precision:.2f}")
print(f"Coverage: {explanation.coverage:.2f}")

## 4. Fairness & Bias Auditing (Fairlearn)
Checks if the model treats demographic groups equally.

In [None]:
from fairlearn.metrics import MetricFrame, selection_rate, count
from fairlearn.plotting import plot_box_plots

# Define sensitive feature (Gender)
sensitive_feature = X_test['Gender_Code']

# Calculate metrics
metrics = {
    'accuracy': accuracy_score,
    'selection_rate': selection_rate, # How often model predicts 'Positive'
    'count': count
}

metric_frame = MetricFrame(
    metrics=metrics,
    y_true=y_test,
    y_pred=clf.predict(X_test),
    sensitive_features=sensitive_feature
)

print("Fairness Metrics by Group (0/1 Gender):")
print(metric_frame.by_group)

# Visualizing Selection Rate (Demographic Parity)
metric_frame.by_group['selection_rate'].plot(kind='bar', title="Selection Rate by Gender")
plt.ylabel("Fraction Predicted Positive")
plt.show()

## 5. Concept Bottleneck Model (Post-hoc Concept Discovery)
Clustering embeddings to find high-level "Concepts" and measuring their impact on Sentiment.

In [None]:
# 1. Parse Embeddings from String (if loaded from CSV they might be strings)
# If using backup, they are usually stored as string representations of lists
import ast

# Check format - we already did this in setup, but doing it safely again
# If we just regenerated, it's list. If from CSV previously, it might be string
# But Setup cell handles it now.

X_emb = np.stack(df['embedding'].values)

# 2. Cluster Embeddings into 'Concepts' (e.g., K=5)
n_concepts = 5
kmeans = KMeans(n_clusters=n_concepts, random_state=42)
concept_labels = kmeans.fit_predict(X_emb)
df['Concept_Cluster'] = concept_labels

# 3. Interpret Concepts (Look at top keywords/reviews in each cluster)
print("--- Discovered Concepts ---")
for i in range(n_concepts):
    print(f"\nConcept {i} Sample Reviews:")
    print(df[df['Concept_Cluster'] == i]['review'].head(3).values)

# 4. Concept Importance (Linear Model: Concept -> Sentiment)
# We One-Hot Encode concepts
concepts_oh = pd.get_dummies(df['Concept_Cluster'], prefix='Concept')

from sklearn.linear_model import LogisticRegression

concept_model = LogisticRegression()
concept_model.fit(concepts_oh, df['Sentiment'])

print("\n--- Concept Importance (Impact on Sentiment) ---")
coefs = concept_model.coef_[0]
for i, coef in enumerate(coefs):
    impact = "Positive" if coef > 0 else "Negative"
    print(f"Concept {i}: Weight {coef:.2f} ({impact})")

plt.figure(figsize=(8, 4))
plt.bar(range(n_concepts), coefs)
plt.title("Impact of Discovered Concepts on Sentiment")
plt.xlabel("Concept Cluster")
plt.ylabel("Regression Coefficient")
plt.show()