<a href="https://colab.research.google.com/github/2303a52252-collab/Exp-AI/blob/main/LabAss_2(2252)_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==============================
# SHAP Assignment - Retail & E-commerce
# Dataset: Online Shoppers Purchasing Intention (UCI)
# ==============================

# Install if missing:
# pip install pandas numpy scikit-learn shap matplotlib seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# -------------------------
# Load Dataset
# -------------------------
df = pd.read_csv("/content/online_shoppers_intention.csv")
print("Shape:", df.shape)
print(df.head())

# -------------------------
# Preprocessing
# -------------------------
target = "Revenue"
df[target] = df[target].astype(int)  # Convert to int
df = df.drop_duplicates()            # Drop duplicates

print("Missing values:\n", df.isnull().sum())

# Identify categorical & numeric columns
categorical_cols = df.select_dtypes(include=["object", "bool"]).columns.tolist()
categorical_cols = [c for c in categorical_cols if c != target]
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != target]

print("Numeric Features:", numeric_cols)
print("Categorical Features:", categorical_cols)

# -------------------------
# Train/Test Split
# -------------------------
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------
# Preprocessing Pipeline
# -------------------------
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

# -------------------------
# Model Pipeline
# -------------------------
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# -------------------------
# Evaluation
# -------------------------
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_proba)

print("\nEvaluation Metrics:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"ROC-AUC:   {roc:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -------------------------
# SHAP Analysis
# -------------------------
# Extract trained RF and transformed data
rf_model = clf.named_steps["model"]
X_train_transformed = clf.named_steps["preprocessor"].transform(X_train)
X_test_transformed  = clf.named_steps["preprocessor"].transform(X_test)

# Get feature names from ColumnTransformer
ohe = clf.named_steps["preprocessor"].named_transformers_["cat"]
ohe_features = ohe.get_feature_names_out(categorical_cols)
all_features = numeric_cols + list(ohe_features)

# Convert transformed arrays to DataFrames
X_train_df = pd.DataFrame(
    X_train_transformed.toarray() if hasattr(X_train_transformed, "toarray") else X_train_transformed,
    columns=all_features
)
X_test_df = pd.DataFrame(
    X_test_transformed.toarray() if hasattr(X_test_transformed, "toarray") else X_test_transformed,
    columns=all_features
)

# -------------------------
# SHAP Analysis (fixed)
# -------------------------
# Build explainer with background = train set
explainer = shap.Explainer(rf_model, X_train_df)

# Get SHAP values (disable additivity check to avoid floating errors)
shap_values = explainer(X_test_df, check_additivity=False)

# -------------------------
# Global explanation
# -------------------------
shap.summary_plot(shap_values, X_test_df, plot_type="bar")
shap.summary_plot(shap_values, X_test_df)



FileNotFoundError: [Errno 2] No such file or directory: '/content/online_shoppers_intention.csv'