In [None]:
!pip install shap

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline   
import shap
import os

In [None]:
# 1. Load Dataset
df = pd.read_csv(r"C:\Users\FARHAT\Downloads\bank+marketing\bank-additional\bank-additional\bank-additional-full.csv", sep=";")
print("Initial shape:", df.shape)
print("Columns in dataset:", df.columns.tolist())
print("Target column unique values:", df['y'].unique())

In [None]:
# 2. Encode target
if "y" not in df.columns:
    raise ValueError(f"Target column 'y' not found! Available columns: {df.columns.tolist()}")

# Clean and map target values
df['y'] = df['y'].astype(str).str.strip().str.lower().map({'yes': 1, 'no': 0})

# Drop rows with unmapped targets
df = df.dropna(subset=['y'])
df['y'] = df['y'].astype(int)

print(" Encoded target distribution:\n", df['y'].value_counts())


In [None]:
# 3. One-hot encode categorical variables
df = pd.get_dummies(df, drop_first=True)

In [None]:
#  Features and target
X = df.drop("y", axis=1)
y = df["y"].astype(int)
print("Dataset after encoding:", X.shape, y.shape)
print("Target distribution:\n", y.value_counts())

In [None]:
#4. Train-test split
if len(y) == 0:
    raise ValueError("Target y is empty after encoding. Check dataset!")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)


In [None]:

# 5. Train Models
# Logistic Regression with Scaling Pipeline
log_reg = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),  # works with sparse data
    ('model', LogisticRegression(max_iter=2000, solver="lbfgs"))
])

rf = RandomForestClassifier(n_estimators=200, random_state=42)

# Train models
log_reg.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Predictions
y_pred_lr = log_reg.predict(X_test)
y_pred_rf = rf.predict(X_test)

In [None]:
# 6. Evaluation
print("\nLogistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Create outputs folder
os.makedirs("outputs", exist_ok=True)

# Confusion Matrix (Random Forest)
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Random Forest Confusion Matrix")
plt.savefig("outputs/confusion_matrix.png")
plt.close()

In [None]:

# ROC Curve (Random Forest)
y_prob_rf = rf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob_rf)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'RF AUC={roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.savefig("outputs/roc_curve.png")
plt.close()


In [None]:
# 7. SHAP Explainability
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)

# Summary Plot
shap.summary_plot(shap_values[1], X_test, show=False)
plt.savefig("outputs/shap_summary.png", bbox_inches="tight")
plt.close()

# Explain 5 predictions
sample_idx = np.random.choice(X_test.index, 5, replace=False)
for i in sample_idx:
    shap.force_plot(
        explainer.expected_value[1],
        shap_values[1][X_test.index.get_loc(i)],
        X_test.iloc[X_test.index.get_loc(i)],
        matplotlib=True,
        show=False
    )
    plt.savefig(f"outputs/shap_explain_{i}.png")
    plt.close()

print("\n All outputs saved in 'outputs/' folder")