In [None]:
# STEP 1: Install Libraries
!pip install shap --quiet
!pip install --upgrade scikit-learn --quiet

# STEP 2: Import Libraries
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import LabelEncoder
from google.colab import files

# STEP 3: Load and Preprocess Data
df = pd.read_csv("/content/credit_risk_dataset.csv")
df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)

# Encode categorical variables
categorical_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
df[categorical_cols] = df[categorical_cols].apply(LabelEncoder().fit_transform)

# Define features and target
X = df.drop("loan_status", axis=1)
y = df["loan_status"]

# STEP 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# STEP 5: Hyperparameter Tuning with RandomizedSearchCV
param_dist = {
    "n_estimators": [50, 100, 200, 300],
    "learning_rate": [0.01, 0.05, 0.1, 0.3, 1.0],
    "base_estimator__max_depth": [1, 2, 3, 4]
}

base_est = DecisionTreeClassifier()
ada = AdaBoostClassifier(base_estimator=base_est)

search = RandomizedSearchCV(ada, param_distributions=param_dist, n_iter=20,
                            scoring="roc_auc", cv=3, verbose=1, n_jobs=-1)
search.fit(X_train, y_train)
model = search.best_estimator_

# STEP 6: Evaluate Model
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\n✅ Evaluation on Test Data:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n", classification_report(y_test, y_pred))

# STEP 7: Feature Importance using Permutation
perm = permutation_importance(model, X_test, y_test, scoring='roc_auc', n_repeats=10, random_state=42)
sorted_idx = perm.importances_mean.argsort()[::-1]

plt.figure(figsize=(10, 5))
plt.barh(X.columns[sorted_idx][:10][::-1], perm.importances_mean[sorted_idx][:10][::-1])
plt.title("Top 10 Feature Importances (Permutation)")
plt.xlabel("Mean Decrease in AUC")
plt.show()

# STEP 8: Simulate Drift
X_drifted = X_test.copy()
X_drifted['loan_intent'] = np.random.choice(X['loan_intent'].unique(), size=len(X_drifted), p=[0.25, 0.15, 0.15, 0.15, 0.15, 0.15])
X_drifted['loan_int_rate'] += np.random.normal(loc=2.0, scale=1.0, size=len(X_drifted))

# STEP 9: PSI Calculation
def calculate_psi(expected, actual, buckets=10):
    breakpoints = np.percentile(expected, np.linspace(0, 100, buckets + 1))
    breakpoints[-1] += 1e-6
    expected_percents = np.histogram(expected, bins=breakpoints)[0] / len(expected)
    actual_percents = np.histogram(actual, bins=breakpoints)[0] / len(actual)
    psi = np.sum((expected_percents - actual_percents) * np.log((expected_percents + 1e-8) / (actual_percents + 1e-8)))
    return psi

score_before = model.predict_proba(X_test)[:, 1]
score_drifted = model.predict_proba(X_drifted)[:, 1]
psi_score = calculate_psi(score_before, score_drifted)
print(f"\n📊 PSI (Prediction Drift): {psi_score:.4f}")

# STEP 10: Auto-Retrain if PSI > 0.2
if psi_score > 0.2:
    print("🚨 Drift detected. Retraining model on drifted data...")
    model.fit(X_drifted, y_test)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    print("🔁 Retrained Model Evaluation:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_prob))
else:
    print("✅ No significant drift. Model remains valid.")

# STEP 11: Save and Download Results
results_df = X_test.copy()
results_df["actual"] = y_test.values
results_df["predicted"] = y_pred
results_df["predicted_prob"] = y_prob
results_df.to_csv("adaboost_predictions.csv", index=False)
files.download("adaboost_predictions.csv")