In [1]:
#Model 1
# Catboost (Updated and Final)
# STEP 1: Install libraries

# STEP 2: Import Libraries
import pandas as pd
import numpy as np
import optuna
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from google.colab import files

# STEP 3: Load Data
df = pd.read_csv("/content/drive/MyDrive/FDA Final Pres/credit_risk_dataset.csv")
df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
df['loan_int_rate'].fillna(df['loan_int_rate'].median(), inplace=True)

X = df.drop("loan_status", axis=1)
y = df["loan_status"]
cat_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']

# STEP 4: Advanced Optuna Objective with Cross-Validation
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 500, 1500),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1e-9, 10),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "rsm": trial.suggest_float("rsm", 0.6, 1.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 0.5, 5.0),
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "verbose": 0,
        "random_seed": 42,
        "cat_features": cat_features
    }
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    auc_scores = []
    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        model = CatBoostClassifier(**params)
        model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=50)
        preds = model.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, preds)
        auc_scores.append(auc)
    return np.mean(auc_scores)

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=30, timeout=600)
print("Best AUC:", study.best_value)
print("Best Parameters:", study.best_params)

# STEP 5: Train Final Model
best_params = study.best_params
best_params.update({
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "verbose": 100,
    "random_seed": 42,
    "cat_features": cat_features
})
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train, y_train)

# STEP 6: Evaluation
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:, 1]
print("\n✅ Final Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("Classification Report:\n", classification_report(y_test, y_pred))

# STEP 7: SHAP
explainer = shap.Explainer(final_model)
shap_values = explainer(X_test)
shap.plots.beeswarm(shap_values, max_display=10)

# STEP 8: PSI
def calculate_psi(expected, actual, buckets=10):
    breakpoints = np.percentile(expected, np.linspace(0, 100, buckets + 1))
    breakpoints[-1] += 1e-6
    expected_percents = np.histogram(expected, bins=breakpoints)[0] / len(expected)
    actual_percents = np.histogram(actual, bins=breakpoints)[0] / len(actual)
    psi_values = []
    for e, a in zip(expected_percents, actual_percents):
        if e == 0 or a == 0:
            psi_values.append(0)
        else:
            psi_values.append((e - a) * np.log(e / a))
    return np.sum(psi_values)

# Simulate drift
X_drifted = X_test.copy()
X_drifted['loan_intent'] = np.random.choice(X['loan_intent'].unique(), size=len(X_drifted), p=[0.25, 0.15, 0.15, 0.15, 0.15, 0.15])
X_drifted['loan_int_rate'] += np.random.normal(loc=2.0, scale=1.0, size=len(X_drifted))
expected_scores = final_model.predict_proba(X_test)[:, 1]
drifted_scores = final_model.predict_proba(X_drifted)[:, 1]
psi_score = calculate_psi(expected_scores, drifted_scores, buckets=10)
print(f"\n📊 Population Stability Index (PSI): {psi_score:.4f}")

# STEP 9: Save & Download
results_df = X_test.copy()
results_df["actual"] = y_test.values
results_df["predicted"] = y_pred
results_df["predicted_prob"] = y_prob
results_df.to_csv("zest_advanced_model_predictions.csv", index=False)
files.download("zest_advanced_model_predictions.csv")

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, RocCurveDisplay, PrecisionRecallDisplay

# CONFUSION MATRIX
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["No Default", "Default"], yticklabels=["No Default", "Default"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

# ROC CURVE
RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("ROC Curve")
plt.grid(True)
plt.show()

# PRECISION-RECALL CURVE
PrecisionRecallDisplay.from_predictions(y_test, y_proba)
plt.title("Precision-Recall Curve")
plt.grid(True)
plt.show()

# XGBOOST FEATURE IMPORTANCE (plotting top 30)
plt.figure(figsize=(10, 8))
xgb.plot_importance(xgb_model, max_num_features=30, importance_type='gain', height=0.6)
plt.title("XGBoost Feature Importance (Gain)")
plt.show()

# SHAP SUMMARY PLOT
shap.summary_plot(shap_values, X_train, plot_type="bar", max_display=30)

ModuleNotFoundError: No module named 'shap'