In [None]:
#Main Model

# STEP 1: INSTALL DEPENDENCIES FIRST


import pandas as pd
import numpy as np
import optuna
import shap
import xgboost as xgb
import joblib
import warnings
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    roc_auc_score, classification_report, brier_score_loss,
    precision_score, f1_score
)
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.calibration import CalibratedClassifierCV
# from river import linear_model, preprocessing as river_preprocessing, metrics as river_metrics

warnings.filterwarnings('ignore')

# STEP 2: LOAD DATA
df = pd.read_csv("credit_risk_dataset.csv")
categorical = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
for col in categorical:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

df = df.dropna(subset=["loan_status"])
X = df.drop(columns=["loan_status"])
y = df["loan_status"]

numeric = X.select_dtypes(include=["int64", "float64"]).columns
X[numeric] = StandardScaler().fit_transform(X[numeric])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, stratify=y, random_state=42)
X_train = pd.DataFrame(X_train).fillna(0).astype(np.float64)
X_test = pd.DataFrame(X_test).fillna(0).astype(np.float64)

# STEP 3: PSI CALCULATION
def calculate_psi(expected, actual, buckets=10):
    def scale_range(data): return (data - data.min()) / (data.max() - data.min())
    psi_values = []
    for col in expected.columns:
        e, a = scale_range(expected[col]), scale_range(actual[col])
        breakpoints = np.linspace(0, 1, buckets + 1)
        expected_counts = np.histogram(e, bins=breakpoints)[0] / len(e)
        actual_counts = np.histogram(a, bins=breakpoints)[0] / len(a)
        psi = np.sum((expected_counts - actual_counts) * np.log((expected_counts + 1e-6) / (actual_counts + 1e-6)))
        psi_values.append((col, psi))
    return pd.DataFrame(psi_values, columns=["Feature", "PSI"]).sort_values(by="PSI", ascending=False)

psi_df = calculate_psi(X_train, X_test)
print("Top PSI drift features:\n", psi_df.head())

# STEP 4: Y-DRIFT
train_rate, test_rate = y_train.mean(), y_test.mean()
print(f"Train default rate: {train_rate:.4f}, Test default rate: {test_rate:.4f}, Diff: {abs(train_rate - test_rate):.4f}")

# STEP 5: ADVERSARIAL VALIDATION
X_adv = pd.concat([X_train, X_test])
y_adv = np.array([0]*len(X_train) + [1]*len(X_test))
adv_model = LogisticRegression(max_iter=1000)
adv_model.fit(X_adv, y_adv)
adv_auc = roc_auc_score(y_adv, adv_model.predict_proba(X_adv)[:, 1])
print(f"Adversarial AUC (train vs test): {adv_auc:.4f}")

# STEP 6: OPTUNA FOR XGBOOST
def objective(trial):
    params = {
        "objective": "binary:logistic",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
        "use_label_encoder": False
    }
    model = xgb.XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    aucs = []
    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        model.fit(X_tr, y_tr)
        preds = model.predict_proba(X_val)[:, 1]
        aucs.append(roc_auc_score(y_val, preds))
    return np.mean(aucs)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
xgb_best_params = study.best_params
xgb_best_params["use_label_encoder"] = False

xgb_model = xgb.XGBClassifier(**xgb_best_params)
xgb_model.fit(X_train, y_train)

# STEP 7: SHAP FEATURE SELECTION (TOP 30)
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_train)
shap_mean = np.abs(shap_values.values).mean(axis=0)
top_indices = np.argsort(shap_mean)[-30:]
top_features = X_train.columns[top_indices]
X_train = X_train[top_features].astype(np.float64)
X_test = X_test[top_features].astype(np.float64)

# STEP 8: STACKING + CALIBRATION
base_models = [
    ('lr', LogisticRegression(max_iter=1000, multi_class="ovr")),
    ('rf', RandomForestClassifier(n_estimators=150, max_depth=10)),
    ('xgb', xgb_model)
]
meta = RidgeClassifier()
stacked = StackingClassifier(estimators=base_models, final_estimator=meta, passthrough=True, n_jobs=-1)
calibrated = CalibratedClassifierCV(estimator=stacked, method='sigmoid', cv=5)
calibrated.fit(X_train.values, y_train.values)

# STEP 9: METRICS + EXPORT
y_proba = calibrated.predict_proba(X_test.values)[:, 1]
y_pred = calibrated.predict(X_test.values)

print("Final AUC:", roc_auc_score(y_test, y_proba))
print("Brier Score:", brier_score_loss(y_test, y_proba))
print("Precision:", precision_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

submission = X_test.copy()
submission["actual"] = y_test.values
submission["predicted"] = y_pred
submission["probability"] = y_proba
submission.to_csv(r"C:\Users\maazj\Downloads\final_submission_stacked.csv", index=False)

# STEP 10: VERSIONING & RETRAINING
if psi_df["PSI"].max() > 0.2 or adv_auc > 0.6 or abs(train_rate - test_rate) > 0.05:
    print("⚠ Drift detected — retraining model...")
    calibrated.fit(X_test.values, y_test.values)
    joblib.dump(calibrated, r"C:\Users\maazj\Downloads\model_v2_retrained.pkl")
else:
    joblib.dump(calibrated, r"‪C:\IBA\FDA\Final")

# STEP 11: RIVER ONLINE TRAINING
print("Running River Online Logistic Regression...")
river_pipeline = river_preprocessing.StandardScaler() | linear_model.LogisticRegression()
river_auc = river_metrics.ROCAUC()

for xi, yi in zip(X_test.to_dict(orient='records'), y_test):
    pred = river_pipeline.predict_one(xi)
    river_auc = river_auc.update(yi, pred)
    river_pipeline = river_pipeline.learn_one(xi, yi)

print("River Online AUC:", river_auc.get())

# STEP 12: OPTIONAL - PyCaret AutoML (Uncomment only if on Python 3.9–3.11)
# from pycaret.classification import *
# df_py = pd.DataFrame(X[top_features])
# df_py['loan_status'] = y.values
# setup(data=df_py, target='loan_status', silent=True, use_gpu=True)
# best_model = compare_models()