In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ML Frameworks
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix, 
                             ConfusionMatrixDisplay, roc_auc_score, accuracy_score)
from xgboost import XGBClassifier

# Resampling for Imbalanced Data
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN

# =============================================================================
# PHASE 5: PREDICTIVE MODELING & ARCHITECTURE
# =============================================================================

# Importing the Enriched dataset
df_1 = pd.read_csv('loan_data_enriched.csv')

# --- 1. FEATURE SELECTION & DIMENSIONALITY REDUCTION ---
# Objective: Remove high-cardinality identifiers, redundant data, and 
# features that introduce 'Data Leakage' (e.g., total_payment).
to_remove = [
    'id', 'address_state', 'application_type', 'emp_title', 'issue_date',
    'last_credit_pull_date', 'last_payment_date', 'next_payment_date',
    'loan_status', 'member_id', 'sub_grade', 'emp_length', 'annual_income',
    'int_rate', 'dti', 'grade', 'installment', 'month', 'emp_length_num',
    'risk_score', 'total_payment'
]

df_ml = df_1.drop(columns=to_remove, errors='ignore')

# --- 2. PREPROCESSING & FEATURE TRANSFORMATION PIPELINES ---
# Logic: Applying a dual-strategy to handle skewed distributions and outliers.

# A. Logarithmic Normalization (Skewed Distributions)
col_to_log = [
    'loan_amount', 'total_acc', 'annual_income_capped', 
    'payment_to_income_ratio', 'loan_to_income_ratio'
]

# B. Scaler-Only (Uniform distributions)
col_to_scale_only = ['payment_completion_ratio']

# C. Categorical & Binary Handling
cat_features = [
    'home_ownership', 'purpose', 'term', 'verification_status',
    'grade_numeric', 'dti_category', 'int_rate_category', 
    'tenure_tier', 'risk_segment'
]
binary_features = ['dti_rate_warning', 'int_rate_warning', 'is_60_months']

# Constructing the ColumnTransformer
# RobustScaler is utilized to mitigate the influence of financial outliers.
log_transformer = FunctionTransformer(np.log1p, validate=True)

preprocessor = ColumnTransformer(
    transformers=[
        ('log_transform', Pipeline([
            ('log', log_transformer),
            ('scaler', RobustScaler())
        ]), col_to_log),
        ('std_scale', RobustScaler(), col_to_scale_only),
        ('categorical', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_features),
        ('binary', 'passthrough', binary_features)
    ])

# --- 3. PIPELINE FACTORY (RESAMPLING & CLASSIFICATION) ---
# Objective: Create a reproducible pipeline that integrates SMOTEENN 
# to address the 14% minority class (defaulted loans) within the CV loop.
def create_credit_pipeline(classifier):
    return Pipeline(steps=[
        ('preprocessing', preprocessor),
        ('smoteen', SMOTEENN(random_state=42)), # Hybrid resampling to clean noise
        ('classifier', classifier)
    ])

# --- 4. DATA PARTITIONING (STRATIFIED SPLIT) ---
X = df_ml.drop(columns=['defaulted'])
y = df_ml['defaulted']

# Utilizing stratify=y to ensure target proportion is preserved in training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- 5. MODEL BENCHMARKING: CHAMPION VS. CHALLENGERS ---
# Benchmarking Linear (Logistic Regression) vs. Ensemble (Random Forest/XGBoost) models.
models = {
    "Logistic_Regression": create_credit_pipeline(LogisticRegression(max_iter=1000, C=1.0)),
    "Random_Forest": create_credit_pipeline(RandomForestClassifier(n_estimators=100, random_state=42)),
    "XGBoost": create_credit_pipeline(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
}

# =============================================================================
# 6. CHAMPION MODEL EVALUATION: LOGISTIC REGRESSION (High Interpretability)
# =============================================================================
lr_pipeline = models["Logistic_Regression"]
lr_pipeline.fit(X_train, y_train)

y_pred_LR = lr_pipeline.predict(X_test)
y_proba_LR = lr_pipeline.predict_proba(X_test)[:, 1]

print("\n--- PERFORMANCE REPORT: LOGISTIC REGRESSION ---")
print(classification_report(y_test, y_pred_LR))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba_LR):.4f}")

# Visualizing Confusion Matrix for Capital Protection Assessment
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_LR, cmap='Blues')
plt.title('Confusion Matrix - Credit Risk (Logistic Regression)')
plt.show()

# =============================================================================
# 7. MODEL INTERPRETABILITY (FEATURE COEFFICIENTS)
# =============================================================================
# 'Opening the Black Box' to identify the top risk escalators and mitigators.
model_logic = lr_pipeline.named_steps['classifier']
preprocessor_fitted = lr_pipeline.named_steps['preprocessing']

def get_feature_names(column_transformer):
    output_features = []
    for name, transformer, columns in column_transformer.transformers_:
        if name == 'remainder' and transformer == 'drop': continue
        if hasattr(transformer, 'named_steps'):
            transformer = transformer.named_steps[list(transformer.named_steps.keys())[-1]]
        if hasattr(transformer, 'get_feature_names_out'):
            names = transformer.get_feature_names_out(columns)
        else:
            names = columns
        output_features.extend(names)
    return output_features

feature_names = get_feature_names(preprocessor_fitted)
feat_imp = pd.DataFrame({'feature': feature_names, 'coefficient': model_logic.coef_[0]})
feat_imp['abs_coef'] = feat_imp['coefficient'].abs()
feat_imp = feat_imp.sort_values(by='abs_coef', ascending=False)

# Visualizing Risk Drivers
top_15 = feat_imp.head(15)
plt.figure(figsize=(10, 6))
colors = ['crimson' if x > 0 else 'seagreen' for x in top_15['coefficient']]
plt.barh(top_15['feature'], top_15['coefficient'], color=colors)
plt.gca().invert_yaxis()
plt.title("Top 15 Credit Risk Drivers (Logistic Regression Coefficients)", fontweight='bold')
plt.xlabel("Coefficient Weight (Direction of Impact)")
plt.show()

# =============================================================================
# 8. STABILITY VALIDATION (CROSS-VALIDATION)
# =============================================================================
# Ensuring results are robust across different folds of the data.
cv_metrics = ['recall', 'precision', 'f1', 'roc_auc']
cv_results = cross_validate(lr_pipeline, X_train, y_train, cv=5, scoring=cv_metrics, n_jobs=-1)

print("\n--- 5-FOLD CROSS-VALIDATION SUMMARY ---")
print(f"Mean Recall:    {cv_results['test_recall'].mean():.4f} (Stability: +/- {cv_results['test_recall'].std():.4f})")
print(f"Mean Precision: {cv_results['test_precision'].mean():.4f}")
print(f"Mean ROC-AUC:   {cv_results['test_roc_auc'].mean():.4f}")

# =============================================================================
# 9. CHALLENGER MODEL EVALUATION: RANDOM FOREST
# =============================================================================
rf_pipeline = models["Random_Forest"]
rf_pipeline.fit(X_train, y_train)
y_pred_RF = rf_pipeline.predict(X_test)

print("\n--- PERFORMANCE REPORT: RANDOM FOREST ---")
print(classification_report(y_test, y_pred_RF))
print(f"ROC-AUC Score: {roc_auc_score(y_test, rf_pipeline.predict_proba(X_test)[:, 1]):.4f}")

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_RF, cmap='Greens')
plt.title('Confusion Matrix - Credit Risk (Random Forest)')
plt.show()