In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, plot_tree, export_text
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report,
    precision_recall_curve, average_precision_score
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

plt.rcParams.update({
    'figure.facecolor': '#0f0f1a',
    'axes.facecolor':   '#1a1a2e',
    'axes.edgecolor':   '#444466',
    'axes.labelcolor':  '#e0e0f0',
    'xtick.color':      '#aaaacc',
    'ytick.color':      '#aaaacc',
    'text.color':       '#e0e0f0',
    'grid.color':       '#2a2a4a',
    'grid.linestyle':   '--',
    'grid.linewidth':   0.5,
    'font.family':      'DejaVu Sans',
    'font.size':        11
})

PRIMARY   = '#7b5ea7'
SECONDARY = '#00c9a7'
DANGER    = '#ff6b6b'
HIGHLIGHT = '#ffd166'
PALETTE   = [PRIMARY, SECONDARY, DANGER, HIGHLIGHT, '#4ecdc4', '#a8dadc']

## 1. Data Loading & Inspection

In [None]:
df = pd.read_csv('loan_data.csv')
print(f'Shape: {df.shape}')
print(f'Null values:\n{df.isnull().sum()}')
print(f'Duplicates: {df.duplicated().sum()}')
df.head()

## 2. Preprocessing

In [None]:
df = df.dropna()

for col in ['person_age', 'person_income', 'person_emp_exp']:
    cap = df[col].quantile(0.99)
    df  = df[df[col] <= cap]

df['previous_loan_defaults_on_file'] = df['previous_loan_defaults_on_file'].map({'Yes': 1, 'No': 0})
df['person_gender']                  = df['person_gender'].map({'male': 1, 'female': 0})

print(f'Shape after cleaning: {df.shape}')
df.describe()

In [None]:
education_order  = [['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate']]
ordinal_features = ['person_education']
nominal_features = ['person_home_ownership', 'loan_intent']
numeric_features = [
    'person_age', 'person_income', 'person_emp_exp',
    'loan_amnt', 'loan_int_rate', 'loan_percent_income',
    'cb_person_cred_hist_length', 'credit_score',
    'previous_loan_defaults_on_file', 'person_gender'
]

X = df.drop(columns=['loan_status'])
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f'Train: {X_train.shape}  |  Test: {X_test.shape}')
print(f'Class balance — 0: {(y==0).sum()}, 1: {(y==1).sum()}')

## 3. Pipeline Construction

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',     'passthrough',                                    numeric_features),
        ('ordinal', OrdinalEncoder(categories=education_order),       ordinal_features),
        ('nominal', OneHotEncoder(drop='first', sparse_output=False), nominal_features),
    ],
    remainder='drop'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',   DecisionTreeClassifier(
        random_state=42,
        class_weight='balanced'
    ))
])

pipeline

## 4. Hyperparameter Tuning (GridSearchCV)

In [None]:
param_grid = {
    'classifier__max_depth':        [4, 6, 8, 10, 12, None],
    'classifier__min_samples_leaf': [1, 5, 10, 20, 50],
    'classifier__criterion':        ['gini', 'entropy'],
    'classifier__min_samples_split': [2, 10, 20]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1,
    refit=True
)

grid_search.fit(X_train, y_train)

print(f'Best ROC-AUC (CV): {grid_search.best_score_:.4f}')
print(f'Best params: {grid_search.best_params_}')

In [None]:
best_model = grid_search.best_estimator_
best_tree  = best_model.named_steps['classifier']

print(f'Tree depth:  {best_tree.get_depth()}')
print(f'Tree leaves: {best_tree.get_n_leaves()}')

## 5. Cost-Complexity Pruning (alpha sweep)

In [None]:
preprocessor_fit = best_model.named_steps['preprocessor']
X_train_t = preprocessor_fit.transform(X_train)
X_test_t  = preprocessor_fit.transform(X_test)

path = DecisionTreeClassifier(
    class_weight='balanced', random_state=42
).cost_complexity_pruning_path(X_train_t, y_train)

ccp_alphas = path.ccp_alphas[:-1]

train_scores, test_scores = [], []
for alpha in ccp_alphas:
    dt = DecisionTreeClassifier(
        class_weight='balanced', random_state=42,
        ccp_alpha=alpha,
        **{k.replace('classifier__', ''): v
           for k, v in grid_search.best_params_.items()
           if k != 'classifier__ccp_alpha'}
    )
    dt.fit(X_train_t, y_train)
    train_scores.append(roc_auc_score(y_train, dt.predict_proba(X_train_t)[:, 1]))
    test_scores.append(roc_auc_score(y_test,  dt.predict_proba(X_test_t)[:, 1]))

best_alpha_idx = np.argmax(test_scores)
best_ccp_alpha = ccp_alphas[best_alpha_idx]

fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(ccp_alphas, train_scores, color=SECONDARY, lw=2, label='Train ROC-AUC')
ax.plot(ccp_alphas, test_scores,  color=DANGER,    lw=2, label='Test ROC-AUC')
ax.axvline(best_ccp_alpha, color=HIGHLIGHT, linestyle='--', lw=1.5,
           label=f'Optimal alpha = {best_ccp_alpha:.5f}')
ax.set_xlabel('ccp_alpha')
ax.set_ylabel('ROC-AUC')
ax.set_title('Cost-Complexity Pruning Path', fontsize=14, fontweight='bold', pad=12)
ax.legend(fontsize=10)
ax.yaxis.grid(True)
ax.set_axisbelow(True)
plt.tight_layout()
plt.show()

print(f'Optimal ccp_alpha: {best_ccp_alpha:.6f}')

In [None]:
pruned_params = {k.replace('classifier__', ''): v
                 for k, v in grid_search.best_params_.items()}
pruned_params['ccp_alpha'] = best_ccp_alpha

final_tree = DecisionTreeClassifier(
    class_weight='balanced', random_state=42,
    **pruned_params
)
final_tree.fit(X_train_t, y_train)

print(f'Pruned tree depth:  {final_tree.get_depth()}')
print(f'Pruned tree leaves: {final_tree.get_n_leaves()}')

## 6. Cross-Validation on Final Model

In [None]:
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_fit),
    ('classifier',   final_tree)
])

cv_metrics = {}
for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
    scores = cross_val_score(final_pipeline, X_train, y_train,
                             cv=cv, scoring=metric, n_jobs=-1)
    cv_metrics[metric] = scores
    print(f'{metric:12s}: {scores.mean():.4f} (+/- {scores.std():.4f})')

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
metric_labels = list(cv_metrics.keys())
means  = [cv_metrics[m].mean() for m in metric_labels]
stds   = [cv_metrics[m].std()  for m in metric_labels]
colors = [PRIMARY, SECONDARY, DANGER, HIGHLIGHT, '#4ecdc4']

bars = ax.bar(metric_labels, means, yerr=stds, capsize=6,
              color=colors, edgecolor='none',
              error_kw=dict(ecolor='#ffffff', lw=1.5))
for bar, mean in zip(bars, means):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
            f'{mean:.4f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

ax.set_ylim(0, 1.12)
ax.set_title('5-Fold Cross-Validation Metrics (Pruned Tree)', fontsize=14, fontweight='bold', pad=12)
ax.set_ylabel('Score')
ax.yaxis.grid(True)
ax.set_axisbelow(True)
plt.tight_layout()
plt.show()

## 7. Threshold Optimization

In [None]:
y_proba = final_tree.predict_proba(X_test_t)[:, 1]

thresholds    = np.linspace(0.01, 0.99, 200)
f1_scores     = [f1_score(y_test, (y_proba >= t).astype(int)) for t in thresholds]
optimal_idx   = np.argmax(f1_scores)
optimal_thr   = thresholds[optimal_idx]

fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(thresholds, f1_scores, color=PRIMARY, lw=2)
ax.axvline(optimal_thr, color=HIGHLIGHT, linestyle='--', lw=1.5,
           label=f'Optimal threshold = {optimal_thr:.3f}')
ax.scatter([optimal_thr], [f1_scores[optimal_idx]], color=HIGHLIGHT, s=80, zorder=5)
ax.set_xlabel('Decision Threshold')
ax.set_ylabel('F1 Score')
ax.set_title('F1 Score vs Decision Threshold', fontsize=14, fontweight='bold', pad=12)
ax.legend(fontsize=10)
ax.yaxis.grid(True)
ax.set_axisbelow(True)
plt.tight_layout()
plt.show()

print(f'Optimal threshold: {optimal_thr:.4f}')
print(f'Best F1 at threshold: {f1_scores[optimal_idx]:.4f}')

## 8. Evaluation on Test Set

In [None]:
y_pred_default = (y_proba >= 0.5).astype(int)
y_pred_optimal = (y_proba >= optimal_thr).astype(int)

def get_metrics(y_true, y_pred, y_prob):
    return {
        'Accuracy':  accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall':    recall_score(y_true, y_pred),
        'F1':        f1_score(y_true, y_pred),
        'ROC-AUC':   roc_auc_score(y_true, y_prob),
        'PR-AUC':    average_precision_score(y_true, y_prob)
    }

metrics_default = get_metrics(y_test, y_pred_default, y_proba)
metrics_optimal = get_metrics(y_test, y_pred_optimal, y_proba)

results_df = pd.DataFrame({
    'Threshold=0.50':               metrics_default,
    f'Threshold={optimal_thr:.3f}': metrics_optimal
})
print(results_df.to_string(float_format='{:.4f}'.format))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, y_pred, title in zip(
    axes,
    [y_pred_default, y_pred_optimal],
    ['Confusion Matrix (Threshold=0.50)', f'Confusion Matrix (Threshold={optimal_thr:.3f})']
):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap='magma',
                xticklabels=['No Default', 'Default'],
                yticklabels=['No Default', 'Default'],
                linewidths=1, linecolor='#0f0f1a',
                annot_kws={'size': 13, 'weight': 'bold'})
    ax.set_title(title, fontsize=12, fontweight='bold', pad=10)
    ax.set_xlabel('Predicted Label', fontsize=10)
    ax.set_ylabel('True Label', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc     = roc_auc_score(y_test, y_proba)
axes[0].plot(fpr, tpr, color=PRIMARY, lw=2.5, label=f'ROC Curve (AUC = {roc_auc:.4f})')
axes[0].plot([0, 1], [0, 1], color='#666680', linestyle='--', lw=1.2, label='Random Classifier')
axes[0].fill_between(fpr, tpr, alpha=0.1, color=PRIMARY)
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve', fontsize=13, fontweight='bold')
axes[0].legend(fontsize=10)
axes[0].yaxis.grid(True)
axes[0].set_axisbelow(True)

precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_proba)
pr_auc = average_precision_score(y_test, y_proba)
axes[1].plot(recall_vals, precision_vals, color=SECONDARY, lw=2.5,
             label=f'PR Curve (AUC = {pr_auc:.4f})')
axes[1].fill_between(recall_vals, precision_vals, alpha=0.1, color=SECONDARY)
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve', fontsize=13, fontweight='bold')
axes[1].legend(fontsize=10)
axes[1].yaxis.grid(True)
axes[1].set_axisbelow(True)

plt.tight_layout()
plt.show()

In [None]:
print('Classification Report (Optimal Threshold):')
print(classification_report(y_test, y_pred_optimal,
                             target_names=['No Default', 'Default']))

## 9. Feature Importance (Gini Impurity)

In [None]:
ohe_feature_names = (
    preprocessor_fit
    .named_transformers_['nominal']
    .get_feature_names_out(nominal_features)
).tolist()

all_feature_names = numeric_features + ordinal_features + ohe_feature_names

importances = final_tree.feature_importances_
feat_df = (
    pd.DataFrame({'feature': all_feature_names, 'importance': importances})
    .sort_values('importance', ascending=True)
)

fig, ax = plt.subplots(figsize=(10, max(6, len(feat_df) * 0.35)))
bar_colors = [PRIMARY if v >= feat_df['importance'].median() else '#444466'
              for v in feat_df['importance']]
ax.barh(feat_df['feature'], feat_df['importance'], color=bar_colors, edgecolor='none')
ax.set_xlabel('Gini Importance')
ax.set_title('Random Forest Feature Importances', fontsize=14, fontweight='bold', pad=12)
ax.xaxis.grid(True)
ax.set_axisbelow(True)

legend_elements = [
    mpatches.Patch(facecolor=PRIMARY,  label='Above median importance'),
    mpatches.Patch(facecolor='#444466', label='Below median importance')
]
ax.legend(handles=legend_elements, fontsize=9)
plt.tight_layout()
plt.show()

In [None]:
print('Top 10 most important features:')
print(
    feat_df.sort_values('importance', ascending=False)
    .head(10)
    .to_string(index=False, float_format='{:.4f}'.format)
)

## 10. Tree Visualization (Top 4 Levels)

In [None]:
fig, ax = plt.subplots(figsize=(26, 10), facecolor='#0f0f1a')
ax.set_facecolor('#0f0f1a')

plot_tree(
    final_tree,
    max_depth=4,
    feature_names=all_feature_names,
    class_names=['No Default', 'Default'],
    filled=True,
    rounded=True,
    impurity=True,
    proportion=False,
    fontsize=8,
    ax=ax
)

ax.set_title('Random Forest Structure (Max Depth = 4 for readability)',
             fontsize=14, fontweight='bold', pad=14, color='#e0e0f0')
plt.tight_layout()
plt.show()

## 11. Depth vs. Performance Analysis

In [None]:
max_depths   = range(1, 21)
train_aucs, test_aucs = [], []

for d in max_depths:
    dt = DecisionTreeClassifier(
        max_depth=d, class_weight='balanced', random_state=42
    )
    dt.fit(X_train_t, y_train)
    train_aucs.append(roc_auc_score(y_train, dt.predict_proba(X_train_t)[:, 1]))
    test_aucs.append(roc_auc_score(y_test,  dt.predict_proba(X_test_t)[:, 1]))

best_depth_idx = np.argmax(test_aucs)

fig, ax = plt.subplots(figsize=(11, 5))
ax.plot(list(max_depths), train_aucs, color=SECONDARY, lw=2.5, marker='o',
        markersize=5, label='Train ROC-AUC')
ax.plot(list(max_depths), test_aucs,  color=DANGER, lw=2.5, marker='o',
        markersize=5, label='Test ROC-AUC')
ax.axvline(list(max_depths)[best_depth_idx], color=HIGHLIGHT, linestyle='--', lw=1.5,
           label=f'Best test depth = {list(max_depths)[best_depth_idx]}')
ax.set_xlabel('Max Depth')
ax.set_ylabel('ROC-AUC')
ax.set_title('Overfitting Analysis: Depth vs ROC-AUC', fontsize=14, fontweight='bold', pad=12)
ax.legend(fontsize=10)
ax.yaxis.grid(True)
ax.set_axisbelow(True)
plt.tight_layout()
plt.show()

## 12. Probability Distribution

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

for status, color, label in zip([0, 1], [SECONDARY, DANGER], ['No Default (0)', 'Default (1)']):
    mask = (y_test == status).values
    ax.hist(y_proba[mask], bins=60, alpha=0.7, color=color, label=label, edgecolor='none')

ax.axvline(0.5, color='#ffffff', lw=1.5, linestyle='--', label='Threshold = 0.50')
ax.axvline(optimal_thr, color=HIGHLIGHT, lw=1.5, linestyle='-.',
           label=f'Optimal Threshold = {optimal_thr:.3f}')
ax.set_xlabel('Predicted Probability of Default')
ax.set_ylabel('Count')
ax.set_title('Predicted Probability Distribution by True Class',
             fontsize=14, fontweight='bold', pad=12)
ax.legend(fontsize=10)
ax.yaxis.grid(True)
ax.set_axisbelow(True)
plt.tight_layout()
plt.show()

## 13. Final Summary

In [None]:
summary = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC', 'PR-AUC'],
    'Threshold=0.50':               list(metrics_default.values()),
    f'Threshold={optimal_thr:.3f}': list(metrics_optimal.values())
}).set_index('Metric')

print('=' * 55)
print('          DECISION TREE — FINAL RESULTS')
print('=' * 55)
print(f'  Best params:  {grid_search.best_params_}')
print(f'  Tree depth:   {final_tree.get_depth()}')
print(f'  Tree leaves:  {final_tree.get_n_leaves()}')
print(f'  ccp_alpha:    {best_ccp_alpha:.6f}')
print('-' * 55)
print(summary.to_string(float_format='{:.4f}'.format))
print('=' * 55)