In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report,
    ConfusionMatrixDisplay, precision_recall_curve, average_precision_score
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight

plt.rcParams.update({
    'figure.facecolor': '#0f0f1a',
    'axes.facecolor': '#1a1a2e',
    'axes.edgecolor': '#444466',
    'axes.labelcolor': '#e0e0f0',
    'xtick.color': '#aaaacc',
    'ytick.color': '#aaaacc',
    'text.color': '#e0e0f0',
    'grid.color': '#2a2a4a',
    'grid.linestyle': '--',
    'grid.linewidth': 0.5,
    'font.family': 'DejaVu Sans',
    'font.size': 11
})

PALETTE = ['#7b5ea7', '#00c9a7', '#ff6b6b', '#ffd166', '#4ecdc4', '#a8dadc']
PRIMARY   = '#7b5ea7'
SECONDARY = '#00c9a7'
DANGER    = '#ff6b6b'
HIGHLIGHT = '#ffd166'

## 1. Data Loading & Initial Inspection

In [None]:
df = pd.read_csv('loan_data.csv')
print(f'Shape: {df.shape}')
print(f'\nDtypes:\n{df.dtypes}')
df.head()

In [None]:
print('Null counts:\n', df.isnull().sum())
print('\nDuplicates:', df.duplicated().sum())
df.describe()

## 2. Exploratory Data Analysis

In [None]:
fig, ax = plt.subplots(figsize=(7, 4))
counts = df['loan_status'].value_counts()
bars = ax.bar(['No Default (0)', 'Default (1)'], counts.values,
              color=[SECONDARY, DANGER], width=0.5, edgecolor='none')
for bar, val in zip(bars, counts.values):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 200,
            f'{val:,}\n({val/len(df)*100:.1f}%)',
            ha='center', va='bottom', fontsize=11, color='#e0e0f0', fontweight='bold')
ax.set_title('Target Class Distribution', fontsize=14, fontweight='bold', pad=12)
ax.set_ylabel('Count')
ax.yaxis.grid(True)
ax.set_axisbelow(True)
plt.tight_layout()
plt.show()

In [None]:
numeric_cols = ['person_age', 'person_income', 'person_emp_exp',
                'loan_amnt', 'loan_int_rate', 'loan_percent_income',
                'cb_person_cred_hist_length', 'credit_score']

fig, axes = plt.subplots(2, 4, figsize=(18, 8))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    for status, color, label in zip([0, 1], [SECONDARY, DANGER], ['No Default', 'Default']):
        axes[i].hist(df[df['loan_status'] == status][col], bins=40, alpha=0.65,
                     color=color, label=label, edgecolor='none')
    axes[i].set_title(col, fontsize=10, fontweight='bold')
    axes[i].legend(fontsize=8)
    axes[i].yaxis.grid(True)
    axes[i].set_axisbelow(True)

fig.suptitle('Numeric Feature Distributions by Loan Status', fontsize=15, fontweight='bold', y=1.01)
plt.tight_layout()
plt.show()

In [None]:
cat_cols = ['person_gender', 'person_education', 'person_home_ownership',
            'loan_intent', 'previous_loan_defaults_on_file']

fig, axes = plt.subplots(1, len(cat_cols), figsize=(22, 5))

for i, col in enumerate(cat_cols):
    default_rate = df.groupby(col)['loan_status'].mean().sort_values(ascending=False)
    bars = axes[i].bar(default_rate.index, default_rate.values,
                       color=PALETTE[:len(default_rate)], edgecolor='none')
    for bar, val in zip(bars, default_rate.values):
        axes[i].text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.003,
                     f'{val:.2%}', ha='center', va='bottom', fontsize=8, fontweight='bold')
    axes[i].set_title(col, fontsize=10, fontweight='bold')
    axes[i].set_ylabel('Default Rate')
    axes[i].tick_params(axis='x', rotation=30)
    axes[i].yaxis.grid(True)
    axes[i].set_axisbelow(True)

fig.suptitle('Default Rate by Categorical Feature', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))
corr = df[numeric_cols + ['loan_status']].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(260, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, annot=True, fmt='.2f',
            linewidths=0.5, linecolor='#0f0f1a', ax=ax,
            annot_kws={'size': 9}, vmin=-1, vmax=1,
            cbar_kws={'shrink': 0.8})
ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold', pad=12)
plt.tight_layout()
plt.show()

## 3. Preprocessing

In [None]:
df = df.dropna()

age_cap   = df['person_age'].quantile(0.99)
inc_cap   = df['person_income'].quantile(0.99)
emp_cap   = df['person_emp_exp'].quantile(0.99)

df = df[df['person_age']   <= age_cap]
df = df[df['person_income'] <= inc_cap]
df = df[df['person_emp_exp'] <= emp_cap]

print(f'Shape after outlier removal: {df.shape}')

In [None]:
education_order = [['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate']]

ordinal_features  = ['person_education']
binary_features   = ['previous_loan_defaults_on_file', 'person_gender']
nominal_features  = ['person_home_ownership', 'loan_intent']

df['previous_loan_defaults_on_file'] = df['previous_loan_defaults_on_file'].map({'Yes': 1, 'No': 0})
df['person_gender']                  = df['person_gender'].map({'male': 1, 'female': 0})

X = df.drop(columns=['loan_status'])
y = df['loan_status']

print(f'X shape: {X.shape}, y shape: {y.shape}')
print(f'Class balance - 0: {(y==0).sum()}, 1: {(y==1).sum()}')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f'Train: {X_train.shape}, Test: {X_test.shape}')
print(f'Train class balance: {y_train.value_counts().to_dict()}')
print(f'Test  class balance: {y_test.value_counts().to_dict()}')

## 4. Pipeline Construction

In [None]:
numeric_features = ['person_age', 'person_income', 'person_emp_exp',
                    'loan_amnt', 'loan_int_rate', 'loan_percent_income',
                    'cb_person_cred_hist_length', 'credit_score',
                    'previous_loan_defaults_on_file', 'person_gender']

preprocessor = ColumnTransformer(
    transformers=[
        ('num',      StandardScaler(),                             numeric_features),
        ('ordinal',  OrdinalEncoder(categories=education_order),   ordinal_features),
        ('nominal',  OneHotEncoder(drop='first', sparse_output=False), nominal_features),
    ],
    remainder='drop'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier',   LogisticRegression(
        solver='saga',
        max_iter=2000,
        class_weight='balanced',
        C=1.0,
        random_state=42,
        n_jobs=-1
    ))
])

pipeline

## 5. Training & Cross-Validation

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_metrics = {}
for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
    scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring=metric, n_jobs=-1)
    cv_metrics[metric] = scores
    print(f'{metric:12s}: {scores.mean():.4f} (+/- {scores.std():.4f})')

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
metric_labels = list(cv_metrics.keys())
means  = [cv_metrics[m].mean() for m in metric_labels]
stds   = [cv_metrics[m].std()  for m in metric_labels]
colors = [PRIMARY, SECONDARY, DANGER, HIGHLIGHT, '#4ecdc4']

bars = ax.bar(metric_labels, means, yerr=stds, capsize=6,
              color=colors, edgecolor='none', error_kw=dict(ecolor='#ffffff', lw=1.5))
for bar, mean in zip(bars, means):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
            f'{mean:.4f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

ax.set_ylim(0, 1.12)
ax.set_title('5-Fold Cross-Validation Metrics', fontsize=14, fontweight='bold', pad=12)
ax.set_ylabel('Score')
ax.yaxis.grid(True)
ax.set_axisbelow(True)
plt.tight_layout()
plt.show()

In [None]:
pipeline.fit(X_train, y_train)
print('Model trained.')

## 6. Threshold Optimization

In [None]:
y_proba = pipeline.predict_proba(X_test)[:, 1]

thresholds  = np.linspace(0.01, 0.99, 200)
f1_scores   = [f1_score(y_test, (y_proba >= t).astype(int)) for t in thresholds]
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(thresholds, f1_scores, color=PRIMARY, lw=2)
ax.axvline(optimal_threshold, color=HIGHLIGHT, linestyle='--', lw=1.5,
           label=f'Optimal threshold = {optimal_threshold:.3f}')
ax.scatter([optimal_threshold], [f1_scores[optimal_idx]], color=HIGHLIGHT, s=80, zorder=5)
ax.set_xlabel('Decision Threshold')
ax.set_ylabel('F1 Score')
ax.set_title('F1 Score vs Decision Threshold', fontsize=14, fontweight='bold', pad=12)
ax.legend(fontsize=10)
ax.yaxis.grid(True)
ax.set_axisbelow(True)
plt.tight_layout()
plt.show()

print(f'Optimal threshold: {optimal_threshold:.4f}')
print(f'Best F1 at threshold: {f1_scores[optimal_idx]:.4f}')

## 7. Evaluation on Test Set

In [None]:
y_pred_default  = (y_proba >= 0.5).astype(int)
y_pred_optimal  = (y_proba >= optimal_threshold).astype(int)

def get_metrics(y_true, y_pred, y_prob):
    return {
        'Accuracy':  accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall':    recall_score(y_true, y_pred),
        'F1':        f1_score(y_true, y_pred),
        'ROC-AUC':   roc_auc_score(y_true, y_prob),
        'PR-AUC':    average_precision_score(y_true, y_prob)
    }

metrics_default = get_metrics(y_test, y_pred_default, y_proba)
metrics_optimal = get_metrics(y_test, y_pred_optimal, y_proba)

results_df = pd.DataFrame({'Threshold=0.50': metrics_default,
                            f'Threshold={optimal_threshold:.3f}': metrics_optimal})
print(results_df.to_string(float_format='{:.4f}'.format))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, y_pred, title in zip(
    axes,
    [y_pred_default, y_pred_optimal],
    ['Confusion Matrix (Threshold=0.50)', f'Confusion Matrix (Threshold={optimal_threshold:.3f})']
):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap='magma',
                xticklabels=['No Default', 'Default'],
                yticklabels=['No Default', 'Default'],
                linewidths=1, linecolor='#0f0f1a',
                annot_kws={'size': 13, 'weight': 'bold'})
    ax.set_title(title, fontsize=12, fontweight='bold', pad=10)
    ax.set_xlabel('Predicted Label', fontsize=10)
    ax.set_ylabel('True Label', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = roc_auc_score(y_test, y_proba)
axes[0].plot(fpr, tpr, color=PRIMARY, lw=2.5, label=f'ROC Curve (AUC = {roc_auc:.4f})')
axes[0].plot([0, 1], [0, 1], color='#666680', linestyle='--', lw=1.2, label='Random Classifier')
axes[0].fill_between(fpr, tpr, alpha=0.1, color=PRIMARY)
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve', fontsize=13, fontweight='bold')
axes[0].legend(fontsize=10)
axes[0].yaxis.grid(True)
axes[0].set_axisbelow(True)

precision_vals, recall_vals, thresholds_pr = precision_recall_curve(y_test, y_proba)
pr_auc = average_precision_score(y_test, y_proba)
axes[1].plot(recall_vals, precision_vals, color=SECONDARY, lw=2.5,
             label=f'PR Curve (AUC = {pr_auc:.4f})')
axes[1].fill_between(recall_vals, precision_vals, alpha=0.1, color=SECONDARY)
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve', fontsize=13, fontweight='bold')
axes[1].legend(fontsize=10)
axes[1].yaxis.grid(True)
axes[1].set_axisbelow(True)

plt.tight_layout()
plt.show()

In [None]:
print('Classification Report (Optimal Threshold):')
print(classification_report(y_test, y_pred_optimal,
                             target_names=['No Default', 'Default']))

## 8. Feature Importance (Log-Odds)

In [None]:
ohe_feature_names = (pipeline.named_steps['preprocessor']
                     .named_transformers_['nominal']
                     .get_feature_names_out(nominal_features)).tolist()

all_feature_names = (numeric_features +
                     ordinal_features +
                     ohe_feature_names)

coefs = pipeline.named_steps['classifier'].coef_[0]

coef_df = (
    pd.DataFrame({'feature': all_feature_names, 'coefficient': coefs})
    .assign(abs_coef=lambda d: d['coefficient'].abs())
    .sort_values('abs_coef', ascending=True)
)

fig, ax = plt.subplots(figsize=(10, max(6, len(coef_df) * 0.35)))
colors_bar = [DANGER if c > 0 else SECONDARY for c in coef_df['coefficient']]
ax.barh(coef_df['feature'], coef_df['coefficient'], color=colors_bar, edgecolor='none')
ax.axvline(0, color='#ffffff', lw=0.8, linestyle='--')
ax.set_xlabel('Coefficient (Log-Odds)')
ax.set_title('Logistic Regression Coefficients', fontsize=14, fontweight='bold', pad=12)
ax.xaxis.grid(True)
ax.set_axisbelow(True)

from matplotlib.patches import Patch
legend_elements = [Patch(facecolor=DANGER, label='Increases default probability'),
                   Patch(facecolor=SECONDARY, label='Decreases default probability')]
ax.legend(handles=legend_elements, fontsize=9, loc='lower right')

plt.tight_layout()
plt.show()

In [None]:
top_n = 10
top_coef = coef_df.sort_values('abs_coef', ascending=False).head(top_n)
print(f'Top {top_n} most influential features (by |coefficient|):')
print(top_coef[['feature', 'coefficient']].to_string(index=False, float_format='{:.4f}'.format))

## 9. Predicted Probability Distribution

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

for status, color, label in zip([0, 1], [SECONDARY, DANGER], ['No Default (0)', 'Default (1)']):
    mask = y_test == status
    ax.hist(y_proba[mask], bins=60, alpha=0.7, color=color, label=label, edgecolor='none')

ax.axvline(0.5, color='#ffffff', lw=1.5, linestyle='--', label='Threshold = 0.50')
ax.axvline(optimal_threshold, color=HIGHLIGHT, lw=1.5, linestyle='-.',
           label=f'Optimal Threshold = {optimal_threshold:.3f}')
ax.set_xlabel('Predicted Probability of Default')
ax.set_ylabel('Count')
ax.set_title('Predicted Probability Distribution by True Class', fontsize=14, fontweight='bold', pad=12)
ax.legend(fontsize=10)
ax.yaxis.grid(True)
ax.set_axisbelow(True)
plt.tight_layout()
plt.show()

## 10. Final Summary

In [None]:
summary = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC', 'PR-AUC'],
    'Threshold=0.50':                      list(metrics_default.values()),
    f'Threshold={optimal_threshold:.3f}':  list(metrics_optimal.values())
}).set_index('Metric')

print('=' * 52)
print('         LOGISTIC REGRESSION â€” FINAL RESULTS')
print('=' * 52)
print(summary.to_string(float_format='{:.4f}'.format))
print('=' * 52)