In [13]:
# loan_prediction_analysis_improved.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (train_test_split, GridSearchCV, 
                                   cross_val_score, StratifiedKFold)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, 
                             GradientBoostingClassifier)
from sklearn.metrics import (accuracy_score, confusion_matrix, 
                           classification_report, roc_auc_score, 
                           RocCurveDisplay, precision_score, 
                           recall_score, f1_score, precision_recall_curve)
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.utils.class_weight import compute_class_weight
import pickle
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# ====================== 1. Data Loading & Initial Inspection ======================
print("Loading and inspecting data...")
df = pd.read_csv('loan_data.csv')

print("\n=== First 5 Rows ===")
print(df.head())

print("\n=== Dataset Info ===")
print(df.info())

print("\n=== Missing Values ===")
print(df.isnull().sum())

# ====================== 2. Data Cleaning ======================
print("\nCleaning data...")
# Drop irrelevant columns
df.drop(['Loan_ID', 'effective_date', 'due_date', 'paid_off_time'], axis=1, inplace=True)

# Handle missing values - we'll remove past_due_days entirely later
df['past_due_days'] = df['past_due_days'].fillna(0)

# ====================== 3. Feature Engineering ======================
print("\nEngineering features...")
# Target encoding
df['loan_status'] = df['loan_status'].map({
    'PAIDOFF': 1,
    'COLLECTION': 0,
    'COLLECTION_PAIDOFF': 0
})

# Categorical encoding
df['Gender'] = df['Gender'].map({'male': 0, 'female': 1})
education_map = {
    'High_School_or_Below': 0, 
    'College': 1, 
    'Bachelor': 2, 
    'Master_or_Above': 3
}
df['education'] = df['education'].map(education_map)

# ====================== 4. Correlation Analysis ======================
print("\nAnalyzing correlations...")
correlation_matrix = df.corr()
print("\n=== Correlation with loan_status ===")
print(correlation_matrix['loan_status'].sort_values(ascending=False))

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()  # Close plot to prevent display in notebook environments

# ====================== 5. Data Preparation ======================
print("\nPreparing data for modeling...")
X = df.drop(['loan_status', 'past_due_days'], axis=1)  # Remove potential leaky feature
y = df['loan_status']

# Check class distribution
print("\nClass distribution:")
print(y.value_counts(normalize=True))

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale numerical features
scaler = StandardScaler()
num_cols = ['Principal', 'terms', 'age']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Compute class weights for imbalanced data
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print("\nClass weights:", class_weight_dict)

# ====================== 6. Feature Selection ======================
print("\nSelecting best features...")
selector = SelectKBest(f_classif, k='all')  # Evaluate all features
selector.fit(X_train, y_train)

# Get feature scores
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': selector.scores_
}).sort_values('Score', ascending=False)

print("\nFeature importance scores:")
print(feature_scores)

# Select top features (adjust k as needed)
top_k = 4  # Using all features for now since we only have 5
X_train_selected = X_train
X_test_selected = X_test

# ====================== 7. Model Training & Evaluation ======================
print("\nTraining and evaluating models...")
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000, 
        class_weight='balanced',
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42
    ),
    "Gradient Boosting": GradientBoostingClassifier(
        random_state=42
    )
}

results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Fit model
    model.fit(X_train_selected, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_selected)
    y_proba = model.predict_proba(X_test_selected)[:, 1]
    
    # Calculate metrics
    results[name] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_proba),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "confusion_matrix": confusion_matrix(y_test, y_pred),
        "classification_report": classification_report(y_test, y_pred),
        "model": model
    }
    
    # Cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(
        model, X_train_selected, y_train, 
        cv=cv, scoring='roc_auc'
    )
    results[name]['cv_roc_auc'] = np.mean(cv_scores)
    
    # Plot ROC Curve
    RocCurveDisplay.from_estimator(model, X_test_selected, y_test)
    plt.title(f'ROC Curve - {name}')
    plt.savefig(f'roc_curve_{name.lower().replace(" ", "_")}.png')
    plt.close()
    
    # Plot Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    plt.figure()
    plt.plot(recall, precision, marker='.')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve - {name}')
    plt.savefig(f'pr_curve_{name.lower().replace(" ", "_")}.png')
    plt.close()

# Print results
for model_name, metrics in results.items():
    print(f"\n=== {model_name} ===")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"ROC-AUC: {metrics['roc_auc']:.4f}")
    print(f"CV ROC-AUC: {metrics['cv_roc_auc']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")
    print("\nConfusion Matrix:")
    print(metrics['confusion_matrix'])
    print("\nClassification Report:")
    print(metrics['classification_report'])

# ====================== 8. Hyperparameter Tuning ======================
print("\nTuning Gradient Boosting hyperparameters...")
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'subsample': [0.8, 0.9, 1.0]
}

gb = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=gb,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_selected, y_train)

best_gb = grid_search.best_estimator_
print("\nBest Parameters:", grid_search.best_params_)

# Evaluate best model
y_pred = best_gb.predict(X_test_selected)
y_proba = best_gb.predict_proba(X_test_selected)[:, 1]

print("\n=== Tuned Gradient Boosting ===")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

# ====================== 9. Feature Importance Analysis ======================
print("\nAnalyzing feature importance...")
# For tree-based models
for name in ["Random Forest", "Gradient Boosting", "Tuned Gradient Boosting"]:
    model = results[name]['model'] if name != "Tuned Gradient Boosting" else best_gb
    if hasattr(model, 'feature_importances_'):
        plt.figure(figsize=(10, 6))
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]
        plt.title(f"Feature Importance - {name}")
        plt.barh(range(len(indices)), importances[indices], align='center')
        plt.yticks(range(len(indices)), [X.columns[i] for i in indices])
        plt.xlabel('Relative Importance')
        plt.tight_layout()
        plt.savefig(f'feature_importance_{name.lower().replace(" ", "_")}.png')
        plt.close()

# ====================== 10. Save Best Model ======================
print("\nSaving best model...")
best_model = best_gb  # Using the tuned Gradient Boosting model

with open('best_loan_model.pkl', 'wb') as f:
    pickle.dump({
        'model': best_model,
        'scaler': scaler,
        'education_map': education_map,
        'features': list(X.columns),
        'num_cols': num_cols
    }, f)

print("\n=== Model Training Complete ===")
print("Best model (Tuned Gradient Boosting) saved to best_loan_model.pkl")

# ====================== 11. Threshold Analysis ======================
print("\nAnalyzing decision thresholds...")
y_proba = best_gb.predict_proba(X_test_selected)[:, 1]

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)

# Find threshold that maximizes F1 score
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

print(f"\nOptimal threshold: {optimal_threshold:.4f}")
print(f"Max F1 Score: {f1_scores[optimal_idx]:.4f}")

# Plot precision-recall vs threshold
plt.figure(figsize=(10, 6))
plt.plot(thresholds, precision[:-1], label='Precision')
plt.plot(thresholds, recall[:-1], label='Recall')
plt.plot(thresholds, f1_scores[:-1], label='F1 Score')
plt.axvline(x=optimal_threshold, color='r', linestyle='--', label='Optimal Threshold')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision, Recall, and F1 Score by Threshold')
plt.legend()
plt.tight_layout()
plt.savefig('optimal_threshold.png')
plt.close()

# ====================== 12. Final Model Evaluation with Optimal Threshold ======================
print("\nFinal evaluation with optimal threshold...")
y_pred_optimal = (y_proba >= optimal_threshold).astype(int)

print("\n=== Final Model with Optimal Threshold ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_optimal):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_optimal):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_optimal):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred_optimal):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_optimal))

Loading and inspecting data...

=== First 5 Rows ===
       Loan_ID loan_status  Principal  terms effective_date   due_date  \
0  xqd20166231     PAIDOFF       1000     30       9/8/2016  10/7/2016   
1  xqd20168902     PAIDOFF       1000     30       9/8/2016  10/7/2016   
2  xqd20160003     PAIDOFF       1000     30       9/8/2016  10/7/2016   
3  xqd20160004     PAIDOFF       1000     15       9/8/2016  9/22/2016   
4  xqd20160005     PAIDOFF       1000     30       9/9/2016  10/8/2016   

     paid_off_time  past_due_days  age             education  Gender  
0  9/14/2016 19:31            NaN   45  High_School_or_Below    male  
1   10/7/2016 9:00            NaN   50              Bachelor  female  
2  9/25/2016 16:58            NaN   33              Bachelor  female  
3  9/22/2016 20:00            NaN   27               College    male  
4  9/23/2016 21:36            NaN   28               College  female  

=== Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 