# Loan Default Prediction - Model Training

This notebook focuses on training and evaluating different machine learning models for loan default prediction, based on the insights gained from our exploratory data analysis.

## 1. Setup and Data Loading

In [None]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
# Load the dataset
data_path = os.path.join('..', 'data', 'Loan_Default.csv')
df = pd.read_csv(data_path)
print(f"Dataset loaded with shape: {df.shape}")
df.head()

## 2. Data Preprocessing

In [None]:
# Identify features and target
target_column = 'Status'
feature_columns = [col for col in df.columns if col != target_column]

# Check target variable distribution
plt.figure(figsize=(8, 5))
sns.countplot(x=target_column, data=df)
plt.title('Target Variable Distribution')
plt.show()

print("Target value counts:")
print(df[target_column].value_counts())
print(f"Target distribution (percentage): {df[target_column].value_counts(normalize=True) * 100}")

In [None]:
# Handle missing values
print("Missing values per column:")
missing_values = df.isnull().sum().sort_values(ascending=False)
missing_percent = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
missing_df = pd.concat([missing_values, missing_percent], axis=1, keys=['Total', 'Percent'])
print(missing_df[missing_df['Total'] > 0])

# We'll use SimpleImputer in our preprocessing pipeline later

In [None]:
# Split into features and target
X = df[feature_columns]
y = df[target_column]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

In [None]:
# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Number of numeric features: {len(numeric_features)}")
print(f"Number of categorical features: {len(categorical_features)}")

In [None]:
# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"Processed training set shape: {X_train_processed.shape}")
print(f"Processed testing set shape: {X_test_processed.shape}")

In [None]:
# Save the preprocessor
models_dir = os.path.join('..', 'models')
os.makedirs(models_dir, exist_ok=True)
joblib.dump(preprocessor, os.path.join(models_dir, 'preprocessor.joblib'))
print("Preprocessor saved successfully!")

## 3. Model Training and Evaluation

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    
    # Print metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    if roc_auc:
        print(f"ROC AUC: {roc_auc:.4f}")
    
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()
    
    # Plot ROC curve if applicable
    if y_prob is not None:
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc='lower right')
        plt.show()
    
    # Return the trained model and metrics
    return model, {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }

### 3.1 Logistic Regression

In [None]:
print("Training Logistic Regression model...")
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model, lr_metrics = evaluate_model(lr_model, X_train_processed, X_test_processed, y_train, y_test)

### 3.2 Random Forest

In [None]:
print("Training Random Forest model...")
rf_model = RandomForestClassifier(random_state=42)
rf_model, rf_metrics = evaluate_model(rf_model, X_train_processed, X_test_processed, y_train, y_test)

### 3.3 Gradient Boosting

In [None]:
print("Training Gradient Boosting model...")
gb_model = GradientBoostingClassifier(random_state=42)
gb_model, gb_metrics = evaluate_model(gb_model, X_train_processed, X_test_processed, y_train, y_test)

### 3.4 XGBoost

In [None]:
print("Training XGBoost model...")
xgb_model = XGBClassifier(random_state=42)
xgb_model, xgb_metrics = evaluate_model(xgb_model, X_train_processed, X_test_processed, y_train, y_test)

### 3.5 LightGBM

In [None]:
print("Training LightGBM model...")
lgbm_model = LGBMClassifier(random_state=42)
lgbm_model, lgbm_metrics = evaluate_model(lgbm_model, X_train_processed, X_test_processed, y_train, y_test)

## 4. Model Comparison

In [None]:
# Compile all model metrics
all_metrics = {
    'Logistic Regression': lr_metrics,
    'Random Forest': rf_metrics,
    'Gradient Boosting': gb_metrics,
    'XGBoost': xgb_metrics,
    'LightGBM': lgbm_metrics
}

# Create a dataframe to compare models
metrics_df = pd.DataFrame(all_metrics).T
metrics_df

In [None]:
# Plot model comparison
plt.figure(figsize=(12, 8))

# Plot accuracy, precision, recall, and F1 score
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
for i, metric in enumerate(metrics_to_plot):
    plt.subplot(2, 3, i+1)
    metrics_df[metric].plot(kind='bar')
    plt.title(f'{metric.capitalize()}')
    plt.ylim(0, 1)
    plt.xticks(rotation=45)
    
plt.tight_layout()
plt.show()

In [None]:
# Find the best model based on F1 score
best_model_name = metrics_df['f1'].idxmax()
print(f"Best model based on F1 score: {best_model_name}")

## 5. Hyperparameter Tuning for Best Model

In [None]:
# Select the best model for hyperparameter tuning
if best_model_name == 'Logistic Regression':
    model = LogisticRegression(random_state=42)
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet', None],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'class_weight': [None, 'balanced']
    }
elif best_model_name == 'Random Forest':
    model = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'class_weight': [None, 'balanced']
    }
elif best_model_name == 'Gradient Boosting':
    model = GradientBoostingClassifier(random_state=42)
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5],
        'subsample': [0.8, 0.9, 1.0]
    }
elif best_model_name == 'XGBoost':
    model = XGBClassifier(random_state=42)
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    }
elif best_model_name == 'LightGBM':
    model = LGBMClassifier(random_state=42)
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'num_leaves': [31, 63, 127],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    }

In [None]:
# Create grid search with cross-validation
print(f"Running GridSearchCV for {best_model_name}...")
grid_search = GridSearchCV(
    model, param_grid, 
    cv=StratifiedKFold(n_splits=5), 
    scoring='f1',
    verbose=1,
    n_jobs=-1
)

# Fit the grid search
grid_search.fit(X_train_processed, y_train)

# Print best parameters
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

In [None]:
# Evaluate the best model
best_model = grid_search.best_estimator_
print(f"Evaluating best {best_model_name} model with tuned hyperparameters:")
tuned_model, tuned_metrics = evaluate_model(best_model, X_train_processed, X_test_processed, y_train, y_test)

## 6. Feature Importance Analysis

In [None]:
# Get feature names from preprocessor
def get_feature_names(column_transformer):
    output_features = []
    
    for name, pipe, features in column_transformer.transformers_:
        if name == 'num':
            output_features.extend(features)
        elif name == 'cat':
            cats = pipe.named_steps['onehot'].get_feature_names_out(features)
            output_features.extend(cats)
    
    return output_features

# Try to get feature names
try:
    feature_names = get_feature_names(preprocessor)
    print(f"Number of features after preprocessing: {len(feature_names)}")
except:
    feature_names = [f"feature_{i}" for i in range(X_train_processed.shape[1])]
    print(f"Could not get feature names, using generic feature names instead.")

In [None]:
# Plot feature importance if the best model has this attribute
if hasattr(best_model, 'feature_importances_'):
    importances = best_model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Plot the top 20 most important features
    plt.figure(figsize=(12, 8))
    plt.title('Feature Importance')
    plt.bar(range(min(20, len(importances))), importances[indices][:20], align='center')
    plt.xticks(range(min(20, len(importances))), [feature_names[i] for i in indices][:20], rotation=90)
    plt.tight_layout()
    plt.show()
    
    # Print the top 20 most important features
    print("Top 20 most important features:")
    for i in range(min(20, len(importances))):
        print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
elif hasattr(best_model, 'coef_'):
    coefs = best_model.coef_[0]
    indices = np.argsort(np.abs(coefs))[::-1]
    
    # Plot the top 20 most important features
    plt.figure(figsize=(12, 8))
    plt.title('Feature Importance (Absolute Coefficients)')
    plt.bar(range(min(20, len(coefs))), np.abs(coefs[indices])[:20], align='center')
    plt.xticks(range(min(20, len(coefs))), [feature_names[i] for i in indices][:20], rotation=90)
    plt.tight_layout()
    plt.show()
    
    # Print the top 20 most important features
    print("Top 20 most important features:")
    for i in range(min(20, len(coefs))):
        print(f"{i+1}. {feature_names[indices[i]]}: {coefs[indices[i]]:.4f}")

## 7. Save the Final Model

In [None]:
# Save the best model
models_dir = os.path.join('..', 'models')
os.makedirs(models_dir, exist_ok=True)

# Save with the model name
model_filename = best_model_name.lower().replace(' ', '_') + '.joblib'
joblib.dump(best_model, os.path.join(models_dir, model_filename))

# Also save as best_model.joblib for easier reference
joblib.dump(best_model, os.path.join(models_dir, 'best_model.joblib'))

print(f"Best model ({best_model_name}) saved successfully!")

## 8. Conclusion

In this notebook, we've:

1. Preprocessed the Loan Default dataset, handling missing values and encoding categorical features
2. Trained and evaluated several machine learning models
3. Performed hyperparameter tuning on the best model
4. Analyzed feature importance to understand which factors most influence loan default
5. Saved the best model for deployment

The best model was the {best_model_name} with an F1 score of {tuned_metrics['f1']:.4f} on the test set.

Next steps:
1. Deploy the model using the provided Streamlit application
2. Monitor the model's performance over time
3. Consider feature engineering or advanced techniques to further improve performance