A complete project for credit risk modeling, demonstrating hyperparameter tuning and conceptual deployment in the financial domain.

In [1]:
# To install: pip install scikit-learn numpy pandas

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

## Business Problem Statement:

An online lender wants to automate its loan approval process. The goal is to build a predictive model that assesses the credit risk of new applicants. A low-risk applicant should be approved, while a high-risk applicant should be rejected to minimize financial losses for the lender. This is a classification problem.

In [None]:
# # 2. Simulate a Realistic-Looking Dataset
# # We'll create a synthetic dataset that mimics loan application data.

# print("Simulating a dataset for credit risk modeling...")
# np.random.seed(42)
# num_applicants = 5000
# data = {
#     'credit_score': np.random.normal(700, 50, num_applicants).astype(int),
#     'annual_income': np.random.lognormal(mean=11, sigma=0.8, size=num_applicants).astype(int),
#     'loan_amount': np.random.normal(25000, 10000, num_applicants).astype(int),
#     'loan_term_years': np.random.choice([2, 3, 4, 5], size=num_applicants),
#     # The target variable (0 = No Default, 1 = Default)
#     'risk_label': np.random.choice([0, 1], size=num_applicants, p=[0.9, 0.1])
# }
# df = pd.DataFrame(data)

In [3]:
df = pd.read_csv("credit_risk_modelling_data.csv")

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   credit_score     5000 non-null   int64
 1   annual_income    5000 non-null   int64
 2   loan_amount      5000 non-null   int64
 3   loan_term_years  5000 non-null   int64
 4   risk_label       5000 non-null   int64
dtypes: int64(5)
memory usage: 195.4 KB
None


#### Conditions to add risk label. This can be modified for different policies.

In [7]:

# Let's add a simple correlation: lower credit score and higher loan amount increase risk
df.loc[df['credit_score'] < 650, 'risk_label'] = 1
df.loc[df['loan_amount'] > 40000, 'risk_label'] = 1


In [8]:

# Features (X) and Target (y)
X = df[['credit_score', 'annual_income', 'loan_amount', 'loan_term_years']]
y = df['risk_label']

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Dataset loaded. Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")


Dataset loaded. Training data shape: (4000, 4)
Test data shape: (1000, 4)


In [9]:

# 3. Define the Model and the Hyperparameter Grid
# We will use the Gradient Boosting Classifier, a powerful model for this task.
model = GradientBoostingClassifier(random_state=42)

# Define the hyperparameter grid to search over.
# Grid Search will explore all combinations of these parameters.
# The 'verbose' argument in GridSearchCV will show us the progress.
param_grid = {
    'n_estimators': [10, 50, 100],          # Number of boosting stages
    'learning_rate': [0.01, 0.05, 0.1],        # Step size shrinkage
    'max_depth': [3, 5, 7],                   # Maximum depth of the individual regression estimators
}


In [10]:

# 4. Instantiate and Run Grid Search
print("\nStarting Grid Search for hyperparameter tuning...")
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',   # We want to maximize accuracy
    cv=5,                 # Perform 5-fold cross-validation
    n_jobs=-1,            # Use all available CPU cores
    verbose=2
)

grid_search.fit(X_train, y_train)
print("\nGrid Search finished.")



Starting Grid Search for hyperparameter tuning...
Fitting 5 folds for each of 27 candidates, totalling 135 fits

Grid Search finished.


In [11]:

# 5. Get and Evaluate the Best Model
print("\nBest hyperparameters found:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_



Best hyperparameters found:
{'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50}


In [12]:

# Make predictions on the unseen test set
y_pred = best_model.predict(X_test)

# Calculate and print the performance metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['No Default', 'Default'])

print(f"\nAccuracy of the best model on the test set: {accuracy:.4f}")
print("\nClassification Report:")
print(report)



Accuracy of the best model on the test set: 0.9140

Classification Report:
              precision    recall  f1-score   support

  No Default       0.89      1.00      0.94       694
     Default       1.00      0.72      0.84       306

    accuracy                           0.91      1000
   macro avg       0.94      0.86      0.89      1000
weighted avg       0.92      0.91      0.91      1000



In [13]:

# 6. Conceptual Deployment
# In a real-world scenario, the trained 'best_model' would be saved to a file.
# This file would then be used by a production application or API to make
# real-time predictions on new data.
print("\n--- Conceptual Deployment ---")

# Save the final, optimized model to a file using joblib.
model_filename = 'credit_risk_model.joblib'
print(f"Saving the best model to '{model_filename}'...")
joblib.dump(best_model, model_filename)
print("Model successfully saved.")



--- Conceptual Deployment ---
Saving the best model to 'credit_risk_model.joblib'...
Model successfully saved.


In [16]:

# Conceptual API Usage:
# In a separate file (e.g., 'api_server.py'), you would set up a simple API
# that loads the model and takes a new applicant's data as input.
# Example of how you would load and use the model in a live application:

try:
    loaded_model = joblib.load('credit_risk_model.joblib')
    print("Model loaded successfully for real-time predictions.")
except FileNotFoundError:
    print("Error: Model file not found. Please train and save the model first.")

def assess_credit_risk(applicant_data):
    # The applicant_data would be a dictionary from an API request, e.g.:
    # {'credit_score': 720, 'annual_income': 85000, 'loan_amount': 20000, 'loan_term_years': 4}
    
    # Convert the new data to a format the model expects (a DataFrame or 2D array)
    df_new = pd.DataFrame([applicant_data])
    
    # Make the prediction
    prediction = loaded_model.predict(df_new)
    
    # The prediction is 0 for 'No Default' and 1 for 'Default'
    if prediction[0] == 0:
        return "Low Risk: Approve Loan"
    else:
        return "High Risk: Reject Loan"

# To test this function, you could call it with a new data point:
new_applicant = {'credit_score': 700, 'annual_income': 75000, 'loan_amount': 30000, 'loan_term_years': 3}
print("\nAssessing a new applicant...")
print(f"Applicant Data: {new_applicant}")
# For this example, we'll just show the concept, not run it.
prediction = assess_credit_risk(new_applicant)
print(f"Prediction: {prediction}")


Model loaded successfully for real-time predictions.

Assessing a new applicant...
Applicant Data: {'credit_score': 700, 'annual_income': 75000, 'loan_amount': 30000, 'loan_term_years': 3}
Prediction: Low Risk: Approve Loan
