In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('Loan_Data.csv')

# Display the first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4         602        0  
3             2501.730397  74356.88347               5         612        0  
4             1768.826187  23448.32631               6         631        0  
customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income         

In [2]:
# Select independent variables and dependent variable
X = data[['income', 'total_debt_outstanding', 'fico_score', 'years_employed']]
y = data['default']

In [3]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize and train the logistic regression model
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.986
Confusion Matrix:
 [[1641   11]
 [  17  331]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1652
           1       0.97      0.95      0.96       348

    accuracy                           0.99      2000
   macro avg       0.98      0.97      0.98      2000
weighted avg       0.99      0.99      0.99      2000



In [7]:
def expected_loss(loan_amount, pd):
    recovery_rate = 0.10
    return pd * loan_amount * (1 - recovery_rate)

# Example usage:
loan_amount_example = 10000  # Example loan amount
pd_example = log_model.predict_proba([[78000, 5000, 700, 5]])[0][1]  # Example borrower details
loss = expected_loss(loan_amount_example, pd_example)
print(f"Expected Loss for loan amount {loan_amount_example} with PD {pd_example}: {loss}")

Expected Loss for loan amount 10000 with PD 1.778546348615269e-08: 0.00016006917137537422




In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_rf_pred = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_rf_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_rf_pred))
print("Classification Report:\n", classification_report(y_test, y_rf_pred))

Random Forest Accuracy: 0.98
Confusion Matrix:
 [[1632   20]
 [  20  328]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1652
           1       0.94      0.94      0.94       348

    accuracy                           0.98      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.98      0.98      0.98      2000



In [9]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Make predictions on the test set
y_gb_pred = gb_model.predict(X_test)

# Evaluate the model
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_gb_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_gb_pred))
print("Classification Report:\n", classification_report(y_test, y_gb_pred))

Gradient Boosting Accuracy: 0.985
Confusion Matrix:
 [[1640   12]
 [  18  330]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1652
           1       0.96      0.95      0.96       348

    accuracy                           0.98      2000
   macro avg       0.98      0.97      0.97      2000
weighted avg       0.98      0.98      0.98      2000



In [10]:
def expected_loss(loan_amount, pd):
    recovery_rate = 0.10  # Assuming a recovery rate of 10%
    return pd * loan_amount * (1 - recovery_rate)

# Example usage for Random Forest model:
loan_amount_example = 10000  # Example loan amount
pd_rf_example = rf_model.predict_proba([[78000, 5000, 700, 5]])[0][1]  # Example borrower details
loss_rf = expected_loss(loan_amount_example, pd_rf_example)
print(f"Expected Loss for loan amount {loan_amount_example} with RF PD {pd_rf_example}: {loss_rf}")

# Example usage for Gradient Boosting model:
pd_gb_example = gb_model.predict_proba([[78000, 5000, 700, 5]])[0][1]  # Example borrower details
loss_gb = expected_loss(loan_amount_example, pd_gb_example)
print(f"Expected Loss for loan amount {loan_amount_example} with GB PD {pd_gb_example}: {loss_gb}")

Expected Loss for loan amount 10000 with RF PD 0.0: 0.0
Expected Loss for loan amount 10000 with GB PD 0.0009902993119457208: 8.912693807511488


