In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [22]:
data = pd.read_csv('Loan_Data.csv')

data.head()



Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [23]:
# Check for missing values in each column
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Check for duplicate rows
duplicate_rows = data.duplicated()
num_duplicates = duplicate_rows.sum()
print(f"Number of duplicate rows: {num_duplicates}")

Missing values in each column:
 customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64
Number of duplicate rows: 0


In [10]:
# customer ID is not useful so we drop it

data = data.drop('customer_id', axis=1)

data.head()

Unnamed: 0,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,0,5221.545193,3915.471226,78039.38546,5,605,0
1,5,1958.928726,8228.75252,26648.43525,2,572,1
2,0,3363.009259,2027.83085,65866.71246,4,602,0
3,0,4766.648001,2501.730397,74356.88347,5,612,0
4,1,1345.827718,1768.826187,23448.32631,6,631,0


In [11]:
#Now we want to split data into features and target variable

X = data.drop('default', axis=1)
y = data['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [18]:
#Standardize the numerical ft

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Create Models and compare

rf_model = RandomForestClassifier(random_state=30)
gbc_model = GradientBoostingClassifier(random_state=30)

#Train and Evaluate

#Random Forest Classifier
rf_model.fit(X_train, y_train)
rf_pred_prob = rf_model.predict_proba(X_test)[:, 1]
rf_roc_auc = roc_auc_score(y_test, rf_pred_prob)
print(f'Random Forest ROC-AUC Score: {rf_roc_auc:.4f}')

#GradientBoostingClassifier
gbc_model.fit(X_train, y_train)
gbc_pred_prob = gbc_model.predict_proba(X_test)[:, 1]
gbc_roc_auc = roc_auc_score(y_test, gbc_pred_prob)
print(f'Gradient Boosting ROC-AUC Score: {gbc_roc_auc:.4f}')

Random Forest ROC-AUC Score: 0.9997
Gradient Boosting ROC-AUC Score: 0.9997


In [24]:
#Both seem to perform very well, cont. with GBC

#Now want to find estimated expected loss

def estimate_expected_loss(loan_properties, model, scaler, recovery_rate=0.10):
    """
    Estimate the expected loss on a loan.

    Parameters:
    loan_properties (dict): A dictionary containing the properties of the loan.
    model (object): Trained model to predict the probability of default.
    scaler (object): Scaler object to standardize the input features.
    recovery_rate (float): The recovery rate (default is 10%).

    Returns:
    float: The expected loss on the loan.
    """
    # Create a DataFrame for the loan properties
    loan_df = pd.DataFrame([loan_properties])
    
    # Standardize the numerical features
    loan_df = scaler.transform(loan_df)
    
    # Predict the probability of default (PD)
    PD_default = model.predict_proba(loan_df)[:, 1][0]
    
    # Assuming Exposure at Default (EAD) is the loan amount outstanding
    ead = loan_properties['loan_amt_outstanding']
    
    # Calculate the expected loss
    expected_loss = PD_default * ead * (1 - recovery_rate)
    
    return expected_loss

# Example usage
loan_properties = {
    'credit_lines_outstanding': 5,
    'loan_amt_outstanding': 15000,
    'total_debt_outstanding': 30000,
    'income': 60000,
    'years_employed': 10,
    'fico_score': 720
}

expected_loss = estimate_expected_loss(loan_properties, gbc_model, scaler)
print(f'The expected loss on the loan is: ${expected_loss:.2f}')

The expected loss on the loan is: $713.52


