In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [3]:
# Load the data (assuming the CSV filename is 'loan_data.csv')
data = pd.read_csv("/Users/gadimg/Downloads/Task 3 and 4_Loan_Data.csv")

# Display basic information about the data
print(data.head())
print(data.info())

# Define features and target
X = data.drop(columns=['customer_id', 'default'])  # Drop customer_id and target column
y = data['default']  # Target column

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train_scaled, y_train)

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Model evaluation on test data
logistic_preds = logistic_model.predict_proba(X_test_scaled)[:, 1]  # Probabilities for class 1
rf_preds = rf_model.predict_proba(X_test_scaled)[:, 1]

# Calculate AUC for model comparison
logistic_auc = roc_auc_score(y_test, logistic_preds)
rf_auc = roc_auc_score(y_test, rf_preds)

print(f"Logistic Regression AUC: {logistic_auc:.3f}")
print(f"Random Forest AUC: {rf_auc:.3f}")

# Set recovery rate
recovery_rate = 0.1

def expected_loss(features, loan_amount, model=rf_model):
    """
    Calculates the expected loss for a given set of borrower features.

    Parameters:
    - features: A dictionary of borrower characteristics (must match training feature names).
    - loan_amount: The loan amount in currency units.
    - model: The trained model to use for predicting default probability (default is rf_model).

    Returns:
    - Expected loss on the loan.
    """
    # Convert features to DataFrame and scale
    features_df = pd.DataFrame([features])
    features_scaled = scaler.transform(features_df)

    # Predict probability of default (PD)
    pd_prob = model.predict_proba(features_scaled)[:, 1][0]

    # Calculate expected loss
    expected_loss_value = loan_amount * pd_prob * (1 - recovery_rate)
    return expected_loss_value

# Sample borrower characteristics (using provided column names)
sample_borrower = {
    'credit_lines': 3,
    'loan_amt_out': 3000,
    'total_debt_out': 5000,
    'income': 45000,
    'years_emplo': 4,
    'fico_score': 650
}

# Example loan amount
loan_amount = 10000

# Calculate expected loss
loss = expected_loss(sample_borrower, loan_amount, model=rf_model)
print(f"Expected Loss: ${loss:,.2f}")

   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4         602        0  
3             2501.730397  74356.88347               5         612        0  
4             1768.826187  23448.32631               6         631        0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                 

In [7]:
from sklearn.cluster import KMeans

def create_buckets_mse(data, num_buckets):
    """
    Quantizes FICO scores into buckets using MSE minimization (K-means clustering).

    Parameters:
    - data: DataFrame with columns 'fico_score' and 'default'
    - num_buckets: Number of buckets to create

    Returns:
    - DataFrame with FICO scores and their corresponding bucket
    """
    fico_scores = data[['fico_score']].values

    # Apply K-means clustering
    kmeans = KMeans(n_clusters=num_buckets, random_state=42)
    data['bucket'] = kmeans.fit_predict(fico_scores)

    # Map each bucket to the mean FICO score within that bucket
    bucket_centers = kmeans.cluster_centers_.flatten()
    data['bucket_center'] = data['bucket'].map(dict(enumerate(bucket_centers)))

    return data

# Example usage
num_buckets = 5  # Example: divide into 5 buckets
bucketed_data_mse = create_buckets_mse(data, num_buckets)
print(bucketed_data_mse.head())


   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  \
0             3915.471226  78039.38546               5         605        0   
1             8228.752520  26648.43525               2         572        1   
2             2027.830850  65866.71246               4         602        0   
3             2501.730397  74356.88347               5         612        0   
4             1768.826187  23448.32631               6         631        0   

   bucket  bucket_center  
0       0     582.786697  
1       0     582.786697  
2       0     582.786697  
3       2     629.90

In [8]:
def log_likelihood(data, boundaries):
    """
    Calculate the log-likelihood of defaults given bucket boundaries.

    Parameters:
    - data: DataFrame with columns 'fico_score' and 'default'
    - boundaries: List of bucket boundaries for FICO scores

    Returns:
    - Log-likelihood of the data given the bucket boundaries
    """
    log_likelihood_value = 0
    data = data.sort_values(by='fico_score')

    for i in range(len(boundaries) - 1):
        # Define the bucket range
        lower_bound, upper_bound = boundaries[i], boundaries[i + 1]

        # Filter data for this bucket
        bucket_data = data[(data['fico_score'] >= lower_bound) & (data['fico_score'] < upper_bound)]
        ni = len(bucket_data)
        ki = bucket_data['default'].sum()

        # Calculate probability of default in the bucket
        if ni > 0:
            pi = ki / ni
            if pi > 0 and pi < 1:  # To avoid log(0) or log(1)
                log_likelihood_value += ki * np.log(pi) + (ni - ki) * np.log(1 - pi)

    return log_likelihood_value


In [9]:
def optimize_buckets_log_likelihood(data, num_buckets):
    """
    Finds the optimal bucket boundaries for FICO scores to maximize log-likelihood.

    Parameters:
    - data: DataFrame with columns 'fico_score' and 'default'
    - num_buckets: Number of buckets to create

    Returns:
    - List of optimized bucket boundaries
    """
    # Initial equally spaced boundaries
    min_score, max_score = data['fico_score'].min(), data['fico_score'].max()
    boundaries = np.linspace(min_score, max_score, num_buckets + 1)

    # Iterative approach to optimize boundaries
    best_boundaries = boundaries
    best_log_likelihood = log_likelihood(data, best_boundaries)

    for _ in range(100):  # Limit iterations for simplicity
        new_boundaries = best_boundaries.copy()
        
        # Adjust boundaries randomly within a small range
        for i in range(1, len(new_boundaries) - 1):
            adjustment = np.random.uniform(-10, 10)
            new_boundaries[i] = np.clip(new_boundaries[i] + adjustment, min_score, max_score)
        
        # Calculate log-likelihood with new boundaries
        new_log_likelihood = log_likelihood(data, new_boundaries)
        
        # Update if log-likelihood improves
        if new_log_likelihood > best_log_likelihood:
            best_log_likelihood = new_log_likelihood
            best_boundaries = new_boundaries
    
    return best_boundaries


Optimized Boundaries: [408.         520.20410629 585.00954142 645.54499113 730.76873523
 850.        ]
