In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("loan_data.csv")  # ensure it has 'fico_score' and 'default'

fico_scores = df['fico_score'].values
defaults = df['default'].values

In [2]:
def quantize_mse(values, n_buckets):
    """
    Simple MSE-based quantization using np.histogram bins
    """
    # Create bins
    counts, bin_edges = np.histogram(values, bins=n_buckets)
    
    # Map values to buckets
    bucket_indices = np.digitize(values, bin_edges[1:-1], right=True)
    
    # Map bucket index to rating (lower index = better score)
    ratings = n_buckets - bucket_indices
    return bucket_indices, ratings, bin_edges

In [3]:
n_buckets = 5
bucket_idx, ratings, bin_edges = quantize_mse(fico_scores, n_buckets)
df['bucket'] = bucket_idx
df['rating'] = ratings
print(df.head())

   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  \
0             3915.471226  78039.38546               5         605        0   
1             8228.752520  26648.43525               2         572        1   
2             2027.830850  65866.71246               4         602        0   
3             2501.730397  74356.88347               5         612        0   
4             1768.826187  23448.32631               6         631        0   

   bucket  rating  
0       2       3  
1       1       4  
2       2       3  
3       2       3  
4       2       3  


In [4]:
def quantize_loglikelihood(values, defaults, n_buckets):
    """
    Likelihood-based discretization (dynamic programming)
    Returns: bin_edges and rating mapping
    """
    sorted_idx = np.argsort(values)
    values_sorted = values[sorted_idx]
    defaults_sorted = defaults[sorted_idx]
    
    N = len(values_sorted)
    
    # Precompute cumulative sums
    cum_defaults = np.cumsum(defaults_sorted)
    cum_counts = np.arange(1, N+1)
    
    # DP tables
    dp = np.full((N+1, n_buckets+1), -np.inf)
    split = np.zeros((N+1, n_buckets+1), dtype=int)
    
    dp[0,0] = 0  # base case
    
    # Helper: log-likelihood of segment [i:j)
    def segment_ll(i,j):
        ni = j - i
        ki = cum_defaults[j-1] - (cum_defaults[i-1] if i>0 else 0)
        pi = ki / ni if ni>0 else 1e-6
        pi = np.clip(pi, 1e-6, 1-1e-6)
        return ki*np.log(pi) + (ni-ki)*np.log(1-pi)
    
    for b in range(1, n_buckets+1):
        for j in range(1, N+1):
            for i in range(b-1, j):
                ll = dp[i,b-1] + segment_ll(i,j)
                if ll > dp[j,b]:
                    dp[j,b] = ll
                    split[j,b] = i
    
    # Backtrack to find bin edges
    edges = []
    j = N
    for b in range(n_buckets,0,-1):
        i = split[j,b]
        edges.append(values_sorted[i])
        j = i
    bin_edges = [min(values_sorted)] + edges[::-1] + [max(values_sorted)]
    
    # Assign ratings
    bucket_idx = np.digitize(values, bin_edges[1:-1], right=True)
    ratings = n_buckets - bucket_idx
    
    return bucket_idx, ratings, bin_edges

In [None]:
n_buckets = 5
bucket_idx, ratings, bin_edges = quantize_loglikelihood(fico_scores, defaults, n_buckets)
df['bucket'] = bucket_idx
df['rating'] = ratings

In [None]:
def map_fico_to_rating(fico_value, bin_edges, n_buckets):
    bucket_idx = np.digitize([fico_value], bin_edges[1:-1], right=True)[0]
    rating = n_buckets - bucket_idx
    return rating

In [None]:
new_fico = 720
rating = map_fico_to_rating(new_fico, bin_edges, n_buckets)
print("Rating:", rating)