# Credit Scorecard Project – Data Transformation (Logistic)

## 1. Imports & Setup

In [1]:
# pip install optbinning

In [2]:
import optbinning
print(optbinning.__version__)


0.20.1


In [3]:
# pip install scorecardpy

In [187]:
# Import libraries
import io
import math
import contextlib
import json
import joblib

import numpy as np
import pandas as pd
import scorecardpy as sc
from scipy.stats import chi2_contingency
from itertools import combinations

import seaborn as sns
import matplotlib.pyplot as plt

from dateutil.relativedelta import relativedelta
from IPython.core.interactiveshell import InteractiveShell

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from optbinning import OptimalBinning

In [5]:
# Display settings
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

pd.set_option('display.float_format', '{:.4f}'.format)

import warnings
warnings.filterwarnings("ignore")

## 2. Load and Split Data

In [None]:
# Import cleaned data - training seet only
df_train = pd.read_csv('../data/interim/train_base_cleaned.csv')
df_test = pd.read_csv('../data/interim/test_base_cleaned.csv')

print(df_train.shape)
print(df_test.shape)

(1074238, 104)
(205520, 104)


In [169]:
# Define X,y
y_train = df_train['GB_FLAG']
X_train = df_train.drop(columns='GB_FLAG')

y_test = df_test['GB_FLAG']
X_test = df_test.drop(columns='GB_FLAG')

print(X_train.shape, len(y_train))
print(X_test.shape, len(y_test))

(1074238, 103) 1074238
(205520, 103) 205520


In [8]:
# Import feature meta fiile for reference 
fmeta = pd.read_csv('../dictionaries/feature_meta.csv')

In [9]:
# Separate numeric and categorical variables

num_vars = X_train.select_dtypes(include='float64').columns
cat_vars = X_train.select_dtypes(include='object').columns

print(len(num_vars))
print(len(cat_vars))

69
4


## 3. WoE Binning (Numeric Variables)

### 3.1 Auto-binning

In [10]:
woe_models = {}
binning_tables = {}
binning_summary = []
total_count = df_train.shape[0]

def infer_monotonic_trend(woes):
    if all(x <= y for x, y in zip(woes, woes[1:])):
        return "descending"
    elif all(x >= y for x, y in zip(woes, woes[1:])):
        return "ascending"
    else:
        return "non-monotonic"

for var in num_vars:
    optb = OptimalBinning(name=var, dtype="numerical", solver="cp")
    optb.fit(X_train[var], y_train)

    # Save model
    woe_models[var] = optb

    # Binning table
    table = optb.binning_table.build()
    table = table[table['Count'] > 0]
    binning_tables[var] = table

    # Extract bin-level info
    woes = table.iloc[:-1]['WoE'].values  # exclude total row
    trend = infer_monotonic_trend(woes)

    # Summary stats
    iv = round(optb.binning_table.iv, 6)
    bins = table.shape[0] - 1  # exclude total row
    bin_size_min = table.iloc[:-1]['Count'].min()
    bin_size_max = table.iloc[:-1]['Count'].max()
    woe_min = round(table.iloc[:-1]['WoE'].min(), 4)
    woe_max = round(table.iloc[:-1]['WoE'].max(), 4)

    # Optional: bad rate range
    bad_rate_min = round(table.iloc[:-1]['Event rate'].min(), 4)
    bad_rate_max = round(table.iloc[:-1]['Event rate'].max(), 4)

    binning_summary.append({
        "variable": var,
        "IV": iv,
        "bins": bins,
        "min_bin_size": round(bin_size_min / total_count, 4),
        "max_bin_size": round(bin_size_max/ total_count, 4),
        "min_woe": woe_min,
        "max_woe": woe_max,
        "woe_spread": woe_max - woe_min,
        "bad_rate_spread": bad_rate_max - bad_rate_min,
        "monotonic_trend": trend
    })

# convert the summary to a dataframe for inspection
binning_summary_df = pd.DataFrame(binning_summary)


In [11]:
# Add additional columns to the binning_summary table 

# iv_group
def iv_group(iv):
    if iv < 0.02:
        return 'not predictive'
    elif 0.02 <= iv < 0.1:
        return 'weak'
    elif 0.1 <= iv < 0.3:
        return 'medium'
    else:
        return 'strong'

binning_summary_df['iv_class'] = binning_summary_df['IV'].apply(iv_group)

# bin_num_group
def bins_num_group(bins):
    if bins <= 3:
        return '< 4'
    elif 4 <= bins <= 8:
        return '4-8'
    elif 9 <= bins <= 15:
        return '9-15'
    else:
        return '> 15'

binning_summary_df['bin_num_group'] = binning_summary_df['bins'].apply(bins_num_group)


# woe_spread_class
def woe_spread_group(w):
    if w < 0.2:
        return 'minimal'
    elif 0.2 <= w < 0.5:
        return 'low'
    elif 0.5 <= w < 0.8:
        return 'medium'
    else:
        return 'high'

binning_summary_df['woe_spread_class'] = binning_summary_df['woe_spread'].apply(woe_spread_group)    
binning_summary_df['type'] = 'numerical'

binning_summary_df.sort_values(by='IV', ascending=False)


Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type
1,int_rate,0.4732,15,0.05,0.1195,-1.1585,1.8489,3.0074,0.4037,ascending,strong,9-15,high,numerical
6,fico_range_low,0.1202,13,0.0557,0.1033,-0.3704,1.0066,1.377,0.1812,descending,medium,9-15,high,numerical
7,fico_range_high,0.1202,13,0.0557,0.1033,-0.3704,1.0066,1.377,0.1812,descending,medium,9-15,high,numerical
34,acc_open_past_24mths,0.0821,9,0.0697,0.1475,-0.5516,0.4403,0.9919,0.1633,ascending,weak,9-15,high,numerical
4,dti,0.0766,15,0.0503,0.0969,-0.5097,0.4075,0.9172,0.1506,ascending,weak,9-15,high,numerical
58,num_tl_op_past_12m,0.0599,6,0.0894,0.2431,-0.4484,0.3346,0.783,0.129,ascending,weak,4-8,medium,numerical
36,bc_open_to_buy,0.0537,14,0.05,0.1408,-0.2423,0.6573,0.8996,0.1262,descending,weak,9-15,high,numerical
35,avg_cur_bal,0.0506,14,0.0506,0.1792,-0.21,0.6272,0.8372,0.1173,non-monotonic,weak,9-15,high,numerical
63,tot_hi_cred_lim,0.043,16,0.0501,0.105,-0.203,0.5373,0.7403,0.1064,non-monotonic,weak,> 15,medium,numerical
65,total_bc_limit,0.0401,13,0.0505,0.1588,-0.1943,0.5878,0.7821,0.1104,descending,weak,9-15,medium,numerical


### 3.2 Autobinning Review

In [12]:
binning_summary_df['iv_class'].value_counts()

iv_class
not predictive    35
weak              31
medium             2
strong             1
Name: count, dtype: int64

#### 3.1.1 Strong IV (> 0.3)

In [13]:
binning_summary_df[binning_summary_df['iv_class']=='strong']

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type
1,int_rate,0.4732,15,0.05,0.1195,-1.1585,1.8489,3.0074,0.4037,ascending,strong,9-15,high,numerical


Observation: 
* The high IV of interest rate indicates that LC may do risk-based pricing, i.e. assigning higher interest rate to riskier customers. 
* The interest rate assignment is likely based on the customer's risk score, i.e., the scorecard output. Therefore interest rate is not usable as a feature in the final model.
* However, when included in exploratory models, its performance (e.g., AUC, KS, IV) gives an upper bound — i.e., a sense of how well you could possibly do if you had access to their internal logic.

**Decision: keep for benchmarking**

In [14]:
keep_vars = []
keep_vars.append('int_rate')

binning_summary_df.loc[binning_summary_df['variable'].isin(keep_vars), 'decision'] = 'keep'
binning_summary_df.loc[binning_summary_df['variable']=='int_rate', 'reason'] = 'for benchmarking'

#### 3.1.2 Medium IV (0.1 - 0.3)

In [15]:
binning_summary_df[binning_summary_df['iv_class']=='medium']

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
6,fico_range_low,0.1202,13,0.0557,0.1033,-0.3704,1.0066,1.377,0.1812,descending,medium,9-15,high,numerical,,
7,fico_range_high,0.1202,13,0.0557,0.1033,-0.3704,1.0066,1.377,0.1812,descending,medium,9-15,high,numerical,,


Observation:
* Not surprising bureau score has high prediction power 
* The two variables are likely highly correlated given the exact same binning - confirmed (0.99)

In [16]:
df_train['fico_range_low'].corr(df_train['fico_range_high'])

np.float64(0.9999999332449054)

**Decison: keep one - fico_range_low**

In [17]:
# fico_range_low
keep_vars.append('fico_range_low')
binning_summary_df.loc[binning_summary_df['variable'].isin(keep_vars), 'decision'] = 'keep'
binning_summary_df.loc[binning_summary_df['variable']=='fico_range_low', 'reason'] = 'high iv'

# fico_range_high
drop_vars = []
drop_vars.append('fico_range_high')
binning_summary_df.loc[binning_summary_df['variable'].isin(drop_vars), 'decision'] = 'drop'
binning_summary_df.loc[binning_summary_df['variable']=='fico_range_high', 'reason'] = 'highly correlated with fico_range_low'

#### 3.1.3 Weak IV (0.02 - 0.1)

In [18]:
weak_df = binning_summary_df[binning_summary_df['iv_class']=='weak']

# break down monotonic trend
weak_df['monotonic_trend'].value_counts()

monotonic_trend
ascending        17
descending        8
non-monotonic     6
Name: count, dtype: int64

**Non-monotoic**

In [19]:
weak_df[weak_df['monotonic_trend']=='non-monotonic'].sort_values(by='IV', ascending=False)

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
35,avg_cur_bal,0.0506,14,0.0506,0.1792,-0.21,0.6272,0.8372,0.1173,non-monotonic,weak,9-15,high,numerical,,
63,tot_hi_cred_lim,0.043,16,0.0501,0.105,-0.203,0.5373,0.7403,0.1064,non-monotonic,weak,> 15,medium,numerical,,
18,tot_cur_bal,0.0392,15,0.0506,0.0977,-0.2254,0.4912,0.7166,0.1053,non-monotonic,weak,9-15,medium,numerical,,
2,installment,0.0283,7,0.0502,0.2593,-0.247,0.389,0.636,0.0971,non-monotonic,weak,4-8,medium,numerical,,
23,mths_since_rcnt_il,0.0231,5,0.0591,0.7219,-0.4251,0.0786,0.5037,0.0886,non-monotonic,weak,4-8,medium,numerical,,
28,max_bal_bc,0.0209,5,0.05,0.7146,-0.3184,0.0824,0.4008,0.0684,non-monotonic,weak,4-8,low,numerical,,


**tot_hi_cred_lim**: 🟨 Re-bin to smaller bin number 
* IV=0.043, 16 bins, WoE spread=0.74 
* bell shaped bad rate as value goes up - unique pattern bu increments are small

**avg_cur_bal**: ❌ Drop (redundant with tot_hi_cred_lim)
* IV=0.051, WoE spread=0.837
* High bin number + non-monotonic -> noises, might be over binned
* High correlation with tot_hi_cred_lim (0.82)

**tot_cur_bal:** ❌ Drop (redundant with tot_hi_cred_lim)
* IV=0.039, 15 bins, WoE spread=0.717 
* Keep one if highly correlated to tot_hi_cred_lim (0.986)

**installment:** ❌ Drop (redundant with loan_amnt)
* is a calculated value from loan_amnt (ascending), term and int_rate
* high correlation with loan_amnt (0.954)

**mths_since_rcnt_il**: ✅ Keep (core)
* IV=0.023, 5 bins, WoE spread=0.504
* non-monotonic as imputed bin (-1) has the lowest bad rate, then bad rate decrease the value goes up. Good separation (18.6% - 27.5%)

**max_bal_bc**: 🟦 Keep (backup variable)
* IV=0.021, 5 bins, WoE spread=0.401
* non-monotonic as imputed bin (-1) has the lowest bad rate, then bad rate decrease the value goes up. Decent seperation (18.6% - 23.4%)

In [20]:
# Record Treatment 
backup_vars = []
review_vars = []

keep_vars.extend(['mths_since_rcnt_il'])
backup_vars.append('max_bal_bc')
review_vars.append('tot_hi_cred_lim')
drop_vars.extend(['avg_cur_bal', 'tot_cur_bal', 'installment'])

binning_summary_df.loc[binning_summary_df['variable'].isin(keep_vars), 'decision'] = 'keep'
binning_summary_df.loc[binning_summary_df['variable'].isin(backup_vars), 'decision'] = 'keep - backup'
binning_summary_df.loc[binning_summary_df['variable'].isin(drop_vars), 'decision'] = 'drop'


binning_summary_df.loc[binning_summary_df['variable']=='mths_since_rcnt_il', 'reason'] = 'non-monotonic due to imputed value -1; good bad rate spread; no strong correlated with other il variables'
binning_summary_df.loc[binning_summary_df['variable']=='max_bal_bc', 'reason'] = 'non-monotonic due to imputed value -1; decent bad rate spread; no strong correlation with other bal variables'
binning_summary_df.loc[binning_summary_df['variable']=='tot_hi_cred_lim', 'reason'] = 're-bin to smaller bin number; non-monotonic due to bell-shaped bad rate pattern'
binning_summary_df.loc[binning_summary_df['variable']=='avg_cur_bal', 'reason'] = 'redundant, strong correlation with tot_hi_cred_lim; over-binned'
binning_summary_df.loc[binning_summary_df['variable']=='tot_cur_bal', 'reason'] = 'redundant, strong correlation with tot_hi_cred_lim and avg_cur_bal'

In [21]:
# define a function to check correlation after excluding missing values

def corr_excl_miss(df, fmeta, vars_to_check):
    """
    Calculate correlation matrix for selected variables, excluding rows where any variable was imputed.
    
    Parameters:
    - df: Cleaned DataFrame
    - feature_meta: DataFrame with at least 'Variable_Name' and 'miss_impute_val' columns
    - vars_to_check: list of variable names to assess
    
    Returns:
    - correlation matrix (pd.DataFrame)
    """
    # Get impute values for the selected variables
    impute_map = (
        fmeta
        .loc[fmeta['Variable_Name'].isin(vars_to_check), ['Variable_Name', 'miss_impute_val']]
        .dropna(subset=['miss_impute_val'])  # Only keep variables with imputation
        .set_index('Variable_Name')['miss_impute_val']
        .to_dict()
    )

    # Build mask of non-imputed rows
    mask = np.ones(len(df), dtype=bool)
    for var, impute_val in impute_map.items():
        mask &= df[var] != impute_val

    # Filter and compute correlation
    filtered = df.loc[mask, vars_to_check]
    return filtered.corr()

In [22]:
# check potential correlation
corr_excl_miss(df_train, fmeta, ['installment', 'loan_amnt'])
corr_excl_miss(df_train, fmeta, ['tot_hi_cred_lim', 'tot_cur_bal', 'avg_cur_bal'])

Unnamed: 0,installment,loan_amnt
installment,1.0,0.9542
loan_amnt,0.9542,1.0


Unnamed: 0,tot_hi_cred_lim,tot_cur_bal,avg_cur_bal
tot_hi_cred_lim,1.0,0.9854,0.8136
tot_cur_bal,0.9854,1.0,0.8435
avg_cur_bal,0.8136,0.8435,1.0


**Monotonic Trend = Dscending**

In [23]:
weak_df[weak_df['monotonic_trend']=='descending'].sort_values(by='IV', ascending=False)

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
36,bc_open_to_buy,0.0537,14,0.05,0.1408,-0.2423,0.6573,0.8996,0.1262,descending,weak,9-15,high,numerical,,
65,total_bc_limit,0.0401,13,0.0505,0.1588,-0.1943,0.5878,0.7821,0.1104,descending,weak,9-15,medium,numerical,,
41,mo_sin_rcnt_tl,0.0349,11,0.0522,0.2352,-0.1862,0.4972,0.6834,0.0989,descending,weak,9-15,medium,numerical,,
42,mort_acc,0.0343,6,0.0548,0.4065,-0.1802,0.3891,0.5693,0.0851,descending,weak,4-8,medium,numerical,,
3,annual_inc,0.0313,13,0.0507,0.138,-0.2363,0.3997,0.636,0.0964,descending,weak,9-15,medium,numerical,,
43,mths_since_recent_bc,0.0278,13,0.0512,0.1444,-0.2047,0.3964,0.6011,0.0903,descending,weak,9-15,medium,numerical,,
40,mo_sin_rcnt_rev_tl_op,0.0271,13,0.0514,0.1853,-0.1691,0.4604,0.6295,0.0917,descending,weak,9-15,medium,numerical,,
30,total_rev_hi_lim,0.0263,10,0.0501,0.3401,-0.1077,0.4832,0.5909,0.0837,descending,weak,9-15,medium,numerical,,


In [24]:
desc_vars = weak_df[weak_df['monotonic_trend']=='descending']['variable']
df_train[desc_vars].corr()

Unnamed: 0,annual_inc,total_rev_hi_lim,bc_open_to_buy,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,total_bc_limit
annual_inc,1.0,0.4083,0.2261,0.0514,-0.03,0.3268,0.0427,0.388
total_rev_hi_lim,0.4083,1.0,0.6525,-0.0209,-0.0008,0.2472,-0.0346,0.8027
bc_open_to_buy,0.2261,0.6525,1.0,-0.0559,-0.0403,0.1332,-0.1034,0.8325
mo_sin_rcnt_rev_tl_op,0.0514,-0.0209,-0.0559,1.0,0.629,0.0353,0.5958,-0.0004
mo_sin_rcnt_tl,-0.03,-0.0008,-0.0403,0.629,1.0,-0.0342,0.38,0.0156
mort_acc,0.3268,0.2472,0.1332,0.0353,-0.0342,1.0,0.0613,0.2162
mths_since_recent_bc,0.0427,-0.0346,-0.1034,0.5958,0.38,0.0613,1.0,-0.0637
total_bc_limit,0.388,0.8027,0.8325,-0.0004,0.0156,0.2162,-0.0637,1.0


* **bc_open_to_buy**:  high WoE spread (0.90) and bad rate spread (12.6%) => ✅ Keep – strongest in group, great separation

* **total_bc_limit**: medim - high WoE spread (0.78) and 11% bad rate spread, but highly corrected to `bc_open_to_buy` (0.832) and `total_rev_hi_lim` (0.803)
    => ❌ Drop – highly corrected with other variables

* **mo_sin_rcnt_tl**: WoE spread 0.68 (medium); bad rate spread 9.9% => ✅ Keep – maybe related to recency of new tradelines
* **mort_acc**: WoE spread 0.57 (medium); bad rate spread 8.5%	=> ✅ Keep – number of mortgage accounts, known risk indicator
* **annual_inc**: WoE spread 0.64 (medium); bad rate spread 9.6% => ✅ Keep - known risk indicator 
* **mths_since_recent_bc**:	WoE spread 0.60 (medium); bad rate spread 9.0%	=> ✅ Keep – could tie into credit freshness
* **mo_sin_rcnt_rev_tl_op**: WoE spread 0.63 (medium); bad rate spread 9.2% => ✅ Keep – similar logic to above
* **total_rev_hi_lim**: WoE spread	0.59 (medium); bad rate spread 	8.4%; moderately corrected with `bc_open_to_buy` (0.66) and tot_hi_cred_lim (0.51) => 🟦 Keep - bakcup

In [25]:
weak_df[weak_df['monotonic_trend']=='descending']['variable']

3                annual_inc
30         total_rev_hi_lim
36           bc_open_to_buy
40    mo_sin_rcnt_rev_tl_op
41           mo_sin_rcnt_tl
42                 mort_acc
43     mths_since_recent_bc
65           total_bc_limit
Name: variable, dtype: object

In [26]:
# Record treatment
keep_vars.extend(['annual_inc'
, 'bc_open_to_buy'
, 'mo_sin_rcnt_rev_tl_op'
, 'mo_sin_rcnt_tl'
, 'mort_acc'
, 'mths_since_recent_bc']
) 

backup_vars.append('total_rev_hi_lim')
drop_vars.append('total_bc_limit')

binning_summary_df.loc[binning_summary_df['variable'].isin(keep_vars), 'decision'] = 'keep'
binning_summary_df.loc[binning_summary_df['variable'].isin(backup_vars), 'decision'] = 'keep - backup'
binning_summary_df.loc[binning_summary_df['variable'].isin(drop_vars), 'decision'] = 'drop'

binning_summary_df.loc[binning_summary_df['variable'].isin(desc_vars), 'reason'] = 'monotonic descending; good WOE & BR spread'
binning_summary_df.loc[binning_summary_df['variable']=='total_rev_hi_lim', 'reason'] = 'monotonic descending; good WOE & BR spread; moderate correlation with bc_open_to_buy (0.65); retained for model-stage evaluation'
binning_summary_df.loc[binning_summary_df['variable']=='total_bc_limit', 'reason'] = 'highly correlated with bc_open_to_buy (0.83) and total_rev_hi_lim (0.80)'

**Monotonic Trend = Ascending**

In [27]:
asc_df = weak_df[weak_df['monotonic_trend']=='ascending'].sort_values(by='IV', ascending=False)
asc_df

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
34,acc_open_past_24mths,0.0821,9,0.0697,0.1475,-0.5516,0.4403,0.9919,0.1633,ascending,weak,9-15,high,numerical,,
4,dti,0.0766,15,0.0503,0.0969,-0.5097,0.4075,0.9172,0.1506,ascending,weak,9-15,high,numerical,,
58,num_tl_op_past_12m,0.0599,6,0.0894,0.2431,-0.4484,0.3346,0.783,0.129,ascending,weak,4-8,medium,numerical,,
27,open_rv_24m,0.0318,5,0.0543,0.7515,-0.5157,0.086,0.6017,0.1081,ascending,weak,4-8,medium,numerical,,
0,loan_amnt,0.0317,8,0.0513,0.298,-0.2347,0.3507,0.5854,0.0901,ascending,weak,4-8,medium,numerical,,
49,num_actv_rev_tl,0.0305,10,0.0585,0.1508,-0.3671,0.2457,0.6128,0.1011,ascending,weak,9-15,medium,numerical,,
55,num_rev_tl_bal_gt_0,0.03,10,0.0524,0.1518,-0.4019,0.2483,0.6502,0.1084,ascending,weak,9-15,medium,numerical,,
29,all_util,0.0292,5,0.051,0.7707,-0.4297,0.0909,0.5206,0.0914,ascending,weak,4-8,medium,numerical,,
8,inq_last_6mths,0.0275,4,0.0551,0.5701,-0.3812,0.1357,0.5169,0.0884,ascending,weak,4-8,medium,numerical,,
60,percent_bc_gt_75,0.0259,7,0.0513,0.2866,-0.1884,0.2389,0.4273,0.067,ascending,weak,4-8,low,numerical,,


In [28]:
# Check correlation
asc_vars = weak_df[weak_df['monotonic_trend']=='ascending']['variable']
df_train[asc_vars].corr()

Unnamed: 0,loan_amnt,dti,inq_last_6mths,open_acc_6m,open_il_12m,open_il_24m,il_util,open_rv_12m,open_rv_24m,all_util,inq_last_12m,acc_open_past_24mths,bc_util,num_actv_rev_tl,num_rev_tl_bal_gt_0,num_tl_op_past_12m,percent_bc_gt_75
loan_amnt,1.0,0.0166,-0.0143,-0.0136,-0.007,0.0032,-0.0211,-0.0205,-0.0149,-0.0081,0.0012,0.0081,0.0585,0.1544,0.1543,-0.0155,0.0234
dti,0.0166,1.0,-0.0005,0.0374,0.0833,0.1096,0.0933,0.022,0.0336,0.0681,0.0347,0.1708,0.1802,0.2531,0.2547,0.1121,0.1649
inq_last_6mths,-0.0143,-0.0005,1.0,0.0869,0.0025,-0.0003,-0.0194,0.071,0.0596,-0.049,0.1687,0.2546,-0.0676,0.1065,0.089,0.2907,-0.0654
open_acc_6m,-0.0136,0.0374,0.0869,1.0,0.8067,0.7645,0.7562,0.8698,0.8163,0.7564,0.6407,0.2559,-0.1246,0.0749,0.068,0.3175,-0.1213
open_il_12m,-0.007,0.0833,0.0025,0.8067,1.0,0.9218,0.8307,0.6891,0.6833,0.8174,0.6462,0.2188,-0.0977,0.0068,0.0028,0.2627,-0.0945
open_il_24m,0.0032,0.1096,-0.0003,0.7645,0.9218,1.0,0.8218,0.6653,0.6675,0.7991,0.6148,0.2707,-0.0919,0.0101,0.0085,0.2308,-0.089
il_util,-0.0211,0.0933,-0.0194,0.7562,0.8307,0.8218,1.0,0.6983,0.6939,0.8913,0.5858,0.152,-0.0779,0.0083,0.0036,0.1535,-0.0781
open_rv_12m,-0.0205,0.022,0.071,0.8698,0.6891,0.6653,0.6983,1.0,0.9156,0.7084,0.6363,0.2943,-0.126,0.1288,0.1202,0.3643,-0.1257
open_rv_24m,-0.0149,0.0336,0.0596,0.8163,0.6833,0.6675,0.6939,0.9156,1.0,0.6959,0.6262,0.3662,-0.1257,0.1614,0.1533,0.3061,-0.1252
all_util,-0.0081,0.0681,-0.049,0.7564,0.8174,0.7991,0.8913,0.7084,0.6959,1.0,0.5747,0.0776,0.0259,0.0071,0.0071,0.0853,0.0077


In [29]:
# Check correlation (after removing imputed missing values)
corr_excl_miss(df_train, fmeta, asc_vars)

Unnamed: 0,loan_amnt,dti,inq_last_6mths,open_acc_6m,open_il_12m,open_il_24m,il_util,open_rv_12m,open_rv_24m,all_util,inq_last_12m,acc_open_past_24mths,bc_util,num_actv_rev_tl,num_rev_tl_bal_gt_0,num_tl_op_past_12m,percent_bc_gt_75
loan_amnt,1.0,-0.004,0.0067,-0.018,0.0043,0.0347,-0.0881,-0.0354,-0.0139,-0.0227,0.0133,0.022,0.0561,0.1664,0.155,-0.0131,0.0261
dti,-0.004,1.0,-0.0149,0.025,0.1509,0.2065,0.0244,-0.0038,0.0353,0.138,0.0131,0.1195,0.1725,0.2394,0.2479,0.0591,0.1502
inq_last_6mths,0.0067,-0.0149,1.0,0.3111,0.1057,0.0839,0.0485,0.2581,0.2344,-0.0572,0.3944,0.2381,-0.0806,0.1365,0.1043,0.2732,-0.0759
open_acc_6m,-0.018,0.025,0.3111,1.0,0.3541,0.2669,0.16,0.6225,0.4692,-0.0736,0.2021,0.536,-0.181,0.1996,0.192,0.723,-0.1678
open_il_12m,0.0043,0.1509,0.1057,0.3541,1.0,0.7625,0.3571,0.0241,0.0347,0.1752,0.2899,0.4174,-0.0915,-0.0508,-0.0493,0.5356,-0.0779
open_il_24m,0.0347,0.2065,0.0839,0.2669,0.7625,1.0,0.3217,0.0195,0.0553,0.194,0.2512,0.5553,-0.0769,-0.0397,-0.023,0.4141,-0.0666
il_util,-0.0881,0.0244,0.0485,0.16,0.3571,0.3217,1.0,0.032,0.0465,0.4884,0.1519,0.2008,-0.0689,-0.0788,-0.0721,0.209,-0.0631
open_rv_12m,-0.0354,-0.0038,0.2581,0.6225,0.0241,0.0195,0.032,1.0,0.7787,-0.2021,0.2367,0.6469,-0.187,0.3835,0.3702,0.8347,-0.1829
open_rv_24m,-0.0139,0.0353,0.2344,0.4692,0.0347,0.0553,0.0465,0.7787,1.0,-0.2164,0.2463,0.8442,-0.1911,0.4735,0.4646,0.6559,-0.1834
all_util,-0.0227,0.138,-0.0572,-0.0736,0.1752,0.194,0.4884,-0.2021,-0.2164,1.0,0.0394,-0.0824,0.5251,-0.1047,-0.0622,-0.0817,0.4358


<u>Top IV variables</u>
* **acc_open_past_24mths**: the highest IV of the group (0.082), WoE spread	0.992 (high), bad rate spread (0.163) => ✅ Keep - Strongest of this group
* **dti**: second highest IV (0.077), WoE spread 0.917 (high), good bad rate spread (0.151)	=> ✅ Keep - known risk indicator 
* **num_tl_op_past_12m**: 3rd highest IV (0.060), medium WoE & bad rate spread, overlap with `acc_open_past_24mths` (0.77) => 🟦 Keep as backup

In [30]:
# Record treatment
keep_vars.extend(['acc_open_past_24mths', 'dti'])
backup_vars.append('num_tl_op_past_12m')

binning_summary_df.loc[binning_summary_df['variable'].isin(keep_vars), 'decision'] = 'keep'
binning_summary_df.loc[binning_summary_df['variable'].isin(backup_vars), 'decision'] = 'keep - backup'

binning_summary_df.loc[binning_summary_df['variable'].isin(asc_df.iloc[0:3]['variable']), 'reason'] = 'monotonic ascending; iv>=0.6, good WOE & BR spread'
binning_summary_df.loc[binning_summary_df['variable']=='num_tl_op_past_12m', 'reason'] = 'backup variable; monotonic ascending; iv>=0.6, good WOE & BR spread; correlate with acc_open_past_24mths (0.74)'

<u>Inquiries and open accounts</u>
* **inq_last_6mths**: weak IV (0.027), okayish WoE spread (5.17), low correlation with the rest and low missing => ✅ Keep - known risk indicator commonly used in production

* **inq_last_12m**: 🟦 Keep - backup variable, included as 12m indicator  
    * similar IV (0.025) and WoE spread (0.509), low correlation with `inq_last_6mths` (0.169). 
    * moderate correlation (~0.6) with account opening variables due to being in the same **concurrent missing block**; correlation drops when excluding imputed records.
    * 78.5% of the values (0) in one bin

* **open_rv_24m**: 🟦 Keep - backup variable, include as one from the concurrent missing block.
    * highest IV (0.032) and WOE Spread (0.6) among the group, 
    * 5 bins with a 71.5% missing bin, gradual bad rate increments: 18.6% -> ~19.5% -> ~21% -> ~24% -> ~28% 
    * strongly correlated to open_rv_12m (0.92) and open_acc_6m (0.82) with imputation; strongly correlated to acc_open_past_24mths without imputation (0.77)
    
* **open_rv_12m**: ❌ drop - high missing, redundant with open_rv_24m
    * highly correlated to open_rv_24m, simialr to binning pattern

* **open_il_12m** & **open_il_24m**: ❌ drop - high missing, redundant with num_tl_op_past_12m
    * similar IV and WoE spread, both 4 bins with a 71.5% missing bin, similar bad rate increments;
    * strongly correlated amount themselves, also correlated to open_acc_6m, all_util, il_util by around ~0.8 each after imputation

* **open_acc_6m**: ❌ drop - high missing, redundant with acc_open_past_24mths
    * strongly correlated to all 12m and 24m open account variables 



*Recall from the Data Cleaning Notebook:*
* inq_last_6mths, only had 1 missing in the raw training set, imputed with 0; after imputation, 57% of the values are 0. Max=8 without capping.
* inq_last_12m had 767k missing values in the raw training data, and there's no obvious difference in bad rate between the null value group and 0 value group so missing was imputed with 0. After imputation, there's 78.5% zero values, which doesn't reconcile with the 6m variable. Max value is 49 and not capped.


In [31]:
# Check distribution - inquiries
df_train[['inq_last_6mths', 'inq_last_12m']].value_counts(normalize=True).head(10)

inq_last_6mths  inq_last_12m
0.0000          0.0000         0.4698
1.0000          0.0000         0.1961
2.0000          0.0000         0.0763
0.0000          1.0000         0.0418
3.0000          0.0000         0.0301
1.0000          1.0000         0.0246
0.0000          2.0000         0.0243
1.0000          2.0000         0.0191
0.0000          3.0000         0.0139
1.0000          3.0000         0.0124
Name: proportion, dtype: float64

In [32]:
# Check binning details - inquireis
var_list = asc_df[asc_df['variable'].str.contains("inq")]['variable']

for var in var_list:
    print(var,":")
    binning_tables[var]

inq_last_6mths :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 0.50)",612370,0.5701,503366,109004,0.178,0.1357,0.0101,0.0013
1,"[0.50, 1.50)",293025,0.2728,230603,62422,0.213,-0.0875,0.0021,0.0003
2,"[1.50, 2.50)",109607,0.102,83329,26278,0.2397,-0.2402,0.0063,0.0008
3,"[2.50, inf)",59236,0.0551,43457,15779,0.2664,-0.3812,0.0089,0.0011
Totals,,1074238,1.0,860755,213483,0.1987,,0.0275,0.0034


inq_last_12m :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 0.50)",843012,0.7848,686018,156994,0.1862,0.0804,0.005,0.0006
1,"[0.50, 1.50)",71577,0.0666,56007,15570,0.2175,-0.1141,0.0009,0.0001
2,"[1.50, 3.50)",90632,0.0844,68743,21889,0.2415,-0.2499,0.0057,0.0007
3,"[3.50, inf)",69017,0.0642,49987,19030,0.2757,-0.4285,0.0133,0.0017
Totals,,1074238,1.0,860755,213483,0.1987,,0.0248,0.0031


In [33]:
# Check binnig details - open accoutns 
var_list = asc_df[asc_df['variable'].str.contains("open")]['variable']

for var in var_list:
    print(var,":")
    binning_tables[var]

acc_open_past_24mths :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 1.50)",150922,0.1405,130141,20781,0.1377,0.4403,0.0237,0.0029
1,"[1.50, 2.50)",141087,0.1313,119291,21796,0.1545,0.3056,0.0112,0.0014
2,"[2.50, 3.50)",158472,0.1475,130910,27562,0.1739,0.1638,0.0038,0.0005
3,"[3.50, 4.50)",150701,0.1403,121660,29041,0.1927,0.0383,0.0002,0.0
4,"[4.50, 5.50)",129407,0.1205,102399,27008,0.2087,-0.0615,0.0005,0.0001
5,"[5.50, 6.50)",101766,0.0947,78869,22897,0.225,-0.1575,0.0025,0.0003
6,"[6.50, 7.50)",76182,0.0709,57900,18282,0.24,-0.2415,0.0044,0.0006
7,"[7.50, 9.50)",90858,0.0846,67268,23590,0.2596,-0.3464,0.0112,0.0014
8,"[9.50, inf)",74843,0.0697,52317,22526,0.301,-0.5516,0.0247,0.003
Totals,,1074238,1.0,860755,213483,0.1987,,0.0821,0.0102


open_rv_24m :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 0.50)",807243,0.7515,657591,149652,0.1854,0.086,0.0054,0.0007
1,"[0.50, 1.50)",58880,0.0548,47594,11286,0.1917,0.0449,0.0001,0.0
2,"[1.50, 2.50)",58356,0.0543,45958,12398,0.2125,-0.0841,0.0004,0.0
3,"[2.50, 4.50)",80865,0.0753,60937,19928,0.2464,-0.2765,0.0062,0.0008
4,"[4.50, inf)",68894,0.0641,48675,20219,0.2935,-0.5157,0.0197,0.0024
Totals,,1074238,1.0,860755,213483,0.1987,,0.0318,0.0039


open_rv_12m :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, -0.50)",767653,0.7146,624926,142727,0.1859,0.0824,0.0047,0.0006
1,"[-0.50, 0.50)",98721,0.0919,79238,19483,0.1974,0.0087,0.0,0.0
2,"[0.50, 1.50)",91396,0.0851,71220,20176,0.2208,-0.133,0.0016,0.0002
3,"[1.50, 2.50)",57618,0.0536,43297,14321,0.2486,-0.2879,0.0048,0.0006
4,"[2.50, inf)",58850,0.0548,42074,16776,0.2851,-0.4748,0.0141,0.0017
Totals,,1074238,1.0,860755,213483,0.1987,,0.0252,0.0031


open_acc_6m :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, -0.50)",767654,0.7146,624927,142727,0.1859,0.0824,0.0047,0.0006
1,"[-0.50, 0.50)",123087,0.1146,98323,24764,0.2012,-0.0154,0.0,0.0
2,"[0.50, 1.50)",97311,0.0906,75044,22267,0.2288,-0.1793,0.0031,0.0004
3,"[1.50, inf)",86186,0.0802,62461,23725,0.2753,-0.4262,0.0164,0.002
Totals,,1074238,1.0,860755,213483,0.1987,,0.0243,0.003


open_il_12m :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, -0.50)",767653,0.7146,624926,142727,0.1859,0.0824,0.0047,0.0006
1,"[-0.50, 0.50)",152554,0.142,121233,31321,0.2053,-0.0408,0.0002,0.0
2,"[0.50, 1.50)",96309,0.0897,73353,22956,0.2384,-0.2325,0.0052,0.0006
3,"[1.50, inf)",57722,0.0537,41243,16479,0.2855,-0.4769,0.014,0.0017
Totals,,1074238,1.0,860755,213483,0.1987,,0.0241,0.003


open_il_24m :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, -0.50)",767653,0.7146,624926,142727,0.1859,0.0824,0.0047,0.0006
1,"[-0.50, 0.50)",74451,0.0693,59623,14828,0.1992,-0.0027,0.0,0.0
2,"[0.50, 1.50)",93127,0.0867,73015,20112,0.216,-0.1049,0.001,0.0001
3,"[1.50, 2.50)",64566,0.0601,49269,15297,0.2369,-0.2246,0.0032,0.0004
4,"[2.50, inf)",74441,0.0693,53922,20519,0.2756,-0.4281,0.0143,0.0018
Totals,,1074238,1.0,860755,213483,0.1987,,0.0233,0.0029


In [34]:
# Record treatment
keep_vars.append('inq_last_6mths')
backup_vars.extend(['inq_last_12m','open_rv_24m'])
drop_vars.extend(['open_acc_6m', 'open_rv_12m', 'open_il_12m', 'open_il_24m'])

binning_summary_df.loc[binning_summary_df['variable'].isin(keep_vars), 'decision'] = 'keep'
binning_summary_df.loc[binning_summary_df['variable'].isin(backup_vars), 'decision'] = 'keep - backup'
binning_summary_df.loc[binning_summary_df['variable'].isin(drop_vars), 'decision'] = 'drop'

# inq_last_6mths
binning_summary_df.loc[binning_summary_df['variable'].isin(['inq_last_6mths']), 'reason'] = 'monotonic ascending; known risk indicator; include as a p24m indicator'

# open_rv_24m
binning_summary_df.loc[binning_summary_df['variable'].isin(['open_rv_24m']), 'reason'] = 'backup variable; monotonic ascending; large missing bin; the strongest predictor among the concurrent missing block 1 (12m/24m open accounts)'

# inq_last_12m
binning_summary_df.loc[binning_summary_df['variable'].isin(['inq_last_12m']), 'reason'] = 'backup variable; monotonic ascending; large missing bin; include as a p12m indicator'

# open_acc_6m/ open_rv_12m/ open_il_12m/ open_il_24m
binning_summary_df.loc[binning_summary_df['variable'].isin(['open_acc_6m', 'open_rv_12m', 'open_il_12m', 'open_il_24m']), 'reason'] = 'redundant with other account open variables,  high missing %, low additional predictive value'

<u>Total Accounts</u>
* num_rev_tl_bal_gt_0: weak IV (0.03) and decent WoE spread (0.65), 10 bins => ✅ Keep 
* num_actv_rev_tl: similar binning profile; extremely strong correlation with num_rev_tl_bal_gt_0. More than 85% of the values are the same => ❌ Drop - redundant with num_rev_tl_bal_gt_0

In [35]:
# Check value distribution 
df_train[['num_actv_rev_tl','num_rev_tl_bal_gt_0']].value_counts(normalize=True).head(10)

num_actv_rev_tl  num_rev_tl_bal_gt_0
4.0000           4.0000                0.1462
5.0000           5.0000                0.1353
3.0000           3.0000                0.1304
6.0000           6.0000                0.1109
7.0000           7.0000                0.0850
2.0000           2.0000                0.0830
8.0000           8.0000                0.0620
9.0000           9.0000                0.0438
10.0000          10.0000               0.0304
1.0000           1.0000                0.0289
Name: proportion, dtype: float64

In [36]:
binning_tables['num_rev_tl_bal_gt_0']

Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 1.50)",62709,0.0584,52543,10166,0.1621,0.2483,0.0033,0.0004
1,"[1.50, 2.50)",91426,0.0851,76444,14982,0.1639,0.2355,0.0044,0.0005
2,"[2.50, 3.50)",144523,0.1345,119415,25108,0.1737,0.1652,0.0035,0.0004
3,"[3.50, 4.50)",163072,0.1518,132901,30171,0.185,0.0885,0.0012,0.0001
4,"[4.50, 5.50)",151973,0.1415,122565,29408,0.1935,0.0331,0.0002,0.0
5,"[5.50, 6.50)",125407,0.1167,99641,25766,0.2055,-0.0417,0.0002,0.0
6,"[6.50, 7.50)",96961,0.0903,76477,20484,0.2113,-0.0769,0.0005,0.0001
7,"[7.50, 9.50)",121980,0.1136,94340,27640,0.2266,-0.1666,0.0033,0.0004
8,"[9.50, 11.50)",59933,0.0558,45389,14544,0.2427,-0.2562,0.0039,0.0005
9,"[11.50, inf)",56254,0.0524,41040,15214,0.2705,-0.4019,0.0095,0.0012


In [37]:
# Record treatment
keep_vars.extend(['num_rev_tl_bal_gt_0'])
drop_vars.extend(['num_actv_rev_tl'])

binning_summary_df.loc[binning_summary_df['variable'].isin(keep_vars), 'decision'] = 'keep'
binning_summary_df.loc[binning_summary_df['variable'].isin(drop_vars), 'decision'] = 'drop'

binning_summary_df.loc[binning_summary_df['variable'].isin(['num_rev_tl_bal_gt_0']), 'reason'] = 'monotonic ascending; decent bin range and WoE spread'
binning_summary_df.loc[binning_summary_df['variable'].isin(['num_actv_rev_tl']), 'reason'] = 'redundant: strong correlation with num_rev_tl_bal_gt_0 (0.985) with almost identical binning profile'

<u>Utility Variables</u>
Weak IV ranging from 0.023 to 0.029
* **all_util**: 5 bins with one bin containing 77% records (part of concurrent missing block 1); strongly correlated with il_util after imputation (0.89) 🟦 Keep as backup variable
* **il_util**: 4 bins with one 80% bin, more missing values in raw data; strongly correlaed with all_util; and moderately correlated to open_rv_24m (0.69) => ❌ Drop - redundant with all_util
* **bc_util**: has the highest WoE spread of all 4 (0.54), distinctive binning pattern to all_util => ✅ Unique binning pattern
* **percent_bc_gt_75**: lowest WoE spread (0.43) among this group, strongly correlated with percent_bc_gt_75 (0.84) => ❌ Drop - redundant with bc_util


In [38]:
var_list = ['all_util', 'il_util', 'bc_util', 'percent_bc_gt_75']

for var in var_list:
    print(var,":")
    binning_tables[var]

all_util :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 42.50)",827866,0.7707,675001,152865,0.1846,0.0909,0.0062,0.0008
1,"[42.50, 54.50)",54780,0.051,43335,11445,0.2089,-0.0628,0.0002,0.0
2,"[54.50, 67.50)",76726,0.0714,58392,18334,0.239,-0.2358,0.0043,0.0005
3,"[67.50, 78.50)",59626,0.0555,44031,15595,0.2615,-0.3563,0.0078,0.001
4,"[78.50, inf)",55240,0.0514,39996,15244,0.276,-0.4297,0.0107,0.0013
Totals,,1074238,1.0,860755,213483,0.1987,,0.0292,0.0036


il_util :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 52.50)",855271,0.7962,695149,160122,0.1872,0.0739,0.0043,0.0005
1,"[52.50, 69.50)",58075,0.0541,45803,12272,0.2113,-0.0772,0.0003,0.0
2,"[69.50, 80.50)",55640,0.0518,42442,13198,0.2372,-0.2262,0.0028,0.0004
3,"[80.50, inf)",105252,0.098,77361,27891,0.265,-0.3741,0.0153,0.0019
Totals,,1074238,1.0,860755,213483,0.1987,,0.0227,0.0028


bc_util :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 18.85)",110149,0.1025,92049,18100,0.1643,0.2322,0.0051,0.0006
1,"[18.85, 32.65)",95897,0.0893,80041,15856,0.1653,0.2247,0.0042,0.0005
2,"[32.65, 43.95)",102862,0.0958,84506,18356,0.1785,0.1326,0.0016,0.0002
3,"[43.95, 52.95)",93440,0.087,75969,17471,0.187,0.0755,0.0005,0.0001
4,"[52.95, 57.95)",55530,0.0517,44781,10749,0.1936,0.0327,0.0001,0.0
5,"[57.95, 63.95)",69517,0.0647,55760,13757,0.1979,0.0053,0.0,0.0
6,"[63.95, 72.85)",108745,0.1012,86932,21813,0.2006,-0.0116,0.0,0.0
7,"[72.85, 78.95)",77789,0.0724,61698,16091,0.2069,-0.0503,0.0002,0.0
8,"[78.95, 87.65)",120138,0.1118,94902,25236,0.2101,-0.0697,0.0006,0.0001
9,"[87.65, 91.95)",65955,0.0614,51557,14398,0.2183,-0.1187,0.0009,0.0001


percent_bc_gt_75 :


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 3.50)",247512,0.2304,207070,40442,0.1634,0.2389,0.0122,0.0015
1,"[3.50, 18.60)",55078,0.0513,45083,9995,0.1815,0.1122,0.0006,0.0001
2,"[18.60, 25.55)",82473,0.0768,67487,14986,0.1817,0.1106,0.0009,0.0001
3,"[25.55, 35.15)",88619,0.0825,71873,16746,0.189,0.0625,0.0003,0.0
4,"[35.15, 52.75)",173290,0.1613,138552,34738,0.2005,-0.0108,0.0,0.0
5,"[52.75, 68.30)",119347,0.1111,93726,25621,0.2147,-0.0973,0.0011,0.0001
6,"[68.30, inf)",307919,0.2866,236964,70955,0.2304,-0.1884,0.0108,0.0013
Totals,,1074238,1.0,860755,213483,0.1987,,0.0259,0.0032


In [39]:
# Record treatment
keep_vars.append('bc_util')
backup_vars.append('all_util')
drop_vars.extend(['il_util', 'percent_bc_gt_75'])

binning_summary_df.loc[binning_summary_df['variable'].isin(keep_vars), 'decision'] = 'keep'
binning_summary_df.loc[binning_summary_df['variable'].isin(backup_vars), 'decision'] = 'keep - backup'
binning_summary_df.loc[binning_summary_df['variable'].isin(drop_vars), 'decision'] = 'drop'

binning_summary_df.loc[binning_summary_df['variable'].isin(['all_util']), 'reason'] = 'backup variable; umbrella utility rate variable, weak IV, part of the concurrent missing block 1'
binning_summary_df.loc[binning_summary_df['variable'].isin(['bc_util']), 'reason'] = 'highest WoE spread in the utlity variable group, low missing'
binning_summary_df.loc[binning_summary_df['variable'].isin(['il_util']), 'reason'] = 'weak IV; high correlation with all_util'
binning_summary_df.loc[binning_summary_df['variable'].isin(['percent_bc_gt_75']), 'reason'] = 'weak IV; high correlation with bc_util'

<u>loan_amount</u>

✅ Keep
* IV=0.32, WoE spread > 0.5, bad rate spread = 9%, clean ascending pattern
* Not strongly correlated with any other ascending variables

In [40]:
binning_summary_df[binning_summary_df['variable']=='loan_amnt']

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
0,loan_amnt,0.0317,8,0.0513,0.298,-0.2347,0.3507,0.5854,0.0901,ascending,weak,4-8,medium,numerical,,


In [41]:
binning_tables['loan_amnt']

Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 3512.50)",55088,0.0513,46897,8191,0.1487,0.3507,0.0057,0.0007
1,"[3512.50, 7012.50)",173029,0.1611,144964,28065,0.1622,0.2477,0.0092,0.0011
2,"[7012.50, 9012.50)",101005,0.094,84025,16980,0.1681,0.2048,0.0037,0.0005
3,"[9012.50, 10012.50)",101633,0.0946,82920,18713,0.1841,0.0944,0.0008,0.0001
4,"[10012.50, 12012.50)",105869,0.0986,84029,21840,0.2063,-0.0468,0.0002,0.0
5,"[12012.50, 15012.50)",126924,0.1182,100203,26721,0.2105,-0.0725,0.0006,0.0001
6,"[15012.50, 28012.50)",320144,0.298,248789,71355,0.2229,-0.1453,0.0066,0.0008
7,"[28012.50, inf)",90546,0.0843,68928,21618,0.2388,-0.2347,0.005,0.0006
Totals,,1074238,1.0,860755,213483,0.1987,,0.0317,0.004


In [42]:
keep_vars.append('loan_amnt')
binning_summary_df.loc[binning_summary_df['variable'].isin(keep_vars), 'decision'] = 'keep'
binning_summary_df.loc[binning_summary_df['variable'].isin(['loan_amnt']), 'reason'] = 'monotonic, good WoE and bad rate spread; no strong correlation with other variables'

#### 3.1.4 Not Predictive

In [43]:
np_df = binning_summary_df[binning_summary_df['iv_class']=='not predictive'].sort_values(by='IV', ascending=False)
np_df

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
31,inq_fi,0.0198,4,0.0675,0.7146,-0.3708,0.0824,0.4532,0.0784,ascending,not predictive,4-8,low,numerical,,
39,mo_sin_old_rev_tl_op,0.0184,11,0.0511,0.1685,-0.259,0.1921,0.4511,0.0733,non-monotonic,not predictive,9-15,low,numerical,,
14,revol_util,0.0183,12,0.0503,0.1555,-0.1813,0.3295,0.5108,0.0778,ascending,not predictive,9-15,medium,numerical,,
20,open_act_il,0.0168,4,0.0695,0.7146,-0.2513,0.0824,0.3337,0.0559,ascending,not predictive,4-8,low,numerical,,
24,total_bal_il,0.0166,5,0.0514,0.7146,-0.2327,0.0824,0.3151,0.0525,ascending,not predictive,4-8,low,numerical,,
32,total_cu_tl,0.0158,4,0.0601,0.7146,-0.2095,0.0824,0.2919,0.0483,non-monotonic,not predictive,4-8,low,numerical,,
45,mths_since_recent_inq,0.0154,2,0.106,0.894,-0.04,0.3857,0.4257,0.0609,ascending,not predictive,< 4,low,numerical,,
67,mnths_since_earliest_cr_line,0.0113,12,0.0535,0.1807,-0.1735,0.1634,0.3369,0.0538,descending,not predictive,9-15,low,numerical,,
53,num_op_rev_tl,0.0102,12,0.0524,0.1076,-0.2232,0.1697,0.3929,0.0636,ascending,not predictive,9-15,low,numerical,,
48,num_actv_bc_tl,0.0101,6,0.0576,0.3463,-0.2612,0.0959,0.3571,0.0597,ascending,not predictive,4-8,low,numerical,,


<u>Potential Saves</u>
* **revol_util**: ❌ Drop
    * Strongly correlated to `bc_util`, similar binning pattern
* **inq_fi**: ❌ Drop
    * High missing (71.5%), high correlation with `inq_last_12m` (0.749) after imputation
* **mths_since_recent_inq**: 🟨 Review
    * low IV due to only 2 bins (missing or not missing), missing only 10% 
* **emp_length_num**: 🟨 Review 
    * Classic risk indicator but low IV & Spread, with 3 bins with almost the same bad rate. Re-bin to see if there's improvement

In [44]:
np_df[(np_df['monotonic_trend'].isin(['ascending', 'descending'])) & (np_df['woe_spread']> 0.4)].sort_values(by='woe_spread', ascending=False)

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
14,revol_util,0.0183,12,0.0503,0.1555,-0.1813,0.3295,0.5108,0.0778,ascending,not predictive,9-15,medium,numerical,,
31,inq_fi,0.0198,4,0.0675,0.7146,-0.3708,0.0824,0.4532,0.0784,ascending,not predictive,4-8,low,numerical,,
45,mths_since_recent_inq,0.0154,2,0.106,0.894,-0.04,0.3857,0.4257,0.0609,ascending,not predictive,< 4,low,numerical,,
68,emp_length_num,0.0093,6,0.0556,0.3322,-0.3548,0.0652,0.42,0.0728,descending,not predictive,4-8,low,numerical,,


In [45]:
binning_tables['emp_length_num']

Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, -0.50)",59777,0.0556,44160,15617,0.2613,-0.3548,0.0078,0.001
1,"[-0.50, 1.50)",153096,0.1425,121973,31123,0.2033,-0.0284,0.0001,0.0
2,"[1.50, 3.50)",180947,0.1684,145245,35702,0.1973,0.009,0.0,0.0
3,"[3.50, 4.50)",62846,0.0585,50458,12388,0.1971,0.0102,0.0,0.0
4,"[4.50, 9.50)",260725,0.2427,209354,51371,0.197,0.0107,0.0,0.0
5,"[9.50, inf)",356847,0.3322,289565,67282,0.1885,0.0652,0.0014,0.0002
Totals,,1074238,1.0,860755,213483,0.1987,,0.0093,0.0012


In [46]:
# Revol_util
util_vars = ['all_util', 'il_util', 'bc_util', 'revol_util', 'percent_bc_gt_75']

# Compare vs. other utitlity variables
binning_summary_df[binning_summary_df['variable'].isin(util_vars)].sort_values(by='IV', ascending=False)

# Correlation with imputed values
df_train[util_vars].corr()

# Correlation after excluding imputed values
corr_excl_miss(df_train, fmeta, util_vars)

# Highly correlated
binning_tables['revol_util']
binning_tables['bc_util']

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
29,all_util,0.0292,5,0.051,0.7707,-0.4297,0.0909,0.5206,0.0914,ascending,weak,4-8,medium,numerical,keep - backup,backup variable; umbrella utility rate variabl...
60,percent_bc_gt_75,0.0259,7,0.0513,0.2866,-0.1884,0.2389,0.4273,0.067,ascending,weak,4-8,low,numerical,drop,weak IV; high correlation with bc_util
37,bc_util,0.0234,13,0.0517,0.1118,-0.3066,0.2322,0.5388,0.0878,ascending,weak,9-15,medium,numerical,keep,highest WoE spread in the utlity variable grou...
25,il_util,0.0227,4,0.0518,0.7962,-0.3741,0.0739,0.448,0.0778,ascending,weak,4-8,low,numerical,drop,weak IV; high correlation with all_util
14,revol_util,0.0183,12,0.0503,0.1555,-0.1813,0.3295,0.5108,0.0778,ascending,not predictive,9-15,medium,numerical,,


Unnamed: 0,all_util,il_util,bc_util,revol_util,percent_bc_gt_75
all_util,1.0,0.8913,0.0259,0.0291,0.0077
il_util,0.8913,1.0,-0.0779,-0.0959,-0.0781
bc_util,0.0259,-0.0779,1.0,0.8005,0.8415
revol_util,0.0291,-0.0959,0.8005,1.0,0.6972
percent_bc_gt_75,0.0077,-0.0781,0.8415,0.6972,1.0


Unnamed: 0,all_util,il_util,bc_util,revol_util,percent_bc_gt_75
all_util,1.0,0.5365,0.5407,0.6115,0.4498
il_util,0.5365,1.0,-0.0536,-0.07,-0.0479
bc_util,0.5407,-0.0536,1.0,0.8556,0.8454
revol_util,0.6115,-0.07,0.8556,1.0,0.7263
percent_bc_gt_75,0.4498,-0.0479,0.8454,0.7263,1.0


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 16.25)",74336,0.0692,63082,11254,0.1514,0.3295,0.0068,0.0008
1,"[16.25, 23.25)",54079,0.0503,45202,8877,0.1641,0.2334,0.0026,0.0003
2,"[23.25, 32.95)",103692,0.0965,85549,18143,0.175,0.1566,0.0023,0.0003
3,"[32.95, 38.75)",77490,0.0721,63126,14364,0.1854,0.0862,0.0005,0.0001
4,"[38.75, 42.65)",55053,0.0512,44415,10638,0.1932,0.0349,0.0001,0.0
5,"[42.65, 47.95)",78833,0.0734,63307,15526,0.1969,0.0112,0.0,0.0
6,"[47.95, 51.95)",61594,0.0573,49201,12393,0.2012,-0.0155,0.0,0.0
7,"[51.95, 56.95)",78573,0.0731,62379,16194,0.2061,-0.0457,0.0002,0.0
8,"[56.95, 63.75)",105024,0.0978,83021,22003,0.2095,-0.0663,0.0004,0.0001
9,"[63.75, 74.75)",154888,0.1442,121792,33096,0.2137,-0.0914,0.0012,0.0002


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 18.85)",110149,0.1025,92049,18100,0.1643,0.2322,0.0051,0.0006
1,"[18.85, 32.65)",95897,0.0893,80041,15856,0.1653,0.2247,0.0042,0.0005
2,"[32.65, 43.95)",102862,0.0958,84506,18356,0.1785,0.1326,0.0016,0.0002
3,"[43.95, 52.95)",93440,0.087,75969,17471,0.187,0.0755,0.0005,0.0001
4,"[52.95, 57.95)",55530,0.0517,44781,10749,0.1936,0.0327,0.0001,0.0
5,"[57.95, 63.95)",69517,0.0647,55760,13757,0.1979,0.0053,0.0,0.0
6,"[63.95, 72.85)",108745,0.1012,86932,21813,0.2006,-0.0116,0.0,0.0
7,"[72.85, 78.95)",77789,0.0724,61698,16091,0.2069,-0.0503,0.0002,0.0
8,"[78.95, 87.65)",120138,0.1118,94902,25236,0.2101,-0.0697,0.0006,0.0001
9,"[87.65, 91.95)",65955,0.0614,51557,14398,0.2183,-0.1187,0.0009,0.0001


In [47]:
# inquiry variables 
inq_vars = ['inq_fi', 'inq_last_6mths', 'inq_last_12m', 'mths_since_recent_inq']

# Check vs. other inquiry variables
binning_summary_df[binning_summary_df['variable'].isin(inq_vars)].sort_values(by='IV', ascending=False)

# Correlation with imputed values
df_train[inq_vars].corr()

# Correlation after excluding imputed values
corr_excl_miss(df_train, fmeta, inq_vars)

binning_tables['inq_fi']
binning_tables['mths_since_recent_inq']

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
8,inq_last_6mths,0.0275,4,0.0551,0.5701,-0.3812,0.1357,0.5169,0.0884,ascending,weak,4-8,medium,numerical,keep,monotonic ascending; known risk indicator; inc...
33,inq_last_12m,0.0248,4,0.0642,0.7848,-0.4285,0.0804,0.5089,0.0895,ascending,weak,4-8,medium,numerical,keep - backup,backup variable; monotonic ascending; large mi...
31,inq_fi,0.0198,4,0.0675,0.7146,-0.3708,0.0824,0.4532,0.0784,ascending,not predictive,4-8,low,numerical,,
45,mths_since_recent_inq,0.0154,2,0.106,0.894,-0.04,0.3857,0.4257,0.0609,ascending,not predictive,< 4,low,numerical,,


Unnamed: 0,inq_fi,inq_last_6mths,inq_last_12m,mths_since_recent_inq
inq_fi,1.0,0.0412,0.7491,-0.0553
inq_last_6mths,0.0412,1.0,0.1687,-0.4272
inq_last_12m,0.7491,0.1687,1.0,-0.1521
mths_since_recent_inq,-0.0553,-0.4272,-0.1521,1.0


Unnamed: 0,inq_fi,inq_last_6mths,inq_last_12m,mths_since_recent_inq
inq_fi,1.0,0.1692,0.5478,-0.1354
inq_last_6mths,0.1692,1.0,0.4014,-0.258
inq_last_12m,0.5478,0.4014,1.0,-0.2131
mths_since_recent_inq,-0.1354,-0.258,-0.2131,1.0


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, -0.50)",767653,0.7146,624926,142727,0.1859,0.0824,0.0047,0.0006
1,"[-0.50, 0.50)",157717,0.1468,124095,33622,0.2132,-0.0884,0.0012,0.0001
2,"[0.50, 1.50)",72506,0.0675,55558,16948,0.2337,-0.207,0.0031,0.0004
3,"[1.50, inf)",76362,0.0711,56176,20186,0.2643,-0.3708,0.0109,0.0013
Totals,,1074238,1.0,860755,213483,0.1987,,0.0198,0.0025


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, -0.50)",113833,0.106,97406,16427,0.1443,0.3857,0.014,0.0017
1,"[-0.50, inf)",960405,0.894,763349,197056,0.2052,-0.04,0.0014,0.0002
Totals,,1074238,1.0,860755,213483,0.1987,,0.0154,0.0019


In [48]:
# Record treatment
review_vars.extend(['mths_since_recent_inq', 'emp_length_num'])
drop_vars.extend(['revol_util', 'inq_fi'])

binning_summary_df.loc[binning_summary_df['variable'].isin(review_vars), 'decision'] = 'review'
binning_summary_df.loc[binning_summary_df['variable'].isin(drop_vars), 'decision'] = 'drop'

binning_summary_df.loc[binning_summary_df['variable'].isin(['mths_since_recent_inq']), 'reason'] = 're-bin and review; only 2 bins (missing vs. non-missing), non-missing account for 89%.'
binning_summary_df.loc[binning_summary_df['variable'].isin(['emp_length_num']), 'reason'] = 're-bin and review; the middle three bins of have similar bad rate'
binning_summary_df.loc[binning_summary_df['variable'].isin(['revol_util']), 'reason'] = 'weak IV; high correlation with bc_util'
binning_summary_df.loc[binning_summary_df['variable'].isin(['inq_fi']), 'reason'] = 'weak IV; high missing; high correlation with inq_last_12m'

<u>Common Risk Indicators</u>

* **mnths_since_earliest_cr_line**  ✅ Keep
    * Known risk indicator behaves as expected (older credit history → lower risk). Low IV (0.011), but pattern is monotonic logical. 

* **open_acc** ❌ Drop
    * IV: 0.007; Bins: 13; WoE spread: 0.35s - very low IV for a variable for 13 bins, despite ascending pattern
    * Correlated with num_rev_tl_bal_gt_0 by 0.63 and acc_open_past_24mths by 0.48,

* **revol_bal** ❌ Drop
    * Low IV (0.005) and non-monotonic, the first 4 bins (85% of total volume) have almost the same bad rate (~20% bad rate)
    * Highly correlated with total_rev_hi_lim, which has higher IV (>0.2) and double the bad rate spread 

* **total_bal_ex_mort** 🟨 Review (re-bin to 3-4 bins)
    * not highly-correlated to other balance variables; arch-shaped bad rate (rises, then falls), suggesting a possible sweet spot for balance:
        * Low balances → maybe low income or thin file → moderate risk
        * Mid balances → more obligations → higher risk
        * ery high balances → maybe high earners → lower risk

* **mths_since_last_delinq**  🟨 Review (re-bin: separate missing and zero)
    * Very weak IV (0.002), small spread (0.145), and short binning range (5), 50% missing
    * Imputation bin: Missing + 0 has lower bad rate; remaining bins: Clean descending bad rate

* **total_acc** ❌ Drop
    * Very weak IV (0.001), very small WoE spread (0.077)

In [49]:
var_list = ['open_acc','mnths_since_earliest_cr_line' ,'mths_since_last_delinq' ,'revol_bal','total_acc','total_bal_ex_mort']

binning_summary_df[binning_summary_df['variable'].isin(var_list)].sort_values(by='IV', ascending=False)

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
67,mnths_since_earliest_cr_line,0.0113,12,0.0535,0.1807,-0.1735,0.1634,0.3369,0.0538,descending,not predictive,9-15,low,numerical,,
11,open_acc,0.0073,13,0.0516,0.0993,-0.1961,0.1598,0.3559,0.0573,ascending,not predictive,9-15,low,numerical,,
13,revol_bal,0.005,8,0.0545,0.2864,-0.0596,0.2575,0.3171,0.0475,non-monotonic,not predictive,4-8,low,numerical,,
64,total_bal_ex_mort,0.0036,11,0.0504,0.2036,-0.0621,0.1706,0.2327,0.0358,non-monotonic,not predictive,9-15,low,numerical,,
9,mths_since_last_delinq,0.0021,5,0.0502,0.498,-0.1045,0.0409,0.1454,0.0236,non-monotonic,not predictive,4-8,minimal,numerical,,
15,total_acc,0.0008,9,0.0633,0.2485,-0.0368,0.0398,0.0766,0.0122,non-monotonic,not predictive,9-15,minimal,numerical,,


In [50]:
for var in var_list:
    print(var)
    binning_tables[var]

open_acc


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 5.50)",88863,0.0827,73356,15507,0.1745,0.1598,0.002,0.0003
1,"[5.50, 6.50)",66452,0.0619,54349,12103,0.1821,0.1077,0.0007,0.0001
2,"[6.50, 7.50)",81318,0.0757,66165,15153,0.1863,0.0797,0.0005,0.0001
3,"[7.50, 8.50)",91496,0.0852,74266,17230,0.1883,0.0667,0.0004,0.0
4,"[8.50, 9.50)",96112,0.0895,77487,18625,0.1938,0.0314,0.0001,0.0
5,"[9.50, 10.50)",94192,0.0877,75612,18580,0.1973,0.0093,0.0,0.0
6,"[10.50, 11.50)",87040,0.081,69542,17498,0.201,-0.0144,0.0,0.0
7,"[11.50, 12.50)",78286,0.0729,62338,15948,0.2037,-0.031,0.0001,0.0
8,"[12.50, 13.50)",68282,0.0636,54341,13941,0.2042,-0.0338,0.0001,0.0
9,"[13.50, 15.50)",106662,0.0993,84628,22034,0.2066,-0.0486,0.0002,0.0


mnths_since_earliest_cr_line


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 81.50)",62184,0.0579,48018,14166,0.2278,-0.1735,0.0018,0.0002
1,"[81.50, 133.50)",194075,0.1807,151738,42337,0.2181,-0.1178,0.0026,0.0003
2,"[133.50, 145.50)",74397,0.0693,58378,16019,0.2153,-0.1011,0.0007,0.0001
3,"[145.50, 154.50)",57469,0.0535,45585,11884,0.2068,-0.0499,0.0001,0.0
4,"[154.50, 164.50)",64223,0.0598,50962,13261,0.2065,-0.048,0.0001,0.0
5,"[164.50, 176.50)",73488,0.0684,58767,14721,0.2003,-0.0099,0.0,0.0
6,"[176.50, 192.50)",86016,0.0801,69136,16880,0.1962,0.0157,0.0,0.0
7,"[192.50, 211.50)",84330,0.0785,68147,16183,0.1919,0.0435,0.0001,0.0
8,"[211.50, 234.50)",85280,0.0794,69160,16120,0.189,0.0621,0.0003,0.0
9,"[234.50, 254.50)",62208,0.0579,50725,11483,0.1846,0.0913,0.0005,0.0001


mths_since_last_delinq


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, -0.50)",534918,0.498,432058,102860,0.1923,0.0409,0.0008,0.0001
1,"[-0.50, 7.50)",53934,0.0502,42290,11644,0.2159,-0.1045,0.0006,0.0001
2,"[7.50, 17.50)",100263,0.0933,79147,21116,0.2106,-0.073,0.0005,0.0001
3,"[17.50, 32.50)",129003,0.1201,102667,26336,0.2042,-0.0337,0.0001,0.0
4,"[32.50, inf)",256120,0.2384,204593,51527,0.2012,-0.0153,0.0001,0.0
Totals,,1074238,1.0,860755,213483,0.1987,,0.0021,0.0003


revol_bal


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 6921.50)",307648,0.2864,245797,61851,0.201,-0.0145,0.0001,0.0
1,"[6921.50, 11130.50)",215074,0.2002,171108,43966,0.2044,-0.0354,0.0003,0.0
2,"[11130.50, 12889.50)",73859,0.0688,58467,15392,0.2084,-0.0596,0.0002,0.0
3,"[12889.50, 18403.50)",170289,0.1585,135377,34912,0.205,-0.039,0.0002,0.0
4,"[18403.50, 22403.50)",80584,0.075,64340,16244,0.2016,-0.0178,0.0,0.0
5,"[22403.50, 27577.50)",71393,0.0665,57520,13873,0.1943,0.0279,0.0001,0.0
6,"[27577.50, 42040.50)",96802,0.0901,78982,17820,0.1841,0.0946,0.0008,0.0001
7,"[42040.50, inf)",58589,0.0545,49164,9425,0.1609,0.2575,0.0033,0.0004
Totals,,1074238,1.0,860755,213483,0.1987,,0.005,0.0006


total_acc


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 16.50)",266952,0.2485,212364,54588,0.2045,-0.0358,0.0003,0.0
1,"[16.50, 19.50)",113415,0.1056,90747,22668,0.1999,-0.0071,0.0,0.0
2,"[19.50, 21.50)",77676,0.0723,62241,15435,0.1987,0.0001,0.0,0.0
3,"[21.50, 25.50)",149934,0.1396,120449,29485,0.1967,0.0131,0.0,0.0
4,"[25.50, 27.50)",67996,0.0633,54686,13310,0.1957,0.0188,0.0,0.0
5,"[27.50, 33.50)",164351,0.153,132567,31784,0.1934,0.0339,0.0002,0.0
6,"[33.50, 37.50)",76740,0.0714,61970,14770,0.1925,0.0398,0.0001,0.0
7,"[37.50, 43.50)",74501,0.0694,59978,14523,0.1949,0.024,0.0,0.0
8,"[43.50, inf)",82673,0.077,65753,16920,0.2047,-0.0368,0.0001,0.0
Totals,,1074238,1.0,860755,213483,0.1987,,0.0008,0.0001


total_bal_ex_mort


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 8822.50)",82062,0.0764,67869,14193,0.173,0.1706,0.0021,0.0003
1,"[8822.50, 14610.50)",85891,0.08,69552,16339,0.1902,0.0543,0.0002,0.0
2,"[14610.50, 17931.50)",54193,0.0504,43691,10502,0.1938,0.0313,0.0,0.0
3,"[17931.50, 21809.50)",66191,0.0616,53078,13113,0.1981,0.0039,0.0,0.0
4,"[21809.50, 27577.50)",100012,0.0931,80023,19989,0.1999,-0.0071,0.0,0.0
5,"[27577.50, 41882.50)",218753,0.2036,174167,44586,0.2038,-0.0317,0.0002,0.0
6,"[41882.50, 48221.50)",76541,0.0713,60559,15982,0.2088,-0.0621,0.0003,0.0
7,"[48221.50, 71466.50)",183847,0.1711,145982,37865,0.206,-0.0448,0.0003,0.0
8,"[71466.50, 84462.50)",58342,0.0543,46427,11915,0.2042,-0.0342,0.0001,0.0
9,"[84462.50, 129588.50)",93411,0.087,74676,18735,0.2006,-0.0115,0.0,0.0


In [51]:
bal_vars = binning_summary_df[binning_summary_df['variable'].str.contains('bal')]['variable']

# Correlation with imputed values
df_train[bal_vars].corr()

# Correlation after excluding imputed values
corr_excl_miss(df_train, fmeta, bal_vars)

Unnamed: 0,revol_bal,tot_cur_bal,total_bal_il,max_bal_bc,avg_cur_bal,num_rev_tl_bal_gt_0,total_bal_ex_mort
revol_bal,1.0,0.4326,0.0345,0.2473,0.2902,0.3111,0.4881
tot_cur_bal,0.4326,1.0,0.1887,0.1641,0.8466,0.1351,0.514
total_bal_il,0.0345,0.1887,1.0,0.4543,0.097,0.0173,0.4332
max_bal_bc,0.2473,0.1641,0.4543,1.0,0.1252,0.0546,0.1725
avg_cur_bal,0.2902,0.8466,0.097,0.1252,1.0,-0.1079,0.2951
num_rev_tl_bal_gt_0,0.3111,0.1351,0.0173,0.0546,-0.1079,1.0,0.1592
total_bal_ex_mort,0.4881,0.514,0.4332,0.1725,0.2951,0.1592,1.0


Unnamed: 0,revol_bal,tot_cur_bal,total_bal_il,max_bal_bc,avg_cur_bal,num_rev_tl_bal_gt_0,total_bal_ex_mort
revol_bal,1.0,0.441,0.0831,0.5952,0.2941,0.3391,0.4803
tot_cur_bal,0.441,1.0,0.3867,0.356,0.8363,0.1136,0.5233
total_bal_il,0.0831,0.3867,1.0,0.0953,0.1959,0.0262,0.8977
max_bal_bc,0.5952,0.356,0.0953,1.0,0.2804,0.1223,0.3462
avg_cur_bal,0.2941,0.8363,0.1959,0.2804,1.0,-0.1506,0.2957
num_rev_tl_bal_gt_0,0.3391,0.1136,0.0262,0.1223,-0.1506,1.0,0.1685
total_bal_ex_mort,0.4803,0.5233,0.8977,0.3462,0.2957,0.1685,1.0


In [52]:
# Record treatment
keep_vars.append('mnths_since_earliest_cr_line')
review_vars.extend(['total_bal_ex_mort', 'mths_since_last_delinq'])
drop_vars.extend(['open_acc', 'total_acc', 'revol_bal'])

binning_summary_df.loc[binning_summary_df['variable'].isin(review_vars), 'decision'] = 'review'
binning_summary_df.loc[binning_summary_df['variable'].isin(keep_vars), 'decision'] = 'keep'
binning_summary_df.loc[binning_summary_df['variable'].isin(drop_vars), 'decision'] = 'drop'

binning_summary_df.loc[binning_summary_df['variable'].isin(['mnths_since_earliest_cr_line']), 'reason'] = 'known risk indicator; low IV, monotonic descending'
binning_summary_df.loc[binning_summary_df['variable'].isin(['open_acc']), 'reason'] = 'low IV (<0.01),  correlated with num_rev_tl_bal_gt_0 (0.63) and acc_open_past_24mths (0.48)'
binning_summary_df.loc[binning_summary_df['variable'].isin(['revol_bal']), 'reason'] = 'low IV and non-monotoic, highly correlated with total_rev_hi_lim'
binning_summary_df.loc[binning_summary_df['variable'].isin(['total_bal_ex_mort']), 'reason'] = 're-bin and review; non-monotonic, bell-shaped bad rate pattern; not highly correlated with other balance vars'
binning_summary_df.loc[binning_summary_df['variable'].isin(['mths_since_last_delinq']), 'reason'] = 're-bin and review; non-monotonic, descending bad rate except the imputation + 0 bin'
binning_summary_df.loc[binning_summary_df['variable'].isin(['total_acc']), 'reason'] = 'very weak IV; very small WoE spread'

<u>Variables with Small Bin Numbers</u>

* **delinq_2yrs**: ❌ Drop
    * very weak IV (0.002) and bad rate spread (2.6%)
    * 3 bins: bad rate goes 19.5% -> 20.5% -> 22.4%, samll spread

* **mths_since_last_major_derog**: 🟦 Keep as backup 
    * auto binning generated 3 bins: missing (72%), [0, 62) and [62, inf); ascending pattern, yet low bad rate spread (2.9%)
    * correlated with pub_rec (0.73), pub_rec_bankruptcies(0.69), and tax_liens (0.63) - covers all three

* **mths_since_last_record**: 🟨 Review - see if re-binning increases IV 
    * IV=0.0067, WoE spread 0.256, bad rate spreaad (4.3%) - the best performing of this group
    * auto binning generated 3 bins: missing (82%), [0, 69) (8%) and [69, inf) (9%); good bad rate separation, goes up with each bin: 19.3%, 21.7%, 23.6%
    * corrlated with pub_rec (0.73), pub_rec_bankruptcies(0.79)

* **pub_rec**: ❌ Drop - redundant with mths_since_last_major_derog and mths_since_last_record
    * 6 unique values (0-5, capped), 83% of 0; auto-binning: two bins zero (19.3% bad rate) vs. non-zero (22.7%)
    * correlates with pub_rec_bankruptcies (0.69)
* **pub_rec_bankruptcies**: ❌ Drop - redundant with pub_rec and mths_since_last_major_derog
    * 4 unique values (0-3, capped), 88% of 0 (overlap with all 83% of 0 in pub_rec); auto_binning: two bins zero (19.5% bad rate) vs. non-zero (22.7%)
    * correlates with pub_rec (0.69)
* **tax_liens**: ❌ Drop - high misssing; redundant with pub_rec and mths_since_last_major_derog
    * 5 unqieu values (0-4, capped), 97% of 0 (overlap with all 83% of 0 in pub_rec); auto_binning results in one bin
    * bad rate of 0 values: 19.8%; bad rate of non-zeros: 22.4%
    * correlates with  pub_rec (0.64)
* **num_tl_90g_dpd_24m**: ❌ Drop - low IV, low spread
    * 4 unique values + 1 imputed value, 92% of 0 & 2% of -1;  auto-binning: two bins zero and missing (19.7% bad rate) vs. non-zero (22.1%)
    * Not correlated with any other deliquency variables 

In [53]:
binning_summary_df[(binning_summary_df['bins']< 4) & (binning_summary_df['decision'].isnull())].sort_values(by='IV', ascending=False)

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
10,mths_since_last_record,0.0067,3,0.0815,0.8294,-0.2195,0.0367,0.2562,0.043,ascending,not predictive,< 4,low,numerical,,
12,pub_rec,0.0062,2,0.1706,0.8294,-0.1679,0.0367,0.2046,0.0338,ascending,not predictive,< 4,low,numerical,,
16,mths_since_last_major_derog,0.0046,3,0.0659,0.7285,-0.1364,0.0422,0.1786,0.0293,ascending,not predictive,< 4,minimal,numerical,,
61,pub_rec_bankruptcies,0.0042,2,0.1225,0.8775,-0.1695,0.0251,0.1946,0.0323,ascending,not predictive,< 4,minimal,numerical,,
17,tot_coll_amt,0.0031,3,0.0525,0.8527,-0.1434,0.0235,0.1669,0.0276,non-monotonic,not predictive,< 4,minimal,numerical,,
5,delinq_2yrs,0.0019,3,0.0672,0.8019,-0.1427,0.0186,0.1613,0.0266,ascending,not predictive,< 4,minimal,numerical,,
57,num_tl_90g_dpd_24m,0.0012,2,0.0571,0.9429,-0.1367,0.0086,0.1453,0.024,ascending,not predictive,< 4,minimal,numerical,,
46,mths_since_recent_revol_delinq,0.0005,3,0.084,0.6525,-0.0352,0.0171,0.0523,0.0084,ascending,not predictive,< 4,minimal,numerical,,
62,tax_liens,0.0,1,1.0,1.0,0.0,0.0,0.0,0.0,descending,not predictive,< 4,minimal,numerical,,


In [54]:
# Record treatment
backup_vars.extend(['mths_since_last_major_derog'])
review_vars.extend(['mths_since_last_record'])
drop_vars.extend(['delinq_2yrs','pub_rec', 'pub_rec_bankruptcies', 'tax_liens', 'num_tl_90g_dpd_24m'])

binning_summary_df.loc[binning_summary_df['variable'].isin(backup_vars), 'decision'] = 'keep - backup'
binning_summary_df.loc[binning_summary_df['variable'].isin(review_vars), 'decision'] = 'review'
binning_summary_df.loc[binning_summary_df['variable'].isin(drop_vars), 'decision'] = 'drop'

binning_summary_df.loc[binning_summary_df['variable'].isin(['delinq_2yrs']), 'reason'] = 'very weak low IV (<0.1) and small bad rate spread (2.6%) '
binning_summary_df.loc[binning_summary_df['variable'].isin(['mths_since_last_major_derog']), 'reason'] = 'backup variables; low IV (<0.1), but covers charge-offs, collections, delinquencies)'
binning_summary_df.loc[binning_summary_df['variable'].isin(['mths_since_last_record']), 'reason'] = 'backup variable, see if re-bin increases IV; low IV (<0.1), but covers public records, bankruptcies and tax liens; re-bin to see if increases IV'
binning_summary_df.loc[binning_summary_df['variable'].isin(['pub_rec']), 'reason'] = 'very weak IV (<0.1); redundant with mths_since_last_record and mths_since_last_major_derog'
binning_summary_df.loc[binning_summary_df['variable'].isin(['pub_rec_bankruptcies']), 'reason'] = 'very weak IV (<0.1); redundant with mths_since_last_record and mths_since_last_major_derog'
binning_summary_df.loc[binning_summary_df['variable'].isin(['tax_liens']), 'reason'] = 'very weak IV (<0.1; redunant with mths_since_last_record)'
binning_summary_df.loc[binning_summary_df['variable'].isin(['num_tl_90g_dpd_24m']), 'reason'] = 'very weak IV (<0.1); very small bad rate spread (2.4%)'

In [55]:
# Check number of unique values
lowbin_vars = list(binning_summary_df[(binning_summary_df['bins']< 4) & (binning_summary_df['decision'].isnull())]['variable'])

nuniq_dict = {}
for var in lowbin_vars:
    nuniq_dict[var] = df_train[var].nunique()

nuniq_dict

{'tot_coll_amt': 9684, 'mths_since_recent_revol_delinq': 163}

In [56]:
# For variables with less than 10 unique values - check value distribution
var_list = [key for key, value in nuniq_dict.items() if value < 10]

for var in var_list:
    df_train[var].value_counts(normalize=True).sort_index()

In [57]:
# For variables with less than 10 unique values - check bad rate by original value 
for var in var_list:
    df_train.groupby(var)['GB_FLAG'].mean()

In [58]:
# Compare pub_rec, pub_rec_bankruptcies and tax_liens
df_copy = df_train.copy()

df_copy['f_tax_liens_gt0'] = (df_copy['tax_liens'] > 0).astype(int)
df_copy['f_pub_rec_gt0'] = (df_copy['pub_rec'] > 0).astype(int)
df_copy['f_pub_rec_bkrptc_gt0'] = (df_copy['pub_rec_bankruptcies'] > 0).astype(int)

df_copy[['f_pub_rec_gt0', 'f_tax_liens_gt0']].value_counts(normalize=True)
df_copy.groupby(['f_pub_rec_gt0', 'f_tax_liens_gt0'])['GB_FLAG'].mean()

df_copy[['f_pub_rec_bkrptc_gt0', 'f_tax_liens_gt0']].value_counts(normalize=True)
df_copy.groupby(['f_pub_rec_bkrptc_gt0', 'f_tax_liens_gt0'])['GB_FLAG'].mean()


f_pub_rec_gt0  f_tax_liens_gt0
0              0                 0.8294
1              0                 0.1365
               1                 0.0341
Name: proportion, dtype: float64

f_pub_rec_gt0  f_tax_liens_gt0
0              0                 0.1930
1              0                 0.2275
               1                 0.2242
Name: GB_FLAG, dtype: float64

f_pub_rec_bkrptc_gt0  f_tax_liens_gt0
0                     0                 0.8501
1                     0                 0.1159
0                     1                 0.0275
1                     1                 0.0066
Name: proportion, dtype: float64

f_pub_rec_bkrptc_gt0  f_tax_liens_gt0
0                     0                 0.1939
                      1                 0.2227
1                     0                 0.2269
                      1                 0.2305
Name: GB_FLAG, dtype: float64

In [59]:
# Check current binning 
for var in lowbin_vars:
    print(var)
    binning_tables[var]

tot_coll_amt


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 28.50)",916042,0.8527,737395,178647,0.195,0.0235,0.0005,0.0001
1,"[28.50, 215.50)",56434,0.0525,43874,12560,0.2226,-0.1434,0.0011,0.0001
2,"[215.50, inf)",101762,0.0947,79486,22276,0.2189,-0.1222,0.0015,0.0002
Totals,,1074238,1.0,860755,213483,0.1987,,0.0031,0.0004


mths_since_recent_revol_delinq


Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, 0.50)",700903,0.6525,563510,137393,0.196,0.0171,0.0002,0.0
1,"[0.50, 53.50)",283132,0.2636,225479,57653,0.2036,-0.0305,0.0002,0.0
2,"[53.50, inf)",90203,0.084,71766,18437,0.2044,-0.0352,0.0001,0.0
Totals,,1074238,1.0,860755,213483,0.1987,,0.0005,0.0001


In [60]:
# Correlation with imputed value
df_train[lowbin_vars].corr()

Unnamed: 0,tot_coll_amt,mths_since_recent_revol_delinq
tot_coll_amt,1.0,0.1244
mths_since_recent_revol_delinq,0.1244,1.0


In [61]:
# Delinquency variables 
binning_summary_df[(binning_summary_df['variable'].str.contains('delinq')) | 
                   (binning_summary_df['variable'].str.contains('dlq')) | 
                   (binning_summary_df['variable'].str.contains('dpd'))].sort_values(by='IV', ascending=False)

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
9,mths_since_last_delinq,0.0021,5,0.0502,0.498,-0.1045,0.0409,0.1454,0.0236,non-monotonic,not predictive,4-8,minimal,numerical,review,"re-bin and review; non-monotonic, descending b..."
59,pct_tl_nvr_dlq,0.0019,6,0.0549,0.523,-0.0837,0.1013,0.185,0.0293,non-monotonic,not predictive,4-8,minimal,numerical,,
5,delinq_2yrs,0.0019,3,0.0672,0.8019,-0.1427,0.0186,0.1613,0.0266,ascending,not predictive,< 4,minimal,numerical,drop,very weak low IV (<0.1) and small bad rate spr...
44,mths_since_recent_bc_dlq,0.0012,4,0.0505,0.7525,-0.1096,0.0179,0.1275,0.0209,non-monotonic,not predictive,4-8,minimal,numerical,,
57,num_tl_90g_dpd_24m,0.0012,2,0.0571,0.9429,-0.1367,0.0086,0.1453,0.024,ascending,not predictive,< 4,minimal,numerical,drop,very weak IV (<0.1); very small bad rate sprea...
46,mths_since_recent_revol_delinq,0.0005,3,0.084,0.6525,-0.0352,0.0171,0.0523,0.0084,ascending,not predictive,< 4,minimal,numerical,,


In [62]:
# Delinquency variables correlation
dlq_vars = binning_summary_df[(binning_summary_df['variable'].str.contains('delinq')) | 
                              (binning_summary_df['variable'].str.contains('dlq')) | 
                              (binning_summary_df['variable'].str.contains('dpd'))]['variable']

# Correlation with imputed value
df_train[dlq_vars].corr()

# Correlation excluding imputation 
corr_excl_miss(df_train, fmeta, dlq_vars)

Unnamed: 0,delinq_2yrs,mths_since_last_delinq,mths_since_recent_bc_dlq,mths_since_recent_revol_delinq,num_tl_90g_dpd_24m,pct_tl_nvr_dlq
delinq_2yrs,1.0,-0.0796,0.066,0.0253,0.5555,-0.196
mths_since_last_delinq,-0.0796,1.0,0.5058,0.6502,-0.017,-0.151
mths_since_recent_bc_dlq,0.066,0.5058,1.0,0.7865,0.0646,-0.1414
mths_since_recent_revol_delinq,0.0253,0.6502,0.7865,1.0,0.0387,-0.1679
num_tl_90g_dpd_24m,0.5555,-0.017,0.0646,0.0387,1.0,0.2519
pct_tl_nvr_dlq,-0.196,-0.151,-0.1414,-0.1679,0.2519,1.0


Unnamed: 0,delinq_2yrs,mths_since_last_delinq,mths_since_recent_bc_dlq,mths_since_recent_revol_delinq,num_tl_90g_dpd_24m,pct_tl_nvr_dlq
delinq_2yrs,1.0,-0.6096,-0.5177,-0.5653,0.5623,-0.3005
mths_since_last_delinq,-0.6096,1.0,0.7705,0.8677,-0.28,0.2003
mths_since_recent_bc_dlq,-0.5177,0.7705,1.0,0.8901,-0.2033,0.0818
mths_since_recent_revol_delinq,-0.5653,0.8677,0.8901,1.0,-0.2098,0.1308
num_tl_90g_dpd_24m,0.5623,-0.28,-0.2033,-0.2098,1.0,-0.2156
pct_tl_nvr_dlq,-0.3005,0.2003,0.0818,0.1308,-0.2156,1.0


<u>Remaining Variables</u>

* **mo_sin_old_rev_tl_op** 🟨 Re-bin - comebine first two bins to see if it makes it monotonic then compare vs. mnths_since_earliest_cr_line
    * highly correlated with `mnths_since_earliest_cr_line` (0.88) - have better quality.
    * IV = 0.18 (vs. 0.11), WoE spread = 0.45, bad rate spread = 7.3% (5.3%)
    * the 1st bin has moderate bad rate, the rest bins descending 

* **open_act_il** ❌ Drop
    * Redundant with open_acc (review) and all_util (corr 0.77) - keep better performing open_il_12m or open_il_24m if must keep a il variable 

* **total_bal_il** 🟦 Backup - check corr with other variables 
    * IV = 0.016, WoE spread = 0.32, bad rate spread 5.3% (18.6% - 23.9%)
    * not strongly correlated with other balance / limit variables 

* **total_il_high_credit_limit**  ❌ Drop
    * very weak IV (0.005) and WoE spread (0.26), non-monotonic, strongly correlated with total_bal_ex_mort (0.87)

* **total_cu_tl** ❌ Drop
    * IV = 0.016, WoE spread = 0.29, bad rate spread = 4.8%
    * 4 bins with 2 bins of similar bad rate (bin 2: 22.9%, bin 3: 22.4%)
    * in the same missin bloack as total_bal_il

* **num_op_rev_tl**:  ❌ Drop
    * IV = 0.01, WoE spread=0.39, 12 bins, bad rate spread = 6.4%
    * high correlation with `num_rev_tl_bal_gt_0` (0.81) and `open_acc` (0.80)

* **num_actv_bc_tl**: ❌ Drop
    * similar profile to `num_op_rev_tl` (corr = 0.68)
    * high correlation with `num_rev_tl_bal_gt_0` (0.79)

* **num_sats**: 🟨 Re-bin and review
    * IV = 0.01, WoE spread = 0.37, bad rate spread = 6.0%
    * Correlation with `num_rev_tl_bal_gt_0` (0.67) and `num_op_rev_tl`(0.82 dropped)
    * ascending pattern over 12 bins, but small increments - try rebinning to fewer bins

In [63]:
# months-variables 
binning_summary_df[(binning_summary_df['variable'].str.contains('mnths_')) | 
                   (binning_summary_df['variable'].str.contains('mths_')) |
                   (binning_summary_df['variable'].str.contains('mo_s'))].sort_values(by='IV', ascending=False)

mnth_vars = binning_summary_df[(binning_summary_df['variable'].str.contains('mnths_')) | 
                               (binning_summary_df['variable'].str.contains('mths_')) |
                               (binning_summary_df['variable'].str.contains('mo_s'))]['variable']

df_train[mnth_vars].corr()
# corr_excl_miss(df_train, fmeta, il_vars)

# df_train[['open_act_il', 'open_il_12m']].value_counts(normalize=True).head(10)
# binning_tables['open_act_il']
# binning_tables['open_il_12m']
# binning_tables['total_bal_il']

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
41,mo_sin_rcnt_tl,0.0349,11,0.0522,0.2352,-0.1862,0.4972,0.6834,0.0989,descending,weak,9-15,medium,numerical,keep,monotonic descending; good WOE & BR spread
43,mths_since_recent_bc,0.0278,13,0.0512,0.1444,-0.2047,0.3964,0.6011,0.0903,descending,weak,9-15,medium,numerical,keep,monotonic descending; good WOE & BR spread
40,mo_sin_rcnt_rev_tl_op,0.0271,13,0.0514,0.1853,-0.1691,0.4604,0.6295,0.0917,descending,weak,9-15,medium,numerical,keep,monotonic descending; good WOE & BR spread
23,mths_since_rcnt_il,0.0231,5,0.0591,0.7219,-0.4251,0.0786,0.5037,0.0886,non-monotonic,weak,4-8,medium,numerical,keep,non-monotonic due to imputed value -1; good ba...
39,mo_sin_old_rev_tl_op,0.0184,11,0.0511,0.1685,-0.259,0.1921,0.4511,0.0733,non-monotonic,not predictive,9-15,low,numerical,,
45,mths_since_recent_inq,0.0154,2,0.106,0.894,-0.04,0.3857,0.4257,0.0609,ascending,not predictive,< 4,low,numerical,review,re-bin and review; only 2 bins (missing vs. no...
67,mnths_since_earliest_cr_line,0.0113,12,0.0535,0.1807,-0.1735,0.1634,0.3369,0.0538,descending,not predictive,9-15,low,numerical,keep,"known risk indicator; low IV, monotonic descen..."
10,mths_since_last_record,0.0067,3,0.0815,0.8294,-0.2195,0.0367,0.2562,0.043,ascending,not predictive,< 4,low,numerical,review,"backup variable, see if re-bin increases IV; l..."
16,mths_since_last_major_derog,0.0046,3,0.0659,0.7285,-0.1364,0.0422,0.1786,0.0293,ascending,not predictive,< 4,minimal,numerical,keep - backup,"backup variables; low IV (<0.1), but covers ch..."
38,mo_sin_old_il_acct,0.004,11,0.0501,0.2128,-0.1027,0.1004,0.2031,0.0324,non-monotonic,not predictive,9-15,low,numerical,,


Unnamed: 0,mths_since_last_delinq,mths_since_last_record,mths_since_last_major_derog,mths_since_rcnt_il,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,mnths_since_earliest_cr_line
mths_since_last_delinq,1.0,0.0261,0.5553,0.0074,0.1249,0.0821,-0.0389,-0.0485,-0.0281,0.5058,-0.0127,0.6502,0.0895
mths_since_last_record,0.0261,1.0,0.049,0.0055,0.0605,0.0489,-0.0631,-0.0521,-0.0454,0.0041,-0.0195,-0.0025,0.0587
mths_since_last_major_derog,0.5553,0.049,1.0,0.017,0.1342,0.1055,-0.0458,-0.0517,-0.0359,0.4553,-0.024,0.4654,0.0991
mths_since_rcnt_il,0.0074,0.0055,0.017,1.0,0.0621,0.0352,0.0252,0.1127,0.0101,-0.0,0.0068,0.0026,0.0306
mo_sin_old_il_acct,0.1249,0.0605,0.1342,0.0621,1.0,0.2888,0.0729,0.0293,0.0595,0.093,0.008,0.0832,0.3349
mo_sin_old_rev_tl_op,0.0821,0.0489,0.1055,0.0352,0.2888,1.0,0.1031,0.0821,0.108,0.1316,0.0079,0.1214,0.8813
mo_sin_rcnt_rev_tl_op,-0.0389,-0.0631,-0.0458,0.0252,0.0729,0.1031,1.0,0.629,0.5958,-0.0286,0.0468,-0.0342,0.0617
mo_sin_rcnt_tl,-0.0485,-0.0521,-0.0517,0.1127,0.0293,0.0821,0.629,1.0,0.38,-0.0312,0.0716,-0.0388,0.0343
mths_since_recent_bc,-0.0281,-0.0454,-0.0359,0.0101,0.0595,0.108,0.5958,0.38,1.0,-0.0242,0.0256,-0.0208,0.0906
mths_since_recent_bc_dlq,0.5058,0.0041,0.4553,-0.0,0.093,0.1316,-0.0286,-0.0312,-0.0242,1.0,-0.0118,0.7865,0.1026


In [64]:
# Filter out the potential ones 
binning_summary_df[(binning_summary_df['iv_class']=='not predictive') & 
                   (binning_summary_df['decision'].isnull()) &
                   ((binning_summary_df['IV'] > 0.015) | (binning_summary_df['woe_spread'] > 0.3)) # IV greater than 0.015 or WoE spread greater than 0.3
                   ].sort_values(by='IV', ascending=False)

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
39,mo_sin_old_rev_tl_op,0.0184,11,0.0511,0.1685,-0.259,0.1921,0.4511,0.0733,non-monotonic,not predictive,9-15,low,numerical,,
20,open_act_il,0.0168,4,0.0695,0.7146,-0.2513,0.0824,0.3337,0.0559,ascending,not predictive,4-8,low,numerical,,
24,total_bal_il,0.0166,5,0.0514,0.7146,-0.2327,0.0824,0.3151,0.0525,ascending,not predictive,4-8,low,numerical,,
32,total_cu_tl,0.0158,4,0.0601,0.7146,-0.2095,0.0824,0.2919,0.0483,non-monotonic,not predictive,4-8,low,numerical,,
53,num_op_rev_tl,0.0102,12,0.0524,0.1076,-0.2232,0.1697,0.3929,0.0636,ascending,not predictive,9-15,low,numerical,,
48,num_actv_bc_tl,0.0101,6,0.0576,0.3463,-0.2612,0.0959,0.3571,0.0597,ascending,not predictive,4-8,low,numerical,,
56,num_sats,0.008,12,0.0506,0.1339,-0.1981,0.1745,0.3726,0.0598,ascending,not predictive,9-15,low,numerical,,


In [65]:
# instalment related variables (il)
binning_summary_df[binning_summary_df['variable'].str.contains('il')].sort_values(by='IV', ascending=False)
il_vars = binning_summary_df[binning_summary_df['variable'].str.contains('il')]['variable']

df_train[il_vars].corr()
# corr_excl_miss(df_train, fmeta, il_vars)

# df_train[['open_act_il', 'open_il_12m']].value_counts(normalize=True).head(10)
# binning_tables['open_act_il']
# binning_tables['open_il_12m']
# binning_tables['total_bal_il']

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
29,all_util,0.0292,5,0.051,0.7707,-0.4297,0.0909,0.5206,0.0914,ascending,weak,4-8,medium,numerical,keep - backup,backup variable; umbrella utility rate variabl...
21,open_il_12m,0.0241,4,0.0537,0.7146,-0.4769,0.0824,0.5593,0.0996,ascending,weak,4-8,medium,numerical,drop,"redundant with other account open variables, ..."
37,bc_util,0.0234,13,0.0517,0.1118,-0.3066,0.2322,0.5388,0.0878,ascending,weak,9-15,medium,numerical,keep,highest WoE spread in the utlity variable grou...
22,open_il_24m,0.0233,5,0.0601,0.7146,-0.4281,0.0824,0.5105,0.0897,ascending,weak,4-8,medium,numerical,drop,"redundant with other account open variables, ..."
23,mths_since_rcnt_il,0.0231,5,0.0591,0.7219,-0.4251,0.0786,0.5037,0.0886,non-monotonic,weak,4-8,medium,numerical,keep,non-monotonic due to imputed value -1; good ba...
25,il_util,0.0227,4,0.0518,0.7962,-0.3741,0.0739,0.448,0.0778,ascending,weak,4-8,low,numerical,drop,weak IV; high correlation with all_util
14,revol_util,0.0183,12,0.0503,0.1555,-0.1813,0.3295,0.5108,0.0778,ascending,not predictive,9-15,medium,numerical,drop,weak IV; high correlation with bc_util
20,open_act_il,0.0168,4,0.0695,0.7146,-0.2513,0.0824,0.3337,0.0559,ascending,not predictive,4-8,low,numerical,,
24,total_bal_il,0.0166,5,0.0514,0.7146,-0.2327,0.0824,0.3151,0.0525,ascending,not predictive,4-8,low,numerical,,
66,total_il_high_credit_limit,0.0055,4,0.0551,0.7209,-0.092,0.1751,0.2671,0.0415,non-monotonic,not predictive,4-8,low,numerical,,


Unnamed: 0,revol_util,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,all_util,bc_util,mo_sin_old_il_acct,num_il_tl,total_il_high_credit_limit
revol_util,1.0,-0.0632,-0.1133,-0.1075,-0.0352,-0.0502,-0.0959,0.0291,0.8005,0.0401,0.0121,0.0274
open_act_il,-0.0632,1.0,0.7162,0.7382,0.3035,0.7464,0.7751,0.772,-0.0507,0.0836,0.273,0.2651
open_il_12m,-0.1133,0.7162,1.0,0.9218,0.2806,0.6282,0.8307,0.8174,-0.0977,0.0479,0.143,0.1462
open_il_24m,-0.1075,0.7382,0.9218,1.0,0.2365,0.6552,0.8218,0.7991,-0.0919,0.0552,0.1882,0.1802
mths_since_rcnt_il,-0.0352,0.3035,0.2806,0.2365,1.0,0.1804,0.2982,0.4883,-0.0306,0.0621,-0.0725,-0.0745
total_bal_il,-0.0502,0.7464,0.6282,0.6552,0.1804,1.0,0.6516,0.6364,-0.044,0.1115,0.2898,0.4745
il_util,-0.0959,0.7751,0.8307,0.8218,0.2982,0.6516,1.0,0.8913,-0.0779,0.0548,0.132,0.1453
all_util,0.0291,0.772,0.8174,0.7991,0.4883,0.6364,0.8913,1.0,0.0259,0.043,0.0939,0.1118
bc_util,0.8005,-0.0507,-0.0977,-0.0919,-0.0306,-0.044,-0.0779,0.0259,1.0,0.0656,0.0297,0.0292
mo_sin_old_il_acct,0.0401,0.0836,0.0479,0.0552,0.0621,0.1115,0.0548,0.043,0.0656,1.0,0.4167,0.2839


In [66]:
# account related variables (act)
binning_summary_df[(binning_summary_df['variable'].str.contains('op')) | (binning_summary_df['variable'].str.contains('ac'))].sort_values(by='IV', ascending=False)
act_vars = binning_summary_df[(binning_summary_df['variable'].str.contains('op')) | (binning_summary_df['variable'].str.contains('ac'))]['variable']

df_train[act_vars].corr()
# corr_excl_miss(df_train, fmeta, il_vars)

# binning_tables['open_act_il']
# binning_tables['open_il_12m']
# binning_tables['total_bal_il']

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
34,acc_open_past_24mths,0.0821,9,0.0697,0.1475,-0.5516,0.4403,0.9919,0.1633,ascending,weak,9-15,high,numerical,keep,"monotonic ascending; iv>=0.6, good WOE & BR sp..."
58,num_tl_op_past_12m,0.0599,6,0.0894,0.2431,-0.4484,0.3346,0.783,0.129,ascending,weak,4-8,medium,numerical,keep - backup,"backup variable; monotonic ascending; iv>=0.6,..."
36,bc_open_to_buy,0.0537,14,0.05,0.1408,-0.2423,0.6573,0.8996,0.1262,descending,weak,9-15,high,numerical,keep,monotonic descending; good WOE & BR spread
42,mort_acc,0.0343,6,0.0548,0.4065,-0.1802,0.3891,0.5693,0.0851,descending,weak,4-8,medium,numerical,keep,monotonic descending; good WOE & BR spread
27,open_rv_24m,0.0318,5,0.0543,0.7515,-0.5157,0.086,0.6017,0.1081,ascending,weak,4-8,medium,numerical,keep - backup,backup variable; monotonic ascending; large mi...
49,num_actv_rev_tl,0.0305,10,0.0585,0.1508,-0.3671,0.2457,0.6128,0.1011,ascending,weak,9-15,medium,numerical,drop,redundant: strong correlation with num_rev_tl_...
40,mo_sin_rcnt_rev_tl_op,0.0271,13,0.0514,0.1853,-0.1691,0.4604,0.6295,0.0917,descending,weak,9-15,medium,numerical,keep,monotonic descending; good WOE & BR spread
26,open_rv_12m,0.0252,5,0.0536,0.7146,-0.4748,0.0824,0.5572,0.0992,ascending,weak,4-8,medium,numerical,drop,"redundant with other account open variables, ..."
19,open_acc_6m,0.0243,4,0.0802,0.7146,-0.4262,0.0824,0.5086,0.0894,ascending,weak,4-8,medium,numerical,drop,"redundant with other account open variables, ..."
21,open_il_12m,0.0241,4,0.0537,0.7146,-0.4769,0.0824,0.5593,0.0996,ascending,weak,4-8,medium,numerical,drop,"redundant with other account open variables, ..."


Unnamed: 0,open_acc,total_acc,open_acc_6m,open_act_il,open_il_12m,open_il_24m,open_rv_12m,open_rv_24m,acc_open_past_24mths,bc_open_to_buy,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mort_acc,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_op_rev_tl,num_rev_accts,num_tl_op_past_12m
open_acc,1.0,0.6983,0.1122,0.2171,0.0748,0.1003,0.1437,0.182,0.481,0.2902,0.165,0.1411,-0.2225,0.1205,0.0227,0.5161,0.6329,0.7954,0.6175,0.3535
total_acc,0.6983,1.0,0.0669,0.1287,0.0625,0.0969,0.0703,0.0935,0.421,0.2206,0.3558,0.2779,-0.1568,0.371,0.1527,0.2744,0.3764,0.5294,0.7316,0.3163
open_acc_6m,0.1122,0.0669,1.0,0.6361,0.8067,0.7645,0.8698,0.8163,0.2559,0.0782,0.036,0.0137,-0.1142,-0.0132,0.0582,0.0559,0.0749,0.1144,0.0643,0.3175
open_act_il,0.2171,0.1287,0.6361,1.0,0.7162,0.7382,0.58,0.5729,0.1341,0.0298,0.0836,-0.0197,-0.0045,-0.0354,0.0613,0.0094,0.0063,0.021,-0.0152,0.1234
open_il_12m,0.0748,0.0625,0.8067,0.7162,1.0,0.9218,0.6891,0.6833,0.2188,0.0463,0.0479,0.0105,-0.0182,-0.0125,0.0509,0.0045,0.0068,0.0349,0.004,0.2627
open_il_24m,0.1003,0.0969,0.7645,0.7382,0.9218,1.0,0.6653,0.6675,0.2707,0.0447,0.0552,0.005,-0.0219,-0.0049,0.0473,0.0059,0.0101,0.0417,0.0098,0.2308
open_rv_12m,0.1437,0.0703,0.8698,0.58,0.6891,0.6653,1.0,0.9156,0.2943,0.0911,0.0245,0.0094,-0.154,-0.0325,0.0619,0.0998,0.1288,0.175,0.1083,0.3643
open_rv_24m,0.182,0.0935,0.8163,0.5729,0.6833,0.6675,0.9156,1.0,0.3662,0.096,0.0186,-0.0009,-0.1538,-0.0384,0.0607,0.1247,0.1614,0.2205,0.1429,0.3061
acc_open_past_24mths,0.481,0.421,0.2559,0.1341,0.2188,0.2707,0.2943,0.3662,1.0,0.1007,0.0662,-0.0192,-0.388,0.0717,0.091,0.2237,0.3425,0.4466,0.3771,0.7389
bc_open_to_buy,0.2902,0.2206,0.0782,0.0298,0.0463,0.0447,0.0911,0.096,0.1007,1.0,0.0401,0.1877,-0.0559,0.1332,-0.0756,0.2464,0.1159,0.3443,0.2994,0.0934


In [67]:
# 'num' vars correlation with ones kept
var_list = ['num_actv_bc_tl', 'num_op_rev_tl', 'num_sats','num_tl_op_past_12m','num_actv_rev_tl','num_rev_tl_bal_gt_0']

binning_summary_df[binning_summary_df['variable'].isin(var_list)]
df_train[var_list].corr()
# corr_excl_miss(df_train, fmeta, var_list)

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
48,num_actv_bc_tl,0.0101,6,0.0576,0.3463,-0.2612,0.0959,0.3571,0.0597,ascending,not predictive,4-8,low,numerical,,
49,num_actv_rev_tl,0.0305,10,0.0585,0.1508,-0.3671,0.2457,0.6128,0.1011,ascending,weak,9-15,medium,numerical,drop,redundant: strong correlation with num_rev_tl_...
53,num_op_rev_tl,0.0102,12,0.0524,0.1076,-0.2232,0.1697,0.3929,0.0636,ascending,not predictive,9-15,low,numerical,,
55,num_rev_tl_bal_gt_0,0.03,10,0.0524,0.1518,-0.4019,0.2483,0.6502,0.1084,ascending,weak,9-15,medium,numerical,keep,monotonic ascending; decent bin range and WoE ...
56,num_sats,0.008,12,0.0506,0.1339,-0.1981,0.1745,0.3726,0.0598,ascending,not predictive,9-15,low,numerical,,
58,num_tl_op_past_12m,0.0599,6,0.0894,0.2431,-0.4484,0.3346,0.783,0.129,ascending,weak,4-8,medium,numerical,keep - backup,"backup variable; monotonic ascending; iv>=0.6,..."


Unnamed: 0,num_actv_bc_tl,num_op_rev_tl,num_sats,num_tl_op_past_12m,num_actv_rev_tl,num_rev_tl_bal_gt_0
num_actv_bc_tl,1.0,0.6843,0.553,0.2252,0.821,0.8163
num_op_rev_tl,0.6843,1.0,0.8188,0.3853,0.8161,0.8199
num_sats,0.553,0.8188,1.0,0.3854,0.6631,0.6677
num_tl_op_past_12m,0.2252,0.3853,0.3854,1.0,0.3182,0.3071
num_actv_rev_tl,0.821,0.8161,0.6631,0.3182,1.0,0.9851
num_rev_tl_bal_gt_0,0.8163,0.8199,0.6677,0.3071,0.9851,1.0


In [68]:
# balance & limit related variables (act)
binning_summary_df[(binning_summary_df['variable'].str.contains('bal')) | (binning_summary_df['variable'].str.contains('lim'))].sort_values(by='IV', ascending=False)
bal_vars = binning_summary_df[(binning_summary_df['variable'].str.contains('bal')) | (binning_summary_df['variable'].str.contains('lim'))]['variable']

df_train[bal_vars].corr()
# corr_excl_miss(df_train, fmeta, il_vars)

# df_train[['open_act_il', 'open_il_12m']].value_counts(normalize=True).head(10)
# binning_tables['open_act_il']
# binning_tables['open_il_12m']
# binning_tables['total_bal_il']

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
35,avg_cur_bal,0.0506,14,0.0506,0.1792,-0.21,0.6272,0.8372,0.1173,non-monotonic,weak,9-15,high,numerical,drop,"redundant, strong correlation with tot_hi_cred..."
63,tot_hi_cred_lim,0.043,16,0.0501,0.105,-0.203,0.5373,0.7403,0.1064,non-monotonic,weak,> 15,medium,numerical,review,re-bin to smaller bin number; non-monotonic du...
65,total_bc_limit,0.0401,13,0.0505,0.1588,-0.1943,0.5878,0.7821,0.1104,descending,weak,9-15,medium,numerical,drop,highly correlated with bc_open_to_buy (0.83) a...
18,tot_cur_bal,0.0392,15,0.0506,0.0977,-0.2254,0.4912,0.7166,0.1053,non-monotonic,weak,9-15,medium,numerical,drop,"redundant, strong correlation with tot_hi_cred..."
55,num_rev_tl_bal_gt_0,0.03,10,0.0524,0.1518,-0.4019,0.2483,0.6502,0.1084,ascending,weak,9-15,medium,numerical,keep,monotonic ascending; decent bin range and WoE ...
30,total_rev_hi_lim,0.0263,10,0.0501,0.3401,-0.1077,0.4832,0.5909,0.0837,descending,weak,9-15,medium,numerical,keep - backup,monotonic descending; good WOE & BR spread; mo...
28,max_bal_bc,0.0209,5,0.05,0.7146,-0.3184,0.0824,0.4008,0.0684,non-monotonic,weak,4-8,low,numerical,keep - backup,non-monotonic due to imputed value -1; decent ...
24,total_bal_il,0.0166,5,0.0514,0.7146,-0.2327,0.0824,0.3151,0.0525,ascending,not predictive,4-8,low,numerical,,
66,total_il_high_credit_limit,0.0055,4,0.0551,0.7209,-0.092,0.1751,0.2671,0.0415,non-monotonic,not predictive,4-8,low,numerical,,
13,revol_bal,0.005,8,0.0545,0.2864,-0.0596,0.2575,0.3171,0.0475,non-monotonic,not predictive,4-8,low,numerical,drop,"low IV and non-monotoic, highly correlated wit..."


Unnamed: 0,revol_bal,tot_cur_bal,total_bal_il,max_bal_bc,total_rev_hi_lim,avg_cur_bal,num_rev_tl_bal_gt_0,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
revol_bal,1.0,0.4326,0.0345,0.2473,0.804,0.2902,0.3111,0.458,0.4881,0.5296,0.1173
tot_cur_bal,0.4326,1.0,0.1887,0.1641,0.4252,0.8466,0.1351,0.9856,0.514,0.3232,0.4177
total_bal_il,0.0345,0.1887,1.0,0.4543,0.0685,0.097,0.0173,0.1844,0.4332,0.0542,0.4745
max_bal_bc,0.2473,0.1641,0.4543,1.0,0.2495,0.1252,0.0546,0.1804,0.1725,0.271,0.1064
total_rev_hi_lim,0.804,0.4252,0.0685,0.2495,1.0,0.231,0.3851,0.5085,0.4164,0.8027,0.1412
avg_cur_bal,0.2902,0.8466,0.097,0.1252,0.231,1.0,-0.1079,0.8173,0.2951,0.1562,0.2213
num_rev_tl_bal_gt_0,0.3111,0.1351,0.0173,0.0546,0.3851,-0.1079,1.0,0.1649,0.1592,0.281,0.0688
tot_hi_cred_lim,0.458,0.9856,0.1844,0.1804,0.5085,0.8173,0.1649,1.0,0.507,0.4057,0.4187
total_bal_ex_mort,0.4881,0.514,0.4332,0.1725,0.4164,0.2951,0.1592,0.507,1.0,0.2964,0.8707
total_bc_limit,0.5296,0.3232,0.0542,0.271,0.8027,0.1562,0.281,0.4057,0.2964,1.0,0.1134


In [69]:
# Record treatment
review_vars.extend(['mo_sin_old_rev_tl_op', 'num_sats'])
backup_vars.append('total_bal_il')
drop_vars.extend(['open_act_il', 'total_il_high_credit_limit', 'total_cu_tl', 'num_op_rev_tl', 'num_actv_bc_tl'])

binning_summary_df.loc[binning_summary_df['variable'].isin(backup_vars), 'decision'] = 'keep - backup'
binning_summary_df.loc[binning_summary_df['variable'].isin(review_vars), 'decision'] = 'review'
binning_summary_df.loc[binning_summary_df['variable'].isin(drop_vars), 'decision'] = 'drop'


binning_summary_df.loc[binning_summary_df['variable'].isin(['total_bal_il']), 'reason'] = 'weak IV and spread, but not strongly correlated with other balance and limit variable'
binning_summary_df.loc[binning_summary_df['variable'].isin(['mo_sin_old_rev_tl_op']), 'reason'] = 'similar profile but better quality than mnths_since_earliest_cr_line, keep one; non-monotonic, try combine the first two bins'
binning_summary_df.loc[binning_summary_df['variable'].isin(['num_sats']), 'reason'] = 'capture different patterns to num_rev_tl_bal_gt_0; small bad rate increments among bins, try more coarse bins'
binning_summary_df.loc[binning_summary_df['variable'].isin(['open_act_il']), 'reason'] = 'weak IV, highly correlated to open_il_12m and open_il_24m - keep one of those if must keep a il account number variable'
binning_summary_df.loc[binning_summary_df['variable'].isin(['total_il_high_credit_limit']), 'reason'] = 'very weak IV, non-monotonic, strongly correlated with total_bal_ex_mort'
binning_summary_df.loc[binning_summary_df['variable'].isin(['total_cu_tl']), 'reason'] = 'very weak IV, non-monotonic, in the same concurrent missing bloack as total_bal_il'
binning_summary_df.loc[binning_summary_df['variable'].isin(['num_op_rev_tl']), 'reason'] = 'very weak IV, highly correlated with num_rev_tl_bal_gt_0'
binning_summary_df.loc[binning_summary_df['variable'].isin(['num_actv_bc_tl']), 'reason'] = 'very weak IV, highly correlated with num_rev_tl_bal_gt_0'

In [70]:
# Mark the rest as drop
drop_vars.extend(list(binning_summary_df[binning_summary_df['decision'].isnull()]['variable']))

binning_summary_df.loc[binning_summary_df['decision'].isnull(), 'decision']='drop'
binning_summary_df.loc[binning_summary_df['decision'].isnull(), 'reason']='very weak IV (<0.01) and WoE spread (<0.2), not predictive'

### 3.3 Manual Binning

In [73]:
binning_summary_df[binning_summary_df['decision']=='review'].sort_values(by='IV', ascending=False)

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason
63,tot_hi_cred_lim,0.043,16,0.0501,0.105,-0.203,0.5373,0.7403,0.1064,non-monotonic,weak,> 15,medium,numerical,review,re-bin to smaller bin number; non-monotonic du...
39,mo_sin_old_rev_tl_op,0.0184,11,0.0511,0.1685,-0.259,0.1921,0.4511,0.0733,non-monotonic,not predictive,9-15,low,numerical,review,similar profile but better quality than mnths_...
45,mths_since_recent_inq,0.0154,2,0.106,0.894,-0.04,0.3857,0.4257,0.0609,ascending,not predictive,< 4,low,numerical,review,re-bin and review; only 2 bins (missing vs. no...
68,emp_length_num,0.0093,6,0.0556,0.3322,-0.3548,0.0652,0.42,0.0728,descending,not predictive,4-8,low,numerical,review,re-bin and review; the middle three bins of ha...
56,num_sats,0.008,12,0.0506,0.1339,-0.1981,0.1745,0.3726,0.0598,ascending,not predictive,9-15,low,numerical,review,capture different patterns to num_rev_tl_bal_g...
10,mths_since_last_record,0.0067,3,0.0815,0.8294,-0.2195,0.0367,0.2562,0.043,ascending,not predictive,< 4,low,numerical,review,"backup variable, see if re-bin increases IV; l..."
64,total_bal_ex_mort,0.0036,11,0.0504,0.2036,-0.0621,0.1706,0.2327,0.0358,non-monotonic,not predictive,9-15,low,numerical,review,"re-bin and review; non-monotonic, bell-shaped ..."
9,mths_since_last_delinq,0.0021,5,0.0502,0.498,-0.1045,0.0409,0.1454,0.0236,non-monotonic,not predictive,4-8,minimal,numerical,review,"re-bin and review; non-monotonic, descending b..."


In [74]:
# tot_hi_cred_lim
cust_bins_tot_hi_cred_lim = [-np.inf, 13500, 32500, 55500, 150000, 200000, 250000, 450000, np.inf]

def custom_binner(var, cust_bins, X, y):
    # creat custom breaks for woebin
    breaks_dict = {var: cust_bins[1:-1]}  # exclude -inf and inf for breaks

    # assemble df for custom binning 
    df_woe = pd.DataFrame({
        var: X[var],
        'target': y.astype(int)
    }).dropna()

    # run woebin and enforce breaks
    woe_table = sc.woebin(
        df_woe,
        y='target',
        breaks_list=breaks_dict
    )
    return woe_table

woe_table_tot_hi_cred_lim = custom_binner('tot_hi_cred_lim', cust_bins_tot_hi_cred_lim, X_train, y_train)
woe_table_tot_hi_cred_lim['tot_hi_cred_lim']

[INFO] creating woe binning ...


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,tot_hi_cred_lim,"[-inf,13500.0)",55225,0.0514,44641,10584,0.1917,-0.0451,0.0001,0.0413,13500.0,False
1,tot_hi_cred_lim,"[13500.0,32500.0)",114101,0.1062,88508,25593,0.2243,0.1535,0.0026,0.0413,32500.0,False
2,tot_hi_cred_lim,"[32500.0,55500.0)",165324,0.1539,127274,38050,0.2302,0.1868,0.0057,0.0413,55500.0,False
3,tot_hi_cred_lim,"[55500.0,150000.0)",301376,0.2805,234728,66648,0.2211,0.1353,0.0053,0.0413,150000.0,False
4,tot_hi_cred_lim,"[150000.0,200000.0)",94562,0.088,76134,18428,0.1949,-0.0244,0.0001,0.0413,200000.0,False
5,tot_hi_cred_lim,"[200000.0,250000.0)",86099,0.0801,70808,15291,0.1776,-0.1385,0.0015,0.0413,250000.0,False
6,tot_hi_cred_lim,"[250000.0,450000.0)",188482,0.1755,158620,29862,0.1584,-0.2757,0.0122,0.0413,450000.0,False
7,tot_hi_cred_lim,"[450000.0,inf)",69069,0.0643,60042,9027,0.1307,-0.5006,0.0138,0.0413,inf,False


In [75]:
# mo_sin_old_rev_tl_op
cust_bins_mo_sin_old_rev_tl_op = [-np.inf, 78.50, 131.50, 145.50, 164.50, 233.50, 264.50, np.inf]

woe_table_mo_sin_old_rev_tl_op = custom_binner('mo_sin_old_rev_tl_op', cust_bins_mo_sin_old_rev_tl_op, X_train, y_train)
woe_table_mo_sin_old_rev_tl_op['mo_sin_old_rev_tl_op']

[INFO] creating woe binning ...


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,mo_sin_old_rev_tl_op,"[-inf,78.5)",129070,0.1202,98541,30529,0.2365,0.2225,0.0063,0.0181,78.5,False
1,mo_sin_old_rev_tl_op,"[78.5,131.5)",237918,0.2215,186090,51828,0.2178,0.116,0.0031,0.0181,131.5,False
2,mo_sin_old_rev_tl_op,"[131.5,145.5)",78523,0.0731,62247,16276,0.2073,0.0528,0.0002,0.0181,145.5,False
3,mo_sin_old_rev_tl_op,"[145.5,164.5)",101039,0.0941,81149,19890,0.1969,-0.0118,0.0,0.0181,164.5,False
4,mo_sin_old_rev_tl_op,"[164.5,233.5)",272824,0.254,221740,51084,0.1872,-0.0738,0.0014,0.0181,233.5,False
5,mo_sin_old_rev_tl_op,"[233.5,264.5)",76091,0.0708,62588,13503,0.1775,-0.1394,0.0013,0.0181,264.5,False
6,mo_sin_old_rev_tl_op,"[264.5,inf)",178773,0.1664,148400,30373,0.1699,-0.1921,0.0058,0.0181,inf,False


In [76]:
#  mths_since_recent_inq
cust_bins_mths_since_recent_inq = [-np.inf, 0, 2, 4, 7, 13, 19, np.inf]

woe_table_mths_since_recent_inq = custom_binner('mths_since_recent_inq', cust_bins_mths_since_recent_inq, X_train, y_train)
woe_table_mths_since_recent_inq['mths_since_recent_inq']

[INFO] creating woe binning ...


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,mths_since_recent_inq,"[-inf,0.0)",113833,0.106,97406,16427,0.1443,-0.3857,0.014,0.0357,0.0,False
1,mths_since_recent_inq,"[0.0,2.0)",199275,0.1855,151243,48032,0.241,0.2472,0.0122,0.0357,2.0,False
2,mths_since_recent_inq,"[2.0,4.0)",167090,0.1555,130205,36885,0.2207,0.1329,0.0029,0.0357,4.0,False
3,mths_since_recent_inq,"[4.0,7.0)",189742,0.1766,150943,38799,0.2045,0.0357,0.0002,0.0357,7.0,False
4,mths_since_recent_inq,"[7.0,13.0)",236397,0.2201,191622,44775,0.1894,-0.0596,0.0008,0.0357,13.0,False
5,mths_since_recent_inq,"[13.0,19.0)",113001,0.1052,93260,19741,0.1747,-0.1584,0.0025,0.0357,19.0,False
6,mths_since_recent_inq,"[19.0,inf)",54900,0.0511,46076,8824,0.1607,-0.2586,0.0032,0.0357,inf,False


In [77]:
# emp_length_num

# binning_tables['emp_length_num']
# df_train['emp_length_num'].value_counts().sort_index()
# pd.DataFrame(df_train.groupby('emp_length_num')['GB_FLAG'].mean())

cust_bins_emp_length_num = [-np.inf, 0, 2, 10, np.inf] # Collapsing the 3rd-4th bins 

woe_table_emp_length_num = custom_binner('emp_length_num', cust_bins_emp_length_num, X_train, y_train)
woe_table_emp_length_num['emp_length_num']

[INFO] creating woe binning ...


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,emp_length_num,"[-inf,0.0)",59777,0.0556,44160,15617,0.2613,0.3548,0.0078,0.0093,0.0,False
1,emp_length_num,"[0.0,2.0)",153096,0.1425,121973,31123,0.2033,0.0284,0.0001,0.0093,2.0,False
2,emp_length_num,"[2.0,10.0)",504518,0.4697,405057,99461,0.1971,-0.01,0.0,0.0093,10.0,False
3,emp_length_num,"[10.0,inf)",356847,0.3322,289565,67282,0.1885,-0.0652,0.0014,0.0093,inf,False


In [78]:
# num_sats -- Drop
# binning_tables['num_sats']

cust_bins_num_sats = [-np.inf, 5.50, 8.50, 10.50, 15.50, 21.50, np.inf] # Collapsing the 3rd-4th bins 

woe_table_num_sats = custom_binner('num_sats', cust_bins_num_sats, X_train, y_train)
woe_table_num_sats['num_sats']

[INFO] creating woe binning ...


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,num_sats,"[-inf,5.5)",104466,0.0972,86457,18009,0.1724,-0.1745,0.0028,0.0079,5.5,False
1,num_sats,"[5.5,8.5)",236638,0.2203,192479,44159,0.1866,-0.0779,0.0013,0.0079,8.5,False
2,num_sats,"[8.5,10.5)",187697,0.1747,150843,36854,0.1963,-0.015,0.0,0.0079,10.5,False
3,num_sats,"[10.5,15.5)",334103,0.311,265699,68404,0.2047,0.0373,0.0004,0.0079,15.5,False
4,num_sats,"[15.5,21.5)",154601,0.1439,121715,32886,0.2127,0.0856,0.0011,0.0079,21.5,False
5,num_sats,"[21.5,inf)",56733,0.0528,43562,13171,0.2322,0.1981,0.0022,0.0079,inf,False


In [79]:
# total_bal_ex_mort -- Drop

# binning_tables['total_bal_ex_mort']
cust_bins_total_bal_ex_mort = [-np.inf, 8822.50, 17931.50, 27577.50,  41882.50, 71466.50, 129588.50, np.inf] # Collapsing the 3rd-4th bins 


woe_table_total_bal_ex_mort = custom_binner('total_bal_ex_mort', cust_bins_total_bal_ex_mort, X_train, y_train)
woe_table_total_bal_ex_mort['total_bal_ex_mort']

[INFO] creating woe binning ...


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,total_bal_ex_mort,"[-inf,8822.5)",82062,0.0764,67869,14193,0.173,-0.1706,0.0021,0.0036,8822.5,False
1,total_bal_ex_mort,"[8822.5,17931.5)",140084,0.1304,113243,26841,0.1916,-0.0454,0.0003,0.0036,17931.5,False
2,total_bal_ex_mort,"[17931.5,27577.5)",166203,0.1547,133101,33102,0.1992,0.0027,0.0,0.0036,27577.5,False
3,total_bal_ex_mort,"[27577.5,41882.5)",218753,0.2036,174167,44586,0.2038,0.0317,0.0002,0.0036,41882.5,False
4,total_bal_ex_mort,"[41882.5,71466.5)",260388,0.2424,206541,53847,0.2068,0.0499,0.0006,0.0036,71466.5,False
5,total_bal_ex_mort,"[71466.5,129588.5)",151753,0.1413,121103,30650,0.202,0.0202,0.0001,0.0036,129588.5,False
6,total_bal_ex_mort,"[129588.5,inf)",54995,0.0512,44731,10264,0.1866,-0.0778,0.0003,0.0036,inf,False


In [None]:
# mths_since_last_delinq -- Drop
# binning_tables['mths_since_last_delinq']

# df_train['mths_since_last_delinq'].value_counts(normalize=True).sort_index()
# df_train.groupby('mths_since_last_delinq')['GB_FLAG'].mean()

cust_bins_mths_since_last_delinq = [-np.inf, 0, 21, 31, 51, 56, 61, 66, np.inf] # Collapsing the 3rd-4th bins 
woe_table_mths_since_last_delinq = custom_binner('mths_since_last_delinq', cust_bins_mths_since_last_delinq, X_train, y_train)
woe_table_mths_since_last_delinq['mths_since_last_delinq']

In [80]:
manual_vars = ['tot_hi_cred_lim', 'mo_sin_old_rev_tl_op', 'mths_since_recent_inq', 'emp_length_num']
drop_vars.extend(['num_sats', 'mths_since_last_record', 'total_bal_ex_mort', 'mths_since_last_delinq', 'mnths_since_earliest_cr_line'])

# Update binning decision for review variabls 
binning_summary_df['manual'] = 0
binning_summary_df.loc[binning_summary_df['variable'].isin(manual_vars), 'manual'] = 1

binning_summary_df.loc[binning_summary_df['variable'].isin(manual_vars), 'decision'] = 'keep'
binning_summary_df.loc[binning_summary_df['variable'].isin(drop_vars), 'decision'] = 'drop'
binning_summary_df.loc[binning_summary_df['variable'].isin(['num_sats', 'mths_since_last_record', 'total_bal_ex_mort', 'mths_since_last_delinq']), 'reason'] = 'low IV; tried re-binning, no IV improvement'


# Update binning decison for related variables
keep_vars.remove('mnths_since_earliest_cr_line')
binning_summary_df.loc[binning_summary_df['variable']=='mnths_since_earliest_cr_line', 'decision'] = 'drop'
binning_summary_df.loc[binning_summary_df['variable']=='mnths_since_earliest_cr_line', 'reason'] = 'known risk indicator but strongly correlated with mo_sin_old_rev_tl_op (0.88), which is more predictive'

In [81]:
# Record manually binned breaks & woes 
manual_binning_tables = {}
custom_bins_dict = {}

for var in manual_vars:
    manual_binning_tables.update(eval(f'woe_table_{var}'))
    custom_bins_dict.update({var: eval(f'cust_bins_{var}')})

## 4. WoE Binning (Categoricl Variables)

### 4.1 Auto-binning

In [82]:
# Assemble training set with only categorical variables
df_cat_woe = X_train[cat_vars].copy()
df_cat_woe['target'] = y_train.astype(int)

# Run woebin on categorical vars
woe_binning_cat = sc.woebin(
    dt=df_cat_woe,
    y='target',
    var_cat=cat_vars,
    positive='1' 
)

[INFO] creating woe binning ...


### 4.2 Auto-binning Review

In [83]:
# Create a function to generate binning summary table 
def summarize_woe_binning(binning_dict, var_type='unknown'):
    summary = []

    for var, df in binning_dict.items():
        df = df.copy()
        df = df[df['count'] > 0]

        bins = df.shape[0]
        min_bin_size = round(df['count_distr'].min(), 4)
        max_bin_size = round(df['count_distr'].max(), 4)
        min_woe = round(df['woe'].min(), 4)
        max_woe = round(df['woe'].max(), 4)
        woe_spread = round(max_woe - min_woe, 4)
        bad_rate_min = round(df['badprob'].min(), 4)
        bad_rate_max = round(df['badprob'].max(), 4)
        bad_rate_spread = round(bad_rate_max - bad_rate_min, 4)
        iv = round(df['total_iv'].iloc[0], 6)

        # IV class
        if iv < 0.02:
            iv_class = 'not predictive'
        elif iv < 0.1:
            iv_class = 'weak'
        elif iv < 0.3:
            iv_class = 'medium'
        else:
            iv_class = 'strong'

        # WOE spread class
        if woe_spread < 0.2:
            woe_spread_class = 'minimal'
        elif woe_spread < 0.5:
            woe_spread_class = 'low'
        elif woe_spread < 0.8:
            woe_spread_class = 'medium'
        else:
            woe_spread_class = 'high'

        # Bin num group
        if bins <= 3:
            bin_num_group = '< 4'
        elif 4 <= bins <= 8:
            bin_num_group = '4-8'
        elif 9 <= bins <= 15:
            bin_num_group = '9-15'
        else:
            bin_num_group = '> 15'

        summary.append({
            "variable": var,
            "IV": iv,
            "bins": bins,
            "min_bin_size": min_bin_size,
            "max_bin_size": max_bin_size,
            "min_woe": min_woe,
            "max_woe": max_woe,
            "woe_spread": woe_spread,
            "bad_rate_spread": bad_rate_spread,
            "monotonic_trend": np.nan,
            "iv_class": iv_class,
            "bin_num_group": bin_num_group,
            "woe_spread_class": woe_spread_class,
            "type": var_type,
            "decision": np.nan,
            "reason": np.nan,
            "manual": 0
        })

    return pd.DataFrame(summary)


cat_binning_summary = summarize_woe_binning(woe_binning_cat, var_type='categorical')
cat_binning_summary

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason,manual
0,home_ownership_grouped,0.0281,3,0.1059,0.494,-0.1724,0.1821,0.3545,0.0566,,weak,< 4,low,categorical,,,0
1,verification_status,0.0537,3,0.2883,0.391,-0.3815,0.207,0.5885,0.0889,,weak,< 4,medium,categorical,,,0
2,addr_state_grouped,0.0143,5,0.0626,0.4076,-0.2599,0.2088,0.4687,0.0736,,not predictive,4-8,low,categorical,,,0
3,purpose,0.0161,3,0.081,0.6798,-0.2131,0.0825,0.2956,0.0452,,not predictive,< 4,low,categorical,,,0


In [84]:
for var in cat_vars:
    woe_binning_cat[var]

Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,verification_status,Not Verified,309708,0.2883,264855,44853,0.1448,-0.3815,0.0372,0.0537,Not Verified,False
1,verification_status,Source Verified,420054,0.391,331943,88111,0.2098,0.0679,0.0018,0.0537,Source Verified,False
2,verification_status,Verified,344476,0.3207,263957,80519,0.2337,0.207,0.0146,0.0537,Verified,False


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,purpose,"educational%,%wedding%,%car%,%credit_card",256915,0.2392,214022,42893,0.167,-0.2131,0.0102,0.0161,"educational%,%wedding%,%car%,%credit_card",False
1,purpose,"home_improvement%,%major_purchase",87017,0.081,71393,15624,0.1796,-0.1251,0.0012,0.0161,"home_improvement%,%major_purchase",False
2,purpose,"vacation%,%other%,%debt_consolidation%,%medica...",730306,0.6798,575340,154966,0.2122,0.0825,0.0047,0.0161,"vacation%,%other%,%debt_consolidation%,%medica...",False


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,addr_state_grouped,band 1,132367,0.1232,111116,21251,0.1605,-0.2599,0.0077,0.0143,band 1,False
1,addr_state_grouped,band 2,113692,0.1058,92885,20807,0.183,-0.1018,0.0011,0.0143,band 2,False
2,addr_state_grouped,band 3,323107,0.3008,259972,63135,0.1954,-0.021,0.0001,0.0143,band 3,False
3,addr_state_grouped,"band 4%,%band 5",437813,0.4076,345266,92547,0.2114,0.0777,0.0025,0.0143,"band 4%,%band 5",False
4,addr_state_grouped,band 6,67259,0.0626,51516,15743,0.2341,0.2088,0.0029,0.0143,band 6,False


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,home_ownership_grouped,MORTGAGE,530696,0.494,439048,91648,0.1727,-0.1724,0.0139,0.0281,MORTGAGE,False
1,home_ownership_grouped,"OWN%,%Other",113722,0.1059,90451,23271,0.2046,0.0367,0.0001,0.0281,"OWN%,%Other",False
2,home_ownership_grouped,RENT,429820,0.4001,331256,98564,0.2293,0.1821,0.014,0.0281,RENT,False


### 4.3 Manual Binning

Note that the `purpose` variable is binned rather coarsely, re-bin and see it could add more granulariy

In [85]:
# Check volume and bad rate by raw values 
purp_stats = (
    df_train.
    groupby('purpose')['GB_FLAG']
    .agg(n_loans = 'count', bad_rate = 'mean')
    .reset_index()
    .sort_values(by='bad_rate', ascending=False)
)

purp_stats

Unnamed: 0,purpose,n_loans,bad_rate
11,small_business,11175,0.2889
10,renewable_energy,662,0.2432
8,moving,6886,0.2312
5,house,4575,0.2302
7,medical,11061,0.2156
2,debt_consolidation,633983,0.2108
9,other,55534,0.2097
12,vacation,6430,0.1922
6,major_purchase,21082,0.1856
4,home_improvement,65935,0.1776


In [86]:
# manual bin `purpose`
cust_bins_purpose = {
    'purpose': [
        'small_business%,%renewable_energy%,%moving%,%house',
        'medical%,%debt_consolidation%,%other',
        'vacation%,%major_purchase%,%home_improvement',
        'educational%,%wedding%,%car%,%credit_card'
    ]
}
# Record manual bins 
custom_bins_dict.update(cust_bins_purpose)

df_purpose = pd.DataFrame({
    'purpose': df_train['purpose'].astype(str),
    'target': y_train.astype(int)
})

woe_table_purpose = sc.woebin(
    df_purpose,
    y='target',
    breaks_list=cust_bins_purpose

)

# Update the custom binning table
custom_bins_dict.update(cust_bins_purpose)

[INFO] creating woe binning ...


In [93]:
woe_table_purpose['purpose']

Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,purpose,"small_business%,%renewable_energy%,%moving%,%h...",23298,0.0217,17263,6035,0.259,0.3433,0.0028,0.0179,"small_business%,%renewable_energy%,%moving%,%h...",False
1,purpose,"medical%,%debt_consolidation%,%other",700578,0.6522,552883,147695,0.2108,0.0743,0.0037,0.0179,"medical%,%debt_consolidation%,%other",False
2,purpose,"vacation%,%major_purchase%,%home_improvement",93447,0.087,76587,16860,0.1804,-0.1192,0.0012,0.0179,"vacation%,%major_purchase%,%home_improvement",False
3,purpose,"educational%,%wedding%,%car%,%credit_card",256915,0.2392,214022,42893,0.167,-0.2131,0.0102,0.0179,"educational%,%wedding%,%car%,%credit_card",False


In [89]:
# Combine the binning summary of numeric & categorical variables 
all_binning_summary_df = pd.concat([binning_summary_df, cat_binning_summary], axis=0) 

In [98]:
keep_vars.extend(['purpose', 'verification_status','home_ownership_grouped','addr_state_grouped'])

all_binning_summary_df.loc[all_binning_summary_df['variable'].isin(keep_vars), 'decision']='keep'
all_binning_summary_df.loc[all_binning_summary_df['variable']=='purpose', 'manual']=1

all_binning_summary_df.loc[all_binning_summary_df['variable']=='verification_status', 'reason']= 'decent bad rate spread (8.9%); commonly used in risk modelling'
all_binning_summary_df.loc[all_binning_summary_df['variable']=='home_ownership_grouped', 'reason']= 'weak IV (0.028) but commonly used in risk modelling'
all_binning_summary_df.loc[all_binning_summary_df['variable']=='addr_state_grouped', 'reason']= 'decent bad rate spread (7.4%); geographic info, check correlation with high_risk_zip'
all_binning_summary_df.loc[all_binning_summary_df['variable']=='purpose', 'reason']= 're-binned to preserve the high risk group (small business, renewale energy etc.)'


In [99]:
all_binning_summary_df['decision'].value_counts()

decision
drop             42
keep             23
keep - backup     8
Name: count, dtype: int64

## 5. WoE Binning (Binary Flags)

### 5.1 Auto Binning

In [100]:
flag_vars = list(X_train.select_dtypes(include='int').columns)
len(flag_vars)

30

In [101]:
# Assemble training set with only flag variables 
df_flag_woe = X_train[flag_vars].copy()
df_flag_woe['target'] = y_train.astype(int)

# Run woebin on categorical vars
woe_binning_flag = sc.woebin(
    dt=df_flag_woe,
    y='target',
    var_cat=flag_vars,
    positive='1' 
)

[INFO] creating woe binning ...


  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  init_bin = dtm.groupby('bin', group_keys=False)['y'].agg([n0, n1])\
  .groupby(['variable', 'bstbin', 'value'], group_keys=False)\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  .agg({'good':sum, 'bad':sum}).reset_index()\
  binning_1bst_brk = binning_1bst_brk.groupby(['variable', 'bstbin'], group_keys=False)\
  .agg({'good':sum, 'bad':sum}).reset_index().assign(bin=lambda x: x['bstbin'])\
  .agg({'good':sum, 'bad':sum}).reset_index().assign(bin=lambda x: x['bstbin'])\
  .groupby(['variable', 'bstbin', 'value'], group_keys=False)\
  .agg({'good':sum, 'bad':sum}).reset_i

Binning on 1074238 rows and 31 columns in 00:00:55


In [102]:
# Create a binning summary table for flag variables 
flag_binning_summary = summarize_woe_binning(woe_binning_flag, var_type='flag')
flag_binning_summary

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason,manual
0,f_miss_il_util,0.0159,2,0.2485,0.7515,-0.0756,0.2102,0.2858,0.0473,,not predictive,< 4,low,flag,,,0
1,f_term_60,0.1996,2,0.2385,0.7615,-0.2895,0.7009,0.9904,0.1767,,medium,< 4,high,flag,,,0
2,f_miss_mths_since_last_major_derog,0.0046,2,0.2715,0.7285,-0.0422,0.1082,0.1504,0.0244,,not predictive,< 4,minimal,flag,,,0
3,f_miss_mths_since_recent_revol_delinq,0.0005,2,0.3483,0.6517,-0.0171,0.0315,0.0486,0.0078,,not predictive,< 4,minimal,flag,,,0
4,f_miss_block3,0.0,1,1.0,1.0,0.0,0.0,0.0,0.0,,not predictive,< 4,minimal,flag,,,0
5,f_tot_coll_amt_gt0,0.003,2,0.1475,0.8525,-0.0235,0.1296,0.1531,0.0252,,not predictive,< 4,minimal,flag,,,0
6,f_num_tl_30dpd_gt0,0.0,1,1.0,1.0,0.0,0.0,0.0,0.0,,not predictive,< 4,minimal,flag,,,0
7,f_hi_risk_zip,0.0057,2,0.0507,0.9493,-0.0182,0.3106,0.3288,0.057,,not predictive,< 4,low,flag,,,0
8,f_miss_mo_sin_old_il_acct,0.0005,2,0.0553,0.9447,-0.0893,0.0051,0.0944,0.0146,,not predictive,< 4,minimal,flag,,,0
9,F_CAPPED_DELQ,0.0,1,1.0,1.0,0.0,0.0,0.0,0.0,,not predictive,< 4,minimal,flag,,,0


### 5.2 Review Auto-binning 

* The auto-binning process resulted in just 1 bin for 16 binary flags, which I believe is because of extreme imbalance of the flags, for example only 0.4% - 3% of values being 1.
* Review the ones that did yield 2 bins first

#### 5.2.1 Two-bins

In [103]:
flag_2bins = flag_binning_summary[flag_binning_summary['bins']==2].sort_values(by='IV', ascending=False)

flag_2bins[(flag_2bins['bad_rate_spread']> 0.04) & 
           (flag_2bins['min_bin_size'] > 0.05)]

Unnamed: 0,variable,IV,bins,min_bin_size,max_bin_size,min_woe,max_woe,woe_spread,bad_rate_spread,monotonic_trend,iv_class,bin_num_group,woe_spread_class,type,decision,reason,manual
1,f_term_60,0.1996,2,0.2385,0.7615,-0.2895,0.7009,0.9904,0.1767,,medium,< 4,high,flag,,,0
0,f_miss_il_util,0.0159,2,0.2485,0.7515,-0.0756,0.2102,0.2858,0.0473,,not predictive,< 4,low,flag,,,0
18,f_miss_block1,0.0157,2,0.2854,0.7146,-0.0824,0.1904,0.2728,0.0449,,not predictive,< 4,low,flag,,,0
25,f_miss_mths_since_recent_inq,0.0154,2,0.106,0.894,-0.3857,0.04,0.4257,0.0609,,not predictive,< 4,low,flag,,,0
23,f_emp_length_missing,0.0083,2,0.0556,0.9444,-0.0233,0.3548,0.3781,0.0663,,not predictive,< 4,low,flag,,,0
7,f_hi_risk_zip,0.0057,2,0.0507,0.9493,-0.0182,0.3106,0.3288,0.057,,not predictive,< 4,low,flag,,,0


In [104]:
# Check overlap 
df_train[['f_hi_risk_zip', 'addr_state_grouped']].value_counts(normalize=True)
df_train[['f_miss_il_util', 'f_miss_block1']].value_counts(normalize=True)

f_hi_risk_zip  addr_state_grouped
0              band 3               0.2912
               band 4               0.2770
               band 1               0.1232
               band 5               0.1067
               band 2               0.1053
               band 6               0.0459
1              band 6               0.0167
               band 5               0.0129
               band 4               0.0109
               band 3               0.0096
               band 2               0.0005
               band 1               0.0000
Name: proportion, dtype: float64

f_miss_il_util  f_miss_block1
1               1               0.7146
0               0               0.2485
1               0               0.0369
Name: proportion, dtype: float64

In [105]:
# Check overlap between raw variable  vs. missing flag

# employment length
df_train.groupby('f_emp_length_missing')['GB_FLAG'].mean()
binning_tables['emp_length_num']

# mths_since_recent_inq
df_train.groupby('f_miss_mths_since_recent_inq')['GB_FLAG'].mean()
binning_tables['mths_since_recent_inq']

f_emp_length_missing
0   0.1950
1   0.2613
Name: GB_FLAG, dtype: float64

Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, -0.50)",59777,0.0556,44160,15617,0.2613,-0.3548,0.0078,0.001
1,"[-0.50, 1.50)",153096,0.1425,121973,31123,0.2033,-0.0284,0.0001,0.0
2,"[1.50, 3.50)",180947,0.1684,145245,35702,0.1973,0.009,0.0,0.0
3,"[3.50, 4.50)",62846,0.0585,50458,12388,0.1971,0.0102,0.0,0.0
4,"[4.50, 9.50)",260725,0.2427,209354,51371,0.197,0.0107,0.0,0.0
5,"[9.50, inf)",356847,0.3322,289565,67282,0.1885,0.0652,0.0014,0.0002
Totals,,1074238,1.0,860755,213483,0.1987,,0.0093,0.0012


f_miss_mths_since_recent_inq
0   0.2052
1   0.1443
Name: GB_FLAG, dtype: float64

Unnamed: 0,Bin,Count,Count (%),Non-event,Event,Event rate,WoE,IV,JS
0,"(-inf, -0.50)",113833,0.106,97406,16427,0.1443,0.3857,0.014,0.0017
1,"[-0.50, inf)",960405,0.894,763349,197056,0.2052,-0.04,0.0014,0.0002
Totals,,1074238,1.0,860755,213483,0.1987,,0.0154,0.0019


In [106]:
# Record treament 

keep_vars.extend(['f_term_60'])
backup_vars.extend(['f_miss_block1','f_hi_risk_zip'])
drop_vars.extend(['f_emp_length_missing', 'f_miss_mths_since_recent_inq', 'f_miss_il_util'])

flag_binning_summary.loc[flag_binning_summary['variable'].isin(keep_vars), 'decision']='keep'
flag_binning_summary.loc[flag_binning_summary['variable'].isin(backup_vars), 'decision']='keep - backup'
flag_binning_summary.loc[flag_binning_summary['variable'].isin(drop_vars), 'decision']='drop'

flag_binning_summary.loc[flag_binning_summary['variable']=='f_term_60', 'reason']= 'binary transformation of the loan term; medium IV (0.2) and great bad rate spread (17%)'
flag_binning_summary.loc[flag_binning_summary['variable']=='f_miss_block1', 'reason']= 'flag for the biggest concurrent missing, low IV (0.01) and moderate bad rate spread (4.5%)'
flag_binning_summary.loc[flag_binning_summary['variable']=='f_hi_risk_zip', 'reason']= 'flag for zip code with bad rate >24%; low IV (0.01) and decent bad rate spread (5.7%); low overlap with the state variable'
flag_binning_summary.loc[flag_binning_summary['variable']=='f_emp_length_missing', 'reason']= 'redundant: covered by emp_length_missing, missing (-1) has its own bin'
flag_binning_summary.loc[flag_binning_summary['variable']=='f_miss_mths_since_recent_inq', 'reason']= 'redundant: covered by mths_since_recent_inq, missing (-1) has its own bin'
flag_binning_summary.loc[flag_binning_summary['variable']=='f_miss_mths_since_recent_inq', 'reason']= 'redundant: overlap with f_miss_block1'

#### 5.2.2 One bins

In [107]:
flag_1bin_vars = flag_binning_summary[flag_binning_summary['bins']==1]['variable']

summary_list = []

for var in flag_1bin_vars:
    stats = df_train.groupby(var)['GB_FLAG'].agg(
        count='count',
        bad_rate='mean'
    ).reset_index()

    # Add variable name column
    stats['variable'] = var

    # Add 0/1 value label 
    stats.rename(columns={var: 'flag_value'}, inplace=True)

    summary_list.append(stats[['variable', 'flag_value', 'count', 'bad_rate']])

# put in a data frame
flag_summary_df = pd.concat(summary_list, ignore_index=True)

# pivot so that 0/1 values are on the column
pivoted = flag_summary_df.pivot(index='variable', columns='flag_value', values=['count', 'bad_rate'])
pivoted.columns = [f"{col[0]}_{int(col[1])}" for col in pivoted.columns]
pivoted.reset_index(inplace=True)


total_count = df_train.shape[0]
pivoted['pctg_1'] = pivoted['count_1'] / total_count
pivoted['bad_rate_spread'] = abs(pivoted['bad_rate_1'] - pivoted['bad_rate_0'])

In [108]:
# Filter potentials
pivoted[(pivoted['count_1'] > 1000) & (pivoted['bad_rate_spread'] >= 0.05)]

Unnamed: 0,variable,count_0,count_1,bad_rate_0,bad_rate_1,pctg_1,bad_rate_spread
0,F_CAPPED_AMOUNT,1065735.0,8503.0,0.1993,0.1328,0.0079,0.0665
2,F_CAPPED_INC,1068355.0,5883.0,0.1991,0.1237,0.0055,0.0754
4,F_CAPPED_NUMACCTS,1059466.0,14772.0,0.198,0.2486,0.0138,0.0505
5,F_OVERLIMIT,1041380.0,32858.0,0.1967,0.2632,0.0306,0.0665
8,f_collections_12_mths_ex_med_gt0,1057705.0,16533.0,0.1979,0.2493,0.0154,0.0514


Flags to keep:
* F_CAPPED_NUMACCTS
* F_OVERLIMIT
* f_collections_12_mths_ex_med_gt0

Bakcup:
* F_CAPPED_AMOUNT
* F_CAPPED_INC

In [None]:
# Manually bin flags chosen to keep 

# Create breaklist
manual_flag_breaks = {
    'F_CAPPED_AMOUNT': [0, 1],
    'F_CAPPED_INC': [0, 1],
    'F_CAPPED_NUMACCTS': [0, 1],
    'F_OVERLIMIT': [0, 1],
    'f_collections_12_mths_ex_med_gt0': [0, 1]
}

# Assemble data for binning
df_manual_flag = X_train[manual_flag_breaks.keys()].copy()
df_manual_flag['target'] = y_train.astype(int)


# Run woebin
woe_binning_manual_flags = sc.woebin(
    dt=df_manual_flag,
    y='target',
    breaks_list=manual_flag_breaks,
    positive='1',
    bin_force=2
)


[INFO] creating woe binning ...
Binning on 1074238 rows and 6 columns in 00:00:14


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,F_CAPPED_AMOUNT,"[-inf,1.0)",1065735,0.9921,853381,212354,0.1993,0.0033,0.0,0.0016,1.0,False
1,F_CAPPED_AMOUNT,"[1.0,inf)",8503,0.0079,7374,1129,0.1328,-0.4824,0.0016,0.0016,inf,False


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,F_CAPPED_INC,"[-inf,1.0)",1068355,0.9945,855600,212755,0.1991,0.0026,0.0,0.0015,1.0,False
1,F_CAPPED_INC,"[1.0,inf)",5883,0.0055,5155,728,0.1237,-0.5632,0.0015,0.0015,inf,False


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,F_CAPPED_NUMACCTS,"[-inf,1.0)",1059466,0.9862,849655,209811,0.198,-0.0044,0.0,0.0013,1.0,False
1,F_CAPPED_NUMACCTS,"[1.0,inf)",14772,0.0138,11100,3672,0.2486,0.288,0.0012,0.0013,inf,False


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,F_OVERLIMIT,"[-inf,1.0)",1041380,0.9694,836546,204834,0.1967,-0.0128,0.0002,0.0047,1.0,False
1,F_OVERLIMIT,"[1.0,inf)",32858,0.0306,24209,8649,0.2632,0.365,0.0045,0.0047,inf,False


Unnamed: 0,variable,bin,count,count_distr,good,bad,badprob,woe,bin_iv,total_iv,breaks,is_special_values
0,f_collections_12_mths_ex_med_gt0,"[-inf,1.0)",1057705,0.9846,848344,209361,0.1979,-0.005,0.0,0.0015,1.0,False
1,f_collections_12_mths_ex_med_gt0,"[1.0,inf)",16533,0.0154,12411,4122,0.2493,0.292,0.0014,0.0015,inf,False


In [111]:
# Create binning summary 
manual_flag_binning_summary = summarize_woe_binning(woe_binning_manual_flags, var_type='flag')

# Merge back to the main flag binning summary table 
flag_binning_summary = pd.concat([flag_binning_summary[~flag_binning_summary['variable'].isin(manual_flag_breaks.keys())], manual_flag_binning_summary], ignore_index=True)

In [112]:
# Record treament 
keep_vars.extend(['F_OVERLIMIT', 'F_CAPPED_NUMACCTS', 'f_collections_12_mths_ex_med_gt0'])
backup_vars.extend(['F_CAPPED_AMOUNT','F_CAPPED_INC'])

flag_binning_summary.loc[flag_binning_summary['variable'].isin(manual_flag_breaks.keys()), 'manual']=1

flag_binning_summary.loc[flag_binning_summary['variable'].isin(keep_vars), 'decision']='keep'
flag_binning_summary.loc[flag_binning_summary['variable'].isin(backup_vars), 'decision']='keep - backup'

flag_binning_summary.loc[flag_binning_summary['variable']=='F_OVERLIMIT', 'reason']= '6.6% br spread; capture utility of any type greater than 100%, known risk indicator.'
flag_binning_summary.loc[flag_binning_summary['variable']=='F_CAPPED_NUMACCTS', 'reason']= '5.1% br spread; indicates very high number of accounts that had to be capped, known red flag for instability or aggressive credit usage.'
flag_binning_summary.loc[flag_binning_summary['variable']=='f_collections_12_mths_ex_med_gt0', 'reason']= '5.1% br spread, non-medical collections in the last year, meaningful and aligns with delinquency risk.'

flag_binning_summary.loc[flag_binning_summary['variable']=='F_CAPPED_AMOUNT', 'reason']= '6.6% br spread but bad bin less than 1%, risk of overfitting'
flag_binning_summary.loc[flag_binning_summary['variable']=='F_CAPPED_INC', 'reason']= '7.5% br spread but bad bin less than 1%, risk of overfitting'

In [113]:
# treatment remaining flags 
flag_binning_summary.loc[flag_binning_summary['decision'].isnull(), 'decision']='drop'
flag_binning_summary.loc[flag_binning_summary['reason'].isnull(), 'reason']='very low spread, not predictive'

In [114]:
# Final binning_summary
all_binning_summary_df = pd.concat([all_binning_summary_df, flag_binning_summary], ignore_index=True)

## 6. Correlation Check

In [115]:
all_binning_summary_df['decision'].value_counts()

decision
drop             64
keep             27
keep - backup    12
Name: count, dtype: int64

In [116]:
keep_vars = all_binning_summary_df[all_binning_summary_df['decision']=='keep']['variable']
X_keep = X_train[keep_vars]

X_keep.dtypes.value_counts()

float64    19
object      4
int64       4
Name: count, dtype: int64

In [117]:
# Correlation (pearson) - numeric & binary
num_bi_vars = X_keep.select_dtypes(include=['float64', 'int64']).columns

X_keep[num_bi_vars].corr()

Unnamed: 0,loan_amnt,int_rate,annual_inc,dti,fico_range_low,inq_last_6mths,mths_since_rcnt_il,acc_open_past_24mths,bc_open_to_buy,bc_util,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,num_rev_tl_bal_gt_0,tot_hi_cred_lim,emp_length_num,f_term_60,F_CAPPED_NUMACCTS,F_OVERLIMIT,f_collections_12_mths_ex_med_gt0
loan_amnt,1.0,0.134,0.4802,0.0166,0.1209,-0.0143,-0.0096,0.0081,0.2073,0.0585,0.1722,0.0548,0.0298,0.2326,0.0405,0.0062,0.1543,0.3507,0.1182,0.3807,0.0069,-0.0275,-0.022
int_rate,0.134,1.0,-0.1174,0.1778,-0.3975,0.2346,-0.0734,0.1956,-0.2918,0.2381,-0.1356,-0.114,-0.1336,-0.0743,-0.0909,-0.1138,0.077,-0.1273,-0.0026,0.4329,0.0235,0.0696,0.0131
annual_inc,0.4802,-0.1174,1.0,-0.2347,0.1167,0.0493,-0.0184,0.0768,0.2261,0.0025,0.1974,0.0514,-0.03,0.3268,0.0427,-0.0304,0.1085,0.571,0.1363,0.0807,0.0442,-0.0112,-0.008
dti,0.0166,0.1778,-0.2347,1.0,-0.0688,-0.0005,-0.0973,0.1708,-0.0713,0.1802,0.0522,-0.023,-0.093,-0.0454,-0.0063,0.0195,0.2547,0.0171,0.0134,0.077,0.0135,0.0114,-0.0033
fico_range_low,0.1209,-0.3975,0.1167,-0.0688,1.0,-0.0945,0.0139,-0.115,0.5058,-0.4479,0.0979,0.0936,0.0657,0.0955,0.0725,0.0313,-0.1795,0.1976,0.0218,-0.0052,-0.0415,-0.0709,-0.0695
inq_last_6mths,-0.0143,0.2346,0.0493,-0.0005,-0.0945,1.0,-0.0551,0.2546,0.0075,-0.0676,-0.009,-0.1842,-0.2173,0.0478,-0.1289,-0.4272,0.089,0.0282,0.0016,0.0221,0.0611,-0.0183,0.011
mths_since_rcnt_il,-0.0096,-0.0734,-0.0184,-0.0973,0.0139,-0.0551,1.0,-0.0807,0.0351,-0.0306,0.0352,0.0252,0.1127,-0.0318,0.0101,0.0068,0.009,-0.0135,-0.0218,-0.0493,0.0115,0.0658,0.0136
acc_open_past_24mths,0.0081,0.1956,0.0768,0.1708,-0.115,0.2546,-0.0807,1.0,0.1007,-0.1254,-0.0192,-0.388,-0.4177,0.0717,-0.3363,-0.0599,0.335,0.1103,0.0246,0.0552,0.1265,-0.0235,0.0185
bc_open_to_buy,0.2073,-0.2918,0.2261,-0.0713,0.5058,0.0075,0.0351,0.1007,1.0,-0.5017,0.1877,-0.0559,-0.0403,0.1332,-0.1034,-0.0055,0.1256,0.2662,0.0242,0.0044,0.0325,-0.0729,-0.0121
bc_util,0.0585,0.2381,0.0025,0.1802,-0.4479,-0.0676,-0.0306,-0.1254,-0.5017,1.0,0.0117,0.1109,0.1197,0.0154,0.1657,0.065,0.1529,0.0005,0.0322,0.0543,-0.0322,0.1319,-0.0307


In [None]:
# Check if the high correlation was due to concurrent missing 
# corr_excl_miss(df_train, fmeta, num_bi_vars)

Pairings with correlation between 0.5 and 0.8:
* annual_inc - tot_hi_cred_lim: 0.571:
    * Medium correlation; could both reflect borrower financial capacity.
* fico_range_low - bc_open_to_buy: 0.5058:
    * Slightly surprising correlation; could be because higher FICO = higher available revolving credit
* mo_sin_rcnt_rev_tl_op - mo_sin_rcnt_tl: 0.629:
    * Likely measuring similar recency, but one is revolving-only, the other is all tradelines
* mo_sin_rcnt_rev_tl_op - mths_since_recent_bc: 0.5958:
    * Red flag — recent revolving TL open date vs recent BC date are probably overlapping heavily
* mort_acc - tot_hi_cred_lim: 0.5459:
    * Mortgages inflate total credit limit. Could be semi-redundant unless mort_acc’s predictive power is separate
* bc_open_to_buy - bc_util: -0.5017:
    

Check IV difference to decide which to drop between the pairs.

In [118]:
# Define correlated pairs
correlated_pairs = [
    ('annual_inc', 'tot_hi_cred_lim'),
    ('fico_range_low', 'bc_open_to_buy'),
    ('mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl'),
    ('mo_sin_rcnt_rev_tl_op', 'mths_since_recent_bc'),
    ('mort_acc', 'tot_hi_cred_lim'),
    ('bc_open_to_buy', 'bc_util')

]

# Create a helper to fetch IV from the summary df
iv_lookup = all_binning_summary_df.set_index('variable')['IV'].to_dict()

# Build comparison table
iv_corr_table = []
for var1, var2 in correlated_pairs:
    iv_corr_table.append({
        'var1': var1,
        'IV_var1': iv_lookup.get(var1, None),
        'var2': var2,
        'IV_var2': iv_lookup.get(var2, None),
        'corr': round(df_train[[var1, var2]].corr().iloc[0, 1], 4)  # recheck correlation in case needed
    })

# Sort by correlation descending
iv_corr_df = pd.DataFrame(iv_corr_table)
iv_corr_df.sort_values(by='corr', ascending=False)




Unnamed: 0,var1,IV_var1,var2,IV_var2,corr
2,mo_sin_rcnt_rev_tl_op,0.0271,mo_sin_rcnt_tl,0.0349,0.629
3,mo_sin_rcnt_rev_tl_op,0.0271,mths_since_recent_bc,0.0278,0.5958
0,annual_inc,0.0313,tot_hi_cred_lim,0.043,0.571
4,mort_acc,0.0343,tot_hi_cred_lim,0.043,0.5459
1,fico_range_low,0.1202,bc_open_to_buy,0.0537,0.5058
5,bc_open_to_buy,0.0537,bc_util,0.0234,-0.5017


Decision:
* `mo_sin_rcnt_rev_tl_op` and mths_since_recent_bc are very close (0.001 difference) in IV. Given that mo_sin_rcnt_rev_tl_op is also correlated with mo_sin_rcnt_tl, we'll drop mo_sin_rcnt_rev_tl_op
* `tot_hi_cred_lim` is strong proxy for total available credit, but it overlaps in meaning with both income (annual_inc) and mortgage account count (mort_acc). Dropping tot_hi_cred_lim instead would:
    * Keep two “known” variables that have regulatory familiarity and business interpretability.
    * Reduce the double-correlation problem since those two are only moderately correlated with each other (~0.33)
* Keep `bc_open_to_buy` given higher IV and move `bc_util` to backup

In [119]:
# Update treatment in the summary df
all_binning_summary_df.loc[all_binning_summary_df['variable'].isin(['mo_sin_rcnt_rev_tl_op', 'tot_hi_cred_lim', 'bc_util']), 'decision'] = 'keep - backup'

all_binning_summary_df.loc[all_binning_summary_df['variable'].isin(['mo_sin_rcnt_rev_tl_op']), 'reason'] = 'decent IV; correlated to mo_sin_rcnt_tl (0.63) and mths_since_recent_bc (0.59)'
all_binning_summary_df.loc[all_binning_summary_df['variable'].isin(['tot_hi_cred_lim']), 'reason'] = 'decent IV (0.043); correlated to annual_inc (0.57) and mort_acc (0.55). The other two were kept for interpretibility'
all_binning_summary_df.loc[all_binning_summary_df['variable'].isin(['bc_util']), 'reason'] = 'correlated to bc_open_to_buy (0.50). Kept bc_open_to_buy for higher IV (0.05)'

In [120]:
# Correlation - categorical variables

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix, correction=False)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n

    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1)*(r - 1)) / (n - 1))    
    rcorr = r - ((r - 1)**2) / (n - 1)
    kcorr = k - ((k - 1)**2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))



for i in range(0,6):
    cate_vars = list(X_keep.select_dtypes(include=['object']).columns)
    two_comb = list(combinations(cat_vars, 2))

    var_x = two_comb[i][0]
    var_y = two_comb[i][1]

    x = X_keep[var_x]
    y = X_keep[var_y]

    print(f"{var_x} - {var_y}: {round(cramers_v(x, y), 4)}")


verification_status - purpose: 0.0611
verification_status - addr_state_grouped: 0.0096
verification_status - home_ownership_grouped: 0.0273
purpose - addr_state_grouped: 0.0165
purpose - home_ownership_grouped: 0.1087
addr_state_grouped - home_ownership_grouped: 0.0651


In [121]:
all_binning_summary_df['decision'].value_counts()

decision
drop             64
keep             24
keep - backup    15
Name: count, dtype: int64

## 7. Consolidate WOE Transformation

Consolidate final variable list

In [149]:
# final var lists
keep_mask = all_binning_summary_df['decision'].isin(['keep', 'keep - backup'])
final_vars = all_binning_summary_df.loc[keep_mask, 'variable'].tolist()

# list of manually-binned variables 
num_manual_set  = set(manual_binning_tables.keys())              # numeric (manual, scorecardpy)
cat_manual_set  = set(woe_table_purpose.keys())                  # {'purpose'} if that's the only one
flag_manual_set = set(woe_binning_manual_flags.keys()) if isinstance(woe_binning_manual_flags, dict) else set()

# only keep variables that are in the keep/bacup list
num_auto_vars_all   = [v for v in binning_tables.keys()        if v in final_vars]
num_manual_vars = [v for v in manual_binning_tables.keys() if v in final_vars]

cat_auto_vars_all   = [v for v in woe_binning_cat.keys()       if v in final_vars]
cat_manual_vars = [v for v in cat_manual_set               if v in final_vars]

flag_auto_vars_all  = [v for v in woe_binning_flag.keys()      if v in final_vars]
flag_manual_vars= [v for v in flag_manual_set              if v in final_vars]

# de-dup: enforce precedence: manual > auto
num_auto_vars   = [v for v in num_auto_vars_all   if v not in num_manual_set]
cat_auto_vars   = [v for v in cat_auto_vars_all   if v not in cat_manual_set]
flag_auto_vars  = [v for v in flag_auto_vars_all  if v not in flag_manual_set]


len(num_auto_vars + num_manual_vars + cat_auto_vars + cat_manual_vars + flag_auto_vars + flag_manual_vars)


39

In [178]:
# Persist a manifest with sources
woe_source_map = {v: 'num_manual' for v in num_manual_vars}
woe_source_map.update({v: 'num_auto' for v in num_auto_vars})
woe_source_map.update({v: 'cat_manual' for v in cat_manual_vars})
woe_source_map.update({v: 'cat_auto' for v in cat_auto_vars})
woe_source_map.update({v: 'flag_manual' for v in flag_manual_vars})
woe_source_map.update({v: 'flag_auto' for v in flag_auto_vars})

# save to artifect 
with open("../artifacts/woe_source_map.json", "w") as f:
    json.dump(woe_source_map, f)


Build one scorecardpy bin dict with manual overriding rule

In [159]:
bins_sc = {}

# munric: autobinning using opt-binning 
# numeric: manual only (scorecardpy-style)
bins_sc.update({var: tab for var, tab in manual_binning_tables.items() if var in num_manual_vars})


# categorical: auto (minus manual) -> manual to override
bins_sc.update({var: tab for var, tab in woe_binning_cat.items() if var in cat_auto_vars})
bins_sc.update({var: woe_table_purpose[var] for var in cat_manual_vars})  # manual cat

# flags: auto (minus manual) -> manual to override
bins_sc.update({var: tab for var, tab in woe_binning_flag.items() if var in flag_auto_vars})
if isinstance(woe_binning_manual_flags, dict):
    bins_sc.update({var: tab for var, tab in woe_binning_manual_flags.items() if var in flag_manual_vars})



Define woe-transformation function

In [160]:
# Define a uniform transformation function 

def transform_to_woe(df, *, bins_sc, num_auto_vars, woe_models, final_vars, suffix='_woe'):
    import scorecardpy as sc
    import pandas as pd
    import numpy as np

    df_in = df.copy()

    # scorecardpy batch (manual numeric, all categorical and all flags)
    sc_vars = [v for v in bins_sc.keys() if v in df_in.columns]
    df_sc_woe = sc.woebin_ply(df_in[sc_vars], bins=bins_sc) if sc_vars else pd.DataFrame(index=df_in.index)

    # optbinning per var (auto numeric only)
    df_opt_woe = pd.DataFrame(index=df_in.index)
    for v in num_auto_vars:
        if v in df_in.columns:
            df_opt_woe[v + suffix] = woe_models[v].transform(df_in[v], metric='woe')

    out = pd.concat([df_sc_woe, df_opt_woe], axis=1)

    # sanity check: de-dupping (if there's any)
    dupes = out.columns[out.columns.duplicated()].tolist()
    if dupes:
        raise ValueError(f"Duplicate WOE columns found: {dupes}. Check overlap rules.")

    # keep only requested final vars (as _woe)
    final_woe_cols = [v + suffix for v in final_vars if (v + suffix) in out.columns]
    return out[final_woe_cols]

In [None]:
# Apply woe transformation to the train set 
X_train_woe = transform_to_woe(
    X_train,
    bins_sc=bins_sc,
    num_auto_vars=num_auto_vars,
    woe_models=woe_models,
    final_vars=final_vars,
    suffix='_woe'
)

print(X_train_woe.shape)

(1074238, 39)


In [None]:
# Apply woe transformation to the test set 
X_test_woe = transform_to_woe(
    X_test,
    bins_sc=bins_sc,
    num_auto_vars=num_auto_vars,
    woe_models=woe_models,
    final_vars=final_vars,
    suffix='_woe'
)

print(X_test_woe.shape)

(205520, 39)


## 8. Save Transformed Data

In [182]:
# sanity check before saving 
assert set(X_train_woe.columns) == set(X_test_woe.columns)
print(X_train_woe.isna().sum().sum(), X_test_woe.isna().sum().sum()) 

0 0


In [180]:
# attach GB_FLAG back to the transformed data frame 
X_train_woe['GB_FLAG'] = y_train.values
X_test_woe['GB_FLAG'] = y_test.values

In [181]:
# Save as .csv
X_train_woe.to_csv('../data/processed/train_woe.csv', index=False)
X_test_woe.to_csv('../data/processed/test_woe.csv', index=False)

# save binning meta data
all_binning_summary_df.to_csv('../dictionaries/binning_summary.csv', index=False)

In [None]:
# Save WoE models

# manual numeric, all categorical and all flag binning
joblib.dump(bins_sc, "../artifacts/manual_bin_models.pkl")

# auto numeric binning 
joblib.dump(woe_models, "../artifacts/numeric_optbin_models.pkl")

['../artifacts/manual_bin_models.pkl']

['../artifacts/numeric_optbin_models.pkl']