In [1]:
import fs

RAW_PROJECT_DIR = fs.open_fs("../../data/raw/project")
INTERIM_PROJECT_DIR = fs.open_fs("../../data/interim/project")
RAW_FILE_NAME = RAW_PROJECT_DIR.getsyspath("hmeq.csv")
INTERIM_FILE_NAME = INTERIM_PROJECT_DIR.getsyspath("hmeq_clean.csv")

In [2]:
import pandas as pd

data = pd.read_csv(RAW_FILE_NAME)
data.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,
3,1,1500,,,,,,,,,,,
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,


### Dataset Description

In [3]:
data.shape

(5960, 13)

In [4]:
data.dtypes

BAD          int64
LOAN         int64
MORTDUE    float64
VALUE      float64
REASON      object
JOB         object
YOJ        float64
DEROG      float64
DELINQ     float64
CLAGE      float64
NINQ       float64
CLNO       float64
DEBTINC    float64
dtype: object

In [5]:
X = data.drop(columns=['BAD'])
y = data['BAD']

In [6]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categoric_features = X.select_dtypes(include=['object']).columns

In [7]:
data[numeric_features].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
LOAN,5960.0,18607.969799,11207.480417,1100.0,11100.0,16300.0,23300.0,89900.0
MORTDUE,5442.0,73760.8172,44457.609458,2063.0,46276.0,65019.0,91488.0,399550.0
VALUE,5848.0,101776.048741,57385.775334,8000.0,66075.5,89235.5,119824.25,855909.0
YOJ,5445.0,8.922268,7.573982,0.0,3.0,7.0,13.0,41.0
DEROG,5252.0,0.25457,0.846047,0.0,0.0,0.0,0.0,10.0
DELINQ,5380.0,0.449442,1.127266,0.0,0.0,0.0,0.0,15.0
CLAGE,5652.0,179.766275,85.810092,0.0,115.116702,173.466667,231.562278,1168.233561
NINQ,5450.0,1.186055,1.728675,0.0,0.0,1.0,2.0,17.0
CLNO,5738.0,21.296096,10.138933,0.0,15.0,20.0,26.0,71.0
DEBTINC,4693.0,33.779915,8.601746,0.524499,29.140031,34.818262,39.003141,203.312149


In [8]:
X[categoric_features].describe().T

Unnamed: 0,count,unique,top,freq
REASON,5708,2,DebtCon,3928
JOB,5681,6,Other,2388


In [9]:
import numpy as np

null_percentage = np.round(data.isnull().sum() * 100 / len(data), 2)

missing_values_df = pd.DataFrame({'missing_values_%': null_percentage})
missing_values_df

Unnamed: 0,missing_values_%
BAD,0.0
LOAN,0.0
MORTDUE,8.69
VALUE,1.88
REASON,4.23
JOB,4.68
YOJ,8.64
DEROG,11.88
DELINQ,9.73
CLAGE,5.17


### Imputing and Scaling Values

In [10]:
from sklearn.impute import SimpleImputer

# Categorical imputing
cat_imputer = SimpleImputer(strategy='most_frequent')
data[categoric_features] = cat_imputer.fit_transform(data[categoric_features])

# Numerical imputing
num_imputer = SimpleImputer(strategy='median')
data[numeric_features] = cat_imputer.fit_transform(data[numeric_features])

In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [12]:
scaler = MinMaxScaler(feature_range=(10, 100))

data[numeric_features] = scaler.fit_transform(data[numeric_features])

In [13]:
label_encoders = {}

for column in categoric_features:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

In [14]:
data

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,10.000000,15.388176,13.293101,1,2,33.048780,10.0,10.0,17.269950,15.294118,21.408451,10.000000
1,1,10.202703,25.394466,16.411065,1,2,25.365854,10.0,22.0,19.385966,10.000000,27.746479,10.000000
2,1,10.405405,12.589594,10.923448,1,2,18.780488,10.0,10.0,21.514821,15.294118,22.676056,10.000000
3,1,10.405405,19.042635,15.519460,0,2,10.000000,10.0,10.0,17.896537,10.000000,30.281690,10.000000
4,0,10.608108,31.677011,21.038920,1,1,16.585366,10.0,10.0,17.190343,10.000000,27.746479,10.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5955,0,98.986486,22.498748,18.723401,0,2,45.121951,10.0,10.0,27.088008,10.000000,30.281690,25.794385
5956,0,99.087838,21.890125,19.015508,0,2,45.121951,10.0,10.0,26.077510,10.000000,29.014085,25.682377
5957,0,99.290541,21.769894,19.014128,0,2,42.926829,10.0,10.0,26.353898,10.000000,29.014085,25.547733
5958,0,99.898649,20.937792,18.901297,0,2,40.731707,10.0,10.0,26.478164,10.000000,30.281690,25.008185


In [15]:
data.to_csv(INTERIM_FILE_NAME, index=False)

In [16]:
INTERIM_FILE_NAME

'/Users/miguelsepulveda/Library/CloudStorage/OneDrive-Personal/MacBook/UP/MCD/Materias/ML II/MCD-ML2/data/interim/project/hmeq_clean.csv'

### Feature Selection

In [17]:
X = data.drop(columns=['BAD'])
y = data['BAD']

In [18]:
y.dtypes

dtype('int64')

In [19]:
from varclushi import VarClusHi

vc = VarClusHi(df = X[numeric_features], feat_list=list(numeric_features))
vc.varclus()

<varclushi.varclushi.VarClusHi at 0x12652c4d0>

In [20]:
rs = vc.rsquare
rs = rs.sort_values(by=['Cluster', 'RS_Ratio']).reset_index(drop = True)
rs['id'] = rs.groupby('Cluster').cumcount()+1

In [21]:
rs

Unnamed: 0,Cluster,Variable,RS_Own,RS_NC,RS_Ratio,id
0,0,VALUE,0.871099,0.062806,0.137539,1
1,0,MORTDUE,0.807589,0.057328,0.204112,2
2,0,LOAN,0.274709,0.017093,0.737904,3
3,1,DEROG,0.428612,0.001981,0.572522,1
4,1,DELINQ,0.406415,0.017831,0.604361,2
5,1,DEBTINC,0.372143,0.022922,0.642586,3
6,1,NINQ,0.195111,0.001715,0.806272,4
7,2,CLAGE,0.609029,0.030318,0.403195,1
8,2,CLNO,0.466744,0.096256,0.590052,2
9,2,YOJ,0.281599,0.000496,0.718757,3


In [22]:
from optbinning import BinningProcess

variable_names = list(X.columns)

binning_process = BinningProcess(variable_names)
binning_process.fit(X, y)
binning_process.information(print_level=1)

(CVXPY) Sep 12 11:49:23 PM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Version of ortools (9.6.2534) is too old. Expected >= 9.7.0.')
optbinning (Version 0.19.0)
Copyright (c) 2019-2024 Guillermo Navas-Palencia, Apache License 2.0

  Statistics
    Number of records                   5960
    Number of variables                   12
    Target type                       binary

    Number of numerical                   12
    Number of categorical                  0
    Number of selected                    12

  Time                                0.5594 sec



In [23]:
df_bins_num  = binning_process.summary().sort_values(by = 'iv', ascending = False)
df_bins_num  = df_bins_num[df_bins_num['dtype'] == 'numerical']
df_bins_num.rename(columns = {'name': 'Variable'}, inplace = True)
df_bins_num = df_bins_num.merge(rs, how = 'left', on = 'Variable')
df_bins_num = df_bins_num.sort_values(by=['Cluster', 'iv'], ascending = [True, False]).reset_index(drop = True)
df_bins_num['id'] = rs.groupby('Cluster').cumcount()+1
#df_bins_num.to_csv("./Outputs/clusters_iv_3.csv", index = False)

In [24]:
df_bins_num

Unnamed: 0,Variable,dtype,status,selected,n_bins,iv,js,gini,quality_score,Cluster,RS_Own,RS_NC,RS_Ratio,id
0,LOAN,numerical,OPTIMAL,True,9,0.197321,0.023593,0.215622,0.000558,0.0,0.274709,0.017093,0.737904,1.0
1,VALUE,numerical,OPTIMAL,True,5,0.185015,0.022173,0.2016,0.371548,0.0,0.871099,0.062806,0.137539,2.0
2,MORTDUE,numerical,OPTIMAL,True,5,0.0508,0.00632,0.120696,0.015357,0.0,0.807589,0.057328,0.204112,3.0
3,DEBTINC,numerical,OPTIMAL,True,8,1.993927,0.223384,0.672568,6e-06,1.0,0.372143,0.022922,0.642586,1.0
4,DELINQ,numerical,OPTIMAL,True,3,0.564005,0.065611,0.323877,0.363771,1.0,0.406415,0.017831,0.604361,2.0
5,DEROG,numerical,OPTIMAL,True,2,0.33617,0.039699,0.213664,0.278185,1.0,0.428612,0.001981,0.572522,3.0
6,NINQ,numerical,OPTIMAL,True,5,0.17286,0.020902,0.199171,0.432491,1.0,0.195111,0.001715,0.806272,4.0
7,CLAGE,numerical,OPTIMAL,True,8,0.253238,0.030915,0.272021,0.042524,2.0,0.609029,0.030318,0.403195,1.0
8,YOJ,numerical,OPTIMAL,True,6,0.110197,0.013632,0.179265,0.265513,2.0,0.281599,0.000496,0.718757,2.0
9,CLNO,numerical,OPTIMAL,True,6,0.08631,0.010662,0.147126,0.076296,2.0,0.466744,0.096256,0.590052,3.0


In [25]:
df_bins_num_ok = df_bins_num[df_bins_num['id']==1][['Variable', 'dtype', 'iv', 'n_bins', 'quality_score']]
df_bins_num_ok.sort_values(by = 'iv', inplace = True, ascending = False)
print(df_bins_num_ok.shape)
df_bins_num_ok.head()

(3, 5)


Unnamed: 0,Variable,dtype,iv,n_bins,quality_score
3,DEBTINC,numerical,1.993927,8,6e-06
7,CLAGE,numerical,0.253238,8,0.042524
0,LOAN,numerical,0.197321,9,0.000558


In [26]:
df_aux = X[numeric_features].copy()

In [27]:
df_bins_num_ok_thresh = df_bins_num_ok[df_bins_num_ok['iv']>=.02]
df_bins_num_ok_thresh.reset_index(drop = True, inplace = True )
#df_bins_num_ok_thresh.to_csv('./Outputs/bins_ok_threshold_3.csv', index = False)
df_bins_num_ok_thresh.shape

(3, 5)

In [28]:
df_bins_num_ok_thresh

Unnamed: 0,Variable,dtype,iv,n_bins,quality_score
0,DEBTINC,numerical,1.993927,8,6e-06
1,CLAGE,numerical,0.253238,8,0.042524
2,LOAN,numerical,0.197321,9,0.000558
