In [41]:
import numpy as np
from utils.log import LOGGER
import category_encoders as ce
import warnings
import pandas as pd
import time
from sklearn.model_selection import StratifiedShuffleSplit ,train_test_split
import xgboost as xgb
from skopt import BayesSearchCV

In [42]:
df = pd.read_csv('../data/loan/loan.csv', low_memory=False)

# Frequency Encoding

In [43]:

warnings.filterwarnings("ignore")

# Or, ignore just the SettingWithCopyWarning
warnings.filterwarnings("ignore", message="A value is trying to be set on a copy of a slice from a DataFrame")

# Filter for individual applications
df_freq = df[df["application_type"] != "JOINT"]

# Extract relevant columns to new df
df_freq.drop(columns=df_freq.columns.difference(
    ['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
     'grade', 'sub_grade', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
     'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'mths_since_last_delinq', 'total_acc', 'out_prncp',
     'total_pymnt', 'total_rec_prncp', 'total_rec_interest', 'total_rec_late_fee', 'acc_now_delinq']), inplace=True)

# Fill any numeric columns na with 0
for column in df_freq.columns:
    if df_freq[column].isnull().any() and df_freq[column].dtype in ['int64', 'float64']:
        df_freq[column].fillna(0, inplace=True)
df_freq.isnull().sum()

LOGGER.info("Preprocessing data...")

# employment
emp_length_map = {
    '10+ years': 11,
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    np.nan: 0
}

df_freq['emp_length'] = df_freq['emp_length'].map(emp_length_map)

# grades and subgrades
grades = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7
}
subgrades = {
    'A1': 1,
    'A2': 2,
    'A3': 3,
    'A4': 4,
    'A5': 5,
    'B1': 6,
    'B2': 7,
    'B3': 8,
    'B4': 9,
    'B5': 10,
    'C1': 11,
    'C2': 12,
    'C3': 13,
    'C4': 14,
    'C5': 15,
    'D1': 16,
    'D2': 17,
    'D3': 18,
    'D4': 19,
    'D5': 20,
    'E1': 21,
    'E2': 22,
    'E3': 23,
    'E4': 24,
    'E5': 25,
    'F1': 26,
    'F2': 27,
    'F3': 28,
    'F4': 29,
    'F5': 30,
    'G1': 31,
    'G2': 32,
    'G3': 33,
    'G4': 34,
    'G5': 35
}

df_freq['grade'] = df_freq['grade'].map(grades)
df_freq['sub_grade'] = df_freq['sub_grade'].map(subgrades)
df_freq['term'] = df_freq['term'].str.extract('(\d+)').astype(int)

# homeownership
homeownership = {
    'ANY': 0,
    'MORTGAGE': -1,
    'NONE': 0,
    'OTHER': 0,
    'OWN': 2,
    'RENT': 1
}

df_freq['home_ownership'] = df_freq['home_ownership'].map(homeownership)

# verification
verification = {
    'Not Verified': -1,
    'Source Verified': 1,
    'Verified': 2
}

df_freq['verification_status'] = df_freq['verification_status'].map(verification)

# loan_status
l_stat = {
    'Charged Off': 1,
    'Default': 1,
    'Does not meet the credit policy. Status:Charged Off': 0,
    'Late (16-30 days)': 1,
    'Late (31-120 days)': 1,
    'Current': 0,
    'Does not meet the credit policy. Status:Fully Paid': 0,
    'Fully Paid': 0,
    'In Grace Period': 0,
    'Issued': 0,
}

df_freq['loan_status'] = df_freq['loan_status'].map(l_stat)

LOGGER.info('Encoding categorical features...')

encoder1 = ce.CountEncoder(cols=['purpose'])

# Fit and transform the data
df_freq = encoder1.fit_transform(df_freq)

encoder2 = ce.CountEncoder(cols=['addr_state'])

# Fit and transform the data
df_freq = encoder2.fit_transform(df_freq)

LOGGER.info('Feature engineering in progress...')

# loan to income
df_freq['loan_to_income'] = round(df_freq['funded_amnt'] / df_freq['annual_inc'], 2)
df_freq['loan_to_income'].replace(np.inf, 2, inplace=True)

# total interest
df_freq['total_interest'] = round((df_freq['term'] / 12) * df_freq['loan_amnt'] * (df_freq['int_rate'] / 100), 2)

# loan performance
df_freq['loan_performance'] = round(df_freq['total_pymnt'] - df_freq['funded_amnt'], 2)

# repayment rate
df_freq['repayment_rate'] = round(df_freq['total_pymnt'] / df_freq['funded_amnt'], 2)

# debt-to-income ratio monthly
df_freq['dti_month'] = round(df_freq['installment'] / (df_freq['annual_inc'] / 12), 3)
df_freq['dti_month'].replace(np.inf, 1, inplace=True)
# format column order
columns = ['loan_amnt', 'funded_amnt', 'term', 'int_rate',
           'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
           'annual_inc', 'verification_status', 'purpose', 'addr_state', 'dti',
           'delinq_2yrs', 'mths_since_last_delinq',
           'total_acc', 'out_prncp', 'total_pymnt', 'total_rec_prncp',
           'total_rec_late_fee', 'acc_now_delinq', 'loan_to_income',
           'total_interest', 'loan_performance', 'repayment_rate', 'dti_month', 'loan_status']

df_freq = df_freq[columns]


LOGGER.info("Preprocessing finished!")



2024-03-25 20:41:56,478 INFO -- Preprocessing data...
2024-03-25 20:41:57,753 INFO -- Encoding categorical features...
2024-03-25 20:41:59,018 INFO -- Feature engineering in progress...
2024-03-25 20:41:59,068 INFO -- Preprocessing finished!


In [44]:
df_freq.head(5)

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,...,total_pymnt,total_rec_prncp,total_rec_late_fee,acc_now_delinq,loan_to_income,total_interest,loan_performance,repayment_rate,dti_month,loan_status
0,5000.0,5000.0,36,10.65,162.87,2,7,11,1,24000.0,...,5861.071414,5000.0,0.0,0.0,0.21,1597.5,861.07,1.17,0.081,0
1,2500.0,2500.0,60,15.27,59.83,3,14,0,1,30000.0,...,1008.71,456.46,0.0,0.0,0.08,1908.75,-1491.29,0.4,0.024,1
2,2400.0,2400.0,36,15.96,84.33,3,15,11,1,12252.0,...,3003.653644,2400.0,0.0,0.0,0.2,1149.12,603.65,1.25,0.083,0
3,10000.0,10000.0,36,13.49,339.31,3,11,11,1,49200.0,...,12226.302212,10000.0,16.97,0.0,0.2,4047.0,2226.3,1.22,0.083,0
4,3000.0,3000.0,60,12.69,67.79,2,10,1,1,80000.0,...,3242.17,2233.1,0.0,0.0,0.04,1903.5,242.17,1.08,0.01,0


# Target Encoding

In [45]:

warnings.filterwarnings("ignore")

# Or, ignore just the SettingWithCopyWarning
warnings.filterwarnings("ignore", message="A value is trying to be set on a copy of a slice from a DataFrame")

# Filter for individual applications
df_target = df[df["application_type"] != "JOINT"]

# Extract relevant columns to new df
df_target.drop(columns=df_target.columns.difference(
    ['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
     'grade', 'sub_grade', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
     'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'mths_since_last_delinq', 'total_acc', 'out_prncp',
     'total_pymnt', 'total_rec_prncp', 'total_rec_interest', 'total_rec_late_fee', 'acc_now_delinq']), inplace=True)

# Fill any numeric columns na with 0
for column in df_target.columns:
    if df_target[column].isnull().any() and df_target[column].dtype in ['int64', 'float64']:
        df_target[column].fillna(0, inplace=True)
df_target.isnull().sum()

LOGGER.info("Preprocessing data...")

# employment
emp_length_map = {
    '10+ years': 11,
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    np.nan: 0
}

df_target['emp_length'] = df_target['emp_length'].map(emp_length_map)

# grades and subgrades
grades = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7
}
subgrades = {
    'A1': 1,
    'A2': 2,
    'A3': 3,
    'A4': 4,
    'A5': 5,
    'B1': 6,
    'B2': 7,
    'B3': 8,
    'B4': 9,
    'B5': 10,
    'C1': 11,
    'C2': 12,
    'C3': 13,
    'C4': 14,
    'C5': 15,
    'D1': 16,
    'D2': 17,
    'D3': 18,
    'D4': 19,
    'D5': 20,
    'E1': 21,
    'E2': 22,
    'E3': 23,
    'E4': 24,
    'E5': 25,
    'F1': 26,
    'F2': 27,
    'F3': 28,
    'F4': 29,
    'F5': 30,
    'G1': 31,
    'G2': 32,
    'G3': 33,
    'G4': 34,
    'G5': 35
}

df_target['grade'] = df_target['grade'].map(grades)
df_target['sub_grade'] = df_target['sub_grade'].map(subgrades)
df_target['term'] = df_target['term'].str.extract('(\d+)').astype(int)

# homeownership
homeownership = {
    'ANY': 0,
    'MORTGAGE': -1,
    'NONE': 0,
    'OTHER': 0,
    'OWN': 2,
    'RENT': 1
}

df_target['home_ownership'] = df_target['home_ownership'].map(homeownership)

# verification
verification = {
    'Not Verified': -1,
    'Source Verified': 1,
    'Verified': 2
}

df_target['verification_status'] = df_target['verification_status'].map(verification)

# loan_status
l_stat = {
    'Charged Off': 1,
    'Default': 1,
    'Does not meet the credit policy. Status:Charged Off': 0,
    'Late (16-30 days)': 1,
    'Late (31-120 days)': 1,
    'Current': 0,
    'Does not meet the credit policy. Status:Fully Paid': 0,
    'Fully Paid': 0,
    'In Grace Period': 0,
    'Issued': 0,
}

df_target['loan_status'] = df_target['loan_status'].map(l_stat)

#---------------------------------------------------
LOGGER.info('Encoding categorical features...')

encoder1 = ce.TargetEncoder(cols=['purpose'])

# Fit and transform the data
df_target = encoder1.fit_transform(df_target, df_target['loan_status'])

encoder2 = ce.TargetEncoder(cols=['addr_state'])

# Fit and transform the data
df_target = encoder2.fit_transform(df_target, df_target['loan_status'])

LOGGER.info('Feature engineering in progress...')

# loan to income
df_target['loan_to_income'] = round(df_target['funded_amnt'] / df_target['annual_inc'], 2)
df_target['loan_to_income'].replace(np.inf, 2, inplace=True)

# total interest
df_target['total_interest'] = round((df_target['term'] / 12) * df_target['loan_amnt'] * (df_target['int_rate'] / 100), 2)

# loan performance
df_target['loan_performance'] = round(df_target['total_pymnt'] - df_target['funded_amnt'], 2)

# repayment rate
df_target['repayment_rate'] = round(df_target['total_pymnt'] / df_target['funded_amnt'], 2)

# debt-to-income ratio monthly
df_target['dti_month'] = round(df_target['installment'] / (df_target['annual_inc'] / 12), 3)
df_target['dti_month'].replace(np.inf, 1, inplace=True)
# format column order
columns = ['loan_amnt', 'funded_amnt', 'term', 'int_rate',
           'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
           'annual_inc', 'verification_status', 'purpose', 'addr_state', 'dti',
           'delinq_2yrs', 'mths_since_last_delinq',
           'total_acc', 'out_prncp', 'total_pymnt', 'total_rec_prncp',
           'total_rec_late_fee', 'acc_now_delinq', 'loan_to_income',
           'total_interest', 'loan_performance', 'repayment_rate', 'dti_month', 'loan_status']

df_target = df_target[columns]


LOGGER.info("Preprocessing finished!")



2024-03-25 20:42:00,212 INFO -- Preprocessing data...
2024-03-25 20:42:01,293 INFO -- Encoding categorical features...
2024-03-25 20:42:02,759 INFO -- Feature engineering in progress...
2024-03-25 20:42:02,815 INFO -- Preprocessing finished!


In [46]:
df_target.head(5)

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,...,total_pymnt,total_rec_prncp,total_rec_late_fee,acc_now_delinq,loan_to_income,total_interest,loan_performance,repayment_rate,dti_month,loan_status
0,5000.0,5000.0,36,10.65,162.87,2,7,11,1,24000.0,...,5861.071414,5000.0,0.0,0.0,0.21,1597.5,861.07,1.17,0.081,0
1,2500.0,2500.0,60,15.27,59.83,3,14,0,1,30000.0,...,1008.71,456.46,0.0,0.0,0.08,1908.75,-1491.29,0.4,0.024,1
2,2400.0,2400.0,36,15.96,84.33,3,15,11,1,12252.0,...,3003.653644,2400.0,0.0,0.0,0.2,1149.12,603.65,1.25,0.083,0
3,10000.0,10000.0,36,13.49,339.31,3,11,11,1,49200.0,...,12226.302212,10000.0,16.97,0.0,0.2,4047.0,2226.3,1.22,0.083,0
4,3000.0,3000.0,60,12.69,67.79,2,10,1,1,80000.0,...,3242.17,2233.1,0.0,0.0,0.04,1903.5,242.17,1.08,0.01,0


# One Hot Encoding

In [47]:

warnings.filterwarnings("ignore")

# Or, ignore just the SettingWithCopyWarning
warnings.filterwarnings("ignore", message="A value is trying to be set on a copy of a slice from a DataFrame")

# Filter for individual applications
df_OH = df[df["application_type"] != "JOINT"]

# Extract relevant columns to new df
df_OH.drop(columns=df_OH.columns.difference(
    ['loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment',
     'grade', 'sub_grade', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'loan_status',
     'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'mths_since_last_delinq', 'total_acc', 'out_prncp',
     'total_pymnt', 'total_rec_prncp', 'total_rec_interest', 'total_rec_late_fee', 'acc_now_delinq']), inplace=True)

# Fill any numeric columns na with 0
for column in df_OH.columns:
    if df_OH[column].isnull().any() and df_OH[column].dtype in ['int64', 'float64']:
        df_OH[column].fillna(0, inplace=True)
df_OH.isnull().sum()

LOGGER.info("Preprocessing data...")

# employment
emp_length_map = {
    '10+ years': 11,
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    np.nan: 0
}

df_OH['emp_length'] = df_OH['emp_length'].map(emp_length_map)

# grades and subgrades
grades = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7
}
subgrades = {
    'A1': 1,
    'A2': 2,
    'A3': 3,
    'A4': 4,
    'A5': 5,
    'B1': 6,
    'B2': 7,
    'B3': 8,
    'B4': 9,
    'B5': 10,
    'C1': 11,
    'C2': 12,
    'C3': 13,
    'C4': 14,
    'C5': 15,
    'D1': 16,
    'D2': 17,
    'D3': 18,
    'D4': 19,
    'D5': 20,
    'E1': 21,
    'E2': 22,
    'E3': 23,
    'E4': 24,
    'E5': 25,
    'F1': 26,
    'F2': 27,
    'F3': 28,
    'F4': 29,
    'F5': 30,
    'G1': 31,
    'G2': 32,
    'G3': 33,
    'G4': 34,
    'G5': 35
}

df_OH['grade'] = df_OH['grade'].map(grades)
df_OH['sub_grade'] = df_OH['sub_grade'].map(subgrades)
df_OH['term'] = df_OH['term'].str.extract('(\d+)').astype(int)

# homeownership
homeownership = {
    'ANY': 0,
    'MORTGAGE': -1,
    'NONE': 0,
    'OTHER': 0,
    'OWN': 2,
    'RENT': 1
}

df_OH['home_ownership'] = df_OH['home_ownership'].map(homeownership)

# verification
verification = {
    'Not Verified': -1,
    'Source Verified': 1,
    'Verified': 2
}

df_OH['verification_status'] = df_OH['verification_status'].map(verification)

# loan_status
l_stat = {
    'Charged Off': 1,
    'Default': 1,
    'Does not meet the credit policy. Status:Charged Off': 0,
    'Late (16-30 days)': 1,
    'Late (31-120 days)': 1,
    'Current': 0,
    'Does not meet the credit policy. Status:Fully Paid': 0,
    'Fully Paid': 0,
    'In Grace Period': 0,
    'Issued': 0,
}

df_OH['loan_status'] = df_OH['loan_status'].map(l_stat)

LOGGER.info('Encoding categorical features...')

df_OH = pd.get_dummies(df_OH, columns=['purpose','addr_state'], dtype=int)


LOGGER.info('Feature engineering in progress...')

# loan to income
df_OH['loan_to_income'] = round(df_OH['funded_amnt'] / df_OH['annual_inc'], 2)
df_OH['loan_to_income'].replace(np.inf, 2, inplace=True)

# total interest
df_OH['total_interest'] = round((df_OH['term'] / 12) * df_OH['loan_amnt'] * (df_OH['int_rate'] / 100), 2)

# loan performance
df_OH['loan_performance'] = round(df_OH['total_pymnt'] - df_OH['funded_amnt'], 2)

# repayment rate
df_OH['repayment_rate'] = round(df_OH['total_pymnt'] / df_OH['funded_amnt'], 2)

# debt-to-income ratio monthly
df_OH['dti_month'] = round(df_OH['installment'] / (df_OH['annual_inc'] / 12), 3)
df_OH['dti_month'].replace(np.inf, 1, inplace=True)
# format column order



LOGGER.info("Preprocessing finished!")


2024-03-25 20:42:03,882 INFO -- Preprocessing data...
2024-03-25 20:42:04,855 INFO -- Encoding categorical features...
2024-03-25 20:42:05,398 INFO -- Feature engineering in progress...
2024-03-25 20:42:05,423 INFO -- Preprocessing finished!


In [48]:
df_OH.head(5)

Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,...,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,loan_to_income,total_interest,loan_performance,repayment_rate,dti_month
0,5000.0,5000.0,36,10.65,162.87,2,7,11,1,24000.0,...,0,0,0,0,0,0.21,1597.5,861.07,1.17,0.081
1,2500.0,2500.0,60,15.27,59.83,3,14,0,1,30000.0,...,0,0,0,0,0,0.08,1908.75,-1491.29,0.4,0.024
2,2400.0,2400.0,36,15.96,84.33,3,15,11,1,12252.0,...,0,0,0,0,0,0.2,1149.12,603.65,1.25,0.083
3,10000.0,10000.0,36,13.49,339.31,3,11,11,1,49200.0,...,0,0,0,0,0,0.2,4047.0,2226.3,1.22,0.083
4,3000.0,3000.0,60,12.69,67.79,2,10,1,1,80000.0,...,0,0,0,0,0,0.04,1903.5,242.17,1.08,0.01


# Runtime Test: Frequency Encoding

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df_freq.drop('loan_status', axis=1),
                                                        df_freq['loan_status'],
                                                        test_size=0.1,
                                                        stratify=df_freq['loan_status'],
                                                        random_state=77)

# Merging back into one DataFrame
df_freq = pd.concat([X_test, y_test], axis=1)

start_time = time.time()
X = df_freq.drop('loan_status', axis=1)
y = df_freq['loan_status']

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=77)
LOGGER.info('Splitting data with StratifiedShuffleSplit...')

# Split data into train and test
train_index, test_index = next(sss.split(X, y))
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

LOGGER.info('Getting test, train and validation sets...')
# Further split test data into validation and test
sss_val_test = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=77)
val_index, test_index = next(sss_val_test.split(X_test, y_test))
X_val, X_test = X_test.iloc[val_index], X_test.iloc[test_index]
y_val, y_test = y_test.iloc[val_index], y_test.iloc[test_index]

eval_set = [(X_val, y_val)]

clf = xgb.sklearn.XGBClassifier(
    objective="binary:logistic",
    seed=7777,
    eval_metric='auc',
    early_stopping_rounds=20)



param_space = {
    "learning_rate": [0.01, 0.03, 0.04, 0.05, 0.1, 0.25, 0.35, 0.5],
    "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "gamma": [0, 0.25, 0.5, 0.75, 1],
    "n_estimators": [100, 200, 300, 400, 450, 500]
}

LOGGER.info('initiating BayesSearchCV...')
bayes_search1 = BayesSearchCV(clf, search_spaces=param_space,
                             n_iter=10, scoring='roc_auc',
                             cv=5, verbose=1,
                             n_jobs=-1)

bayes_search1.fit(X_train, y_train, eval_set=eval_set, verbose=True)
end_time = time.time() 
runtime1 = end_time - start_time  # Calculate runtime in seconds
print(f"Runtime of Frequency Encoding: {runtime1} seconds with best ROC_AUC of {bayes_search1.best_score_}")

2024-03-25 20:42:05,781 INFO -- Splitting data with StratifiedShuffleSplit...
2024-03-25 20:42:05,809 INFO -- Getting test, train and validation sets...
2024-03-25 20:42:05,821 INFO -- initiating BayesSearchCV...


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[0]	validation_0-auc:0.91263
[0]	validation_0-auc:0.91355
[0]	validation_0-auc:0.91232
[0]	validation_0-auc:0.91478
[0]	validation_0-auc:0.91533
[1]	validation_0-auc:0.91990
[1]	validation_0-auc:0.91519
[1]	validation_0-auc:0.91267
[1]	validation_0-auc:0.91783
[1]	validation_0-auc:0.92380
[2]	validation_0-auc:0.92174
[2]	validation_0-auc:0.91526
[2]	validation_0-auc:0.91491
[2]	validation_0-auc:0.93080
[3]	validation_0-auc:0.91501
[3]	validation_0-auc:0.91451
[3]	validation_0-auc:0.92157
[2]	validation_0-auc:0.93009
[3]	validation_0-auc:0.93114
[4]	validation_0-auc:0.91480
[3]	validation_0-auc:0.93013
[4]	validation_0-auc:0.92160
[4]	validation_0-auc:0.91518
[4]	validation_0-auc:0.93117
[5]	validation_0-auc:0.91565
[5]	validation_0-auc:0.92443
[4]	validation_0-auc:0.93063
[5]	validation_0-auc:0.91519
[5]	validation_0-auc:0.93135
[6]	validation_0-auc:0.92511
[5]	validation_0-auc:0.93056
[6]	validation_0-auc:0.92884
[6]	validatio

# Target Encoding

In [50]:
X_train, X_test, y_train, y_test = train_test_split(df_target.drop('loan_status', axis=1),
                                                        df_target['loan_status'],
                                                        test_size=0.1,
                                                        stratify=df_target['loan_status'],
                                                        random_state=77)

# Merging back into one DataFrame
df_target = pd.concat([X_test, y_test], axis=1)

start_time = time.time()
X = df_target.drop('loan_status', axis=1)
y = df_target['loan_status']

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=77)
LOGGER.info('Splitting data with StratifiedShuffleSplit...')

# Split data into train and test
train_index, test_index = next(sss.split(X, y))
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

LOGGER.info('Getting test, train and validation sets...')
# Further split test data into validation and test
sss_val_test = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=77)
val_index, test_index = next(sss_val_test.split(X_test, y_test))
X_val, X_test = X_test.iloc[val_index], X_test.iloc[test_index]
y_val, y_test = y_test.iloc[val_index], y_test.iloc[test_index]

eval_set = [(X_val, y_val)]

clf = xgb.sklearn.XGBClassifier(
    objective="binary:logistic",
    seed=7777,
    eval_metric='auc',
    early_stopping_rounds=20)



param_space = {
    "learning_rate": [0.01, 0.03, 0.04, 0.05, 0.1, 0.25, 0.35, 0.5],
    "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "gamma": [0, 0.25, 0.5, 0.75, 1],
    "n_estimators": [100, 200, 300, 400, 450, 500]
}

LOGGER.info('initiating BayesSearchCV...')
bayes_search2 = BayesSearchCV(clf, search_spaces=param_space,
                             n_iter=10, scoring='roc_auc',
                             cv=5, verbose=1,
                             n_jobs=-1)

bayes_search2.fit(X_train, y_train, eval_set=eval_set, verbose=True)
end_time = time.time() 
runtime2 = end_time - start_time  # Calculate runtime in seconds
print(f"Runtime of Target Encoding: {runtime2} seconds with best ROC_AUC of {bayes_search2.best_score_}")


2024-03-25 20:42:44,139 INFO -- Splitting data with StratifiedShuffleSplit...
2024-03-25 20:42:44,164 INFO -- Getting test, train and validation sets...
2024-03-25 20:42:44,176 INFO -- initiating BayesSearchCV...


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[0]	validation_0-auc:0.91290
[0]	validation_0-auc:0.91136
[0]	validation_0-auc:0.91236
[1]	validation_0-auc:0.92028
[0]	validation_0-auc:0.91202
[0]	validation_0-auc:0.91128
[1]	validation_0-auc:0.91952
[1]	validation_0-auc:0.92020
[2]	validation_0-auc:0.92197
[1]	validation_0-auc:0.92063
[2]	validation_0-auc:0.92150
[1]	validation_0-auc:0.91692
[2]	validation_0-auc:0.92135
[3]	validation_0-auc:0.92389
[2]	validation_0-auc:0.92612
[3]	validation_0-auc:0.92263
[2]	validation_0-auc:0.92539
[3]	validation_0-auc:0.92129
[4]	validation_0-auc:0.92486
[4]	validation_0-auc:0.92256
[3]	validation_0-auc:0.92661
[3]	validation_0-auc:0.92624
[4]	validation_0-auc:0.92128
[5]	validation_0-auc:0.92768
[5]	validation_0-auc:0.92319
[4]	validation_0-auc:0.92581
[5]	validation_0-auc:0.92344
[4]	validation_0-auc:0.92567
[6]	validation_0-auc:0.92963
[6]	validation_0-auc:0.92853
[5]	validation_0-auc:0.92754
[6]	validation_0-auc:0.92627
[5]	validatio

# One Hot Encoding

In [51]:
X_train, X_test, y_train, y_test = train_test_split(df_OH.drop('loan_status', axis=1),
                                                        df_OH['loan_status'],
                                                        test_size=0.1,
                                                        stratify=df_OH['loan_status'],
                                                        random_state=77)

# Merging back into one DataFrame
df_OH = pd.concat([X_test, y_test], axis=1)

start_time = time.time()
X = df_OH.drop('loan_status', axis=1)
y = df_OH['loan_status']

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=77)
LOGGER.info('Splitting data with StratifiedShuffleSplit...')

# Split data into train and test
train_index, test_index = next(sss.split(X, y))
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

LOGGER.info('Getting test, train and validation sets...')
# Further split test data into validation and test
sss_val_test = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=77)
val_index, test_index = next(sss_val_test.split(X_test, y_test))
X_val, X_test = X_test.iloc[val_index], X_test.iloc[test_index]
y_val, y_test = y_test.iloc[val_index], y_test.iloc[test_index]

eval_set = [(X_val, y_val)]

clf = xgb.sklearn.XGBClassifier(
    objective="binary:logistic",
    seed=7777,
    eval_metric='auc',
    early_stopping_rounds=20)



param_space = {
    "learning_rate": [0.01, 0.03, 0.04, 0.05, 0.1, 0.25, 0.35, 0.5],
    "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "gamma": [0, 0.25, 0.5, 0.75, 1],
    "n_estimators": [100, 200, 300, 400, 450, 500]
}

LOGGER.info('initiating BayesSearchCV...')
bayes_search3 = BayesSearchCV(clf, search_spaces=param_space,
                             n_iter=10, scoring='roc_auc',
                             cv=5, verbose=1,
                             n_jobs=-1)

bayes_search3.fit(X_train, y_train, eval_set=eval_set, verbose=True)
end_time = time.time() 
runtime3 = end_time - start_time  # Calculate runtime in seconds
print(f"Runtime of One Hot Encoding: {runtime3} seconds with best ROC_AUC of {bayes_search3.best_score_}")


2024-03-25 20:43:47,643 INFO -- Splitting data with StratifiedShuffleSplit...
2024-03-25 20:43:47,700 INFO -- Getting test, train and validation sets...
2024-03-25 20:43:47,719 INFO -- initiating BayesSearchCV...


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[0]	validation_0-auc:0.91226
[0]	validation_0-auc:0.91242
[0]	validation_0-auc:0.91359
[0]	validation_0-auc:0.91359
[0]	validation_0-auc:0.91085
[1]	validation_0-auc:0.91947
[1]	validation_0-auc:0.92182
[1]	validation_0-auc:0.92083
[1]	validation_0-auc:0.92182
[1]	validation_0-auc:0.91868
[2]	validation_0-auc:0.92502
[2]	validation_0-auc:0.92453
[2]	validation_0-auc:0.92337
[2]	validation_0-auc:0.92696
[2]	validation_0-auc:0.92447
[3]	validation_0-auc:0.92602
[3]	validation_0-auc:0.92753
[3]	validation_0-auc:0.92542
[3]	validation_0-auc:0.92749
[3]	validation_0-auc:0.92621
[4]	validation_0-auc:0.92741
[4]	validation_0-auc:0.92716
[4]	validation_0-auc:0.92341
[4]	validation_0-auc:0.92889
[4]	validation_0-auc:0.92976
[5]	validation_0-auc:0.92990
[5]	validation_0-auc:0.92934
[5]	validation_0-auc:0.92812
[5]	validation_0-auc:0.92829
[5]	validation_0-auc:0.92991
[6]	validation_0-auc:0.92979
[6]	validation_0-auc:0.93038
[6]	validatio

In [52]:
print(f"Runtime of Frequency Encoding: {runtime1} seconds")
print(f"Runtime of Target Encoding: {runtime2} seconds")
print(f"Runtime of One Hot Encoding: {runtime3} seconds")


Runtime of Frequency Encoding: 37.88290977478027 seconds
Runtime of Target Encoding: 62.00072503089905 seconds
Runtime of One Hot Encoding: 136.36267805099487 seconds
