In [43]:
####
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
import yaml
import pickle
import joblib
import os
import src.utils as utils
import importlib
importlib.reload(utils)

## load configuration file
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
root_path = os.getcwd()

In [3]:
## load constant variables
TARGET_COL = config['data_source']['target_col']
TEST_SZ = config['data_source']['test_size']
SEED_ST = config['data_source']['random_state']
NUM_COLS = config['data_source']['num_cols']
CAT_COLS = config['data_source']['cat_cols']
CORR_COLS = config['data_source']['corr_cols']
X_TRAIN_PATH = config['pickle_path']['X_train']
X_TEST_PATH = config['pickle_path']['X_test']
X_VALID_PATH = config['pickle_path']['X_valid']
y_TRAIN_PATH = config['pickle_path']['y_train']
y_TEST_PATH = config['pickle_path']['y_test']
y_VALID_PATH = config['pickle_path']['y_valid']

## Desirialize Pickle Data
var_name = ["X_train", "X_test", "X_valid", "y_train", "y_test", "y_valid"]
stock_list = [X_TRAIN_PATH, X_TEST_PATH, X_VALID_PATH, y_TRAIN_PATH, y_TEST_PATH, y_VALID_PATH]
for var_name, path in zip(var_name, stock_list):
    globals()[var_name] = utils.deserialize_data(path)

In [4]:
"""Testing Desirialize"""
print(f"Features train shape:{X_train.shape}")
print(f"\nFeatures columns:{X_train.columns}")
X_train.head()

Features train shape:(26064, 11)

Features columns:Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file',
       'cb_person_cred_hist_length'],
      dtype='object')


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
29762,45,37500,MORTGAGE,1.0,DEBTCONSOLIDATION,B,5000,11.49,0.13,N,16
2714,25,50000,RENT,5.0,PERSONAL,A,12000,7.88,0.24,N,2
50,24,78000,RENT,4.0,DEBTCONSOLIDATION,D,30000,,0.38,Y,4
28458,31,78504,RENT,2.0,EDUCATION,C,10000,11.41,0.13,N,7
3674,26,14000,RENT,2.0,VENTURE,B,4000,,0.29,N,3


In [5]:
X_train, y_train = utils.drop_duplicate_data(X_train, y_train)

drop_duplicate_data: Parameters have been validated

drop_duplicate_data: data shape before dropping : (26064, 11)

drop_duplicate_data: duplicated data have the following shape : (110, 11)

drop_duplicate_data: data shape have after dropping should be : (25954, 11)

Data shape after dropping duplicates are: (25954, 11)


In [6]:
subset_data = X_train.select_dtypes(include=np.number).columns.tolist()
print(subset_data)

['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']


In [7]:
subset_data = utils.median_imputation(X_train, subset_data, fit=True)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: proses fitting telah selesai, berikut hasilnya {'person_age': 26.0, 'person_income': 55000.0, 'person_emp_length': 4.0, 'loan_amnt': 8000.0, 'loan_int_rate': 10.99, 'loan_percent_income': 0.15, 'cb_person_cred_hist_length': 4.0}.


In [8]:
X_train = utils.median_imputation(X_train, subset_data, False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:
person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              707
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 2474
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64



In [9]:
X_test = utils.median_imputation(X_test, subset_data, False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:
person_age                      0
person_income                   0
person_home_ownership           0
person_emp_length              90
loan_intent                     0
loan_grade                      0
loan_amnt                       0
loan_int_rate                 312
loan_percent_income             0
cb_person_default_on_file       0
cb_person_cred_hist_length      0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64



In [10]:
X_valid = utils.median_imputation(X_valid, subset_data, False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:
person_age                      0
person_income                   0
person_home_ownership           0
person_emp_length              93
loan_intent                     0
loan_grade                      0
loan_amnt                       0
loan_int_rate                 319
loan_percent_income             0
cb_person_default_on_file       0
cb_person_cred_hist_length      0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64



## Categorical Encoding

### X_train

In [31]:
## unpack OneHotEncoder Path
path_home_ownership = config['encoding_path']['home_owner']
path_loan_intent = config['encoding_path']['loan_intent']
path_loan_grade = config['encoding_path']['loan_grade']
path_default_history = config['encoding_path']['record_default']

In [24]:
## create variable for each categorical columns
person_home_ownership, loan_intent, loan_grade, cb_person_defailt_on_file = [], [], [], []
cat_list = [person_home_ownership, loan_intent, loan_grade, cb_person_defailt_on_file]

for idx in range(len(cat_list)):
    cat_list[idx].extend(X_train[CAT_COLS[idx]].to_list())

['MORTGAGE', 'RENT', 'RENT', 'RENT', 'RENT', 'MORTGAGE', 'OWN', 'MORTGAGE', 'RENT', 'MORTGAGE', 'RENT', 'MORTGAGE', 'RENT', 'RENT', 'RENT', 'MORTGAGE', 'RENT', 'MORTGAGE', 'MORTGAGE', 'OWN', 'RENT', 'RENT', 'MORTGAGE', 'RENT', 'RENT', 'OWN', 'MORTGAGE', 'RENT', 'RENT', 'RENT', 'MORTGAGE', 'RENT', 'RENT', 'RENT', 'RENT', 'RENT', 'MORTGAGE', 'RENT', 'MORTGAGE', 'RENT', 'OWN', 'RENT', 'MORTGAGE', 'MORTGAGE', 'MORTGAGE', 'RENT', 'MORTGAGE', 'RENT', 'RENT', 'MORTGAGE', 'RENT', 'OWN', 'OWN', 'MORTGAGE', 'MORTGAGE', 'RENT', 'RENT', 'MORTGAGE', 'MORTGAGE', 'RENT', 'MORTGAGE', 'RENT', 'MORTGAGE', 'RENT', 'RENT', 'MORTGAGE', 'RENT', 'MORTGAGE', 'OWN', 'MORTGAGE', 'RENT', 'RENT', 'MORTGAGE', 'OWN', 'RENT', 'MORTGAGE', 'MORTGAGE', 'MORTGAGE', 'MORTGAGE', 'RENT', 'MORTGAGE', 'MORTGAGE', 'MORTGAGE', 'MORTGAGE', 'MORTGAGE', 'RENT', 'MORTGAGE', 'MORTGAGE', 'RENT', 'RENT', 'RENT', 'RENT', 'RENT', 'RENT', 'MORTGAGE', 'RENT', 'RENT', 'OWN', 'OWN', 'RENT', 'MORTGAGE', 'RENT', 'RENT', 'OWN', 'RENT', 'MORTG

In [32]:
ohe_home_ownership = utils.create_onehot_encoder(person_home_ownership, path_home_ownership)

Fitted Categories are: ['MORTGAGE', 'OTHER', 'OWN', 'RENT']


In [33]:
ohe_loan_intent = utils.create_onehot_encoder(loan_intent, path_loan_intent)

Fitted Categories are: ['DEBTCONSOLIDATION', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL', 'PERSONAL', 'VENTURE']


In [34]:
ohe_loan_grade = utils.create_onehot_encoder(loan_grade, path_loan_grade)

Fitted Categories are: ['A', 'B', 'C', 'D', 'E', 'F', 'G']


In [35]:
ohe_default_on_file = utils.create_onehot_encoder(cb_person_defailt_on_file, path_default_history)

Fitted Categories are: ['N', 'Y']


In [45]:
X_train = utils.ohe_transform(X_train, "person_home_ownership", "home_ownership", ohe_home_ownership)
X_train

ohe_transform: params have been validated.

ohe_transform: The list of column names before coding is ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length'].

ohe_transform: The list of column names after coding is ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT'].




Unnamed: 0,person_age,person_income,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT
29762,45,37500,1.0,DEBTCONSOLIDATION,B,5000,11.49,0.13,N,16,1.0,0.0,0.0,0.0
2714,25,50000,5.0,PERSONAL,A,12000,7.88,0.24,N,2,0.0,0.0,0.0,1.0
50,24,78000,4.0,DEBTCONSOLIDATION,D,30000,10.99,0.38,Y,4,0.0,0.0,0.0,1.0
28458,31,78504,2.0,EDUCATION,C,10000,11.41,0.13,N,7,0.0,0.0,0.0,1.0
3674,26,14000,2.0,VENTURE,B,4000,10.99,0.29,N,3,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7647,22,47000,3.0,DEBTCONSOLIDATION,B,6850,10.65,0.15,N,4,0.0,0.0,0.0,1.0
1120,22,40000,1.0,DEBTCONSOLIDATION,B,17000,10.65,0.42,N,4,0.0,0.0,0.0,1.0
19531,27,34000,12.0,VENTURE,C,10625,13.47,0.31,Y,10,1.0,0.0,0.0,0.0
5581,26,74500,8.0,VENTURE,B,5000,11.36,0.07,N,2,0.0,0.0,0.0,1.0


In [48]:
X_train = utils.ohe_transform(X_train, "loan_intent", "loan_intent", ohe_loan_intent)
X_train

ohe_transform: params have been validated.

ohe_transform: The list of column names before coding is ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT'].

ohe_transform: The list of column names after coding is ['person_age', 'person_income', 'person_emp_length', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE'].




Unnamed: 0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
29762,45,37500,1.0,B,5000,11.49,0.13,N,16,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2714,25,50000,5.0,A,12000,7.88,0.24,N,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
50,24,78000,4.0,D,30000,10.99,0.38,Y,4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
28458,31,78504,2.0,C,10000,11.41,0.13,N,7,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3674,26,14000,2.0,B,4000,10.99,0.29,N,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7647,22,47000,3.0,B,6850,10.65,0.15,N,4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1120,22,40000,1.0,B,17000,10.65,0.42,N,4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
19531,27,34000,12.0,C,10625,13.47,0.31,Y,10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5581,26,74500,8.0,B,5000,11.36,0.07,N,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [50]:
X_train = utils.ohe_transform(X_train, "loan_grade", "loan_grade", ohe_loan_grade)
X_train

ohe_transform: params have been validated.

ohe_transform: The list of column names before coding is ['person_age', 'person_income', 'person_emp_length', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE'].

ohe_transform: The list of column names after coding is ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_inte



Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,home_ownership_MORTGAGE,home_ownership_OTHER,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G
29762,45,37500,1.0,5000,11.49,0.13,N,16,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2714,25,50000,5.0,12000,7.88,0.24,N,2,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50,24,78000,4.0,30000,10.99,0.38,Y,4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
28458,31,78504,2.0,10000,11.41,0.13,N,7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3674,26,14000,2.0,4000,10.99,0.29,N,3,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7647,22,47000,3.0,6850,10.65,0.15,N,4,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1120,22,40000,1.0,17000,10.65,0.42,N,4,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
19531,27,34000,12.0,10625,13.47,0.31,Y,10,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5581,26,74500,8.0,5000,11.36,0.07,N,2,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [52]:
X_train = utils.ohe_transform(X_train, "cb_person_default_on_file", "default_onfile", ohe_default_on_file)
X_train

ohe_transform: params have been validated.

ohe_transform: The list of column names before coding is ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE', 'loan_grade_A', 'loan_grade_B', 'loan_grade_C', 'loan_grade_D', 'loan_grade_E', 'loan_grade_F', 'loan_grade_G'].

ohe_transform: The list of column names after coding is ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION', 'loan_intent_HOMEIM



Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,home_ownership_MORTGAGE,home_ownership_OTHER,home_ownership_OWN,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,default_onfile_N,default_onfile_Y
29762,45,37500,1.0,5000,11.49,0.13,16,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2714,25,50000,5.0,12000,7.88,0.24,2,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50,24,78000,4.0,30000,10.99,0.38,4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
28458,31,78504,2.0,10000,11.41,0.13,7,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3674,26,14000,2.0,4000,10.99,0.29,3,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7647,22,47000,3.0,6850,10.65,0.15,4,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1120,22,40000,1.0,17000,10.65,0.42,4,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19531,27,34000,12.0,10625,13.47,0.31,10,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
5581,26,74500,8.0,5000,11.36,0.07,2,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### X_valid

In [54]:
new_name = ["home_ownership","loan_intent","loan_grade","default_onfile"]
old_name = ["person_home_ownership", "loan_intent", "loan_grade", "cb_person_cred_hist_length"]
ohe_list = [ohe_home_ownership, ohe_loan_intent, ohe_loan_grade,ohe_default_on_file]

for idx in range(len(old_name)):
    X_valid_testing = utils.ohe_transform(X_valid, old_name[idx], new_name[idx], ohe_list[idx])

ohe_transform: params have been validated.

ohe_transform: The list of column names before coding is ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length'].

ohe_transform: The list of column names after coding is ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT'].
ohe_transform: params have been validated.

ohe_transform: The list of column names before coding is ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length'].

ohe_trans



TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [1]:
print(X_train)

NameError: name 'X_train' is not defined