In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
import yaml
import os
import sys
sys.path.append(os.path.join(os.getcwd(), 'src'))
import pickle
import joblib
import src.utils as utils

In [2]:
## load configuration file
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
root_path = os.getcwd()

## load constant variables
TARGET_COL = config['data_source']['target_col']
TEST_SZ = config['data_source']['test_size']
SEED_ST = config['data_source']['random_state']
NUM_COLS = config['data_source']['num_cols']
CAT_COLS = config['data_source']['cat_cols']
CORR_COLS = config['data_source']['corr_cols']
X_TRAIN_PATH = config['pickle_path']['X_train']
X_TEST_PATH = config['pickle_path']['X_test']
X_VALID_PATH = config['pickle_path']['X_valid']
y_TRAIN_PATH = config['pickle_path']['y_train']
y_TEST_PATH = config['pickle_path']['y_test']
y_VALID_PATH = config['pickle_path']['y_valid']

## Desirialize Pickle Data
var_name = ["X_train", "X_test", "X_valid", "y_train", "y_test", "y_valid"]
stock_list = [X_TRAIN_PATH, X_TEST_PATH, X_VALID_PATH, y_TRAIN_PATH, y_TEST_PATH, y_VALID_PATH]
for var_name, path in zip(var_name, stock_list):
    globals()[var_name] = utils.deserialize_data(path)

In [3]:
"""Testing Desirialize"""
print(f"Features train shape:{X_train.shape}")
print(f"\nFeatures columns:{X_train.columns}")
X_train.head()

Features train shape:(26064, 11)

Features columns:Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file',
       'cb_person_cred_hist_length'],
      dtype='object')


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
29762,45,37500,MORTGAGE,1.0,DEBTCONSOLIDATION,B,5000,11.49,0.13,N,16
2714,25,50000,RENT,5.0,PERSONAL,A,12000,7.88,0.24,N,2
50,24,78000,RENT,4.0,DEBTCONSOLIDATION,D,30000,,0.38,Y,4
28458,31,78504,RENT,2.0,EDUCATION,C,10000,11.41,0.13,N,7
3674,26,14000,RENT,2.0,VENTURE,B,4000,,0.29,N,3


In [4]:
X_train, y_train = utils.drop_duplicate_data(X_train, y_train)

drop_duplicate_data: Parameters have been validated

drop_duplicate_data: data shape before dropping : (26064, 11)

drop_duplicate_data: duplicated data have the following shape : (110, 11)

drop_duplicate_data: data shape have after dropping should be : (25954, 11)

Data shape after dropping duplicates are: (25954, 11)


In [5]:
subset_data = X_train.select_dtypes(include=np.number).columns.tolist()
print(subset_data)

['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']


In [6]:
subset_data = utils.median_imputation(X_train, subset_data, fit=True)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: proses fitting telah selesai, berikut hasilnya {'person_age': 26.0, 'person_income': 55000.0, 'person_emp_length': 4.0, 'loan_amnt': 8000.0, 'loan_int_rate': 10.99, 'loan_percent_income': 0.15, 'cb_person_cred_hist_length': 4.0}.


In [7]:
X_train = utils.median_imputation(X_train, subset_data, False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:
person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              707
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 2474
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64



In [8]:
X_test = utils.median_imputation(X_test, subset_data, False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:
person_age                      0
person_income                   0
person_home_ownership           0
person_emp_length              90
loan_intent                     0
loan_grade                      0
loan_amnt                       0
loan_int_rate                 312
loan_percent_income             0
cb_person_default_on_file       0
cb_person_cred_hist_length      0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64



In [9]:
X_valid = utils.median_imputation(X_valid, subset_data, False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:
person_age                      0
person_income                   0
person_home_ownership           0
person_emp_length              93
loan_intent                     0
loan_grade                      0
loan_amnt                       0
loan_int_rate                 319
loan_percent_income             0
cb_person_default_on_file       0
cb_person_cred_hist_length      0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

