In [14]:
# ---- IMPORTS ----

import os 
import sys 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

In [2]:
# --------------------- HANDLING DIRECTORY -------------------------

# parent folder director
parent_dir = Path.cwd().parent

# data directory
data_dir = parent_dir / "data"

# dataset directory
dataset_dir = data_dir / "Loan_Default.csv"

# Add parent directory to system
if str(parent_dir) not in sys.path:
    sys.path.insert(0, str(parent_dir))

In [3]:
# importing custom modules

from modules.preprocess import clean_data
from modules.preprocess import preprocessing_pipeline

In [4]:
# import data into dataframe
df = pd.read_csv(dataset_dir)

## Clean Data and Preprocess with Pipeline

In [5]:
# ------- Clean Data and Preprocess with Pipeline -------
df_clean = clean_data(df)

df_clean.head()

Unnamed: 0,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,loan_amount,rate_of_interest,...,credit_type,Credit_Score,co-applicant_credit_type,submission_of_application,LTV,Region,Security_Type,Status,dtir1,age_numerical
0,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,116500,,...,EXP,758,CIB,to_inst,98.728814,south,direct,1,45.0,29.5
1,cf,Male,nopre,type2,p1,l1,nopc,b/c,206500,,...,EQUI,552,EXP,to_inst,,North,direct,1,,59.5
2,cf,Male,pre,type1,p1,l1,nopc,nob/c,406500,4.56,...,EXP,834,CIB,to_inst,80.019685,south,direct,0,46.0,39.5
3,cf,Male,nopre,type1,p4,l1,nopc,nob/c,456500,4.25,...,EXP,587,CIB,not_inst,69.3769,North,direct,0,42.0,49.5
4,cf,Joint,pre,type1,p1,l1,nopc,nob/c,696500,4.0,...,CRIF,602,EXP,not_inst,91.886544,North,direct,0,39.0,29.5


In [6]:
# ------ prepprocessing pipeline ------
pipeline = preprocessing_pipeline(df)

pipeline

In [10]:
# ---------- SPLIT DATASET -------------
# Prediction Matrix
X = df.drop(columns= "Status")

# Target vector
y = df["Status"]

# train_test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape


(118936, 33)

In [12]:
# ----------- TRANSFORM DATA ----------------

# fit and transform Train set
X_train_trans = pipeline.fit_transform(X_train)

# Transform the test set
X_test_trans = pipeline.transform(X_test)

X_test_trans.shape

(29734, 63)

In [13]:
pipeline

## Create Processed DataFrame Function

In [None]:
# ------- DEFINE FEATURE LISTS -------

# Log Transform Columns
log_transform_cols = [
    'loan_amount',
    'property_value',
    'income',
    'Upfront_charges'
]

# Numerical columns for imputation and scaling
numerical_cols = [
    'rate_of_interest',
    'Interest_rate_spread',
    'term',
    'Credit_Score',
    'LTV',
    'dtir1',
    'age_numerical'
]


# Categorical columns for imputation and One-Hot Encoding
categorical_cols = [
    'loan_limit',
    'Gender',
    'approv_in_adv',
    'loan_type',
    'loan_purpose',
    'Credit_Worthiness',
    'open_credit',
    'business_or_commercial',
    'Neg_ammortization',
    'interest_only',
    'lump_sum_payment',
    'construction_type',
    'occupancy_type',
    'Secured_by',
    'total_units',
    'credit_type',
    'co-applicant_credit_type',
    'submission_of_application',
    'Region',
    'Security_Type'
]

In [None]:
# Access the OneHotEncoder transformer
onehot_transformer =  pipeline["preprocessor"].named_transformers_["cat"]["onehot"]

# extract feature names from onehot transformer
onehot_transformer.get_feature_names_out(categorical_cols)

array(['loan_limit_cf', 'loan_limit_ncf', 'Gender_Female', 'Gender_Joint',
       'Gender_Male', 'Gender_Sex Not Available', 'approv_in_adv_nopre',
       'approv_in_adv_pre', 'loan_type_type1', 'loan_type_type2',
       'loan_type_type3', 'loan_purpose_p1', 'loan_purpose_p2',
       'loan_purpose_p3', 'loan_purpose_p4', 'Credit_Worthiness_l1',
       'Credit_Worthiness_l2', 'open_credit_nopc', 'open_credit_opc',
       'business_or_commercial_b/c', 'business_or_commercial_nob/c',
       'Neg_ammortization_neg_amm', 'Neg_ammortization_not_neg',
       'interest_only_int_only', 'interest_only_not_int',
       'lump_sum_payment_lpsm', 'lump_sum_payment_not_lpsm',
       'construction_type_mh', 'construction_type_sb',
       'occupancy_type_ir', 'occupancy_type_pr', 'occupancy_type_sr',
       'Secured_by_home', 'Secured_by_land', 'total_units_1U',
       'total_units_2U', 'total_units_3U', 'total_units_4U',
       'credit_type_CIB', 'credit_type_CRIF', 'credit_type_EQUI',
       'credit_

In [None]:
# Access the OneHotEncoder transformer
onehot_transformer =  pipeline["preprocessor"].named_transformers_["cat"]["onehot"]

# extract feature names from onehot transformer
cat_cols = list(onehot_transformer.get_feature_names_out(categorical_cols))

# processed features
processed_feat = (
    log_transform_cols +
    numerical_cols +
    cat_cols
)

processed_feat

['loan_amount',
 'property_value',
 'income',
 'Upfront_charges',
 'rate_of_interest',
 'Interest_rate_spread',
 'term',
 'Credit_Score',
 'LTV',
 'dtir1',
 'age_numerical',
 'loan_limit_cf',
 'loan_limit_ncf',
 'Gender_Female',
 'Gender_Joint',
 'Gender_Male',
 'Gender_Sex Not Available',
 'approv_in_adv_nopre',
 'approv_in_adv_pre',
 'loan_type_type1',
 'loan_type_type2',
 'loan_type_type3',
 'loan_purpose_p1',
 'loan_purpose_p2',
 'loan_purpose_p3',
 'loan_purpose_p4',
 'Credit_Worthiness_l1',
 'Credit_Worthiness_l2',
 'open_credit_nopc',
 'open_credit_opc',
 'business_or_commercial_b/c',
 'business_or_commercial_nob/c',
 'Neg_ammortization_neg_amm',
 'Neg_ammortization_not_neg',
 'interest_only_int_only',
 'interest_only_not_int',
 'lump_sum_payment_lpsm',
 'lump_sum_payment_not_lpsm',
 'construction_type_mh',
 'construction_type_sb',
 'occupancy_type_ir',
 'occupancy_type_pr',
 'occupancy_type_sr',
 'Secured_by_home',
 'Secured_by_land',
 'total_units_1U',
 'total_units_2U',
 'tot

In [29]:
# Access the OneHotEncoder transformer
onehot_transformer =  pipeline["preprocessor"].named_transformers_["cat"]["onehot"]

# extract feature names from onehot transformer
cat_cols = list(onehot_transformer.get_feature_names_out(categorical_cols))

# processed features
processed_feat = (
    log_transform_cols +
    numerical_cols +
    cat_cols
)

# dataframe of processed data
df_processed = pd.DataFrame(
    X_train_trans,
    columns= processed_feat,
    index = X_train.index
)

df_processed.head()

Unnamed: 0,loan_amount,property_value,income,Upfront_charges,rate_of_interest,Interest_rate_spread,term,Credit_Score,LTV,dtir1,...,co-applicant_credit_type_CIB,co-applicant_credit_type_EXP,submission_of_application_not_inst,submission_of_application_to_inst,Region_North,Region_North-East,Region_central,Region_south,Security_Type_Indriect,Security_Type_direct
141245,-2.286107,-2.289657,-0.767228,0.377582,-1.089765,-0.838457,0.425386,-0.818669,-0.109128,-2.684156,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3507,1.168972,1.388673,0.305514,0.37363,-0.066885,-0.680613,0.425386,0.25087,-0.715008,0.524081,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
53688,-1.410402,-1.750828,-0.925248,0.442474,-0.834045,2.380745,-2.658811,-0.784168,0.707193,0.42059,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
46491,-0.248852,-0.61231,0.437874,0.37363,-0.087342,-0.08713,0.425386,-0.861796,0.713734,0.317098,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
54671,0.934909,1.193685,0.521759,0.262195,0.188835,-1.107733,0.425386,0.009361,-0.78257,-0.303851,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
