In [301]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import imblearn
import matplotlib.pyplot as plt

In [302]:
from datetime import datetime
from sklearn import preprocessing
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

In [303]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SequentialFeatureSelector
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVR

## Data Understanding

In [304]:
df = pd.read_csv("Data/loan.csv")

In [305]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ApplicationDate             20000 non-null  object 
 1   Age                         20000 non-null  int64  
 2   AnnualIncome                20000 non-null  int64  
 3   CreditScore                 20000 non-null  int64  
 4   EmploymentStatus            20000 non-null  object 
 5   EducationLevel              20000 non-null  object 
 6   Experience                  20000 non-null  int64  
 7   LoanAmount                  20000 non-null  int64  
 8   LoanDuration                20000 non-null  int64  
 9   MaritalStatus               20000 non-null  object 
 10  NumberOfDependents          20000 non-null  int64  
 11  HomeOwnershipStatus         20000 non-null  object 
 12  MonthlyDebtPayments         20000 non-null  int64  
 13  CreditCardUtilizationRate   200

In [306]:
df.head()

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
0,2018-01-01,45,39948,617,Employed,Master,22,13152,48,Married,...,3329.0,0.724972,11,126928,0.199652,0.22759,419.805992,0.181077,0,49.0
1,2018-01-02,38,39709,628,Employed,Associate,15,26045,48,Single,...,3309.083333,0.935132,3,43609,0.207045,0.201077,794.054238,0.389852,0,52.0
2,2018-01-03,47,40724,570,Employed,Bachelor,26,17627,36,Married,...,3393.666667,0.872241,6,5205,0.217627,0.212548,666.406688,0.462157,0,52.0
3,2018-01-04,58,69084,545,Employed,High School,34,37898,96,Single,...,5757.0,0.896155,5,99452,0.300398,0.300911,1047.50698,0.313098,0,54.0
4,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,Married,...,8605.333333,0.941369,5,227019,0.197184,0.17599,330.17914,0.07021,1,36.0


In [310]:
df[df["LoanApproved"] == 1]

Unnamed: 0,ApplicationDate,Age,AnnualIncome,CreditScore,EmploymentStatus,EducationLevel,Experience,LoanAmount,LoanDuration,MaritalStatus,...,MonthlyIncome,UtilityBillsPaymentHistory,JobTenure,NetWorth,BaseInterestRate,InterestRate,MonthlyLoanPayment,TotalDebtToIncomeRatio,LoanApproved,RiskScore
4,2018-01-05,37,103264,594,Employed,Associate,17,9184,36,Married,...,8605.333333,0.941369,5,227019,0.197184,0.175990,330.179140,0.070210,1,36.0
5,2018-01-06,37,178310,626,Self-Employed,Master,16,15433,72,Married,...,14859.166667,0.756079,5,27071,0.217433,0.217601,385.577074,0.075211,1,44.0
7,2018-01-08,49,97345,516,Employed,High School,23,19634,12,Divorced,...,8112.083333,0.933492,5,38621,0.226634,0.209113,1827.360055,0.260767,1,42.4
15,2018-01-16,33,56650,605,Employed,Doctorate,11,12652,36,Single,...,4720.833333,0.837214,2,250291,0.195152,0.216413,480.840828,0.173029,1,33.6
17,2018-01-18,43,142326,644,Employed,Bachelor,24,13499,96,Widowed,...,11860.500000,0.971063,5,69623,0.226499,0.223795,303.195231,0.090653,1,39.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19983,2072-09-17,40,61052,652,Unemployed,Master,18,10865,36,Single,...,5087.666667,0.867487,6,11129,0.169865,0.145219,374.100069,0.145863,1,36.8
19987,2072-09-21,31,77443,525,Employed,Bachelor,9,8237,36,Divorced,...,6453.583333,0.927929,7,8282,0.230737,0.247035,326.211559,0.099822,1,41.6
19988,2072-09-22,64,147351,631,Employed,High School,36,17094,108,Single,...,12279.250000,0.746710,2,48804,0.246594,0.225134,370.469135,0.070889,1,40.8
19990,2072-09-24,54,96264,621,Employed,Bachelor,28,38031,48,Single,...,8022.000000,0.512786,5,2604,0.222531,0.196153,1149.516437,0.185804,1,37.6


In [249]:
# pp = ProfileReport(df, title = "Exploration")

In [250]:
# pp.to_notebook_iframe()

## Data Selection
this was based on the correlation after data transformation, check on the markdown below

In [251]:
df = df[["MonthlyIncome", "LoanAmount", "InterestRate", "MonthlyLoanPayment", "CreditScore", "NumberOfDependents", "LoanDuration", "EducationLevel", "Age", "EmploymentStatus", "MaritalStatus", "LoanApproved"]]

In [252]:
df_pipe = df.copy()
df_pipe_full = df.copy()

## Data Transformation

In [253]:
from sklearn import preprocessing

### Education

In [254]:
df["EducationLevel"]

0             Master
1          Associate
2           Bachelor
3        High School
4          Associate
            ...     
19995    High School
19996      Associate
19997       Bachelor
19998    High School
19999      Associate
Name: EducationLevel, Length: 20000, dtype: object

In [255]:
df["EducationLevel"] = pd.Categorical(df["EducationLevel"],categories =  ['High School', "Associate", "Bachelor", "Master", 'Doctorate'], ordered = True)

In [256]:
df["EducationLevel"]

0             Master
1          Associate
2           Bachelor
3        High School
4          Associate
            ...     
19995    High School
19996      Associate
19997       Bachelor
19998    High School
19999      Associate
Name: EducationLevel, Length: 20000, dtype: category
Categories (5, object): ['High School' < 'Associate' < 'Bachelor' < 'Master' < 'Doctorate']

In [257]:
label_education = preprocessing.LabelEncoder()

In [258]:
df["EducationLevel"] = label_education.fit_transform(df["EducationLevel"])

### Marital Status

In [259]:
# Sparse
df["MaritalStatus"].unique()

array(['Married', 'Single', 'Divorced', 'Widowed'], dtype=object)

In [260]:
df["MaritalStatus"] = pd.Categorical(df["MaritalStatus"], categories = ['Married', 'Single', 'Divorced', 'Widowed'])

In [261]:
categorical_marital = preprocessing.OneHotEncoder(sparse_output=False)

In [262]:
encoded_marital = categorical_marital.fit_transform(df[["MaritalStatus"]])
encoded_marital = pd.DataFrame(encoded_marital, columns=categorical_marital.get_feature_names_out(['MaritalStatus']))

In [263]:
encoded_marital

Unnamed: 0,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widowed
0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0
...,...,...,...,...
19995,0.0,1.0,0.0,0.0
19996,0.0,1.0,0.0,0.0
19997,0.0,1.0,0.0,0.0
19998,0.0,1.0,0.0,0.0


In [264]:
df.drop("MaritalStatus", axis=1, inplace=True)
df = pd.concat([df, encoded_marital], axis=1)

### EmploymentStatus

In [265]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MonthlyIncome           20000 non-null  float64
 1   LoanAmount              20000 non-null  int64  
 2   InterestRate            20000 non-null  float64
 3   MonthlyLoanPayment      20000 non-null  float64
 4   CreditScore             20000 non-null  int64  
 5   NumberOfDependents      20000 non-null  int64  
 6   LoanDuration            20000 non-null  int64  
 7   EducationLevel          20000 non-null  int32  
 8   Age                     20000 non-null  int64  
 9   EmploymentStatus        20000 non-null  object 
 10  LoanApproved            20000 non-null  int64  
 11  MaritalStatus_Divorced  20000 non-null  float64
 12  MaritalStatus_Married   20000 non-null  float64
 13  MaritalStatus_Single    20000 non-null  float64
 14  MaritalStatus_Widowed   20000 non-null

In [266]:
# Sparse
df["EmploymentStatus"].unique()

array(['Employed', 'Self-Employed', 'Unemployed'], dtype=object)

In [267]:
df["EmploymentStatus"] = pd.Categorical(df["EmploymentStatus"], categories = ['Employed', 'Self-Employed', 'Unemployed'])

In [268]:
categorical_employment = preprocessing.OneHotEncoder(sparse_output=False)

In [269]:
encoded_employment = categorical_employment.fit_transform(df[["EmploymentStatus"]])
encoded_employment = pd.DataFrame(encoded_employment, columns=categorical_employment.get_feature_names_out(['EmploymentStatus']))

In [270]:
df.drop("EmploymentStatus", axis=1, inplace=True)
df = pd.concat([df, encoded_employment], axis=1)

In [271]:
df

Unnamed: 0,MonthlyIncome,LoanAmount,InterestRate,MonthlyLoanPayment,CreditScore,NumberOfDependents,LoanDuration,EducationLevel,Age,LoanApproved,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widowed,EmploymentStatus_Employed,EmploymentStatus_Self-Employed,EmploymentStatus_Unemployed
0,3329.000000,13152,0.227590,419.805992,617,2,48,4,45,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,3309.083333,26045,0.201077,794.054238,628,1,48,0,38,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,3393.666667,17627,0.212548,666.406688,570,2,36,1,47,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,5757.000000,37898,0.300911,1047.506980,545,1,96,3,58,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,8605.333333,9184,0.175990,330.179140,594,1,36,0,37,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,2515.000000,24521,0.195574,905.767712,587,3,36,3,44,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
19996,4103.833333,25818,0.199168,958.395633,567,5,36,0,56,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
19997,4079.833333,37033,0.226766,945.427454,645,3,72,1,44,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
19998,3418.750000,14760,0.264873,411.168284,560,3,72,3,60,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## EDA

In [272]:
import plotly.express as px

# Calculate correlation matrix
corr_matrix = round(df.corr(numeric_only= True, method="spearman"), 2)

# Create a Plotly heatmap
fig = px.imshow(corr_matrix,
                text_auto=True,   # Annotates the heatmap with correlation values
                aspect="auto",    # Aspect ratio
                color_continuous_scale="Viridis", # Color scale
                width=800,        # Width of the plot
                height=600        # Height of the plot
               )

# Show the plot
fig.show()


In [273]:
df.corr(numeric_only= True, method="spearman")["LoanApproved"][abs(df.corr(numeric_only= True, method="spearman")["LoanApproved"])>0.2]

MonthlyIncome         0.568802
LoanAmount           -0.260152
InterestRate         -0.307956
MonthlyLoanPayment   -0.222174
LoanApproved          1.000000
Name: LoanApproved, dtype: float64

In [274]:
abs(df.corr(numeric_only= True, method="spearman")["LoanApproved"])

MonthlyIncome                     0.568802
LoanAmount                        0.260152
InterestRate                      0.307956
MonthlyLoanPayment                0.222174
CreditScore                       0.142818
NumberOfDependents                0.000702
LoanDuration                      0.088465
EducationLevel                    0.029313
Age                               0.143402
LoanApproved                      1.000000
MaritalStatus_Divorced            0.005743
MaritalStatus_Married             0.001125
MaritalStatus_Single              0.000601
MaritalStatus_Widowed             0.007944
EmploymentStatus_Employed         0.005740
EmploymentStatus_Self-Employed    0.027026
EmploymentStatus_Unemployed       0.036616
Name: LoanApproved, dtype: float64

Temuan berdasarkan korelasi tinggi:
1. AnnualIncome
2. LoanAmount
3. InterestRate
4. MonthlyLoanPayment
5. CreditScore

Yang seharusnya dipakai juga:
1. NumberOfDependents (korelasi hampir 0)
2. LoanDuration (<0,1)
3. EducationLevel (<0,1)
4. EmploymentStatus
5. Age

## Feature Engineering

### Scaling loan amount and annual income to IDR

In [275]:
px.box(df["MonthlyIncome"])

According to BPS (2024), median of salary in Indonesia is Rp3.074.000, which is 755,33 times the median of this study case. So for that, we will scale AnnualIncome, LoanAmount, and MonthlyLoanPayment with 755,33.

In [276]:
df["MonthlyIncome"] = df["MonthlyIncome"]*755.33
df["LoanAmount"] = df["LoanAmount"]*755.33
df["MonthlyLoanPayment"] = df["MonthlyLoanPayment"]*755.33

In [277]:
df

Unnamed: 0,MonthlyIncome,LoanAmount,InterestRate,MonthlyLoanPayment,CreditScore,NumberOfDependents,LoanDuration,EducationLevel,Age,LoanApproved,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widowed,EmploymentStatus_Employed,EmploymentStatus_Self-Employed,EmploymentStatus_Unemployed
0,2.514494e+06,9934100.16,0.227590,317092.059606,617,2,48,4,45,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,2.499450e+06,19672569.85,0.201077,599772.987755,628,1,48,0,38,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,2.563338e+06,13314201.91,0.212548,503356.963403,570,2,36,1,47,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,4.348435e+06,28625496.34,0.300911,791213.447377,545,1,96,3,58,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,6.499866e+06,6936950.72,0.175990,249394.210182,594,1,36,0,37,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,1.899655e+06,18521446.93,0.195574,684153.526110,587,3,36,3,44,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
19996,3.099748e+06,19501109.94,0.199168,723904.973617,567,5,36,0,56,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
19997,3.081621e+06,27972135.89,0.226766,714109.718832,645,3,72,1,44,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
19998,2.582284e+06,11148670.80,0.264873,310567.740107,560,3,72,3,60,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [278]:
scaler = preprocessing.RobustScaler()
df[["MonthlyIncome", "LoanAmount", "MonthlyLoanPayment", "CreditScore"]] = scaler.fit_transform(df[["MonthlyIncome", "LoanAmount", "MonthlyLoanPayment", "CreditScore"]])

In [279]:
df

Unnamed: 0,MonthlyIncome,LoanAmount,InterestRate,MonthlyLoanPayment,CreditScore,NumberOfDependents,LoanDuration,EducationLevel,Age,LoanApproved,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Widowed,EmploymentStatus_Employed,EmploymentStatus_Self-Employed,EmploymentStatus_Unemployed
0,-0.199736,-0.574214,0.227590,-0.498711,0.565217,2,48,4,45,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,-0.205373,0.270675,0.201077,0.105884,0.724638,1,48,0,38,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.181434,-0.280963,0.212548,-0.100330,-0.115942,2,36,1,47,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.487418,1.047412,0.300911,0.515334,-0.478261,1,96,3,58,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,1.293531,-0.834240,0.175990,-0.643502,0.231884,1,36,0,37,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,-0.430108,0.170806,0.195574,0.286356,0.130435,3,36,3,44,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
19996,0.019551,0.255799,0.199168,0.371376,-0.159420,5,36,0,56,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
19997,0.012759,0.990727,0.226766,0.350426,0.971014,3,72,1,44,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
19998,-0.174336,-0.468840,0.264873,-0.512665,-0.260870,3,72,3,60,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Data Pipeline

In [280]:
from sklearn import model_selection
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [281]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(df_pipe.drop("LoanApproved", axis=1), df_pipe["LoanApproved"], test_size=0.2)

In [282]:
preprocessor = ColumnTransformer(transformers=[
    ('education_level', preprocessing.OrdinalEncoder(categories=[['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate']], dtype=int), ['EducationLevel']),
    ('marital_status', preprocessing.OneHotEncoder(sparse_output=False, categories=[['Married', 'Single', 'Divorced', 'Widowed']]), ['MaritalStatus']),
    ('employment_status', preprocessing.OneHotEncoder(sparse_output=False, categories=[['Employed', 'Self-Employed', 'Unemployed']]), ['EmploymentStatus']),
    ('num', preprocessing.RobustScaler(), ['MonthlyIncome', 'LoanAmount', 'MonthlyLoanPayment', 'CreditScore'])
], remainder='passthrough')



In [283]:
preprocessor.fit(X_train)

In [284]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [285]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.545119,0.794202,0.513879,-0.507246,0.274758,5.0,60.0,49.0
1,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,-0.352548,0.411531,0.268598,0.000000,0.291404,2.0,60.0,33.0
2,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.275939,0.011668,0.633559,-0.652174,0.200236,1.0,24.0,37.0
3,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.597684,-0.493177,-0.545602,0.463768,0.215228,2.0,60.0,35.0
4,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.477177,0.276282,0.352118,1.014493,0.183585,1.0,36.0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.129762,-0.657842,-0.613154,-1.492754,0.315374,1.0,84.0,30.0
15996,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.079176,-0.205945,-0.256050,0.072464,0.201590,2.0,48.0,42.0
15997,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.277352,0.592930,0.669197,0.608696,0.196720,1.0,36.0,56.0
15998,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.316516,0.324721,-0.184673,0.724638,0.182280,3.0,72.0,50.0


### Resampling

In [286]:
from imblearn import over_sampling

In [287]:
df_temp = pd.DataFrame(X_train).reset_index(drop=True)
df_temp = pd.concat([df_temp, pd.DataFrame(Y_train).reset_index(drop = True)], axis=1)

In [288]:
df_temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,LoanApproved
0,2.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.545119,0.794202,0.513879,-0.507246,0.274758,5.0,60.0,49.0,0
1,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,-0.352548,0.411531,0.268598,0.000000,0.291404,2.0,60.0,33.0,0
2,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.275939,0.011668,0.633559,-0.652174,0.200236,1.0,24.0,37.0,0
3,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.597684,-0.493177,-0.545602,0.463768,0.215228,2.0,60.0,35.0,0
4,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.477177,0.276282,0.352118,1.014493,0.183585,1.0,36.0,40.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15995,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.129762,-0.657842,-0.613154,-1.492754,0.315374,1.0,84.0,30.0,0
15996,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.079176,-0.205945,-0.256050,0.072464,0.201590,2.0,48.0,42.0,0
15997,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.277352,0.592930,0.669197,0.608696,0.196720,1.0,36.0,56.0,0
15998,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.316516,0.324721,-0.184673,0.724638,0.182280,3.0,72.0,50.0,0


In [289]:
fig = px.pie(Y_train, names="LoanApproved")
fig.show()

In [290]:
smote = over_sampling.SMOTE(random_state=42)
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)

In [291]:
fig = px.pie(Y_train_resampled, names="LoanApproved")
fig.show()

## Modelling

### All Models

In [300]:
# import lightgbm as lgb
# from sklearn.svm import SVC
# from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# import matplotlib.pyplot as plt

# # Initialize models
# models = {
#     'Random Forest': RandomForestClassifier(random_state=42),
#     'Ada Boost': AdaBoostClassifier(random_state=42),
#     'LightGBM': lgb.LGBMClassifier(random_state=42),
#     'SVM': SVC(random_state=42),
#     'XGBoost': XGBClassifier(random_state=42)
# }

# # Initialize dictionary to store classification reports
# classification_reports = {}
# model_names = []
# accuracies_train = []
# accuracies_test = []

# # Train and evaluate each model
# for model_name, model in models.items():
#     print(f"Training {model_name}...")
#     model.fit(X_train, Y_train.values.ravel())

#     # Predict on both training and test data
#     Y_train_pred = model.predict(X_train)
#     Y_test_pred = model.predict(X_test)

#     # Calculate confusion matrix and classification report for train data
#     confusion_train = confusion_matrix(Y_train, Y_train_pred)
#     classification_rep_train = classification_report(
#         Y_train, Y_train_pred, target_names=['Not Approved', 'Approved'], zero_division=1
#     )

#     # Calculate confusion matrix and classification report for test data
#     confusion_test = confusion_matrix(Y_test, Y_test_pred)
#     classification_rep_test = classification_report(
#         Y_test, Y_test_pred, target_names=['Not Approved', 'Approved'], zero_division=1
#     )

#     # Store the classification report in the dictionary for both train and test data
#     classification_reports[model_name] = {
#         'Train': classification_rep_train,
#         'Test': classification_rep_test
#     }

#     # Calculate accuracy for train and test data
#     accuracy_train = accuracy_score(Y_train, Y_train_pred)
#     accuracy_test = accuracy_score(Y_test, Y_test_pred)

#     # Store model names and accuracies
#     model_names.append(model_name)
#     accuracies_train.append(accuracy_train)
#     accuracies_test.append(accuracy_test)

#     # Print train classification report and accuracy
#     print(f"\nClassification Report for {model_name} (Train):")
#     print(classification_rep_train)
#     print(f"{model_name} Train Accuracy: {accuracy_train:.4f}")
#     print("=" * 50)

#     # Print test classification report and accuracy
#     print(f"\nClassification Report for {model_name} (Test):")
#     print(classification_rep_test)
#     print(f"{model_name} Test Accuracy: {accuracy_test:.4f}")
#     print("=" * 50)

#     # Feature importance (only for models that have feature_importances_)
#     if hasattr(model, 'feature_importances_'):  # Check if model has feature_importances_
#         features_importances = zip(model.feature_importances_, df_temp.drop("LoanApproved", axis=1).columns)
#         sorted_feature_importances = sorted(features_importances, reverse=True)

#         # Get top 15 predictors
#         top_15_predictors = sorted_feature_importances[:15]
#         values = [value for value, predictors in top_15_predictors]
#         predictors = [predictors for value, predictors in top_15_predictors]

#         # Plot the top 15 feature importances
#         plt.figure()
#         plt.title(f"{model_name} Feature Importances")
#         plt.bar(range(len(predictors)), values, color="r", align="center")
#         plt.xticks(range(len(predictors)), predictors, rotation=90)
#         plt.show()


Based on the above, lets pick LGBM as the best model.


## Full Pipeline

In [293]:
from sklearn import model_selection
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb

In [294]:
from sklearn.base import BaseEstimator, TransformerMixin

class MultiplyByFactor(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, factor=755.33):
        self.columns = columns
        self.factor = factor
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        if self.columns is not None:
            X_copy[self.columns] *= self.factor
        return X_copy


In [295]:
preprocessor = ColumnTransformer(transformers=[
    ('multiply', MultiplyByFactor(columns=["MonthlyIncome", "LoanAmount", "MonthlyLoanPayment"]), ["MonthlyIncome", "LoanAmount", "MonthlyLoanPayment"]),
    ('education_level', preprocessing.OrdinalEncoder(categories=[['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate']], dtype=int), ['EducationLevel']),
    ('marital_status', preprocessing.OneHotEncoder(sparse_output=False, categories=[['Married', 'Single', 'Divorced', 'Widowed']]), ['MaritalStatus']),
    ('employment_status', preprocessing.OneHotEncoder(sparse_output=False, categories=[['Employed', 'Self-Employed', 'Unemployed']]), ['EmploymentStatus']),
    ('num', preprocessing.RobustScaler(), ['MonthlyIncome', 'LoanAmount', 'MonthlyLoanPayment', 'CreditScore'])
], remainder='passthrough')

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMClassifier(random_state=42))
])

In [296]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(df_pipe_full.drop("LoanApproved", axis=1), df_pipe_full["LoanApproved"], test_size=0.2)

In [297]:
pipeline.fit(X_train, Y_train)

[LightGBM] [Info] Number of positive: 3877, number of negative: 12123
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003289 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2112
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242312 -> initscore=-1.140043
[LightGBM] [Info] Start training from score -1.140043


In [298]:
# Fit the pipeline with training data
pipeline.fit(X_train, Y_train)

# Make predictions with the pipeline on X_test
Y_pred = pipeline.predict(X_test)

# Evaluate the predictions
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy:.4f}")

confusion = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:\n", confusion)

classification_rep = classification_report(Y_test, Y_pred, target_names=['Not Approved', 'Approved'])
print("Classification Report:\n", classification_rep)

[LightGBM] [Info] Number of positive: 3877, number of negative: 12123
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001229 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2112
[LightGBM] [Info] Number of data points in the train set: 16000, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242312 -> initscore=-1.140043
[LightGBM] [Info] Start training from score -1.140043
Accuracy: 0.9165
Confusion Matrix:
 [[2946  151]
 [ 183  720]]
Classification Report:
               precision    recall  f1-score   support

Not Approved       0.94      0.95      0.95      3097
    Approved       0.83      0.80      0.81       903

    accuracy                           0.92      4000
   macro avg       0.88      0.87      0.88      4000
weighted avg       0.92      0.92      0.92      4000



### Export Model

In [299]:
import joblib
joblib.dump(pipeline, 'Dashboard/Credit Risk.pkl')

['Credit Risk.pkl']

## Experimentation
Try and temper with the RF

In [236]:
from sklearn import model_selection
from sklearn import metrics

In [235]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(df.drop("LoanApproved", axis=1), df["LoanApproved"], test_size=0.2)

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
confusion = confusion_matrix(Y_test, Y_pred)
classification_rep = classification_report(Y_test, Y_pred, target_names=['Good', 'Bad'], zero_division=1)
    
metrics.classification_reports[model_name] = classification_rep

accuracy = accuracy_score(Y_test, Y_pred)

model_names.append(model_name)
accuracies.append(accuracy)

print("\nClassification Report:")
print(classification_rep)
print(f"{model_name} Accuracy: {accuracy:.4f}")
print("=" * 50)

In [237]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
confusion = confusion_matrix(Y_test, Y_pred)
classification_rep = classification_report(Y_test, Y_pred, target_names=['Not Approved', 'Approved'], zero_division=1)

In [239]:
print(classification_rep)

              precision    recall  f1-score   support

Not Approved       0.92      0.95      0.94      3052
    Approved       0.83      0.72      0.77       948

    accuracy                           0.90      4000
   macro avg       0.87      0.84      0.85      4000
weighted avg       0.90      0.90      0.90      4000

