In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from lightgbm import LGBMClassifier
from sklearn.metrics import  f1_score

In [2]:
df = pd.read_csv('loan_analyzed.csv')
df.head()

Unnamed: 0,age_group,employment_status,credit_score,loan_amount_requested,annual_income,diff_income_to_expenses,debt_to_income_ratio,loan_approval_status
0,Middle Adult,Employed,743,24535,139901,9125.416667,0.141686,1
1,Middle Adult,Employed,468,8288,21162,-2277.5,0.86575,0
2,Middle Age,Self-Employed,389,10308,27815,-1135.083333,0.497969,0
3,Middle Adult,Self-Employed,778,33937,137853,8755.75,0.207525,1
4,Middle Adult,Employed,752,23360,81753,5164.75,0.107397,1


In [3]:
df_scaled = df.copy()

In [4]:
# Select categorical columns for encoding
cat_cols = ['age_group', 'employment_status']

# OneHotEncoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

df_encoded = pd.DataFrame(encoder.fit_transform(df_scaled[cat_cols]))

# Rename columns after encoding
df_encoded.columns = encoder.get_feature_names_out(cat_cols)

# Drop original categorical columns and concatenate encoded ones
df_scaled = df_scaled.drop(columns=cat_cols)
df_scaled = pd.concat([df_scaled, df_encoded], axis=1)

In [5]:
df_scaled

Unnamed: 0,credit_score,loan_amount_requested,annual_income,diff_income_to_expenses,debt_to_income_ratio,loan_approval_status,age_group_Early Retirement,age_group_Middle Adult,age_group_Middle Age,age_group_Young Adult,employment_status_Employed,employment_status_Self-Employed,employment_status_Unemployed
0,743,24535,139901,9125.416667,0.141686,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,468,8288,21162,-2277.500000,0.865750,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,389,10308,27815,-1135.083333,0.497969,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,778,33937,137853,8755.750000,0.207525,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,752,23360,81753,5164.750000,0.107397,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50494,381,15723,56085,2879.750000,0.212338,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
50495,726,21209,79062,5217.500000,0.312362,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50496,396,10540,57471,1347.250000,0.215187,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
50497,362,16765,49495,1250.583333,0.267502,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [6]:
import joblib

# บันทึก One-Hot Encoder
joblib.dump(encoder, 'one_hot_encoder.pkl')


['one_hot_encoder.pkl']

In [7]:
#  MinMaxScaler
scaler = MinMaxScaler()

# Columns to scale (excluding loan_approval_status)
cols_to_scale = ['credit_score', 'loan_amount_requested', 'annual_income',
                     'diff_income_to_expenses', 'debt_to_income_ratio','loan_approval_status']
df_scaled[cols_to_scale] = scaler.fit_transform(df_scaled[cols_to_scale])

In [8]:
# บันทึก Min-Max Scaler
joblib.dump(scaler, 'min_max_scaler.pkl')

['min_max_scaler.pkl']

In [9]:
df_scaled

Unnamed: 0,credit_score,loan_amount_requested,annual_income,diff_income_to_expenses,debt_to_income_ratio,loan_approval_status,age_group_Early Retirement,age_group_Middle Adult,age_group_Middle Age,age_group_Young Adult,employment_status_Employed,employment_status_Self-Employed,employment_status_Unemployed
0,0.806922,0.558159,0.922324,0.815031,0.095621,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.306011,0.093946,0.008870,0.067841,0.584275,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.162113,0.151661,0.060051,0.142700,0.336068,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.870674,0.826795,0.906569,0.790808,0.140054,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.823315,0.524586,0.474994,0.555503,0.072480,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50494,0.147541,0.306380,0.277531,0.405776,0.143302,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
50495,0.775956,0.463128,0.454292,0.558960,0.210806,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50496,0.174863,0.158290,0.288194,0.305357,0.145225,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
50497,0.112933,0.336152,0.226835,0.299023,0.180531,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [10]:
df_scaled.columns = df_scaled.columns.str.replace(' ', '_').str.lower()

In [11]:
df_scaled.head()

Unnamed: 0,credit_score,loan_amount_requested,annual_income,diff_income_to_expenses,debt_to_income_ratio,loan_approval_status,age_group_early_retirement,age_group_middle_adult,age_group_middle_age,age_group_young_adult,employment_status_employed,employment_status_self-employed,employment_status_unemployed
0,0.806922,0.558159,0.922324,0.815031,0.095621,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.306011,0.093946,0.00887,0.067841,0.584275,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.162113,0.151661,0.060051,0.1427,0.336068,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.870674,0.826795,0.906569,0.790808,0.140054,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.823315,0.524586,0.474994,0.555503,0.07248,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [12]:
#  Train Test Split
X = df_scaled.drop(columns=['loan_approval_status'])
y = df_scaled['loan_approval_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)


In [13]:
#LightGBM Model with Tuning
lgbm_model = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [14]:
# Train the model
lgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 26165, number of negative: 14234
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000684 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1284
[LightGBM] [Info] Number of data points in the train set: 40399, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.647665 -> initscore=0.608789
[LightGBM] [Info] Start training from score 0.608789


In [15]:
#  Evaluate Model
train_predictions = lgbm_model.predict(X_train)
test_predictions = lgbm_model.predict(X_test)

In [16]:
# Evaluate accuracy
f1 = f1_score(y_test, test_predictions) # Calculate f1_score
print(f'F1-Score: {f1:.4f}')

F1-Score: 0.8883


In [17]:
joblib.dump(lgbm_model, 'models/lgbm_model.pkl')

['lgbm_model.pkl']