<a href="https://www.kaggle.com/code/adastroabyssosque/loan-approval-prediction-with-cascade-models?scriptVersionId=203843776" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [89]:
!pip install optuna



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# import data

In [None]:
loan_tr = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
loan_te = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')
loan_tr.info()

## drop id

stored id in test data as `test_id`

In [None]:
loan_tr = loan_tr.drop(columns = ['id'])
loan_tr.columns

In [None]:
test_id = loan_te['id']
loan_te = loan_te.drop(columns = ['id'])
loan_te.columns

# Overview on train data by column

## min max scaler

In [None]:
def min_max_scaler(col):
    return (col-col.min())/(col.max()-col.min())

## styles first: create a color palette

In [None]:
import matplotlib.colors as colors

color_l = [ '#445E93', '#7EB2DD','#FCECC9','#FCB0B3','#F93943']
is_cat = colors.ListedColormap(name = 'imperial spring', colors = color_l)
is_uncat = colors.LinearSegmentedColormap.from_list(name = 'imperial spring', colors = color_l)
data = np.random.rand(10, 10)*10

In [None]:
# display to see outcome
from matplotlib import pyplot as plt

plt.imshow(data, cmap=is_uncat)
plt.colorbar()

In [None]:
plt.imshow(data, cmap = is_cat)
plt.colorbar()

## loan status

In [None]:
loan_tr.loan_status.head(5)

In [None]:
loan_tr.loan_status.unique()

In [None]:
loan_tr.loan_status.describe()

## age

In [None]:
loan_tr.person_age.describe()

In [None]:
loan_te.person_age.describe()

In [None]:
plt.scatter(loan_tr.person_age, loan_tr.loan_status, c = loan_tr.person_age, cmap = is_uncat)
plt.colorbar()

### a bit manipulation on person age and cred history length
doing this cause these columns has 0.87 correlation, and they both has a high value from F classification to response varaible. (For more information of it, see this script in previous history)

In [None]:
loan_tr.cb_person_cred_hist_length.describe()

In [None]:
plt.scatter(loan_tr.cb_person_cred_hist_length, loan_tr.loan_status, c = loan_tr.cb_person_cred_hist_length, cmap = is_uncat)
plt.colorbar()

In [None]:
loan_tr.insert(0, 'age_cred_time_ratio', loan_tr.person_age/loan_tr.cb_person_cred_hist_length)
loan_tr['age_cred_time_ratio'] = min_max_scaler(loan_tr.age_cred_time_ratio)
loan_tr.age_cred_time_ratio.describe()

In [None]:
loan_tr.insert(0, 'emp_length_cred_time_ratio', loan_tr.person_emp_length/loan_tr.cb_person_cred_hist_length)

loan_tr['emp_length_cred_time_ratio'] = min_max_scaler(loan_tr.emp_length_cred_time_ratio)
loan_tr.emp_length_cred_time_ratio.describe()

In [None]:
loan_tr.insert(0, 'age_salary_ratio', loan_tr.person_age/loan_tr.person_income)

loan_tr['age_salary_ratio'] = min_max_scaler(loan_tr.age_salary_ratio)
loan_tr.age_salary_ratio.describe()

In [None]:
loan_te.insert(0, 'emp_length_cred_time_ratio', loan_te.person_emp_length/loan_te.cb_person_cred_hist_length)

loan_te['emp_length_cred_time_ratio'] = min_max_scaler(loan_te.emp_length_cred_time_ratio)
loan_te.emp_length_cred_time_ratio.describe()

In [None]:
loan_te.insert(0, 'age_cred_time_ratio', loan_te.person_age/loan_te.cb_person_cred_hist_length)
loan_te['age_cred_time_ratio'] = min_max_scaler(loan_te.age_cred_time_ratio)
loan_te.age_cred_time_ratio.describe()

In [None]:
loan_te.insert(0, 'age_salary_ratio', loan_te.person_age/loan_te.person_income)

loan_te['age_salary_ratio'] = min_max_scaler(loan_te.age_salary_ratio)
loan_te.age_salary_ratio.describe()

In [None]:
loan_tr['person_age'] = min_max_scaler(loan_tr.person_age)
loan_tr['cb_person_cred_hist_length'] = min_max_scaler(loan_tr.cb_person_cred_hist_length)
loan_te['person_age'] = min_max_scaler(loan_te.person_age)
loan_te['cb_person_cred_hist_length'] = min_max_scaler(loan_te.cb_person_cred_hist_length)

## person_income

In [None]:
loan_tr.person_income.describe()

In [None]:
plt.scatter(loan_tr.person_income, loan_tr.loan_status, c = loan_tr.person_income, cmap = is_uncat)
plt.colorbar()

we can see that at least in our train data, only people with lower income would have loan satus = 1

In [None]:
loan_tr['person_income'] = min_max_scaler(loan_tr.person_income)
loan_tr['person_income'].head()

In [None]:
loan_te['person_income'] = min_max_scaler(loan_te.person_income)
loan_te['person_income'].head()

## person_home_ownership

In [None]:
loan_tr.columns

In [None]:
loan_tr.person_home_ownership.describe()

In [None]:
loan_tr = pd.get_dummies(loan_tr, columns = ['person_home_ownership'], prefix='OWN')

In [None]:
loan_te = pd.get_dummies(loan_te, columns = ['person_home_ownership'], prefix='OWN')

## person_emp_length

In [None]:
plt.scatter(loan_tr.person_emp_length, loan_tr.loan_status, c = loan_tr.person_emp_length, cmap = is_uncat)
plt.colorbar()

In [None]:
loan_tr['person_emp_length'] = min_max_scaler(loan_tr.person_emp_length)
loan_tr['person_emp_length'].head()

In [None]:
loan_te['person_emp_length'] = min_max_scaler(loan_te.person_emp_length)
loan_te['person_emp_length'].head()

## loan_intent

In [None]:
loan_tr.loan_intent.info()

In [None]:
loan_tr.groupby(by = 'loan_intent')['loan_intent'].count()

In [None]:
loan_te.groupby(by='loan_intent')['loan_intent'].count()

In [None]:
loan_tr = pd.get_dummies(loan_tr, columns = ['loan_intent'], prefix='INTENT')

In [None]:
loan_te = pd.get_dummies(loan_te, columns = ['loan_intent'], prefix='INTENT')

## loan grade

In [None]:
loan_tr.loan_grade.info()

In [None]:
loan_tr.loan_grade.unique()

In [None]:
loan_te.loan_grade.unique()

In [None]:
grade_dict = {'A':1,'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7}
loan_tr.loan_grade = loan_tr.loan_grade.apply(lambda x: grade_dict[x])

In [None]:
loan_te.loan_grade = loan_te.loan_grade.apply(lambda x: grade_dict[x])

### loan_int_rate and loan_grade
These columns has a corr of 0.94 (see previous version of this script for more information)

In [None]:
plt.scatter(loan_tr.loan_grade, loan_tr.loan_int_rate, c = loan_tr.loan_grade, cmap = is_uncat)
plt.colorbar()

In [None]:
plt.scatter(loan_te.loan_grade, loan_te.loan_int_rate, c = loan_te.loan_grade, cmap = is_uncat)
plt.colorbar()

## loan amnt

In [None]:
loan_tr.loan_amnt.describe()

In [None]:
plt.scatter(loan_tr.loan_amnt, loan_tr.loan_status, c = loan_tr.loan_amnt, cmap = is_uncat)
plt.colorbar()

In [None]:
loan_tr['loan_amnt'] = min_max_scaler(loan_tr.loan_amnt)
loan_tr['loan_amnt'].head()

In [None]:
loan_te['loan_amnt'] = min_max_scaler(loan_te.loan_amnt)
loan_te['loan_amnt'].head()

## loan_int_rate

In [None]:
plt.scatter(loan_tr.loan_int_rate, loan_tr.loan_status, c = loan_tr.loan_int_rate, cmap = is_uncat)
plt.colorbar()

In [None]:
loan_tr['loan_int_rate'] = min_max_scaler(loan_tr.loan_int_rate)
loan_tr['loan_int_rate'].head()

In [None]:
loan_te['loan_int_rate'] = min_max_scaler(loan_te.loan_int_rate)
loan_te['loan_int_rate'].head()

## loan_percent_income

In [None]:
loan_tr.loan_percent_income.describe()

In [None]:
plt.scatter(loan_tr.loan_percent_income, loan_tr.loan_status, c = loan_tr.loan_percent_income, cmap = is_uncat)

## cb_person_default_on_file

In [None]:
loan_tr.cb_person_default_on_file.describe()

In [None]:
loan_tr.cb_person_default_on_file.unique()

In [None]:
loan_tr = pd.get_dummies(loan_tr, columns = ['cb_person_default_on_file'], prefix = 'cb')

In [None]:
loan_tr.head(1)

In [None]:
loan_tr = loan_tr.drop(columns =['cb_Y'], axis=1)

In [None]:
loan_te = pd.get_dummies(loan_te, columns = ['cb_person_default_on_file'], prefix = 'cb')
loan_te = loan_te.drop(columns =['cb_Y'], axis=1)

# Analysis on correlation btw features and btw X and Y

## split X and Y

In [None]:
y = loan_tr['loan_status']
X = loan_tr.drop(columns=['loan_status'], axis=1)
X.head(1)

## corr matrix

In [None]:
y = loan_tr.pop('loan_status')

In [None]:
loan_tr.insert(loan_tr.shape[1],'loan_status', y)

In [None]:
import seaborn as sns
corr = loan_tr.corr()
matrix = np.triu(corr)
fig, ax = plt.subplots(figsize=(len(data[0]), len(data)))
sns.heatmap(corr, annot=True, cbar=False,fmt='.2f', cmap = is_uncat, ax=ax, mask=matrix)
plt.tight_layout()

## mutual information

In [None]:
from sklearn.feature_selection import mutual_info_classif
info = mutual_info_classif(X, y)

In [None]:
print(info)

In [None]:
plt.bar(X.columns, info)
tmp = plt.xticks(rotation=90)

Because loan_int_rate and loan_grade are highly correlated. we should drop 1.

## f stats

In [None]:
from sklearn.feature_selection import f_classif

In [None]:
f_stats, p_val = f_classif(X, y)

In [None]:
plt.bar(X.columns, p_val)
tmp = plt.xticks(rotation=90)

# cascade model fitting and hyper parameter tuning

1. LGBM: gives probability of points belong to which category -> training data
2. KNN; gives a rough prediction -> training data
3. XGBoost: gives the final prediction

In [None]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.utils import class_weight
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier

## HT tuning

In [None]:
import os
def objective(trial):
    params = {
        'model_xgb':{
            'objective': 'binary:logistic',                   
            'eval_metric': 'auc',     
            'learning_rate': trial.suggest_float('learning_rate_xgb', 5e-4, 0.05),
            'max_depth': trial.suggest_int('max_depth_xgb',10, 25),
            'n_estimators': trial.suggest_int('n_estimators_xgb', 1000, 3000),
            'subsample': trial.suggest_float('subsample_xgb', 0.01, 0.4),
            'reg_lambda': trial.suggest_float('reg_lambda_xgb', 5, 10),
            'colsample_bytree': trial.suggest_float('colsample_bytree_xgb', 0.13, 0.5),
        },
        'model_lgbm':{
            'objective': 'binary', 
            'metric': 'auc',  
           'learning_rate': trial.suggest_float('learning_rate_lgbm', 1e-5, 1),
            'n_estimators': trial.suggest_int('n_estimators_lgbm', 100, 1000),
            'max_depth': trial.suggest_int('max_depth_lgbm',3, 50),
            'num_leaves': trial.suggest_int('num_leaves_lgbm', 30, 300),
            'min_child_weight': trial.suggest_int('min_child_weight_lgbm', 20, 300),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf_lgbm', 50, 1000),
            'feature_fraction': trial.suggest_float('feature_fraction_lgbm',0.1, 1),
            'bagging_fraction': trial.suggest_float('bagging_fraction_lgbm',0.1, 1),
            'bagging_freq': trial.suggest_int('bagging_freq_lgbm',1, 10),
            'reg_alpha': trial.suggest_float('reg_alpha_lgbm',0, 10),
            'reg_lambda': trial.suggest_float('reg_lambda_lgbm',0, 10),
            'verbose': -1
        },
        'real_predictor':{
            'n_estimators': trial.suggest_int("n_estimators_rf", 100, 500),
            'max_depth' : trial.suggest_int("max_depth_rf", 10, 15),
            'min_samples_split' : trial.suggest_int("min_samples_split_rf", 2, 10),
            'min_samples_leaf' : trial.suggest_int("min_samples_leaf_rf", 1, 5),
            'max_features' : trial.suggest_float("max_features_rf", 0.1, 1.0),
            'bootstrap': True,
            'class_weight': 'balanced',
            'random_state' : 42, 
            'n_jobs': os.cpu_count()
        },
    }
   
    scores = []
    
    skf = StratifiedKFold(n_splits=5)
    
    for train_index, val_index in skf.split(X, y):
        
        #current_time = datetime.now()
        #formatted_time = current_time.strftime("%H:%M:%S")
        #print(f"Current time in iteration: {formatted_time}")
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
        
        class_weights = class_weight.compute_class_weight('balanced', 
                                                          classes=np.unique(y_train_fold), 
                                                          y=y_train_fold)
        class_weights_dict = dict(zip(np.unique(y_train_fold), class_weights))
        
        prior_model_xgb = xgb.XGBClassifier(**params['model_xgb'], 
                                        scale_pos_weight=class_weights_dict[1])
        prior_model_xgb.fit(X_train_fold, y_train_fold)
        proba_tr_xgb = prior_model_xgb.predict_proba(X_train_fold)
        proba_val_xgb = prior_model_xgb.predict_proba(X_val_fold)
        
        prior_model_lgb = lgb.LGBMClassifier(**params['model_lgbm'], class_weight='balanced')
        prior_model_lgb.fit(X_train_fold, y_train_fold)
        proba_tr_lgb = prior_model_lgb.predict_proba(X_train_fold)
        proba_val_lgb = prior_model_lgb.predict_proba(X_val_fold)
        
        proba_tr = pd.DataFrame({'xgb_proba':proba_tr_xgb[:,0], 'lgb_proba':proba_tr_lgb[:,0]})
        proba_val = pd.DataFrame({'xgb_proba':proba_val_xgb[:,0], 'lgb_proba':proba_val_lgb[:,0]})
        
        end_model = RandomForestClassifier(**params['real_predictor'])
        end_model.fit(proba_tr, y_train_fold)
        real_pred = end_model.predict(proba_val)
        
        scores.append(roc_auc_score(y_val_fold, real_pred))
        
    
    return np.mean(scores)


In [None]:
pruner = optuna.pruners.HyperbandPruner()
study = optuna.create_study(direction='maximize', pruner = pruner)

study.optimize(objective, n_trials=200)  

In [None]:
all_params= list(study.best_params.keys())
all_params

In [None]:
all_params= list(study.best_params.keys())
xgb_params = all_params[:6]
xgb_params_v = {x.rsplit('_',1)[0]:study.best_params[x] for x in xgb_params}
xgb_params_v['objective'] = 'binary:logistic'
xgb_params_v['eval_metric'] = 'auc'

lgb_params = all_params[6:17]
lgb_params_v = {x.rsplit('_',1)[0]:study.best_params[x] for x in lgb_params}
lgb_params_v['objective'] = 'binary'
lgb_params_v['eval_metric'] = 'auc'
lgb_params_v['verbose'] = -1

rf_params = all_params[17:len(all_params)]
rf_params_v = {x.rsplit('_',1)[0]:study.best_params[x] for x in rf_params}
rf_params_v['bootstrap'] = True
rf_params_v['class_weight'] = 'balanced'
rf_params_v['random_state'] = 42
rf_params_v['n_jobs'] =  os.cpu_count()

skf = StratifiedKFold(n_splits=5)
scores = []

for train_index, val_index in skf.split(X, y):
    X_tr, X_te = X.iloc[train_index], X.iloc[val_index]
    y_tr, y_te = y.iloc[train_index], y.iloc[val_index]

    class_weights = class_weight.compute_class_weight('balanced', 
                                              classes=np.unique(y_te), 
                                              y=y_te)
    class_weights_dict = dict(zip(np.unique(y_te), class_weights))
        
    prior_model_xgb = xgb.XGBClassifier(**xgb_params_v, scale_pos_weight=class_weights_dict[1])
    prior_model_xgb.fit(X_tr, y_tr)
    proba_tr_xgb = prior_model_xgb.predict_proba(X_tr)
    proba_te_xgb = prior_model_xgb.predict_proba(X_te)

    prior_model_lgb = lgb.LGBMClassifier(**lgb_params_v, class_weight='balanced')
    prior_model_lgb.fit(X_tr, y_tr)
    proba_tr_lgb = prior_model_lgb.predict_proba(X_tr)
    proba_te_lgb = prior_model_lgb.predict_proba(X_te)

    proba_tr = pd.DataFrame({'xgb_proba':proba_tr_xgb[:,0], 'lgb_proba':proba_tr_lgb[:,0]})
    proba_te = pd.DataFrame({'xgb_proba':proba_te_xgb[:,0], 'lgb_proba':proba_te_lgb[:,0]})

    final_pred = RandomForestClassifier(**rf_params_v)
    final_pred.fit(proba_tr, y_tr)
    real_pred = final_pred.predict(proba_te)
    scores.append(roc_auc_score(y_te, real_pred))
print(np.mean(scores))

## optuna visualization

In [None]:
from optuna import visualization as op_viz

op_viz.plot_optimization_history(study, error_bar=True)

In [None]:
op_viz.plot_param_importances(study)

In [None]:
op_viz.plot_slice(study)

# submission

In [None]:
loan_te = loan_te.reindex(X.columns, axis=1)
class_weights = class_weight.compute_class_weight('balanced', 
                                                  classes=np.unique(y), 
                                                  y=y)
class_weights_dict = dict(zip(np.unique(y), class_weights))
        
prior_model_xgb = xgb.XGBClassifier(**xgb_params_v, 
                                scale_pos_weight=class_weights_dict[1])
prior_model_xgb.fit(X, y)
proba_tr_xgb = prior_model_xgb.predict_proba(X)
proba_te_xgb = prior_model_xgb.predict_proba(loan_te)

prior_model_lgb = lgb.LGBMClassifier(**lgb_params_v, class_weight='balanced')
prior_model_lgb.fit(X, y)
proba_tr_lgb = prior_model_lgb.predict_proba(X)
proba_te_lgb = prior_model_lgb.predict_proba(loan_te)

proba_tr = pd.DataFrame({'xgb_proba':proba_tr_xgb[:,0], 'lgb_proba':proba_tr_lgb[:,0]})
proba_te = pd.DataFrame({'xgb_proba':proba_te_xgb[:,0], 'lgb_proba':proba_te_lgb[:,0]})
#X_tr = X_tr.assign(xgb_confidence= proba_tr[:,1])
#X_te = X_te.assign(xgb_confidence=proba_te[:,1])
final_pred = RandomForestClassifier(**rf_params_v)
final_pred.fit(proba_tr, y)
real_pred = final_pred.predict(proba_te)
real_pred

In [None]:
output = pd.DataFrame({'id':test_id, 'loan_status': real_pred})

In [None]:
output.to_csv('/kaggle/working/submission.csv', index=False)