In [219]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

# Loan Approval Prediction Data

In [220]:
is_show_viz = False

## Read Data

In [221]:
df = pd.read_csv('./raw_data/train.csv')

In [222]:
if is_show_viz == True:
    df.info()

In [223]:
if is_show_viz == True:
    print(df.loc[1])

## Separate categorical from numerical columns

In [224]:
cat_cols = ['person_home_ownership', 'loan_intent', 'loan_grade','cb_person_default_on_file']
num_cols = num_cols = [item for item in df.columns if item not in cat_cols]

## General structure of the data.

In [225]:
if is_show_viz == True:
    df.head()

### Overview of numerical columns

In [226]:
if is_show_viz == True:
    df[num_cols].describe()

#### => No extreme outliers.

### Plot categorical variables

In [227]:
if is_show_viz == True:
    for i in cat_cols:
        counts = df[i].value_counts()
        counts.plot(kind = 'bar')
        plt.ylabel('count')
        plt.title(i)
        plt.xticks(rotation=45)
        plt.show()

### Plot numerical variables

In [228]:
if is_show_viz == True:
    for i in num_cols:
        plt.hist(df[i], bins = 30, edgecolor = 'black')
        plt.ylabel('Frequency')
        plt.title(i)
        plt.show()

## Preprocessing

### Duplicates - No.

In [229]:
if is_show_viz == True:
    print(df.duplicated().sum())

### Missing values - No.

In [230]:
if is_show_viz == True:
    print(df.isna().sum())

### Overview of categorical data

In [231]:
if is_show_viz == True:
    for col in df[cat_cols]:
        print(f"{col}: {df[col].unique()}")

### Encode categorical ordinal data

In [232]:
df['cb_person_default_on_file'] = df['cb_person_default_on_file'].map({'N': 0, 'Y': 1})
# loan status
df['loan_grade'] = df['loan_grade'].map({'G': 1, 'F': 2, 'E': 3, 'D': 4, 'C': 5, 'B': 6 , 'A': 7}) # needs normalization later
df[cat_cols].head()

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
0,RENT,EDUCATION,6,0
1,OWN,MEDICAL,5,0
2,OWN,PERSONAL,7,0
3,RENT,VENTURE,6,0
4,RENT,MEDICAL,7,0


In [233]:
if is_show_viz == True:
    df.columns

### One-Hot-Encode categorical non-ordinal data

In [234]:
df = pd.get_dummies(df, columns = ['person_home_ownership','loan_intent'])
df.columns

Index(['id', 'person_age', 'person_income', 'person_emp_length', 'loan_grade',
       'loan_amnt', 'loan_int_rate', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length',
       'loan_status', 'person_home_ownership_MORTGAGE',
       'person_home_ownership_OTHER', 'person_home_ownership_OWN',
       'person_home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION',
       'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT',
       'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE'],
      dtype='object')

In [235]:
if is_show_viz == True:
    df.head()

## Vizualizations

### Correlations

In [236]:
if is_show_viz == True:
    corr_matrix = df.corr()
    plt.figure(figsize=(8,6))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.01f', linewidths = 1)

### Pivot Tables

In [237]:
if is_show_viz == True:
    pivot = pd.pivot_table(df, index = 'person_age', columns = 'cb_person_cred_hist_length', aggfunc='size', fill_value = 0)
    plt.figure(figsize=(12,8))
    sns.heatmap(pivot, cmap = 'YlGnBu', annot=True, fmt="d", linewidths = 0.5)
    plt.title('Frequency of Person Age and Credit History Length Combinations')
    plt.xlabel('Credit History Length')
    plt.ylabel('Person Age')
    plt.show()

In [238]:
if is_show_viz == True:
    pivot = pd.pivot_table(df, index = 'person_home_ownership_RENT', columns = 'person_home_ownership_MORTGAGE', aggfunc='size', fill_value = 0)
    plt.figure(figsize=(12,8))
    sns.heatmap(pivot, cmap = 'YlGnBu', annot=True, fmt="d", linewidths = 0.5)
    plt.title('Frequency of home ownership = Rent and home ownership = Mortgage')
    plt.xlabel('Mortgage')
    plt.ylabel('Rent')
    plt.show()

In [239]:
if is_show_viz == True:
    pivot = pd.pivot_table(df, index = 'loan_grade', columns = 'cb_person_default_on_file', aggfunc='size', fill_value = 0)
    plt.figure(figsize=(12,8))
    sns.heatmap(pivot, cmap = 'YlGnBu', annot=True, fmt="d", linewidths = 0.5)
    plt.title('Frequency of loangrade and default on file')
    plt.xlabel('default on file')
    plt.ylabel('loangrade')
    plt.show()

In [240]:
if is_show_viz == True:
    plt.figure(figsize=(12, 8))
    sns.violinplot(data=df, x='cb_person_default_on_file', y='loan_int_rate', hue='cb_person_default_on_file', palette='YlGnBu', legend=False)
    plt.title('Distribution of Interest Rates by Default Status')
    plt.xlabel('Default on File')
    plt.ylabel('Interest Rate')
    plt.show()

### Vizualizing Outliers

In [241]:
import math

if is_show_viz == True:
    stats_df = pd.DataFrame(index=df.select_dtypes(include=['float64', 'int64']).columns,
                        columns=['Q1', 'Q3', 'IQR'])
    #print(stats_df.index)
    num_features = len(stats_df.index)
    num_cols = len(stats_df.columns)
    num_rows = math.ceil(num_features / num_cols) 
    
    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, 5 * num_rows))
    axs = axs.flatten()
    
    for i, col in enumerate(stats_df.index):
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        stats_df.loc[col, 'Q1'] = Q1
        stats_df.loc[col, 'Q3'] = Q3
        stats_df.loc[col, 'IQR'] = IQR
        sns.boxplot(x=df[col], ax=axs[i])
        axs[i].set_title(f"{col} Boxplot")
    
    
    # Hide any unused subplots
    for j in range(i + 1, len(axs)):
        axs[j].axis('off')
    
    plt.tight_layout()
    plt.show()

### Removing outliers based on distance from the 25% and 75% quantiles. 

In [242]:
if is_show_viz == True:
    df.info()

In [243]:
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3.5 * IQR
    upper_bound = Q3 + 3.5 * IQR
    
    if col != 'loan_status':
        num_outliers_before = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
        print(f"Number of outliers in {col} before removing: {num_outliers_before}")
    
        df = df.loc[lambda x: (x[col] >= lower_bound) & (x[col] <= upper_bound)]

        num_outliers_after = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
        print(f"Number of outliers in {col} after removing: {num_outliers_after}")

Number of outliers in id before removing: 0
Number of outliers in id after removing: 0
Number of outliers in person_age before removing: 236
Number of outliers in person_age after removing: 0
Number of outliers in person_income before removing: 373
Number of outliers in person_income after removing: 0
Number of outliers in person_emp_length before removing: 39
Number of outliers in person_emp_length after removing: 0
Number of outliers in loan_grade before removing: 0
Number of outliers in loan_grade after removing: 0
Number of outliers in loan_amnt before removing: 0
Number of outliers in loan_amnt after removing: 0
Number of outliers in loan_int_rate before removing: 0
Number of outliers in loan_int_rate after removing: 0
Number of outliers in loan_percent_income before removing: 1
Number of outliers in loan_percent_income after removing: 0
Number of outliers in cb_person_default_on_file before removing: 8609
Number of outliers in cb_person_default_on_file after removing: 0
Number of

In [244]:
if is_show_viz == True:
    df.info()

## Feature Engineering

In [245]:
import numpy as np
df['employment_gap'] = df['person_age'] - df['person_emp_length']
df['loan_interest_amount'] = np.where(df['loan_amnt'] == 0, 0, df['loan_int_rate'] / 100 * df['loan_amnt'])
df['age_to_credit_history'] = np.where(df['cb_person_cred_hist_length'] == 0, df['person_age'] , df['person_age'] / df['cb_person_cred_hist_length'])
df['age_to_employment'] = np.where(df['person_emp_length'] == 0, df['person_age'], df['person_age'] / df['person_emp_length'])
df['age_to_rate'] = np.where(df['loan_int_rate'] == 0, df['person_age'], df['person_age'] / df['loan_int_rate'])
if is_show_viz == True:
    df.info()

## Training models
#### Normalization (MinMaxScaler, because no normal distribution and values should be between 0 and 1 and there are no extreme Outliers)

In [246]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMClassifier as lgbm

scaler = MinMaxScaler()
cols_to_norm = ['person_age', 'person_income', 'person_emp_length', 'loan_grade', 'loan_amnt', 
                'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length','loan_interest_amount','employment_gap',
                'employment_gap','loan_interest_amount','age_to_credit_history','age_to_employment', 'age_to_rate'
               ]
               
cols = [col for col in df.columns if col not in ['loan_status','id']]
X = df[cols]
y = df['loan_status'].astype('float64')

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, stratify=y)


# Normalization
X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])

model_lr = LogisticRegression(max_iter=7000)
model_lgbm = lgbm(device = 'gpu', n_jobs = 1, verbose = -1)

models = [model_lr, model_lgbm]
for model in models:
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]
    
    score = roc_auc_score(y_test, y_pred_proba)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print('roc_auc: ', score)
    print('accuracy: ', acc)
    print('Classification Report: \n', report)

roc_auc:  0.8966706370513832
accuracy:  0.9213300892133008
Classification Report: 
               precision    recall  f1-score   support

         0.0       0.93      0.98      0.96      8726
         1.0       0.77      0.46      0.57      1138

    accuracy                           0.92      9864
   macro avg       0.85      0.72      0.76      9864
weighted avg       0.91      0.92      0.91      9864

roc_auc:  0.9540985528169255
accuracy:  0.9613746958637469
Classification Report: 
               precision    recall  f1-score   support

         0.0       0.96      0.99      0.98      8726
         1.0       0.94      0.71      0.81      1138

    accuracy                           0.96      9864
   macro avg       0.95      0.85      0.89      9864
weighted avg       0.96      0.96      0.96      9864



## Optuna tuning

In [247]:
import optuna
# Optuna Hyperparameter Tuning
# Define the objective function for Optuna
def objective(trial):
    param = {
        'device': 'gpu',
        'n_jobs': 1,
        'verbose': -1,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0, log=True),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0, log=True),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
    }
    
    model = lgbm(**param)
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='roc_auc').mean()
    return score

# Create a study and run the optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

# Print the best trial
print('Best trial:')
trial = study.best_trial
print(f'  Value: {trial.value}')
print('  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

[I 2024-11-05 22:23:41,019] A new study created in memory with name: no-name-62413278-777f-4167-a97f-6850c37d28d7
[I 2024-11-05 22:23:47,667] Trial 0 finished with value: 0.9455155615824373 and parameters: {'learning_rate': 0.08845451621560302, 'n_estimators': 641, 'max_depth': 6, 'num_leaves': 118, 'feature_fraction': 0.7986872849264339, 'bagging_fraction': 0.9547740979028987, 'lambda_l1': 3.1057431867752032, 'lambda_l2': 2.079057344159426e-05}. Best is trial 0 with value: 0.9455155615824373.
[I 2024-11-05 22:23:52,062] Trial 1 finished with value: 0.946750637562152 and parameters: {'learning_rate': 0.06184787392255232, 'n_estimators': 546, 'max_depth': 5, 'num_leaves': 68, 'feature_fraction': 0.8733249685255821, 'bagging_fraction': 0.8557749563574011, 'lambda_l1': 4.851084479663115e-05, 'lambda_l2': 0.0012101926171261268}. Best is trial 1 with value: 0.946750637562152.
[I 2024-11-05 22:23:56,148] Trial 2 finished with value: 0.9411921244935902 and parameters: {'learning_rate': 0.1683

Best trial:
  Value: 0.9494729758528867
  Params: 
    learning_rate: 0.1567771264642283
    n_estimators: 487
    max_depth: 3
    num_leaves: 50
    feature_fraction: 0.8155317089131539
    bagging_fraction: 0.594724692444444
    lambda_l1: 1.8304584557138635
    lambda_l2: 0.0002825258688921562


## Train model with the best parameters

In [248]:
# Train model with the best parameters
best_params = study.best_params
best_model = lgbm(**best_params, device='gpu', n_jobs=1, verbose=-1)
best_model.fit(X_train, y_train)

# Evaluate the model
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

score = roc_auc_score(y_test, y_pred_proba)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print('Best LGBM model performance:')
print('roc_auc: ', score)
print('accuracy: ', acc)
print('Classification Report: \n', report)

Best LGBM model performance:
roc_auc:  0.9553011483770498
accuracy:  0.9603609083536091
Classification Report: 
               precision    recall  f1-score   support

         0.0       0.96      0.99      0.98      8726
         1.0       0.92      0.71      0.81      1138

    accuracy                           0.96      9864
   macro avg       0.94      0.85      0.89      9864
weighted avg       0.96      0.96      0.96      9864

