In [1]:
# LOAN DEFAULT PREDICTION USING LIGHTGBM

import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import lightgbm as lgb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


# Read the Excel dataset into a pandas DataFrame
data = pd.read_csv('accepted_2007_to_2018Q4.csv', low_memory=False)



def separate_features(dataframe):
    categorical_features = []
    continuous_features = []

    for column in dataframe.columns:
        if dataframe[column].dtype == 'object' or dataframe[column].dtype.name == 'category':
            categorical_features.append(column)
        elif dataframe[column].dtype == 'int64' or dataframe[column].dtype == 'float64':
            continuous_features.append(column)
        else:
            print(f'Unhandled data type in column "{column}": {dataframe[column].dtype}')

    return categorical_features, continuous_features



#print("Categorical Features:", cat_features)
#print("Continuous Features:", cont_features)


data = data[data['loan_status'].isin(['Fully Paid','Charged Off','Defaulted'])]
leakage_features = [
    'out_prncp',
    'out_prncp_inv',
    'total_pymnt',
    'total_pymnt_inv',
    'total_rec_prncp',
    'total_rec_int',
    'total_rec_late_fee',
    'recoveries',
    'collection_recovery_fee',
    'last_pymnt_d',
    'last_pymnt_amnt',
    'next_pymnt_d',
    'last_credit_pull_d',
    'debt_settlement_flag',
    'debt_settlement_flag_date',
    'settlement_status',
    'settlement_date',
    'settlement_amount',
    'settlement_percentage',
    'settlement_term',
    'last_fico_range_high',
    'last_fico_range_low',
    'id',
    'url',
    'emp_title',
    'title'
]

# Drop the leakage features
data = data.drop(leakage_features, axis=1)

# Calculate the percentage of missing values for each feature
missing_values = data.isnull().sum() / len(data) * 100

# Identify columns with more than 90% missing values
columns_to_drop = missing_values[missing_values > 90].index

# Drop the identified columns
data = data.drop(columns_to_drop, axis=1)

categorical_features, continuous_features = separate_features(data)
categorical_features.remove('loan_status')

# fill numeric NAs with median
#median_values = data[continuous_features].median() 
#data[continuous_features] = data[continuous_features].fillna(median_values)
data[continuous_features] = data[continuous_features].fillna(-1)

# Fill categorical NAs with "missing" and convert all elements to strings
data[categorical_features] = data[categorical_features].fillna('missing').astype(str)

data.isna().sum().sum()

#Scaling
#scaler = StandardScaler()
#data[continuous_features] = scaler.fit_transform(data[continuous_features])

for feature in categorical_features:
    label_encoder = LabelEncoder()
    data[feature] = label_encoder.fit_transform(data[feature])

# Preprocess loan status labels with custom encoding
custom_encoding = {
    'Fully Paid': 1,
    'Charged Off': 0,
    'Default': 0,
    
}
data['loan_status'] = data['loan_status'].replace(custom_encoding)


# Split the dataset into features (X) and labels (y)
X = data.drop('loan_status', axis=1)
y = data['loan_status']




X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Apply RandomUnderSampler to balance the training dataset
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)



# Scaling continuous features for the training, validation, and test sets
scaler = StandardScaler()
X_train_resampled[continuous_features] = scaler.fit_transform(X_train_resampled[continuous_features])
X_test[continuous_features] = scaler.transform(X_test[continuous_features])

# Define the hyperparameter search space
param_dist = {
    'num_leaves': [20, 31, 50],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.7, 0.9, 1],
    'lambda_l1': [0, 0.01, 0.1],
    'lambda_l2': [0, 0.01, 0.1],
    'n_estimators': [100, 500, 1000]
}

clf = lgb.LGBMClassifier()
random_search = RandomizedSearchCV(clf, param_dist, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

# Perform random search with cross-validation
random_search.fit(X_train_resampled, y_train_resampled)

# Get the best hyperparameters found by the random search
best_hyperparameters = random_search.best_params_



Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [2]:
best_model = lgb.LGBMClassifier(**best_hyperparameters)
best_model.fit(X_train_resampled, y_train_resampled)

y_pred = best_model.predict(X_test)
y_pred_binary = y_pred.astype(int)

# Calculate the test accuracy
test_accuracy = accuracy_score(y_test, y_pred_binary)
test_precision = precision_score(y_test, y_pred_binary)
test_recall = recall_score(y_test, y_pred_binary)
test_f1_score = f1_score(y_test, y_pred_binary)

# Calculate the training accuracy
y_pred_train = best_model.predict(X_train)
y_pred_train_binary = y_pred_train.astype(int)
train_accuracy = accuracy_score(y_train, y_pred_train_binary)

# Eval metrics
print(f"Training accuracy: {train_accuracy}")
print(f"Test accuracy: {test_accuracy}")
print(f"Test precision: {test_precision}")
print(f"Test recall: {test_recall}")
print(f"Test F1-score: {test_f1_score}")


Training accuracy: 0.6153971946986196
Test accuracy: 0.6635756814414522
Test precision: 0.8954583104130706
Test recall: 0.6568541055990501
Test F1-score: 0.7578184153144355
