# Risk Assessment and Creditworthiness App



Import the necessary Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, precision_score, classification_report, recall_score, f1_score, roc_auc_score, auc


To prevent the warning error from showing

In [3]:
import warnings
warnings.filterwarnings('ignore')

# Explanation of the Dataset

The definition of the variables are as follows;

person_age: The age of the borrower when securing the loan.

person_income: The borrower’s annual earnings at the time of the loan.

person_home_ownership: Type of home ownership of the applicant;
Rent: The borrower is currently renting a property.
Mortgage: The borrower has a mortgage on the property they own.
Own: The borrower owns a home outright.
Other: Other categories of home ownership that may be specific to the dataset.

person_emp_length: The amount of time in years that the borrower has been in employment.

loan_intent: Loan purpose - what applicant (borrower) wants to use the loan for.

loan_grade: A classification system based on credit history, collateral quality, and the likelihood of repayment of the principal and interest.
A: The borrower has a high creditworthiness, indicating low risk.
B: The borrower is relatively low-risk, but not as creditworthy as Grade A.
C: The borrower’s creditworthiness is moderate.
D: The borrower is considered to have a higher risk compared to previous grades.
E: The borrower’s creditworthiness is lower, indicating a higher risk.
F: The borrower poses a significant credit risk.
G: The borrower’s creditworthiness is the lowest, signifying the highest risk.

loan_amnt: Total amount of the loan applied for.

loan_int_rate: The interest rate of the loan.

loan_status: Target variable indicating Default as (1) or Non-default as (0). The loan_status variable is a crucial dependent variable. A default occurs when a borrower is unable to make timely payments, misses payments, or avoids or stops making payments on interest or principal owed;
0: Non-default - The borrower successfully repaid the loan as agreed, and there was no default.
1: Default - The borrower failed to repay the loan according to the agreed-upon terms and defaulted.

loan_percent_income: Ratio between the loan amount and the annual income of borrower.

cb_person_cred_hist_length: The number of years of personal history since the first loan was taken by the borrower.

cb_person_default_on_file: Indicates if the person has previously defaulted.


# The DataFrame is referred to as "dfrisk"

In [4]:
dfrisk = pd.read_csv('C:/Users/PC/Documents/sola/others/Telegram Desktop/credit_risk_dataset.csv')
dfrisk

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


In [5]:
dfrisk.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


Capture the data types of each column

In [6]:
dfrisk.dtypes

person_age                      int64
person_income                   int64
person_home_ownership          object
person_emp_length             float64
loan_intent                    object
loan_grade                     object
loan_amnt                       int64
loan_int_rate                 float64
loan_status                     int64
loan_percent_income           float64
cb_person_default_on_file      object
cb_person_cred_hist_length      int64
dtype: object

# Preprocessing

Instead of making use of the encoder, we manually converted the categorical columns into Numerical columns

categorical_columns ==>> 'person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'

Column >> 'person_home_ownership'

In [7]:
dfrisk['person_home_ownership'].replace('RENT',0,inplace=True)
dfrisk['person_home_ownership'].replace('OWN',1,inplace=True)
dfrisk['person_home_ownership'].replace('MORTGAGE',2,inplace=True)
dfrisk['person_home_ownership'].replace('OTHER',3,inplace=True)

In [8]:
dfrisk['person_home_ownership'].value_counts()

person_home_ownership
0    16446
2    13444
1     2584
3      107
Name: count, dtype: int64

Column >> 'loan_intent'

In [9]:
dfrisk['loan_intent'].replace('PERSONAL',0,inplace=True)
dfrisk['loan_intent'].replace('EDUCATION',1,inplace=True)
dfrisk['loan_intent'].replace('MEDICAL',2,inplace=True)
dfrisk['loan_intent'].replace('VENTURE',3,inplace=True)
dfrisk['loan_intent'].replace('HOMEIMPROVEMENT',4,inplace=True)
dfrisk['loan_intent'].replace('DEBTCONSOLIDATION',5,inplace=True)

In [10]:
dfrisk['loan_intent'].value_counts()

loan_intent
1    6453
2    6071
3    5719
0    5521
5    5212
4    3605
Name: count, dtype: int64

Column >> 'loan_grade'

In [11]:
dfrisk['loan_grade'].replace('A',0,inplace=True)
dfrisk['loan_grade'].replace('B',1,inplace=True)
dfrisk['loan_grade'].replace('C',2,inplace=True)
dfrisk['loan_grade'].replace('D',3,inplace=True)
dfrisk['loan_grade'].replace('E',4,inplace=True)
dfrisk['loan_grade'].replace('F',5,inplace=True)
dfrisk['loan_grade'].replace('G',6,inplace=True)

In [12]:
dfrisk['loan_grade'].value_counts()

loan_grade
0    10777
1    10451
2     6458
3     3626
4      964
5      241
6       64
Name: count, dtype: int64

Column >> 'cb_person_default_on_file'

In [13]:
dfrisk['cb_person_default_on_file'].replace('Y',1,inplace=True)
dfrisk['cb_person_default_on_file'].replace('N',0,inplace=True)

In [14]:
dfrisk['cb_person_default_on_file'].value_counts()

cb_person_default_on_file
0    26836
1     5745
Name: count, dtype: int64

Generate the unique values of the converted categorical columns to confirm correctness

In [15]:
dfrisk['loan_intent'].unique()

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [16]:
dfrisk['loan_grade'].unique()

array([3, 1, 2, 0, 4, 5, 6], dtype=int64)

In [17]:
dfrisk['cb_person_default_on_file'].unique()

array([1, 0], dtype=int64)

In [18]:
dfrisk['person_home_ownership'].unique()

array([0, 1, 2, 3], dtype=int64)

Another glance at dataset, in order to further Preprocess

In [19]:
dfrisk.describe()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
count,32581.0,32581.0,32581.0,31686.0,32581.0,32581.0,32581.0,29465.0,32581.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,0.914429,4.789686,2.339769,1.218195,9589.371106,11.011695,0.218164,0.170203,0.17633,5.804211
std,6.348078,61983.12,0.960858,4.14263,1.678803,1.166336,6322.086646,3.240459,0.413006,0.106782,0.381106,4.055001
min,20.0,4000.0,0.0,0.0,0.0,0.0,500.0,5.42,0.0,0.0,0.0,2.0
25%,23.0,38500.0,0.0,2.0,1.0,0.0,5000.0,7.9,0.0,0.09,0.0,3.0
50%,26.0,55000.0,0.0,4.0,2.0,1.0,8000.0,10.99,0.0,0.15,0.0,4.0
75%,30.0,79200.0,2.0,7.0,4.0,2.0,12200.0,13.47,0.0,0.23,0.0,8.0
max,144.0,6000000.0,3.0,123.0,5.0,6.0,35000.0,23.22,1.0,0.83,1.0,30.0


Confirmation that all 4 categorical columns have been transformed into numerical...

In [20]:
print(dfrisk.dtypes)

person_age                      int64
person_income                   int64
person_home_ownership           int64
person_emp_length             float64
loan_intent                     int64
loan_grade                      int64
loan_amnt                       int64
loan_int_rate                 float64
loan_status                     int64
loan_percent_income           float64
cb_person_default_on_file       int64
cb_person_cred_hist_length      int64
dtype: object


Checking the number of NaN values per column

In [21]:
dfrisk.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

Replace the NaN values in the 'person_emp_length' column with the median value

In [22]:
dfrisk['person_emp_length'].fillna(dfrisk['person_emp_length'].median(), inplace=True)

Confirm that the 'person_emp_length' column has no NaN values anymore

In [23]:
dfrisk.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length                0
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

Replace the NaN values in the 'loan_int_rate' column with the mode value

In [24]:
dfrisk['loan_int_rate'].fillna(dfrisk['loan_int_rate'].median(), inplace=True)

Confirm that the 'loan_int_rate' column has no NaN values anymore

In [25]:
dfrisk.isna().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

Remove unrealistic fields from 2 identified columns - 144 years of age and 123 years of service

In [26]:
#Check Modal age in person_age by calculating the mode of 'person_age' column
mode_person_age = dfrisk['person_age'].mode()

# Update the 'person_age' column where the age is greater than 75 with the mode age
dfrisk.loc[dfrisk['person_age'] > 75, 'person_age'] = mode_person_age

In [27]:
# Calculate the Median value for 'person_emp_length' column
median_person_emp_length = dfrisk['person_emp_length'].median()

# Update the 'person_emp_length' column where it is greater than 50 with the median person_emp_length
dfrisk.loc[dfrisk['person_emp_length'] > 50, 'person_emp_length'] = median_person_emp_length

Confirm the removal of outliers, another look at the dataset

In [28]:
dfrisk

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22.0,59000,0,4.0,0,3,35000,16.02,1,0.59,1,3
1,21.0,9600,1,5.0,1,1,1000,11.14,0,0.10,0,2
2,25.0,9600,2,1.0,2,2,5500,12.87,1,0.57,0,3
3,23.0,65500,0,4.0,2,2,35000,15.23,1,0.53,0,2
4,24.0,54400,0,8.0,2,2,35000,14.27,1,0.55,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57.0,53000,2,1.0,0,2,5800,13.16,0,0.11,0,30
32577,54.0,120000,2,4.0,0,0,17625,7.49,0,0.15,0,19
32578,65.0,76000,0,3.0,4,1,35000,10.99,1,0.46,0,28
32579,56.0,150000,2,5.0,0,1,15000,11.48,0,0.10,0,26


In [29]:
dfrisk.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22.0,59000,0,4.0,0,3,35000,16.02,1,0.59,1,3
1,21.0,9600,1,5.0,1,1,1000,11.14,0,0.1,0,2
2,25.0,9600,2,1.0,2,2,5500,12.87,1,0.57,0,3
3,23.0,65500,0,4.0,2,2,35000,15.23,1,0.53,0,2
4,24.0,54400,0,8.0,2,2,35000,14.27,1,0.55,1,4


In [30]:
dfrisk.tail()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
32576,57.0,53000,2,1.0,0,2,5800,13.16,0,0.11,0,30
32577,54.0,120000,2,4.0,0,0,17625,7.49,0,0.15,0,19
32578,65.0,76000,0,3.0,4,1,35000,10.99,1,0.46,0,28
32579,56.0,150000,2,5.0,0,1,15000,11.48,0,0.1,0,26
32580,66.0,42000,0,2.0,2,1,6475,9.99,0,0.15,0,30


In [31]:
dfrisk.describe()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
count,32571.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0
mean,27.70965,66074.85,0.914429,4.760689,2.339769,1.218195,9589.371106,11.00962,0.218164,0.170203,0.17633,5.804211
std,6.167858,61983.12,0.960858,3.981013,1.678803,1.166336,6322.086646,3.081611,0.413006,0.106782,0.381106,4.055001
min,20.0,4000.0,0.0,0.0,0.0,0.0,500.0,5.42,0.0,0.0,0.0,2.0
25%,23.0,38500.0,0.0,2.0,1.0,0.0,5000.0,8.49,0.0,0.09,0.0,3.0
50%,26.0,55000.0,0.0,4.0,2.0,1.0,8000.0,10.99,0.0,0.15,0.0,4.0
75%,30.0,79200.0,2.0,7.0,4.0,2.0,12200.0,13.11,0.0,0.23,0.0,8.0
max,73.0,6000000.0,3.0,41.0,5.0,6.0,35000.0,23.22,1.0,0.83,1.0,30.0


In [32]:
dfrisk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32571 non-null  float64
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  int64  
 3   person_emp_length           32581 non-null  float64
 4   loan_intent                 32581 non-null  int64  
 5   loan_grade                  32581 non-null  int64  
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               32581 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  int64  
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(4), int64(8)
memory usage: 3.0 MB


In [33]:
print(dfrisk)

       person_age  person_income  person_home_ownership  person_emp_length  \
0            22.0          59000                      0                4.0   
1            21.0           9600                      1                5.0   
2            25.0           9600                      2                1.0   
3            23.0          65500                      0                4.0   
4            24.0          54400                      0                8.0   
...           ...            ...                    ...                ...   
32576        57.0          53000                      2                1.0   
32577        54.0         120000                      2                4.0   
32578        65.0          76000                      0                3.0   
32579        56.0         150000                      2                5.0   
32580        66.0          42000                      0                2.0   

       loan_intent  loan_grade  loan_amnt  loan_int_rate  loan_

# LogisticRegression

Import necessary Libraries

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score

Check data types

In [35]:
print(dfrisk.dtypes)

person_age                    float64
person_income                   int64
person_home_ownership           int64
person_emp_length             float64
loan_intent                     int64
loan_grade                      int64
loan_amnt                       int64
loan_int_rate                 float64
loan_status                     int64
loan_percent_income           float64
cb_person_default_on_file       int64
cb_person_cred_hist_length      int64
dtype: object


Prepare features and target for Logistic Regression

In [36]:
dfrisk.corr()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
person_age,1.0,0.141424,0.033113,0.168988,0.027972,0.014779,0.052195,0.012065,-0.020128,-0.041733,0.00785,0.878456
person_income,0.141424,1.0,0.199315,0.137016,0.016113,-0.001022,0.26682,0.000746,-0.144449,-0.254471,-0.003613,0.117987
person_home_ownership,0.033113,0.199315,1.0,0.233845,0.017682,-0.119541,0.128938,-0.132323,-0.218714,-0.135376,-0.061556,0.025155
person_emp_length,0.168988,0.137016,0.233845,1.0,0.017365,-0.046959,0.111694,-0.052793,-0.08563,-0.058584,-0.029034,0.147934
loan_intent,0.027972,0.016113,0.017682,0.017365,1.0,0.013898,0.018725,0.006006,0.065381,-0.001939,0.010508,0.016905
loan_grade,0.014779,-0.001022,-0.119541,-0.046959,0.013898,1.0,0.145799,0.889929,0.37308,0.123021,0.537054,0.015069
loan_amnt,0.052195,0.26682,0.128938,0.111694,0.018725,0.145799,1.0,0.139483,0.105376,0.572612,0.039081,0.041967
loan_int_rate,0.012065,0.000746,-0.132323,-0.052793,0.006006,0.889929,0.139483,1.0,0.31936,0.114514,0.477146,0.015762
loan_status,-0.020128,-0.144449,-0.218714,-0.08563,0.065381,0.37308,0.105376,0.31936,1.0,0.379366,0.179141,-0.015529
loan_percent_income,-0.041733,-0.254471,-0.135376,-0.058584,-0.001939,0.123021,0.572612,0.114514,0.379366,1.0,0.03591,-0.03169


In [37]:
dfrisk.isna().sum()

person_age                    10
person_income                  0
person_home_ownership          0
person_emp_length              0
loan_intent                    0
loan_grade                     0
loan_amnt                      0
loan_int_rate                  0
loan_status                    0
loan_percent_income            0
cb_person_default_on_file      0
cb_person_cred_hist_length     0
dtype: int64

Fill the NaN values with the Median Value of the person_age column

In [38]:
dfrisk['person_age'].fillna(dfrisk['person_age'].median(), inplace=True)

In [39]:
dfrisk.isna().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [40]:
X = dfrisk.drop(['loan_status'], axis=1)
y = dfrisk['loan_status']

Split the dataset >> dfrisk

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Perform standardization

In [42]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled[:5]

array([[ 5.8452654 , -0.31717677, -0.9521391 , -0.69239938, -1.39203199,
         0.6746644 , -0.76018887,  0.02663141, -0.65843875,  2.17447733,
         4.48497655],
       [-0.28026618, -0.63165151,  0.08886738, -1.19395427,  1.57848724,
         2.39492002, -0.17434095,  1.77173414,  1.49131123, -0.45988063,
        -0.69475502],
       [-0.76386077, -0.23855808,  1.12987386, -0.44162193, -1.39203199,
         0.6746644 ,  1.01318862,  0.68430072,  1.30437645,  2.17447733,
        -0.69475502],
       [-0.92505897, -0.1598765 ,  1.12987386,  0.31071041, -0.2038243 ,
        -1.04559122, -0.57018414, -1.01847675, -0.56497136, -0.45988063,
        -0.44810114],
       [-0.60266257,  0.20170655, -0.9521391 , -0.44162193, -1.39203199,
         0.6746644 , -0.41184686,  0.49872077, -0.75190614, -0.45988063,
        -0.69475502]])

Instantiate the LogisticRegression model

In [43]:
model = LogisticRegression(max_iter=1000)

Fit the model

In [44]:
model.fit(X_train_scaled, y_train)

Predict the model on the test set

In [45]:
y_pred = model.predict(X_test_scaled)

In [46]:
y_pred

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [47]:
y_test

14668    0
24614    0
11096    0
10424    1
26007    1
        ..
31330    0
2862     0
14754    0
14170    0
24385    0
Name: loan_status, Length: 6517, dtype: int64

Evaluate model performance

In [48]:

accuracy = accuracy_score(y_test, y_pred)
print(f"The LogisticRegression Model performance Accuracy => {round(accuracy, 2)}")

The LogisticRegression Model performance Accuracy => 0.84


Let's take a look at other Evaluation Metrics

In [49]:
logisticRegression_report = classification_report(y_test, y_pred)
print("Classification Report for LogisticRegression \n")
print(f"{logisticRegression_report}")

Classification Report for LogisticRegression 

              precision    recall  f1-score   support

           0       0.86      0.95      0.90      5072
           1       0.72      0.47      0.56      1445

    accuracy                           0.84      6517
   macro avg       0.79      0.71      0.73      6517
weighted avg       0.83      0.84      0.83      6517



In [50]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r_square = r2_score(y_test, y_pred)

In [51]:
print(f"The LogisticRegression Model MSE => {round(mse, 2)}")
print(f"The LogisticRegression Model MAE => {round(mae, 2)}")
print(f"The LogisticRegression Model R-Square => {round(r_square, 2)}")

The LogisticRegression Model MSE => 0.16
The LogisticRegression Model MAE => 0.16
The LogisticRegression Model R-Square => 0.08


Save the LogisticRegression model

In [52]:
import joblib


joblib.dump(model, 'C:/Users/PC/Documents/sola/others/Telegram Desktop/logistic_regression_model.pkl')

['C:/Users/PC/Documents/sola/others/Telegram Desktop/logistic_regression_model.pkl']

# - - - End of LogisticRegression - - -

# XGBoost Regression 'XRegressor'

Define features and target

In [53]:

X = dfrisk.drop(columns=['loan_amnt'])
y = dfrisk['loan_amnt']

Split the dataset >> dfrisk

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Perform standardization

In [55]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled[:5]

array([[ 5.8452654 , -0.31717677, -0.9521391 , -0.69239938, -1.39203199,
         0.6746644 ,  0.02663141, -0.52686282, -0.65843875,  2.17447733,
         4.48497655],
       [-0.28026618, -0.63165151,  0.08886738, -1.19395427,  1.57848724,
         2.39492002,  1.77173414,  1.89802727,  1.49131123, -0.45988063,
        -0.69475502],
       [-0.76386077, -0.23855808,  1.12987386, -0.44162193, -1.39203199,
         0.6746644 ,  0.68430072, -0.52686282,  1.30437645,  2.17447733,
        -0.69475502],
       [-0.92505897, -0.1598765 ,  1.12987386,  0.31071041, -0.2038243 ,
        -1.04559122, -1.01847675, -0.52686282, -0.56497136, -0.45988063,
        -0.44810114],
       [-0.60266257,  0.20170655, -0.9521391 , -0.44162193, -1.39203199,
         0.6746644 ,  0.49872077, -0.52686282, -0.75190614, -0.45988063,
        -0.69475502]])

Import XGBoost Algorithm

In [56]:
import xgboost as xgb

Initialize and train the XGBoost regressor model


In [57]:
xgb_regressor = xgb.XGBRegressor(enable_categorical=True, n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
xgb_regressor.fit(X_train_scaled, y_train)

Predict on the test set

In [58]:
y_predxgb = xgb_regressor.predict(X_test_scaled)

In [59]:
y_predxgb

array([10331.591, 10265.318, 15647.186, ..., 10115.188,  6169.595,
       25432.598], dtype=float32)

In [60]:
y_test

14668    10000
24614    10000
11096    16000
10424    10000
26007    13000
         ...  
31330     9925
2862      3000
14754    10000
14170     6000
24385    25000
Name: loan_amnt, Length: 6517, dtype: int64

Evaluate the model

In [61]:
mse = mean_squared_error(y_test, y_predxgb)
print(f'The Mean Squared Error of XRegressor Model is {mse}')
#The Mean Squared Error of the XRegressor Model is 164505.5387713405

The Mean Squared Error of XRegressor Model is 168146.45510336247




Another Evaluation

In [62]:
rmse = mse ** 0.5
print(f'The Root Mean Squared Error of the XRegressor => {rmse}')
#The Root Mean Squared Error of the XRegressor => 405.5928238656849

The Root Mean Squared Error of the XRegressor => 410.0566486515765


Save the XGBoost regressor model

In [67]:

joblib.dump(xgb_regressor, 'C:/Users/PC/Documents/sola/others/Telegram Desktop/xgb_regressor_model.pkl')

['C:/Users/PC/Documents/sola/others/Telegram Desktop/xgb_regressor_model.pkl']

# - - - End of XGBoost - - -