In [18]:
import numpy as np
import pandas as pd

pd.__version__

'2.1.1'

In [2]:
df = pd.read_csv("dataset/credit_risk_dataset.csv")

In [3]:
df.shape

(32581, 12)

In [4]:
df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [6]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


## Preprocessing data

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [8]:
df['person_emp_length'].fillna(df['person_emp_length'].mean(), inplace=True)
df['loan_int_rate'].fillna(df['loan_int_rate'].mean(), inplace=True)

categorical_cols = ["person_home_ownership", "loan_intent", "load_grade", "cb_person_default_on_file"]
df_encoded = pd.get_dummies(df, categorical_cols, drop_first=True)
df_encoded

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,load_grade_B,load_grade_C,load_grade_D,load_grade_E,load_grade_F,load_grade_G,cb_person_default_on_file_Y
0,22,59000,123.0,35000,16.02,1,0.59,3,False,False,...,False,True,False,False,False,True,False,False,False,True
1,21,9600,5.0,1000,11.14,0,0.10,2,False,True,...,False,False,False,True,False,False,False,False,False,False
2,25,9600,1.0,5500,12.87,1,0.57,3,False,False,...,True,False,False,False,True,False,False,False,False,False
3,23,65500,4.0,35000,15.23,1,0.53,2,False,False,...,True,False,False,False,True,False,False,False,False,False
4,24,54400,8.0,35000,14.27,1,0.55,4,False,False,...,True,False,False,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,5800,13.16,0,0.11,30,False,False,...,False,True,False,False,True,False,False,False,False,False
32577,54,120000,4.0,17625,7.49,0,0.15,19,False,False,...,False,True,False,False,False,False,False,False,False,False
32578,65,76000,3.0,35000,10.99,1,0.46,28,False,False,...,False,False,False,True,False,False,False,False,False,False
32579,56,150000,5.0,15000,11.48,0,0.10,26,False,False,...,False,True,False,True,False,False,False,False,False,False


In [10]:
X = df_encoded.drop("loan_status", axis=1)
Y = df_encoded["loan_status"]


In [11]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-0.90337383, -0.11414329, 28.93573761, ..., -0.08632538,
        -0.04436441,  2.16129414],
       [-1.06090423, -0.91114671,  0.05148097, ..., -0.08632538,
        -0.04436441, -0.46268575],
       [-0.43078263, -0.91114671, -0.92764637, ..., -0.08632538,
        -0.04436441, -0.46268575],
       ...,
       [ 5.87043346,  0.16012914, -0.4380827 , ..., -0.08632538,
        -0.04436441, -0.46268575],
       [ 4.45265984,  1.35402091,  0.05148097, ..., -0.08632538,
        -0.04436441, -0.46268575],
       [ 6.02796387, -0.38841572, -0.68286453, ..., -0.08632538,
        -0.04436441, -0.46268575]])

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=55)
len(X_train)

26064

In [13]:
model = RandomForestClassifier(random_state=55)

model.fit(X_train, Y_train)

In [14]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)

In [15]:
res = classification_report(Y_test, y_pred)
print(res)

              precision    recall  f1-score   support

           0       0.93      0.99      0.96      5103
           1       0.96      0.71      0.82      1414

    accuracy                           0.93      6517
   macro avg       0.94      0.85      0.89      6517
weighted avg       0.93      0.93      0.93      6517



### Export the model

In [16]:
import joblib

joblib.dump(model, 'models/model.pkl')
joblib.dump(scaler, 'models/scaler.pkl')

['models/scaler.pkl']

In [41]:
import pickle

pickle.dump(model, open("models/credit_risk_predictor.pkl", "wb"))
pickle.dump(model, open("models/scaler.pkl", "wb"))