In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pickle

In [2]:
base_credit = pd.read_csv('../../database/raw_data/credit_risk_dataset.csv')

In [3]:
base_credit

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


Fixing inconsistent data.

In [4]:
base_credit = base_credit.drop(base_credit[base_credit["person_age"] >= 100].index)

In [5]:
base_credit = base_credit.drop(base_credit[base_credit["person_emp_length"] > 100].index)

Fixing missing data.

In [6]:
base_credit.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3115
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [7]:
base_credit.loc[pd.isnull(base_credit["person_emp_length"])]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
105,22,12600,MORTGAGE,,PERSONAL,A,2000,5.42,1,0.16,N,4
222,24,185000,MORTGAGE,,EDUCATION,B,35000,12.42,0,0.19,N,2
379,24,16800,MORTGAGE,,DEBTCONSOLIDATION,A,3900,,1,0.23,N,3
407,25,52000,RENT,,PERSONAL,B,24000,10.74,1,0.46,N,2
408,22,17352,MORTGAGE,,EDUCATION,C,2250,15.27,0,0.13,Y,3
...,...,...,...,...,...,...,...,...,...,...,...,...
32285,38,12000,OWN,,EDUCATION,A,4800,7.29,1,0.40,N,12
32328,51,18408,RENT,,PERSONAL,C,1000,14.65,1,0.05,Y,20
32360,70,39996,RENT,,MEDICAL,C,3600,15.23,0,0.09,Y,19
32453,56,32400,RENT,,MEDICAL,A,8575,7.51,0,0.26,N,18


In [8]:
base_credit["person_emp_length"].fillna(0,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  base_credit["person_emp_length"].fillna(0,inplace=True)


In [9]:
base_credit.loc[pd.isnull(base_credit["loan_int_rate"])]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
39,23,71500,RENT,3.0,DEBTCONSOLIDATION,D,30000,,1,0.42,N,4
50,24,78000,RENT,4.0,DEBTCONSOLIDATION,D,30000,,1,0.38,Y,4
57,23,277000,OWN,3.0,PERSONAL,A,35000,,0,0.13,N,4
59,24,12000,OWN,2.0,VENTURE,E,1750,,0,0.15,Y,3
62,26,263000,MORTGAGE,0.0,EDUCATION,B,10000,,1,0.04,N,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32547,53,4888,OWN,0.0,VENTURE,C,1400,,1,0.29,Y,28
32552,65,45900,RENT,2.0,EDUCATION,C,10000,,0,0.22,Y,19
32553,54,20000,RENT,2.0,MEDICAL,C,5000,,0,0.25,N,28
32569,51,60000,MORTGAGE,1.0,PERSONAL,A,7500,,0,0.13,N,23


In [10]:
base_credit["loan_int_rate"].mean()

np.float64(11.01152856512441)

In [11]:
base_credit["loan_int_rate"].fillna(base_credit["loan_int_rate"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  base_credit["loan_int_rate"].fillna(base_credit["loan_int_rate"].mean(), inplace=True)


Analyzing the relationship between features and the target variable.

In [12]:
pd.crosstab(base_credit['loan_grade'], base_credit['loan_status'])

loan_status,0,1
loan_grade,Unnamed: 1_level_1,Unnamed: 2_level_1
A,9703,1073
B,8747,1701
C,5117,1339
D,1485,2140
E,343,621
F,71,170
G,1,63


Applying Target Encoding to all categorical variables.

In [13]:
# Calculate mean target per category
mean_encoding = base_credit.groupby("person_home_ownership")["loan_status"].mean()

# Map means back to original data
base_credit["person_home_encoded"] = base_credit["person_home_ownership"].map(mean_encoding)

# Optional: Smoothing (blend with global mean)
global_mean = base_credit["loan_status"].mean()
alpha = 10  # Higher = more smoothing
base_credit["person_home_encoded"] = (
    base_credit["person_home_encoded"] * alpha + global_mean
) / (alpha + 1)

In [14]:
# Calculate mean target per category
mean_encoding = base_credit.groupby("loan_intent")["loan_status"].mean()

# Map means back to original data
base_credit["loan_intent_encoded"] = base_credit["loan_intent"].map(mean_encoding)

# Optional: Smoothing (blend with global mean)
global_mean = base_credit["loan_status"].mean()
alpha = 10  # Higher = more smoothing
base_credit["loan_intent_encoded"] = (
    base_credit["loan_intent_encoded"] * alpha + global_mean
) / (alpha + 1)

In [15]:
# Calculate mean target per category
mean_encoding = base_credit.groupby("loan_grade")["loan_status"].mean()

# Map means back to original data
base_credit["loan_grade_encoded"] = base_credit["loan_grade"].map(mean_encoding)

# Optional: Smoothing (blend with global mean)
global_mean = base_credit["loan_status"].mean()
alpha = 10  # Higher = more smoothing
base_credit["loan_grade_encoded"] = (
    base_credit["loan_grade_encoded"] * alpha + global_mean
) / (alpha + 1)

In [16]:
base_credit

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,person_home_encoded,loan_intent_encoded,loan_grade_encoded
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2,0.087735,0.176399,0.167840
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,0.134139,0.262568,0.208384
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,0.306849,0.262568,0.208384
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,0.306849,0.262568,0.208384
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2,0.087735,0.154544,0.110356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30,0.134139,0.200533,0.208384
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19,0.134139,0.200533,0.110356
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28,0.306849,0.257131,0.167840
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26,0.134139,0.200533,0.167840


In [17]:
base_credit.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32574 entries, 1 to 32580
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32574 non-null  int64  
 1   person_income               32574 non-null  int64  
 2   person_home_ownership       32574 non-null  object 
 3   person_emp_length           32574 non-null  float64
 4   loan_intent                 32574 non-null  object 
 5   loan_grade                  32574 non-null  object 
 6   loan_amnt                   32574 non-null  int64  
 7   loan_int_rate               32574 non-null  float64
 8   loan_status                 32574 non-null  int64  
 9   loan_percent_income         32574 non-null  float64
 10  cb_person_default_on_file   32574 non-null  object 
 11  cb_person_cred_hist_length  32574 non-null  int64  
 12  person_home_encoded         32574 non-null  float64
 13  loan_intent_encoded         32574 no

In [18]:
base_credit = base_credit.drop(columns=["person_home_ownership","loan_intent","loan_grade","cb_person_default_on_file"])

Splitting data between feature variables and target variables

In [19]:
x_credit = base_credit.iloc[:, list(range(0,5)) + list(range(6,11))].values

In [20]:
y_credit = base_credit.iloc[:,5].values

Applying Standard Scaler to feature variables

In [21]:
scaler_credit = StandardScaler()
x_credit = scaler_credit.fit_transform(x_credit)

In [22]:
x_credit_training, x_credit_test, y_credit_training, y_credit_test = train_test_split(x_credit, y_credit, test_size = 0.25, random_state = 0)

In [23]:
with open('../../database/processed_data/credit.pkl', mode = 'wb') as f:
    pickle.dump([x_credit_training,y_credit_training, x_credit_test, y_credit_test], f)