In [115]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

In [116]:
base_credit = pd.read_csv('../../database/raw_data/credit_risk_dataset.csv')

In [117]:
base_credit

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


Fixing inconsistent data.

In [118]:
base_credit = base_credit.drop(base_credit[base_credit["person_age"] >= 100].index)

In [119]:
base_credit = base_credit.drop(base_credit[base_credit["person_emp_length"] > 100].index)

Fixing missing data.

In [120]:
base_credit.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3115
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [121]:
base_credit.loc[pd.isnull(base_credit["person_emp_length"])]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
105,22,12600,MORTGAGE,,PERSONAL,A,2000,5.42,1,0.16,N,4
222,24,185000,MORTGAGE,,EDUCATION,B,35000,12.42,0,0.19,N,2
379,24,16800,MORTGAGE,,DEBTCONSOLIDATION,A,3900,,1,0.23,N,3
407,25,52000,RENT,,PERSONAL,B,24000,10.74,1,0.46,N,2
408,22,17352,MORTGAGE,,EDUCATION,C,2250,15.27,0,0.13,Y,3
...,...,...,...,...,...,...,...,...,...,...,...,...
32285,38,12000,OWN,,EDUCATION,A,4800,7.29,1,0.40,N,12
32328,51,18408,RENT,,PERSONAL,C,1000,14.65,1,0.05,Y,20
32360,70,39996,RENT,,MEDICAL,C,3600,15.23,0,0.09,Y,19
32453,56,32400,RENT,,MEDICAL,A,8575,7.51,0,0.26,N,18


In [122]:
base_credit["person_emp_length"].fillna(0,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  base_credit["person_emp_length"].fillna(0,inplace=True)


In [123]:
base_credit.loc[pd.isnull(base_credit["loan_int_rate"])]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
39,23,71500,RENT,3.0,DEBTCONSOLIDATION,D,30000,,1,0.42,N,4
50,24,78000,RENT,4.0,DEBTCONSOLIDATION,D,30000,,1,0.38,Y,4
57,23,277000,OWN,3.0,PERSONAL,A,35000,,0,0.13,N,4
59,24,12000,OWN,2.0,VENTURE,E,1750,,0,0.15,Y,3
62,26,263000,MORTGAGE,0.0,EDUCATION,B,10000,,1,0.04,N,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32547,53,4888,OWN,0.0,VENTURE,C,1400,,1,0.29,Y,28
32552,65,45900,RENT,2.0,EDUCATION,C,10000,,0,0.22,Y,19
32553,54,20000,RENT,2.0,MEDICAL,C,5000,,0,0.25,N,28
32569,51,60000,MORTGAGE,1.0,PERSONAL,A,7500,,0,0.13,N,23


In [124]:
base_credit["loan_int_rate"].mean()

np.float64(11.01152856512441)

In [125]:
base_credit["loan_int_rate"].fillna(base_credit["loan_int_rate"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  base_credit["loan_int_rate"].fillna(base_credit["loan_int_rate"].mean(), inplace=True)


Analyzing the relationship between features and the target variable.

In [126]:
pd.crosstab(base_credit['loan_grade'], base_credit['loan_status'])

loan_status,0,1
loan_grade,Unnamed: 1_level_1,Unnamed: 2_level_1
A,9703,1073
B,8747,1701
C,5117,1339
D,1485,2140
E,343,621
F,71,170
G,1,63


Applying One Hot Encoding to all categorical variables.

In [127]:
base_credit = pd.get_dummies(base_credit)

In [128]:
base_credit

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
1,21,9600,5.0,1000,11.14,0,0.10,2,False,False,...,False,False,True,False,False,False,False,False,True,False
2,25,9600,1.0,5500,12.87,1,0.57,3,True,False,...,False,False,False,True,False,False,False,False,True,False
3,23,65500,4.0,35000,15.23,1,0.53,2,False,False,...,False,False,False,True,False,False,False,False,True,False
4,24,54400,8.0,35000,14.27,1,0.55,4,False,False,...,False,False,False,True,False,False,False,False,False,True
5,21,9900,2.0,2500,7.14,1,0.25,2,False,False,...,True,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,1.0,5800,13.16,0,0.11,30,True,False,...,False,False,False,True,False,False,False,False,True,False
32577,54,120000,4.0,17625,7.49,0,0.15,19,True,False,...,False,True,False,False,False,False,False,False,True,False
32578,65,76000,3.0,35000,10.99,1,0.46,28,False,False,...,False,False,True,False,False,False,False,False,True,False
32579,56,150000,5.0,15000,11.48,0,0.10,26,True,False,...,False,False,True,False,False,False,False,False,True,False


In [129]:
base_credit.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32574 entries, 1 to 32580
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      32574 non-null  int64  
 1   person_income                   32574 non-null  int64  
 2   person_emp_length               32574 non-null  float64
 3   loan_amnt                       32574 non-null  int64  
 4   loan_int_rate                   32574 non-null  float64
 5   loan_status                     32574 non-null  int64  
 6   loan_percent_income             32574 non-null  float64
 7   cb_person_cred_hist_length      32574 non-null  int64  
 8   person_home_ownership_MORTGAGE  32574 non-null  bool   
 9   person_home_ownership_OTHER     32574 non-null  bool   
 10  person_home_ownership_OWN       32574 non-null  bool   
 11  person_home_ownership_RENT      32574 non-null  bool   
 12  loan_intent_DEBTCONSOLIDATION   32574

Splitting data between feature variables and target variables

In [130]:
x_credit = base_credit.iloc[:, list(range(0,5)) + list(range(6,27))].values

In [131]:
scaler_credit = StandardScaler()
x_credit = scaler_credit.fit_transform(x_credit)

In [132]:
y_credit = base_credit.iloc[:,5].values

In [133]:
x_credit_training, x_credit_test, y_credit_training, y_credit_test = train_test_split(x_credit, y_credit, test_size = 0.25, random_state = 0)

In [134]:
with open('../../database/processed_data/credit.pkl', mode = 'wb') as f:
    pickle.dump([x_credit_training,y_credit_training, x_credit_test, y_credit_test], f)