In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

sns.set(style='whitegrid')

In [5]:
data_path = '../data/processed/eda_credit_risk_data'

df = pd.read_csv(data_path)

In [8]:
df.head()

Unnamed: 0,age,year_income,home_status,work_experience,credit_purpose,credit_grade,credit_amount,percentage_rate,credit_status,credit_percent_income,cb_person_default,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         32581 non-null  int64  
 1   year_income                 32581 non-null  int64  
 2   home_status                 32581 non-null  object 
 3   work_experience             32581 non-null  float64
 4   credit_purpose              32581 non-null  object 
 5   credit_grade                32581 non-null  object 
 6   credit_amount               32581 non-null  int64  
 7   percentage_rate             32581 non-null  float64
 8   credit_status               32581 non-null  int64  
 9   credit_percent_income       32581 non-null  float64
 10  cb_person_default           32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [10]:
median_work_experience = df['work_experience'].median()

df['work_experience'] = df['work_experience'].fillna(median_work_experience)

In [14]:
median_percentage_rate = df['percentage_rate'].median()

df['percentage_rate'] = df['percentage_rate'].fillna(median_percentage_rate)

In [16]:
df.describe()

Unnamed: 0,age,year_income,work_experience,credit_amount,percentage_rate,credit_status,credit_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.767994,9589.371106,11.00962,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.087372,6322.086646,3.081611,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,8.49,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.11,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [23]:
df = df[df['age'] <= 100]

In [25]:
df = df[df['work_experience'] <= 100]

In [29]:
column_to_move = df.pop("credit_status")

In [30]:
df.insert(11, "loan_status", column_to_move)

In [32]:
df = df.rename(columns={'loan_status':'credit_status'})

In [33]:
df.head()

Unnamed: 0,age,year_income,home_status,work_experience,credit_purpose,credit_grade,credit_amount,percentage_rate,credit_percent_income,cb_person_default,cb_person_cred_hist_length,credit_status
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.1,N,2,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3,1
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2,1
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4,1
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,0.25,N,2,1


In [39]:
df['credit_grade'].unique()

array(['B', 'C', 'A', 'D', 'E', 'F', 'G'], dtype=object)

In [42]:
df_encoded = pd.get_dummies(df, columns=['home_status', 'credit_purpose'], drop_first=True, dtype=int)

grade_map = {'G':0, 'F':1, 'E':2, 'D':3, 'C':4, 'B':5, 'A':6}
df_encoded['credit_grade'] = df_encoded['credit_grade'].map(grade_map)

default_map = {'N':0, 'Y':1}
df_encoded['cb_person_default'] = df_encoded['cb_person_default'].map(default_map)

In [43]:
df_encoded.head()

Unnamed: 0,age,year_income,work_experience,credit_grade,credit_amount,percentage_rate,credit_percent_income,cb_person_default,cb_person_cred_hist_length,credit_status,home_status_OTHER,home_status_OWN,home_status_RENT,credit_purpose_EDUCATION,credit_purpose_HOMEIMPROVEMENT,credit_purpose_MEDICAL,credit_purpose_PERSONAL,credit_purpose_VENTURE
1,21,9600,5.0,5,1000,11.14,0.1,0,2,0,0,1,0,1,0,0,0,0
2,25,9600,1.0,4,5500,12.87,0.57,0,3,1,0,0,0,0,0,1,0,0
3,23,65500,4.0,4,35000,15.23,0.53,0,2,1,0,0,1,0,0,1,0,0
4,24,54400,8.0,4,35000,14.27,0.55,1,4,1,0,0,1,0,0,1,0,0
5,21,9900,2.0,6,2500,7.14,0.25,0,2,1,0,1,0,0,0,0,0,1


In [46]:
df_encoded['credit_history_ratio'] = df_encoded['cb_person_cred_hist_length'] / (df_encoded['age'] - 18)

In [50]:
df_encoded.head()

Unnamed: 0,age,year_income,work_experience,credit_grade,percentage_rate,credit_percent_income,cb_person_default,cb_person_cred_hist_length,credit_status,home_status_OTHER,home_status_OWN,home_status_RENT,credit_purpose_EDUCATION,credit_purpose_HOMEIMPROVEMENT,credit_purpose_MEDICAL,credit_purpose_PERSONAL,credit_purpose_VENTURE,credit_history_ratio
1,21,9600,5.0,5,11.14,0.1,0,2,0,0,1,0,1,0,0,0,0,0.666667
2,25,9600,1.0,4,12.87,0.57,0,3,1,0,0,0,0,0,1,0,0,0.428571
3,23,65500,4.0,4,15.23,0.53,0,2,1,0,0,1,0,0,1,0,0,0.4
4,24,54400,8.0,4,14.27,0.55,1,4,1,0,0,1,0,0,1,0,0,0.666667
5,21,9900,2.0,6,7.14,0.25,0,2,1,0,1,0,0,0,0,0,1,0.666667


In [49]:
df_encoded = df_encoded.drop(columns=['credit_amount'])

In [52]:
X = df_encoded.drop(columns=['credit_status'])

In [53]:
y = df_encoded['credit_status']

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [57]:
X_train.shape

(22801, 17)

In [58]:
X_test.shape

(9773, 17)

In [60]:
y.mean()

np.float64(0.21818014367286792)

In [61]:
y_train.mean()

np.float64(0.21819218455330908)

In [62]:
y_test.mean()

np.float64(0.21815205157065384)

In [64]:
output_dir = '../data/processed/'

X_train.to_csv(f'{output_dir}X_train.csv', index=False)
X_test.to_csv(f'{output_dir}X_test.csv', index=False)
y_train.to_csv(f'{output_dir}y_train.csv', index=False)
y_test.to_csv(f'{output_dir}y_test.csv', index=False)