In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Data Validation Initial Inspection

In [2]:
credit_risk_dataset = pd.read_csv(
    filepath_or_buffer = "../data/raw/credit_risk_dataset.csv"
    )

In [3]:
credit_risk_dataset.head(5)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [4]:
credit_risk_dataset.tail(5)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.1,N,26
32580,66,42000,RENT,2.0,MEDICAL,B,6475,9.99,0,0.15,N,30


In [5]:
credit_risk_dataset.shape

(32581, 12)

In [6]:
credit_risk_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [7]:
credit_risk_dataset.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

In [8]:
credit_risk_dataset.index

RangeIndex(start=0, stop=32581, step=1)

In [9]:
credit_risk_dataset.axes

[RangeIndex(start=0, stop=32581, step=1),
 Index(['person_age', 'person_income', 'person_home_ownership',
        'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
        'loan_int_rate', 'loan_status', 'loan_percent_income',
        'cb_person_default_on_file', 'cb_person_cred_hist_length'],
       dtype='object')]

In [10]:
credit_risk_dataset.size

390972

In [11]:
target = credit_risk_dataset.loan_status

In [12]:
credit_risk_dataset.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [13]:
credit_risk_dataset.describe(include='O')

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
count,32581,32581,32581,32581
unique,4,6,7,2
top,RENT,EDUCATION,A,N
freq,16446,6453,10777,26836


#### note: Majority of people are around 27 years old who rent their living spaces and are getting high interest rates for education - most of them did not default on their loans, so there is an imbalance problem.

### Data Quality Assessment

#### Removing Impossible Values

In [14]:
credit_risk_dataset.person_emp_length.sort_values(ascending=False)

210      123.0
0        123.0
32355     41.0
32515     38.0
32428     34.0
         ...  
32285      NaN
32328      NaN
32360      NaN
32453      NaN
32471      NaN
Name: person_emp_length, Length: 32581, dtype: float64

In [15]:
credit_risk_dataset[credit_risk_dataset.person_emp_length > 41.0]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
210,21,192000,MORTGAGE,123.0,VENTURE,A,20000,6.54,0,0.1,N,4


In [16]:
print("size of dataset before filtering for impossible values from persons employment length:", credit_risk_dataset.shape)

size of dataset before filtering for impossible values from persons employment length: (32581, 12)


In [17]:
credit_risk_dataset = credit_risk_dataset[~(credit_risk_dataset.person_emp_length > 41.0)]

In [18]:
print("size of dataset after filtering impossible values from persons employment length:", credit_risk_dataset.shape)

size of dataset after filtering impossible values from persons employment length: (32579, 12)


In [None]:
credit_risk_dataset.person_age.sort_values(ascending=False)

32297    144
81       144
183      144
575      123
747      123
        ... 
9926      20
7748      20
7327      20
5648      20
3851      20
Name: person_age, Length: 32579, dtype: int64

In [20]:
credit_risk_dataset[credit_risk_dataset.person_age > 94]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
81,144,250000,RENT,4.0,VENTURE,C,4800,13.57,0,0.02,N,3
183,144,200000,MORTGAGE,4.0,EDUCATION,B,6000,11.86,0,0.03,N,2
575,123,80004,RENT,2.0,EDUCATION,B,20400,10.25,0,0.25,N,3
747,123,78000,RENT,7.0,VENTURE,B,20000,,0,0.26,N,4
32297,144,6000000,MORTGAGE,12.0,PERSONAL,C,5000,12.73,0,0.0,N,25


In [21]:
print("Shape of dataset before filtering out impossible age values", credit_risk_dataset.shape)

Shape of dataset before filtering out impossible age values (32579, 12)


In [22]:
credit_risk_dataset = credit_risk_dataset[~(credit_risk_dataset.person_age > 94)]

In [23]:
print("Shape of dataset after filtering out impossible age values", credit_risk_dataset.shape)

Shape of dataset after filtering out impossible age values (32574, 12)


In [100]:
credit_risk_dataset[credit_risk_dataset.person_emp_length > credit_risk_dataset.person_age]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length


#### Converting from Features data types from Object to Categorical (If needed)

In [120]:
credit_risk_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32574 entries, 1 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   person_age                  32574 non-null  int64   
 1   person_income               32574 non-null  int64   
 2   person_home_ownership       32574 non-null  category
 3   person_emp_length           31679 non-null  float64 
 4   loan_intent                 32574 non-null  category
 5   loan_grade                  32574 non-null  category
 6   loan_amnt                   32574 non-null  int64   
 7   loan_int_rate               29459 non-null  float64 
 8   loan_status                 32574 non-null  int64   
 9   loan_percent_income         32574 non-null  float64 
 10  cb_person_default_on_file   32574 non-null  object  
 11  cb_person_cred_hist_length  32574 non-null  int64   
dtypes: category(3), float64(3), int64(5), object(1)
memory usage: 2.6+ MB


In [79]:
credit_risk_dataset.person_home_ownership.value_counts()

person_home_ownership
RENT        16442
MORTGAGE    13441
OWN          2584
OTHER         107
Name: count, dtype: int64

In [84]:
credit_risk_dataset = credit_risk_dataset.astype(
    {"person_home_ownership": "category"}
    )

In [88]:
credit_risk_dataset.loan_intent.value_counts()

loan_intent
EDUCATION            6451
MEDICAL              6071
VENTURE              5716
PERSONAL             5519
DEBTCONSOLIDATION    5212
HOMEIMPROVEMENT      3605
Name: count, dtype: int64

In [89]:
credit_risk_dataset = credit_risk_dataset.astype(
    {"loan_intent": "category"}
    )

In [91]:
credit_risk_dataset.loan_grade.value_counts()

loan_grade
A    10776
B    10448
C     6456
D     3625
E      964
F      241
G       64
Name: count, dtype: int64

In [116]:
list2d = [[1,2,3,4,5], [6,7,8,9,10]]
list2d[1][2]

8

In [92]:
credit_risk_dataset = credit_risk_dataset.astype(
    {"loan_grade" : "category"}
)

In [27]:
categorical_columns = credit_risk_dataset.select_dtypes(include=['object'])

In [97]:
credit_risk_dataset.cb_person_default_on_file.value_counts()

cb_person_default_on_file
N    26830
Y     5744
Name: count, dtype: int64

In [128]:
credit_risk_dataset.cb_person_default_on_file.replace(
    {'Y': 1, 'N': 0},
    inplace=True
)

  credit_risk_dataset.cb_person_default_on_file.replace(


In [134]:
credit_risk_dataset = credit_risk_dataset.astype(
    {"cb_person_default_on_file": "bool"}
)

In [135]:
credit_risk_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32574 entries, 1 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   person_age                  32574 non-null  int64   
 1   person_income               32574 non-null  int64   
 2   person_home_ownership       32574 non-null  category
 3   person_emp_length           31679 non-null  float64 
 4   loan_intent                 32574 non-null  category
 5   loan_grade                  32574 non-null  category
 6   loan_amnt                   32574 non-null  int64   
 7   loan_int_rate               29459 non-null  float64 
 8   loan_status                 32574 non-null  int64   
 9   loan_percent_income         32574 non-null  float64 
 10  cb_person_default_on_file   32574 non-null  bool    
 11  cb_person_cred_hist_length  32574 non-null  int64   
dtypes: bool(1), category(3), float64(3), int64(5)
memory usage: 2.4 MB


In [111]:
result = lambda x: 1 if x == 'Y' else 0
result(credit_risk_dataset.cb_person_cred_hist_length)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [108]:
map(lambda x: return 1 if x == 'Y' else 0, credit_risk_dataset.cb_person_cred_hist_length)

SyntaxError: invalid syntax (2461999777.py, line 1)

In [37]:
for column in categorical_columns.columns:
    print(credit_risk_dataset[column].value_counts())

person_home_ownership
RENT        16442
MORTGAGE    13441
OWN          2584
OTHER         107
Name: count, dtype: int64
loan_intent
EDUCATION            6451
MEDICAL              6071
VENTURE              5716
PERSONAL             5519
DEBTCONSOLIDATION    5212
HOMEIMPROVEMENT      3605
Name: count, dtype: int64
loan_grade
A    10776
B    10448
C     6456
D     3625
E      964
F      241
G       64
Name: count, dtype: int64
cb_person_default_on_file
N    26830
Y     5744
Name: count, dtype: int64


In [38]:
credit_risk_dataset.loan_status.value_counts()

loan_status
0    25467
1     7107
Name: count, dtype: int64

In [None]:
credit_risk_dataset.person_home_ownership.value_counts()

#### Missing values

In [26]:
credit_risk_dataset.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3115
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

#### Duplicated Values

In [45]:
credit_risk_dataset.duplicated()

1        False
2        False
3        False
4        False
5        False
         ...  
32576    False
32577    False
32578    False
32579    False
32580    False
Length: 32574, dtype: bool

In [43]:
credit_risk_dataset[credit_risk_dataset.duplicated()]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
15975,23,42000,RENT,5.0,VENTURE,B,6000,9.99,0,0.14,N,4
15989,23,90000,MORTGAGE,7.0,EDUCATION,B,8000,10.36,0,0.09,N,3
15995,24,48000,MORTGAGE,4.0,MEDICAL,A,4000,5.42,0,0.08,N,4
16025,24,10000,RENT,8.0,PERSONAL,A,3000,7.90,1,0.30,N,3
16028,23,100000,MORTGAGE,7.0,EDUCATION,A,15000,7.88,0,0.15,N,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32010,42,39996,MORTGAGE,2.0,HOMEIMPROVEMENT,A,2500,5.42,0,0.06,N,12
32047,36,250000,RENT,2.0,DEBTCONSOLIDATION,A,20000,7.88,0,0.08,N,17
32172,49,120000,MORTGAGE,12.0,MEDICAL,B,12000,10.99,0,0.10,N,12
32259,39,40000,OWN,4.0,VENTURE,B,1000,10.37,0,0.03,N,16


#### Outlier Visualization

In [49]:
numeric_features = credit_risk_dataset.select_dtypes(include=["int64", "float64"]).columns

In [None]:
for column in numeric_features:
    plt.figure(figsize=(8, 6))
    plt.boxplot(credit_risk_dataset[column])
    plt.title(f'Histogram of {column}', fontsize=16)
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

### Univariate Analysis

In [None]:
for numeric_feature in numeric_features:
    plt.figure(figsize=(10,6))
    plt.hist(x=numeric_feature, data=credit_risk_dataset)
    plt.title(f"Histogram of {numeric_feature}")
    plt.xlabel(numeric_feature)
    plt.ylabel("Frequency")
    plt.grid(visible=True)