# 1. Import the libraries

In [1]:
# Warnings
import warnings

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore the warnings
warnings.filterwarnings('ignore')



# 2. Load the dataset

In [2]:
# Load the dataset
df = pd.read_excel('jca_datasets.xlsx')

In [3]:
# See 5 first row
df.head()

Unnamed: 0,first_name,last_name,NIK,customer_type,credit_score,income,employment_status,marital_status,number_of_dependent,occupation,...,date_of_birth,age,address,rt,rw,postal_code,address_match,criminal_rate_location,risk,risk_score
0,Bala,Suwarno,1208184902990002,Business,543,33840473,Retired,Widowed,5,Vice Mayor,...,1996-02-27,27,"Gg. Kebonjati No. 9, RUMFAKAR, BAKAUHENI, CILA...",5,13,24784,no,high,medium,37
1,Kawaca,Lailasari,3504174902520005,Business,469,22519044,Less than a year,Divorced,2,Private employee,...,1967-01-05,56,"Jalan Suniaraja No. 1, GAYA BARU ENAM, TUKDANA...",13,9,95371,no,high,medium,53
2,Cayadi,Usamah,3326155809710008,Personal,339,139646738,Unemployed,Single,0,Architect,...,1960-02-01,63,"Gg. Gardujati No. 16, DUDAKAWU, CILAWU, ACEH B...",5,2,24653,no,high,medium,36
3,Rahman,Pradipta,3309066801600001,Business,464,0,Retired,Divorced,5,Member of the Constitutional Court,...,1951-12-28,71,"Gang Laswi No. 47, MEJASEM, PAGEDANGAN, PESAWA...",12,12,35142,no,high,medium,57
4,Kasiran,Palastri,1502045109710008,Business,433,50955547,More than 5 years,Divorced,3,Pilot,...,1983-10-19,39,"Gang Rumah Sakit No. 80, GAPUK TUA, TANJUNG HA...",7,10,21174,no,high,high,93


In [4]:
df[['last_name', 'NIK', 'credit_limit']]

Unnamed: 0,last_name,NIK,credit_limit
0,Suwarno,1208184902990002,14608894
1,Lailasari,3504174902520005,205969
2,Usamah,3326155809710008,35214516
3,Pradipta,3309066801600001,34907241
4,Palastri,1502045109710008,4196851
...,...,...,...
95,Nurdiyanti,1207204602510009,5139423
96,Simbolon,3511116208960005,22215292
97,Maryati,1472015609520003,631153
98,Mulyani,3529015803700007,40641858


# 3. Missing value

In [5]:
# Check if the column have missing value?
checkMissingValue = df.isnull().sum()

# Checking the condition
if(checkMissingValue.any() > 0):
    print("There's a missing value" + checkMissingValue)
else:
    print("There's not a missing value")

There's not a missing value


# 4. Duplicate value

In [6]:
# Check if the column have duplicate value?
df[df.duplicated()]

checkingDuplicatedValue = df.duplicated()

# Checking the condition
if(checkingDuplicatedValue.any() > 0):
    print("There's a duplicate value" + checkingDuplicatedValue)
else:
    print("There's not a duplicate value")

There's not a duplicate value


# 5. Convert the datatype

In [7]:
# Check the datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 51 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   first_name                          100 non-null    object        
 1   last_name                           100 non-null    object        
 2   NIK                                 100 non-null    int64         
 3   customer_type                       100 non-null    object        
 4   credit_score                        100 non-null    int64         
 5   income                              100 non-null    int64         
 6   employment_status                   100 non-null    object        
 7   marital_status                      100 non-null    object        
 8   number_of_dependent                 100 non-null    int64         
 9   occupation                          100 non-null    object        
 10  job_position               

In [8]:
# Convert the datatype for NIK & credit_limit
df[['NIK', 'credit_limit']] = df[['NIK', 'credit_limit']].astype('int64')

In [9]:
# Convert the datatype for debt_to_income & credit_utilization
df[['debt_to_income', 'credit_utilization']] = df[['debt_to_income', 'credit_utilization']].astype('float64')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 51 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   first_name                          100 non-null    object        
 1   last_name                           100 non-null    object        
 2   NIK                                 100 non-null    int64         
 3   customer_type                       100 non-null    object        
 4   credit_score                        100 non-null    int64         
 5   income                              100 non-null    int64         
 6   employment_status                   100 non-null    object        
 7   marital_status                      100 non-null    object        
 8   number_of_dependent                 100 non-null    int64         
 9   occupation                          100 non-null    object        
 10  job_position               

In [11]:
df.describe()

Unnamed: 0,NIK,credit_score,income,number_of_dependent,code_occupation_kbli,debt_to_income,ticket_size,tenor,principal_installment_amount,total_installment_amount,collateral_amount,credit_limit,credit_utilization,outstanding_debts,age,rt,rw,postal_code,risk_score
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,4157594000000000.0,447.55,51114870.0,2.38,37827.29,1.097872,51672300.0,59.82,2835140.0,2522505.0,50506290.0,19489050.0,1.099334,49038980.0,50.4,6.65,7.64,60689.33,49.81
std,2674111000000000.0,78.978756,46791770.0,1.662297,27815.509239,0.597199,27608670.0,46.495005,4963376.0,4930535.0,27445270.0,11976650.0,0.554385,28117760.0,16.150179,3.980698,3.394946,27842.862525,28.82784
min,1106227000000000.0,300.0,0.0,0.0,1230.0,0.0,3900000.0,3.0,50000.0,17777.0,1795430.0,205969.0,0.0,1155992.0,21.0,1.0,2.0,15326.0,0.0
25%,1751688000000000.0,399.25,7400576.0,1.0,11709.25,0.598515,27735000.0,24.0,415134.8,186318.8,30322970.0,7016826.0,0.630182,26038560.0,39.0,3.0,5.0,32745.0,24.75
50%,3327126000000000.0,452.5,38781570.0,2.0,35364.5,1.184615,55300000.0,54.0,1179130.0,758613.5,51504520.0,19820420.0,1.139055,44345900.0,51.5,6.0,8.0,62212.5,49.0
75%,6429137000000000.0,513.75,90334970.0,4.0,52133.75,1.66776,75625000.0,96.0,2462256.0,2016168.0,70161280.0,30869830.0,1.59475,74256040.0,64.0,10.0,10.0,88074.0,75.25
max,9206245000000000.0,575.0,159392400.0,5.0,94110.0,1.9926,99450000.0,180.0,28653330.0,28227800.0,99975310.0,40641860.0,1.99194,99699630.0,75.0,13.0,13.0,99573.0,96.0


# 6. Drop the name column

The `first_name` and `last_name` columns are not coherent with many column in the dataset

In [12]:
# Dropping the first_name and last_name columns
df = df.drop(['first_name', 'last_name'], axis=1)

# 7. Correlation

In [13]:
# Draw a correlation matrix to see what features are corralating to the outcome the most:
display(df.corr().sort_values('risk_score', ascending=False))
correlation = df.corr().sort_values('risk_score', ascending=False)[['risk_score']]
correlation.style.background_gradient(cmap='Blues')

Unnamed: 0,NIK,credit_score,income,number_of_dependent,code_occupation_kbli,debt_to_income,ticket_size,tenor,principal_installment_amount,total_installment_amount,collateral_amount,credit_limit,credit_utilization,outstanding_debts,age,rt,rw,postal_code,risk_score
risk_score,0.018094,0.097237,0.122199,-0.004169,0.05653,-0.021017,-0.01436,0.136438,-0.197793,-0.198525,-0.02013,-0.032913,-0.088734,0.174663,-0.042359,-0.051638,0.048215,0.102709,1.0
outstanding_debts,0.011322,-0.094461,0.092315,0.014693,0.194217,-0.055153,-0.108341,0.142096,-0.051048,-0.045902,-0.184726,0.208645,0.214056,1.0,-0.044786,0.050845,-0.117633,0.129979,0.174663
tenor,0.179756,0.080577,0.170404,0.144786,0.146021,-0.102201,-0.074442,1.0,-0.516167,-0.512734,-0.097468,0.014327,0.116347,0.142096,-0.025045,0.086595,0.034333,-0.030855,0.136438
income,0.154555,0.013715,1.0,0.020918,-0.076787,0.069123,-0.092675,0.170404,-0.068975,-0.064417,-0.072585,0.1584,-0.010323,0.092315,-0.00503,0.011643,0.007734,0.143922,0.122199
postal_code,-0.015576,0.151291,0.143922,0.016702,0.175181,-0.04538,-0.0236,-0.030855,-0.125413,-0.126502,0.096273,-0.042746,0.009614,0.129979,-0.005244,0.058065,-0.097606,1.0,0.102709
credit_score,-0.055003,1.0,0.013715,0.227593,0.098469,-0.099565,-0.024035,0.080577,-0.245035,-0.246624,0.016443,-0.160261,-0.077834,-0.094461,0.025373,0.102981,0.044973,0.151291,0.097237
code_occupation_kbli,-0.126703,0.098469,-0.076787,-0.045926,1.0,-0.112483,-0.04216,0.146021,-0.100442,-0.099174,-0.109189,-0.15255,0.01059,0.194217,-0.003936,0.151971,-0.092578,0.175181,0.05653
rw,0.064045,0.044973,0.007734,0.131878,-0.092578,0.142647,-0.179478,0.034333,0.061401,0.068557,-0.10307,0.076238,-0.139464,-0.117633,0.080581,0.015248,1.0,-0.097606,0.048215
NIK,1.0,-0.055003,0.154555,0.123927,-0.126703,-0.074118,-0.015646,0.179756,-0.019903,-0.01869,-0.050953,-0.10713,0.04699,0.011322,-0.080313,0.085876,0.064045,-0.015576,0.018094
number_of_dependent,0.123927,0.227593,0.020918,1.0,-0.045926,0.015933,0.065864,0.144786,-0.072323,-0.074522,0.131805,-0.044676,-0.133228,0.014693,-0.181052,0.110366,0.131878,0.016702,-0.004169


Unnamed: 0,risk_score
risk_score,1.0
outstanding_debts,0.174663
tenor,0.136438
income,0.122199
postal_code,0.102709
credit_score,0.097237
code_occupation_kbli,0.05653
rw,0.048215
NIK,0.018094
number_of_dependent,-0.004169


- 0,4 >= Yes
- 0,4 < No

# 8. Categorical encoding: Label Encoding

In [14]:
'''
- customer_type (Individual, Company)
- employment_status (Unemployed, Less than a year, 1-2 years, 3-5 years, More than 5 years, Retired)
- occupation (Karena banyak unique value & banyak jumlah variabel -> Curse of Dimensionality & Overfit)(Combination)
- job_position (Karena banyak unique value, banyak jumlah variabel, dan masing-masing perusahaan berbeda-beda order of job position nya -> Curse of Dimensionality & Overfit)
- credit_purpose (Karena banyak unique value & banyak jumlah variabel -> Curse of Dimensionality & Overfit)(Combination)
- collateral (Karena banyak unique value & banyak jumlah variabel -> Curse of Dimensionality & Overfit)(Combination)
- payment_history (Poor, Fair, Good, Excellent)
- length_of_credit_history (Less than a year, 1-2 years, 3-5 years, More than 5 years)
- last_credit_history - other_credit_history3 (none, performing loan, under attention, substandard, doubt, non-performing loan)
- bankcruptcy_or_foreclosure_history (No, Less than a year, More than a year)
- document_validity_KTP-NIB (fraud, not included, unreadable, expired, valid)
- legal_history (no, yes)
- address_match (no, yes)
- criminal_rate_location (low, medium, high)
- risk (low, medium, high)'''



'\n- customer_type (Individual, Company)\n- employment_status (Unemployed, Less than a year, 1-2 years, 3-5 years, More than 5 years, Retired)\n- occupation (Karena banyak unique value & banyak jumlah variabel -> Curse of Dimensionality & Overfit)(Combination)\n- job_position (Karena banyak unique value, banyak jumlah variabel, dan masing-masing perusahaan berbeda-beda order of job position nya -> Curse of Dimensionality & Overfit)\n- credit_purpose (Karena banyak unique value & banyak jumlah variabel -> Curse of Dimensionality & Overfit)(Combination)\n- collateral (Karena banyak unique value & banyak jumlah variabel -> Curse of Dimensionality & Overfit)(Combination)\n- payment_history (Poor, Fair, Good, Excellent)\n- length_of_credit_history (Less than a year, 1-2 years, 3-5 years, More than 5 years)\n- last_credit_history - other_credit_history3 (none, performing loan, under attention, substandard, doubt, non-performing loan)\n- bankcruptcy_or_foreclosure_history (No, Less than a yea

# 9. Categorical encoding: One-Hot Encoding

In [15]:
'''
- marital_status (Single, Divorced, Widowed, Married)
- payment_method (Credit Card, Debit Card, Leasing, Cryptocurrency, Cash)
- types_of_credit (Medical, Investment, Credit, Car, Business, Home, Personal)'''

'\n- marital_status (Single, Divorced, Widowed, Married)\n- payment_method (Credit Card, Debit Card, Leasing, Cryptocurrency, Cash)\n- types_of_credit (Medical, Investment, Credit, Car, Business, Home, Personal)'