# 1. Import the libraries

In [1]:
# Warnings
import warnings

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore the warnings
warnings.filterwarnings('ignore')



# 2. Load the dataset

In [2]:
# Load the dataset
df = pd.read_excel('jca_dataset.xlsx')

In [3]:
# See 5 first row
df.head()

Unnamed: 0,first_name,last_name,NIK,customer_type,credit_score,income,employment_status,marital_status,number_of_dependent,occupation,...,date_of_birth,age,address,rt,rw,postal_code,address_match,criminal_rate_location,risk,risk_score
0,Prasetya,Maryadi,8103067000000000.0,Company,365,83580000,Retired,Married,3,Translator,...,1987-05-13,35,"Gang Cihampelas No. 86, LENGKONG, SUMOBITO, BA...",5,8,95129,no,medium,high,99
1,Saadat,Firgantoro,6207066000000000.0,Individual,328,16370000,Unemployed,Widowed,2,Shoe repairman,...,1990-08-12,32,"Gang Surapati No. 03, LADOLIMA BARAT, WONOSARI...",12,12,29566,yes,low,medium,84
2,Bajragin,Hutapea,9104036000000000.0,Individual,838,27800000,More than 5 years,Divorced,9,Student,...,1979-12-24,43,"Jalan Jend. A. Yani No. 3, DAYAH TUHA, NUSAWUN...",3,10,37172,yes,low,high,12
3,Karja,Namaga,5305084000000000.0,Company,762,60790000,Less than a year,Widowed,4,Retiree,...,1981-07-20,41,"Gg. Pacuan Kuda No. 21, GUNUNG TUA TONGA, RANO...",13,12,24458,yes,medium,medium,6
4,Calista,Zulaika,1108266000000000.0,Company,506,70360000,Less than a year,Married,3,Makeup artist,...,1992-05-07,30,"Jl. H.J Maemunah No. 51, TANGSI LAMA, TANJUNGB...",6,10,94473,no,medium,low,49


In [4]:
df[['last_name', 'NIK', 'credit_limit']]

Unnamed: 0,last_name,NIK,credit_limit
0,Maryadi,8.103067e+15,251000000
1,Firgantoro,6.207066e+15,49110000
2,Hutapea,9.104036e+15,83400000
3,Namaga,5.305084e+15,182000000
4,Zulaika,1.108266e+15,211000000
...,...,...,...
95,Halimah,1.307107e+15,110000000
96,Purnawati,1.707066e+15,69540000
97,Narpati,1.603067e+15,12510000
98,Wulandari,6.108105e+15,153000000


# 3. Missing value

In [5]:
# Check if the column have missing value?
checkMissingValue = df.isnull().sum()

# Checking the condition
if(checkMissingValue.any() > 0):
    print("There's a missing value" + checkMissingValue)
else:
    print("There's not a missing value")

There's not a missing value


# 4. Duplicate value

In [6]:
# Check if the column have duplicate value?
df[df.duplicated()]

checkingDuplicatedValue = df.duplicated()

# Checking the condition
if(checkingDuplicatedValue.any() > 0):
    print("There's a duplicate value" + checkingDuplicatedValue)
else:
    print("There's not a duplicate value")

There's not a duplicate value


# 5. Convert the datatype

In [7]:
# Check the datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 49 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   first_name                          100 non-null    object        
 1   last_name                           100 non-null    object        
 2   NIK                                 100 non-null    float64       
 3   customer_type                       100 non-null    object        
 4   credit_score                        100 non-null    int64         
 5   income                              100 non-null    int64         
 6   employment_status                   100 non-null    object        
 7   marital_status                      100 non-null    object        
 8   number_of_dependent                 100 non-null    int64         
 9   occupation                          100 non-null    object        
 10  job_position               

In [8]:
# Convert the datatype for NIK & credit_limit
df[['NIK', 'credit_limit']] = df[['NIK', 'credit_limit']].astype('int64')

In [9]:
# Convert the datatype for debt_to_income & credit_utilization
df[['debt_to_income', 'credit_utilization']] = df[['debt_to_income', 'credit_utilization']].astype('float64')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 49 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   first_name                          100 non-null    object        
 1   last_name                           100 non-null    object        
 2   NIK                                 100 non-null    int64         
 3   customer_type                       100 non-null    object        
 4   credit_score                        100 non-null    int64         
 5   income                              100 non-null    int64         
 6   employment_status                   100 non-null    object        
 7   marital_status                      100 non-null    object        
 8   number_of_dependent                 100 non-null    int64         
 9   occupation                          100 non-null    object        
 10  job_position               

In [11]:
df.describe()

Unnamed: 0,NIK,credit_score,income,number_of_dependent,code_occupation_kbli,debt_to_income,ticket_size,tenor,collateral_amount,credit_limit,credit_utilization,outstanding_debts,age,rt,rw,postal_code,risk_score
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,4847624000000000.0,582.67,45935000.0,5.03,46421.02,0.121128,52608400.0,62.16,48087720.0,137792400.0,1.608387,51539290.0,46.66,6.56,6.38,56955.9,46.13
std,2669837000000000.0,170.679868,28399590.0,3.095957,28194.386211,0.263158,29608470.0,52.899699,28709550.0,85184630.0,1.778839,28401510.0,16.905095,3.880331,4.04465,27422.132702,29.558589
min,1108266000000000.0,300.0,3520000.0,0.0,1112.0,0.001615,1840000.0,3.0,1555758.0,10560000.0,0.035391,1941774.0,21.0,1.0,1.0,17520.0,2.0
25%,3202628000000000.0,441.25,20025000.0,2.75,23956.25,0.013716,26420000.0,24.0,21358250.0,60075000.0,0.444906,27952490.0,32.0,3.0,3.0,29797.0,20.0
50%,5157075000000000.0,594.5,44610000.0,5.0,47461.5,0.031476,54475000.0,48.0,51199150.0,134000000.0,1.089097,53820070.0,45.0,6.0,6.0,57274.0,42.0
75%,7223136000000000.0,728.5,71147500.0,7.0,64122.25,0.086599,79660000.0,84.0,73215920.0,213250000.0,2.096007,79147910.0,62.0,10.0,10.0,83592.25,72.0
max,9210000000000000.0,848.0,96680000.0,10.0,95240.0,2.005263,99580000.0,180.0,99043540.0,290000000.0,8.907609,97723820.0,75.0,13.0,13.0,99565.0,99.0


# 6. Drop the name column

The `first_name` and `last_name` columns are not coherent with many column in the dataset

In [16]:
# Dropping the first_name and last_name columns
df = df.drop(['first_name', 'last_name'], axis=1)

# 7. Correlation

In [18]:
# Draw a correlation matrix to see what features are corralating to the outcome the most:
display(df.corr().sort_values('risk_score', ascending=False))
correlation = df.corr().sort_values('risk_score', ascending=False)[['risk_score']]
correlation.style.background_gradient(cmap='Blues')

Unnamed: 0,NIK,credit_score,income,number_of_dependent,code_occupation_kbli,debt_to_income,ticket_size,tenor,collateral_amount,credit_limit,credit_utilization,outstanding_debts,age,rt,rw,postal_code,risk_score
risk_score,-0.074137,-0.023619,0.036823,0.10272,-0.036443,-0.014136,0.133184,0.084547,-0.075937,0.036466,0.042898,0.002572,-0.116609,-0.214027,-0.024412,-0.016451,1.0
ticket_size,-0.026757,-0.094565,0.075734,0.13018,0.095142,0.013658,1.0,0.189908,0.007932,0.076295,0.378269,-0.016104,0.029757,-0.075482,0.016546,0.192089,0.133184
number_of_dependent,-0.108542,0.182286,-0.095934,1.0,0.177531,0.092528,0.13018,-0.16933,-0.17513,-0.09581,-0.006342,-0.151631,0.103258,0.064171,-0.103365,0.126479,0.10272
tenor,0.123237,-0.107601,-0.09275,-0.16933,-0.109737,-0.292656,0.189908,1.0,0.17208,-0.092883,0.188811,0.076497,-0.135865,-0.0092,0.099373,-0.048348,0.084547
credit_utilization,-0.117022,-0.273022,-0.550294,-0.006342,-0.005668,0.246031,0.378269,0.188811,-0.153478,-0.550204,1.0,0.011546,-0.113484,0.162974,-0.038678,0.091307,0.042898
income,0.03932,0.026532,1.0,-0.095934,-0.011465,-0.175158,0.075734,-0.09275,0.213162,0.999997,-0.550294,0.145571,0.033957,-0.25734,0.136608,0.010908,0.036823
credit_limit,0.039153,0.026885,0.999997,-0.09581,-0.011921,-0.175108,0.076295,-0.092883,0.212881,1.0,-0.550204,0.145222,0.034284,-0.257221,0.136998,0.010899,0.036466
outstanding_debts,0.044774,-0.054894,0.145571,-0.151631,0.037834,-0.052041,-0.016104,0.076497,0.002292,0.145222,0.011546,1.0,-0.02529,0.056004,0.075402,0.003264,0.002572
debt_to_income,-0.1084,0.017587,-0.175158,0.092528,-0.008747,1.0,0.013658,-0.292656,-0.066766,-0.175108,0.246031,-0.052041,-0.037063,-0.055222,0.057976,-0.081025,-0.014136
postal_code,0.008342,-0.080529,0.010908,0.126479,0.065188,-0.081025,0.192089,-0.048348,-0.065238,0.010899,0.091307,0.003264,-0.031173,-0.06798,-0.100822,1.0,-0.016451


Unnamed: 0,risk_score
risk_score,1.0
ticket_size,0.133184
number_of_dependent,0.10272
tenor,0.084547
credit_utilization,0.042898
income,0.036823
credit_limit,0.036466
outstanding_debts,0.002572
debt_to_income,-0.014136
postal_code,-0.016451


- 0,4 >= Yes
- 0,4 < No

# 8. Categorical encoding: Label Encoding

In [None]:
'''
- customer_type (Individual, Company)
- employment_status (Unemployed, Less than a year, 1-2 years, 3-5 years, More than 5 years, Retired)
- occupation (Karena banyak unique value & banyak jumlah variabel -> Curse of Dimensionality & Overfit)(Combination)
- job_position (Karena banyak unique value, banyak jumlah variabel, dan masing-masing perusahaan berbeda-beda order of job position nya -> Curse of Dimensionality & Overfit)
- credit_purpose (Karena banyak unique value & banyak jumlah variabel -> Curse of Dimensionality & Overfit)(Combination)
- collateral (Karena banyak unique value & banyak jumlah variabel -> Curse of Dimensionality & Overfit)(Combination)
- payment_history (Poor, Fair, Good, Excellent)
- length_of_credit_history (Less than a year, 1-2 years, 3-5 years, More than 5 years)
- last_credit_history - other_credit_history3 (none, performing loan, under attention, substandard, doubt, non-performing loan)
- bankcruptcy_or_foreclosure_history (No, Less than a year, More than a year)
- document_validity_KTP-NIB (fraud, not included, unreadable, expired, valid)
- legal_history (no, yes)
- address_match (no, yes)
- criminal_rate_location (low, medium, high)
- risk (low, medium, high)'''



# 9. Categorical encoding: One-Hot Encoding

In [None]:
'''
- marital_status (Single, Divorced, Widowed, Married)
- payment_method (Credit Card, Debit Card, Leasing, Cryptocurrency, Cash)
- types_of_credit (Medical, Investment, Credit, Car, Business, Home, Personal)'''