# 1. Import the libraries

In [1]:
# Warnings
import warnings

# Random
import random

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Requests
import requests

# Datetime and Dateutil
from datetime import datetime
from datetime import date
from dateutil.relativedelta import relativedelta

# Generate new dummy data
from faker import Faker
from faker_credit_score import CreditScore

# Ignore the warnings
warnings.filterwarnings('ignore')



# 2. Generate Faker & Random

In [2]:
# Set the Faker locale to Indonesia
fake = Faker('id_ID')

# Faker & random seed
Faker.seed(123)
random.seed(123)

# Add provider for Credit Score
fake.add_provider(CreditScore)

# 3. Create a Dataframe

In [3]:
df = pd.DataFrame(columns=['first_name', 'last_name', 'NIK', 'customer_type', 'credit_score', 'income', 'employment_status',
                           'marital_status', 'number_of_dependent', 'occupation', 'job_position', 'code_occupation_kbli',
                           'types_of_occupation_kbli', 'debt_to_income', 'credit_purpose', 'ticket_size', 'tenor',
                           'principal_installment_amount', 'total_installment_amount', 'collateral', 'collateral_amount', 
                           'payment_history', 'payment_method', 'credit_limit',
                           'credit_utilization', 'length_of_credit_history', 'last_credit_history', 'other_credit_history1',
                           'other_credit_history2', 'other_credit_history3', 'types_of_credit', 'outstanding_debts',
                           'bankcruptcy_or_foreclosure_history', 'document_validity_KTP', 'document_validity_NPWP',
                           'document_validity_SIUP', 'document_validity_akta_notaris', 'document_validity_SKDP',
                           'document_validity_TDP', 'document_validity_NIB', 'legal_history', 'date_of_birth', 'age',
                           'address', 'rt', 'rw', 'postal_code', 'address_match', 'criminal_rate_location', 'risk', 'risk_score'])

# 4. Loading the data

In [4]:
# Loading the column value for NIK
kode_wilayah_df = pd.read_csv('kode_wilayah.csv')

# Loading the column value for occupation from KTP
occupation_ktp_df = pd.read_excel('occupation_ktp.xlsx')

# Loading the column value for job_position from the internet
job_position_df = pd.read_excel('job_position.xlsx')

# Loading the column value for credit_purpose from the excel
credit_purpose_df = pd.read_excel('credit_purpose.xlsx')

# Loading the column value for code_occupation_kbli & type_of_occupation_kbli from KBLI
code_occupation_kbli_df = pd.read_csv('kbli_data.csv', usecols=['Kode'])
types_of_occupation_kbli_df = pd.read_csv('kbli_data.csv', usecols=['Judul'])

In [5]:
# Check the kode wilayah data
kode_wilayah_df.head()

Unnamed: 0,kode_provinsi,nama_provinsi,kode_kabupaten,nama_kabupaten,kode_kecamatan,nama_kecamatan
0,11,ACEH,11.01,KAB. ACEH SELATAN,11.01.01,Bakongan
1,11,ACEH,11.01,KAB. ACEH SELATAN,11.01.02,Kluet Utara
2,11,ACEH,11.01,KAB. ACEH SELATAN,11.01.03,Kluet Selatan
3,11,ACEH,11.01,KAB. ACEH SELATAN,11.01.04,Labuhan Haji
4,11,ACEH,11.01,KAB. ACEH SELATAN,11.01.05,Meukek


In [6]:
# Generate NIK
def generate_nik(gender=None):
    # kode provinsi
    # provinsi = random.choice(['11','12','13','14','15','16','17','18','19','21','31','32','33','34','35','36','51','52','53','61','62','63','64','65','71','72','73','74','75','76','81','82','91', '94'])
    
    # # kode kabupaten/kota
    # kabupaten = random.choice(['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','43','44','45','46','47','48','49','50','51','52','53','54','55','56','57','58','59','60','61','62','63','64','65','66','67','68','69','70','71','72','73','74','75','76','77','78','79','80','81','82','83','84','85','86','87','88','89','90','91','92','93','94','95','96','97','98','99'])
    
    # # kode kecamatan
    # kecamatan = random.choice(['01','02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42','43','44','45','46','47','48','49','50','51','52','53','54','55','56','57','58','59','60','61','62','63','64','65','66','67','68','69','70','71','72','73','74','75','76','77','78','79','80','81','82','83','84','85','86','87','88','89','90','91','92','93','94','95','96','97','98','99'])
    
    # kode wilayah
    # xxyyzz
    # xx: kode provinsi, yy: kode kabupaten/kota, zz: kode kecamatan
    wilayah = random.choice(kode_wilayah_df['kode_kecamatan']).replace('.', '')
    
    # tanggal lahir
    if (gender == 'L'):
        tanggal = str(random.randint(1,31))
    else:
        tanggal = str(random.randint(1,31) + 40)
        
    bulan = str(random.randint(1,12))
    tahun = str(random.randint(1950, 2000))[2:]
    
    # nomor urut
    nomor_urut = fake.numerify(text='#')
    
    # menggabungkan kode wilayah, tanggal lahir, dan nomor urut
    # nik = provinsi + kabupaten + kecamatan + tanggal.zfill(2) + bulan.zfill(2) + tahun + nomor_urut
    nik = wilayah + tanggal.zfill(2) + bulan.zfill(2) + tahun + nomor_urut.zfill(4)
    
    return nik

In [7]:
# Loading a KBLI data
kbli_df = pd.read_csv('kbli_data.csv')

In [8]:
# Loading a postal code data
pos_code_df = pd.read_csv('tbl_kodepos.csv')

In [9]:
# Check the post code data
pos_code_df.head()

Unnamed: 0,id,kelurahan,kecamatan,kabupaten,provinsi,kodepos
0,1,GAMBIR,GAMBIR,JAKARTA PUSAT,DKI JAKARTA,10110
1,2,KEBON KELAPA,GAMBIR,JAKARTA PUSAT,DKI JAKARTA,10120
2,3,PETOJO UTARA,GAMBIR,JAKARTA PUSAT,DKI JAKARTA,10130
3,4,DURI PULO,GAMBIR,JAKARTA PUSAT,DKI JAKARTA,10140
4,5,CIDENG,GAMBIR,JAKARTA PUSAT,DKI JAKARTA,10150


In [10]:
# Create an occupation list
occupation_list = occupation_ktp_df['occupation'].tolist()

# Create a job position list
job_position_list = job_position_df['job_position'].tolist()

# Create a credit purpose list
credit_purpose_list = credit_purpose_df['credit_purpose'].tolist()

# Create a code occupation KBLI list
code_occupation_kbli_list = code_occupation_kbli_df['Kode'].tolist()

# Create a type of occupation KBLI list
types_of_occupation_kbli_list = types_of_occupation_kbli_df['Judul'].tolist()

In [11]:
job_position_list

['Chairman of the Board of Directors',
 'Vice Chairman of the Board',
 'Board of Directors (Members)',
 'CEO',
 'Other C-level',
 'President',
 'Vice President',
 'Manager',
 'Permanent employee',
 'Temporary employee',
 'Contract employee',
 'Part time employee',
 'Freelance',
 'Not working']

In [12]:
# Create a tenor list
tenor_list = [3, 6, 12, 24, 36, 48, 60, 72, 84, 96, 120, 180]

In [13]:
# Create a kelurahan code list
kelurahan_list = []
for i in pos_code_df['kelurahan']:
	kelurahan_list.append(i)

# Create a kecamatan code list
kecamatan_list = []
for i in pos_code_df['kecamatan']:
	kecamatan_list.append(i)

# Create a kabupaten list
kabupaten_list = []
for i in pos_code_df['kabupaten']:
	kabupaten_list.append(i)    

# Create a provinsi list
provinsi_list = []
for i in pos_code_df['provinsi']:
	provinsi_list.append(i)        

# Create a postal code list
kode_pos_list = []
for i in pos_code_df['kodepos']:
	kode_pos_list.append(i)

In [14]:
# Dictionary for Code and Type KBLI
mapping_code_dict = dict(zip(kbli_df['Kode'], kbli_df['Judul']))

'''# Dictionary for Kelurahan and Kode Pos
mapping_kel_dict = dict(zip(pos_code_df['kodepos'], pos_code_df['kelurahan']))

# Dictionary for Kecamatan and Kode Pos
mapping_kec_dict = dict(zip(pos_code_df['kodepos'], pos_code_df['kecamatan']))

# Dictionary for Kabupaten and Kode Pos
mapping_kab_dict = dict(zip(pos_code_df['kodepos'], pos_code_df['kabupaten']))

# Dictionary for Provinsi and Kode Pos
mapping_pro_dict = dict(zip(pos_code_df['kodepos'], pos_code_df['provinsi']))'''

"# Dictionary for Kelurahan and Kode Pos\nmapping_kel_dict = dict(zip(pos_code_df['kodepos'], pos_code_df['kelurahan']))\n\n# Dictionary for Kecamatan and Kode Pos\nmapping_kec_dict = dict(zip(pos_code_df['kodepos'], pos_code_df['kecamatan']))\n\n# Dictionary for Kabupaten and Kode Pos\nmapping_kab_dict = dict(zip(pos_code_df['kodepos'], pos_code_df['kabupaten']))\n\n# Dictionary for Provinsi and Kode Pos\nmapping_pro_dict = dict(zip(pos_code_df['kodepos'], pos_code_df['provinsi']))"

# 6. Generate the Dummy Data

In [15]:
# Looping the dummy data
df['first_name'] = [fake.first_name() for _ in range(100)]
df['last_name'] = [fake.last_name() for _ in range(100)]
df['NIK'] = [generate_nik() for _ in range(100)]
df['customer_type'] = [fake.random_element(elements=('Personal', 'Business')) for _ in range(100)]
df['credit_score'] = [fake.credit_score() for _ in range(100)]
df['income'] = [fake.random_int(min=3000000, max=2000000000, step=10000) for _ in range(100)]
df['employment_status'] = [fake.random_element(elements=('Unemployed', 'Less than a year', '1-2 years', '3-5 years', 'More than 5 years', 'Retired')) for _ in range(100)]
df['marital_status'] = [fake.random_element(elements=('Single', 'Divorced', 'Widowed', 'Married')) for _ in range(100)]
df['number_of_dependent'] = [fake.random_int(min=0, max=5) for _ in range(100)]
df['occupation'] = fake.random_elements(elements=occupation_list, length=100)
df['job_position'] = fake.random_elements(elements=job_position_list, length=100)
df['code_occupation_kbli'] = fake.random_elements(elements=code_occupation_kbli_list, length=100)
df['types_of_occupation_kbli'] = df['code_occupation_kbli'].map(mapping_code_dict)
df['debt_to_income'] = [fake.pyfloat(min_value=0, max_value=1) for _ in range(100)]
df['credit_purpose'] = fake.random_elements(elements=credit_purpose_list, length=100)
df['ticket_size'] = [fake.random_int(min=1000000, max=100000000, step=10000) for _ in range(100)]
df['tenor'] = fake.random_elements(elements=tenor_list, length=100)
df['principal_installment_amount'] = [df['ticket_size'][_]/df['tenor'][_] for _ in range(100)]
df['total_installment_amount'] = [abs((df['ticket_size'][_]*0.18/12)/(1-(1+0.18/12)**(df['tenor'][_]))) for _ in range(100)]
df['collateral'] = [fake.random_element(elements=('Account Receivable', 'Equipment', 'Life Insurance Policy', 'Real Estate', 'Intellectual Property', 'Artworks', 'Vehicles', 'Stocks')) for _ in range(100)]
df['collateral_amount'] = [fake.random_int(min=1000000, max=100000000) for _ in range(100)]
df['payment_history'] = [fake.random_element(elements=('Poor', 'Fair', 'Good', 'Excellent')) for _ in range(100)]
df['payment_method'] = [fake.random_element(elements=('Credit Card', 'Debit Card', 'Leasing', 'Cryptocurrency', 'Cash')) for _ in range(100)]
df['credit_limit'] = [df['income'][_] for _ in range(100)]
df['credit_utilization'] = [round(df['ticket_size'][_]/df['credit_limit'][_], 5) for _ in range(100)]
df['length_of_credit_history'] = [fake.random_element(elements=('Less than a year', '1-2 years', '3-5 years', 'More than 5 years')) for _ in range(100)]
df['last_credit_history'] = [fake.random_element(elements=('none', 'performing loan', 'under attention', 'substandard', 'doubt', 'non-performing loan')) for _ in range(100)]
df['other_credit_history1'] = [fake.random_element(elements=('none', 'performing loan', 'under attention', 'substandard', 'doubt', 'non-performing loan')) for _ in range(100)]
df['other_credit_history2'] = [fake.random_element(elements=('none', 'performing loan', 'under attention', 'substandard', 'doubt', 'non-performing loan')) for _ in range(100)]
df['other_credit_history3'] = [fake.random_element(elements=('none', 'performing loan', 'under attention', 'substandard', 'doubt', 'non-performing loan')) for _ in range(100)]
df['types_of_credit'] = [fake.random_element(elements=('Medical', 'Investment', 'Credit', 'Car', 'Business', 'Home', 'Personal')) for _ in range(100)]
df['outstanding_debts'] = [fake.random_int(min=1000000, max=100000000) for _ in range(100)]
df['bankcruptcy_or_foreclosure_history'] = [fake.random_element(elements=('No', 'Less than a year', 'More than a year')) for _ in range(100)]
df['document_validity_KTP'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable', 'expired', 'valid')) for _ in range(100)]
df['document_validity_NPWP'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable', 'expired', 'valid')) for _ in range(100)]
df['document_validity_SIUP'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable', 'expired', 'valid')) for _ in range(100)]
df['document_validity_akta_notaris'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable', 'expired', 'valid')) for _ in range(100)]
df['document_validity_SKDP'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable', 'expired', 'valid')) for _ in range(100)]
df['document_validity_TDP'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable', 'expired', 'valid')) for _ in range(100)]
df['document_validity_NIB'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable', 'expired', 'valid')) for _ in range(100)]
df['legal_history'] = [fake.random_element(elements=('no', 'yes')) for _ in range(100)]
df['date_of_birth'] = [fake.date_of_birth(minimum_age=21, maximum_age=75) for _ in range(100)]

# today
today = date.today()

df['age'] = [(today.year - x.year - ((today.month, today.day) < (x.month, x.day))) for x in df['date_of_birth']]
df['address'] = df.apply(lambda row: fake.street_address() +
                         ', ' + fake.random_element(elements=kelurahan_list) +
                         ', ' + fake.random_element(elements=kecamatan_list) +
                         ', ' + fake.random_element(elements=kabupaten_list) +
                         ', ' + fake.random_element(elements=provinsi_list) +
                         ', RT ' + str(fake.random_int(min=1, max=13, step=1)) + 
                         '/RW ' + str(fake.random_int(min=1, max=13, step=1)) +
                         ', ' + str(fake.random_element(elements=kode_pos_list)), axis=1)
df['rt'] = df['address'].str.extract(r'RT\s+(\d+)\s*/')
df['rw'] = df['address'].str.extract(r'RW (\d+)', expand=False)
df['postal_code'] = df['address'].str[-5:]
df['address_match'] = [fake.random_element(elements=('no', 'yes')) for _ in range(100)]
df['criminal_rate_location'] = [fake.random_element(elements=('low', 'medium', 'high')) for _ in range(100)]
df['risk'] = [fake.random_element(elements=('low', 'medium', 'high')) for _ in range(100)]
df['risk_score'] = [str(fake.random_int(min=1, max=100, step=1)) for _ in range(100)]

In [16]:
df.head()

Unnamed: 0,first_name,last_name,NIK,customer_type,credit_score,income,employment_status,marital_status,number_of_dependent,occupation,...,date_of_birth,age,address,rt,rw,postal_code,address_match,criminal_rate_location,risk,risk_score
0,Bala,Suwarno,1208184902990002,Business,343,960160000,Retired,Widowed,5,Vice Mayor,...,1996-02-27,27,"Gg. Kebonjati No. 9, RUMFAKAR, BAKAUHENI, CILA...",5,13,24784,yes,medium,medium,30
1,Kawaca,Lailasari,3504174902520005,Business,442,198570000,Less than a year,Divorced,2,Private employee,...,1967-01-05,56,"Jalan Suniaraja No. 1, GAYA BARU ENAM, TUKDANA...",13,9,95371,yes,high,medium,99
2,Cayadi,Usamah,3326155809710008,Personal,409,569720000,Unemployed,Single,0,Architect,...,1960-02-01,63,"Gg. Gardujati No. 16, DUDAKAWU, CILAWU, ACEH B...",5,2,24653,no,high,medium,9
3,Rahman,Pradipta,3309066801600001,Business,444,238110000,Retired,Divorced,5,Member of the Constitutional Court,...,1951-12-28,71,"Gang Laswi No. 47, MEJASEM, PAGEDANGAN, PESAWA...",12,12,35142,yes,high,medium,14
4,Kasiran,Palastri,1502045109710008,Business,661,1154640000,More than 5 years,Divorced,3,Pilot,...,1983-10-19,39,"Gang Rumah Sakit No. 80, GAPUK TUA, TANJUNG HA...",7,10,21174,no,high,high,14


In [17]:
df[df['customer_type'] == 'Personal'].head(3)

Unnamed: 0,first_name,last_name,NIK,customer_type,credit_score,income,employment_status,marital_status,number_of_dependent,occupation,...,date_of_birth,age,address,rt,rw,postal_code,address_match,criminal_rate_location,risk,risk_score
2,Cayadi,Usamah,3326155809710008,Personal,409,569720000,Unemployed,Single,0,Architect,...,1960-02-01,63,"Gg. Gardujati No. 16, DUDAKAWU, CILAWU, ACEH B...",5,2,24653,no,high,medium,9
6,Bahuwarna,Nugroho,3514136502880003,Personal,595,1405290000,Less than a year,Divorced,0,Governor,...,1948-11-10,74,"Gang M.H Thamrin No. 060, BUNGUR, TANIMBAR SEL...",13,4,57571,yes,low,medium,61
7,Prabu,Mangunsong,3326034301700009,Personal,518,819470000,More than 5 years,Married,4,Carpenter,...,1977-01-26,46,"Jalan Moch. Toha No. 607, KARANGAREN, SERAM UT...",8,7,98815,no,medium,medium,90


In [18]:
df[['last_credit_history', 'other_credit_history1', 'other_credit_history2', 'other_credit_history3']]

Unnamed: 0,last_credit_history,other_credit_history1,other_credit_history2,other_credit_history3
0,doubt,performing loan,substandard,non-performing loan
1,non-performing loan,substandard,doubt,non-performing loan
2,under attention,substandard,under attention,doubt
3,substandard,under attention,substandard,non-performing loan
4,non-performing loan,under attention,under attention,performing loan
...,...,...,...,...
95,none,none,doubt,doubt
96,doubt,under attention,non-performing loan,none
97,under attention,substandard,none,none
98,performing loan,substandard,none,none


In [19]:
df[['document_validity_KTP', 'document_validity_NPWP', 'document_validity_SIUP', 'document_validity_akta_notaris', 'document_validity_SKDP', 'document_validity_TDP', 'document_validity_NIB', 'legal_history']]

Unnamed: 0,document_validity_KTP,document_validity_NPWP,document_validity_SIUP,document_validity_akta_notaris,document_validity_SKDP,document_validity_TDP,document_validity_NIB,legal_history
0,fraud,not included,expired,not included,unreadable,fraud,fraud,no
1,not included,valid,valid,unreadable,expired,valid,unreadable,yes
2,unreadable,unreadable,unreadable,valid,expired,expired,unreadable,no
3,fraud,not included,fraud,unreadable,fraud,valid,unreadable,yes
4,unreadable,expired,fraud,unreadable,valid,unreadable,valid,yes
...,...,...,...,...,...,...,...,...
95,unreadable,unreadable,valid,valid,valid,fraud,fraud,yes
96,unreadable,fraud,not included,unreadable,valid,expired,unreadable,yes
97,expired,fraud,fraud,fraud,unreadable,expired,valid,yes
98,valid,expired,valid,expired,valid,unreadable,expired,no


In [20]:
# Conditioning for risk column
'''if df[['last_credit_history', 'other_credit_history1', 'other_credit_history2', 'other_credit_history3']].eq('non-performing loan').any().any():
    df['risk'] = 'high'
elif df[['last_credit_history', 'other_credit_history1', 'other_credit_history2', 'other_credit_history3']].isin(['under attention', 'substandard', 'doubt']).any().any():
    df['risk'] = 'medium'
else:
    df['risk'] = 'low'''

"if df[['last_credit_history', 'other_credit_history1', 'other_credit_history2', 'other_credit_history3']].eq('non-performing loan').any().any():\n    df['risk'] = 'high'\nelif df[['last_credit_history', 'other_credit_history1', 'other_credit_history2', 'other_credit_history3']].isin(['under attention', 'substandard', 'doubt']).any().any():\n    df['risk'] = 'medium'\nelse:\n    df['risk'] = 'low"

In [21]:
# Conditioning for risk column
'''if df[['document_validity_KTP', 'document_validity_NPWP', 'document_validity_SIUP', 'document_validity_akta_notaris', 'document_validity_SKDP', 'document_validity_TDP', 'document_validity_NIB', 'legal_history']].eq('fraud').any().any():
    df['risk'] = 'high'
elif df[['document_validity_KTP', 'document_validity_NPWP', 'document_validity_SIUP', 'document_validity_akta_notaris', 'document_validity_SKDP', 'document_validity_TDP', 'document_validity_NIB', 'legal_history']].isin(['not included', 'unreadable', 'expired']).any().any():
    df['risk'] = 'medium'
else:
    df['risk'] = 'low'''

"if df[['document_validity_KTP', 'document_validity_NPWP', 'document_validity_SIUP', 'document_validity_akta_notaris', 'document_validity_SKDP', 'document_validity_TDP', 'document_validity_NIB', 'legal_history']].eq('fraud').any().any():\n    df['risk'] = 'high'\nelif df[['document_validity_KTP', 'document_validity_NPWP', 'document_validity_SIUP', 'document_validity_akta_notaris', 'document_validity_SKDP', 'document_validity_TDP', 'document_validity_NIB', 'legal_history']].isin(['not included', 'unreadable', 'expired']).any().any():\n    df['risk'] = 'medium'\nelse:\n    df['risk'] = 'low"

In [22]:
# Insert new column value for risk_score
low_risk_score = np.random.randint(low=0, high=33, size=(df['risk'] == 'low').sum())
medium_risk_score = np.random.randint(low=34, high=66, size=(df['risk'] == 'medium').sum())
high_risk_score = np.random.randint(low=67, high=100, size=(df['risk'] == 'high').sum())

risk_score = np.empty_like(df['risk'], dtype=int)
risk_score[df['risk'] == 'low'] = low_risk_score
risk_score[df['risk'] == 'medium'] = medium_risk_score
risk_score[df['risk'] == 'high'] = high_risk_score

df['risk_score'] = risk_score

In [23]:
df[df['job_position'] == 'Not working']

Unnamed: 0,first_name,last_name,NIK,customer_type,credit_score,income,employment_status,marital_status,number_of_dependent,occupation,...,date_of_birth,age,address,rt,rw,postal_code,address_match,criminal_rate_location,risk,risk_score
7,Prabu,Mangunsong,3326034301700009,Personal,518,819470000,More than 5 years,Married,4,Carpenter,...,1977-01-26,46,"Jalan Moch. Toha No. 607, KARANGAREN, SERAM UT...",8,7,98815,no,medium,medium,38
23,Michelle,Andriani,3307115807540000,Personal,570,1010510000,Retired,Married,4,Fisherman/fisheries,...,1965-09-03,57,"Gg. Jayawijaya No. 2, SALUBUA, JAYANTI, YAHUKI...",3,6,96181,no,medium,medium,39
29,Damu,Thamrin,9113507001900009,Business,411,1063900000,3-5 years,Widowed,4,Mayor,...,1995-06-22,27,"Jalan Rajiman No. 8, KARANG REJO, MUARA BENGKA...",12,12,24382,yes,high,medium,62
33,Edi,Anggriawan,7601095501500008,Business,359,275450000,Retired,Married,5,Governor,...,1954-01-22,69,"Jalan Jakarta No. 137, SUKA MAJU, SEMARANG TEN...",7,9,86591,no,high,medium,49
48,Ophelia,Tampubolon,1221096711530008,Personal,717,1853180000,3-5 years,Single,1,Indonesian National Police,...,1947-12-13,75,"Jalan Yos Sudarso No. 0, BALAYON, KELILA, SEMA...",5,8,69467,yes,medium,high,82
52,Lasmono,Marpaung,1408075806830009,Personal,467,485410000,Less than a year,Married,1,Carpenter,...,1969-11-13,53,"Jalan Moch. Ramdan No. 601, BIOPIS, GOLEWA BAR...",3,4,41181,no,high,medium,42
58,Yessi,Winarsih,3216075401600000,Personal,472,9220000,Retired,Married,4,Lawyer,...,1976-08-03,46,"Jalan Suniaraja No. 5, TANJUNG MANGKALIHAT, SE...",10,7,87261,yes,low,high,84
76,Ganep,Lazuardi,9102596610880002,Business,716,977290000,Retired,Divorced,2,State-owned enterprise employee,...,1951-12-31,71,"Jl. Pacuan Kuda No. 0, SIMANUNGKALIT, TANO TOM...",13,13,28711,no,high,medium,56
78,Mursita,Marbun,1406076301730006,Personal,554,1976710000,3-5 years,Married,0,Vice Mayor,...,1977-10-01,45,"Gg. Setiabudhi No. 4, BAGOR, TANAH RUBUH, BANY...",13,6,98463,no,low,high,96


In [24]:
df.head()

Unnamed: 0,first_name,last_name,NIK,customer_type,credit_score,income,employment_status,marital_status,number_of_dependent,occupation,...,date_of_birth,age,address,rt,rw,postal_code,address_match,criminal_rate_location,risk,risk_score
0,Bala,Suwarno,1208184902990002,Business,343,960160000,Retired,Widowed,5,Vice Mayor,...,1996-02-27,27,"Gg. Kebonjati No. 9, RUMFAKAR, BAKAUHENI, CILA...",5,13,24784,yes,medium,medium,37
1,Kawaca,Lailasari,3504174902520005,Business,442,198570000,Less than a year,Divorced,2,Private employee,...,1967-01-05,56,"Jalan Suniaraja No. 1, GAYA BARU ENAM, TUKDANA...",13,9,95371,yes,high,medium,53
2,Cayadi,Usamah,3326155809710008,Personal,409,569720000,Unemployed,Single,0,Architect,...,1960-02-01,63,"Gg. Gardujati No. 16, DUDAKAWU, CILAWU, ACEH B...",5,2,24653,no,high,medium,36
3,Rahman,Pradipta,3309066801600001,Business,444,238110000,Retired,Divorced,5,Member of the Constitutional Court,...,1951-12-28,71,"Gang Laswi No. 47, MEJASEM, PAGEDANGAN, PESAWA...",12,12,35142,yes,high,medium,57
4,Kasiran,Palastri,1502045109710008,Business,661,1154640000,More than 5 years,Divorced,3,Pilot,...,1983-10-19,39,"Gang Rumah Sakit No. 80, GAPUK TUA, TANJUNG HA...",7,10,21174,no,high,high,93


In [25]:
# Insert new column value for debt to income
df['debt_to_income'] = [round(df['total_installment_amount'][_]/df['income'][_], 5) for _ in range(100)]

In [26]:
df[['ticket_size', 'tenor', 'principal_installment_amount', 'total_installment_amount', 'debt_to_income']]

Unnamed: 0,ticket_size,tenor,principal_installment_amount,total_installment_amount,debt_to_income
0,39530000,84,4.705952e+05,2.378851e+05,0.00025
1,79340000,36,2.203889e+06,1.678231e+06,0.00845
2,63440000,180,3.524444e+05,7.005111e+04,0.00012
3,57510000,180,3.195000e+05,6.350314e+04,0.00027
4,31160000,180,1.731111e+05,3.440720e+04,0.00003
...,...,...,...,...,...
95,52590000,6,8.765000e+06,8.442021e+06,0.03287
96,39940000,96,4.160417e+05,1.886452e+05,0.00015
97,34170000,84,4.067857e+05,2.056295e+05,0.00021
98,13450000,60,2.241667e+05,1.397916e+05,0.00007


In [27]:
# Dictionary of income ranges for each job position
income_ranges = {
    "Chairman of the Board of Directors": (90000000, 160000000),
    "Vice Chairman of the Board" : (80000000, 125000000),
    "Board of Directors (Members)" : (24000000, 61250000),
    "CEO" : (90000000, 160000000),
    "Other C-level" : (50000000, 120000000),
    "President" : (56000000, 160000000),
    "Vice President" : (34000000, 84000000),
    "Manager" : (50000000, 125000000),
    "Permanent employee" : (3000000, 54000000),
    "Temporary employee" : (3000000, 6000000),
    "Contract employee" : (3000000, 20000000),
    "Part time employee" : (3000000, 5000000),
    "Freelance" : (3000000, 50000000),
    "Not working" : (0, 0)
}

# Insert new column value for job_position
df['job_position'] = [fake.random_element(elements=list(income_ranges.keys())) for _ in range(100)]

# Insert new column value for income
df['income'] = [fake.random_int(min=income_ranges[position][0], max=income_ranges[position][1]) 
                   for position in df['job_position']]

In [28]:
df[['income', 'job_position']]

Unnamed: 0,income,job_position
0,33840473,Freelance
1,22519044,Permanent employee
2,139646738,CEO
3,0,Not working
4,50955547,Other C-level
...,...,...
95,90481832,CEO
96,38192056,Permanent employee
97,11540315,Contract employee
98,108335982,Other C-level


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 51 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   first_name                          100 non-null    object 
 1   last_name                           100 non-null    object 
 2   NIK                                 100 non-null    object 
 3   customer_type                       100 non-null    object 
 4   credit_score                        100 non-null    int64  
 5   income                              100 non-null    int64  
 6   employment_status                   100 non-null    object 
 7   marital_status                      100 non-null    object 
 8   number_of_dependent                 100 non-null    int64  
 9   occupation                          100 non-null    object 
 10  job_position                        100 non-null    object 
 11  code_occupation_kbli                100 non-nu

In [30]:
# Insert new column value for credit limit

'''if (df['income'] < 10000000).any():
    df['credit_limit'] = df['credit_limit'] * 3
else:
    if df['credit_score'] >= 800 and df['debt_to_income'] < 0.36 and df['last_credit_history'] == 'none' or df['last_credit_history'] == 'performing loan' and df['tenor'] >= 12:
        df['credit_limit'] = df['credit_limit'] * 6
    
    elif df['credit_score'] >= 740 and df['debt_to_income'] == 0.36 and df['last_credit_history'] == 'none' or df['last_credit_history'] == 'performing loan' and df['tenor'] >= 12:
        df['credit_limit'] = df['credit_limit'] * 5

    elif df['credit_score'] >= 670 and df['debt_to_income'] > 0.36 or df['debt_to_income'] <= 0.4 and df['last_credit_history'] == 'none' or df['last_credit_history'] == 'performing loan' and df['tenor'] >= 12:
        df['credit_limit'] = df['credit_limit'] * 4
    else:
        df['credit_limit'] = df['credit_limit'] * 3'''

df['credit_limit'] = [df['income'][_] * 0.45 * 8 for _ in range(100)]

In [31]:
df[df['job_position'] == 'Not working']

Unnamed: 0,first_name,last_name,NIK,customer_type,credit_score,income,employment_status,marital_status,number_of_dependent,occupation,...,date_of_birth,age,address,rt,rw,postal_code,address_match,criminal_rate_location,risk,risk_score
3,Rahman,Pradipta,3309066801600001,Business,444,0,Retired,Divorced,5,Member of the Constitutional Court,...,1951-12-28,71,"Gang Laswi No. 47, MEJASEM, PAGEDANGAN, PESAWA...",12,12,35142,yes,high,medium,57
18,Joko,Rajasa,9103136210810008,Business,408,0,More than 5 years,Single,5,Merchant/trader,...,2000-10-23,22,"Gang Suniaraja No. 014, KERTOSONO, SAPARUA, ND...",9,5,95761,yes,low,high,78
48,Ophelia,Tampubolon,1221096711530008,Personal,717,0,3-5 years,Single,1,Indonesian National Police,...,1947-12-13,75,"Jalan Yos Sudarso No. 0, BALAYON, KELILA, SEMA...",5,8,69467,yes,medium,high,82
60,Sarah,Hutapea,5321014506820007,Business,518,0,3-5 years,Widowed,2,Mason,...,1952-07-25,70,"Gang Lembong No. 2, BUJUR TIMUR, SINGKAWANG TE...",13,13,99565,no,high,low,17
73,Anita,Nasyidah,7373046710810006,Personal,811,0,3-5 years,Widowed,4,Makeup artist,...,1973-12-10,49,"Jalan Surapati No. 05, BOTING, MESUJI MAKMUR, ...",1,12,95775,yes,low,medium,49
84,Karen,Hasanah,3324195705610008,Personal,781,0,Unemployed,Divorced,1,Pilot,...,1961-04-26,61,"Jalan Waringin No. 520, RENGAS, TUGU, SINJAI, ...",9,11,61272,yes,medium,low,8
94,Jagapati,Manullang,5102065905710007,Personal,367,0,Retired,Widowed,1,Not working,...,1960-04-22,62,"Jl. Wonoayu No. 320, DINDOK, COT GIREK, SUMBA ...",8,13,20772,no,high,low,19


In [32]:
df[['income', 'credit_score', 'debt_to_income', 'last_credit_history', 'tenor', 'credit_limit']]

Unnamed: 0,income,credit_score,debt_to_income,last_credit_history,tenor,credit_limit
0,33840473,343,0.00025,doubt,84,121825702.8
1,22519044,442,0.00845,non-performing loan,36,81068558.4
2,139646738,409,0.00012,under attention,180,502728256.8
3,0,444,0.00027,substandard,180,0.0
4,50955547,661,0.00003,non-performing loan,180,183439969.2
...,...,...,...,...,...,...
95,90481832,494,0.03287,none,6,325734595.2
96,38192056,527,0.00015,doubt,96,137491401.6
97,11540315,563,0.00021,under attention,84,41545134.0
98,108335982,719,0.00007,performing loan,60,390009535.2


In [33]:
df.head()

Unnamed: 0,first_name,last_name,NIK,customer_type,credit_score,income,employment_status,marital_status,number_of_dependent,occupation,...,date_of_birth,age,address,rt,rw,postal_code,address_match,criminal_rate_location,risk,risk_score
0,Bala,Suwarno,1208184902990002,Business,343,33840473,Retired,Widowed,5,Vice Mayor,...,1996-02-27,27,"Gg. Kebonjati No. 9, RUMFAKAR, BAKAUHENI, CILA...",5,13,24784,yes,medium,medium,37
1,Kawaca,Lailasari,3504174902520005,Business,442,22519044,Less than a year,Divorced,2,Private employee,...,1967-01-05,56,"Jalan Suniaraja No. 1, GAYA BARU ENAM, TUKDANA...",13,9,95371,yes,high,medium,53
2,Cayadi,Usamah,3326155809710008,Personal,409,139646738,Unemployed,Single,0,Architect,...,1960-02-01,63,"Gg. Gardujati No. 16, DUDAKAWU, CILAWU, ACEH B...",5,2,24653,no,high,medium,36
3,Rahman,Pradipta,3309066801600001,Business,444,0,Retired,Divorced,5,Member of the Constitutional Court,...,1951-12-28,71,"Gang Laswi No. 47, MEJASEM, PAGEDANGAN, PESAWA...",12,12,35142,yes,high,medium,57
4,Kasiran,Palastri,1502045109710008,Business,661,50955547,More than 5 years,Divorced,3,Pilot,...,1983-10-19,39,"Gang Rumah Sakit No. 80, GAPUK TUA, TANJUNG HA...",7,10,21174,no,high,high,93


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 51 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   first_name                          100 non-null    object 
 1   last_name                           100 non-null    object 
 2   NIK                                 100 non-null    object 
 3   customer_type                       100 non-null    object 
 4   credit_score                        100 non-null    int64  
 5   income                              100 non-null    int64  
 6   employment_status                   100 non-null    object 
 7   marital_status                      100 non-null    object 
 8   number_of_dependent                 100 non-null    int64  
 9   occupation                          100 non-null    object 
 10  job_position                        100 non-null    object 
 11  code_occupation_kbli                100 non-nu

In [35]:
df['credit_utilization']

0     0.04117
1     0.39956
2     0.11135
3     0.24153
4     0.02699
       ...   
95    0.20477
96    0.03097
97    0.03489
98    0.00679
99    0.03223
Name: credit_utilization, Length: 100, dtype: float64

In [36]:
# Convert principal_installment_amount, total_installment_amount, credit_limit column to int64
df[['principal_installment_amount', 'total_installment_amount', 'credit_limit']] = df[['principal_installment_amount', 'total_installment_amount', 'credit_limit']].astype('int64')

In [37]:
df[['principal_installment_amount', 'total_installment_amount', 'credit_limit']]

Unnamed: 0,principal_installment_amount,total_installment_amount,credit_limit
0,470595,237885,121825702
1,2203888,1678231,81068558
2,352444,70051,502728256
3,319500,63503,0
4,173111,34407,183439969
...,...,...,...
95,8765000,8442021,325734595
96,416041,188645,137491401
97,406785,205629,41545134
98,224166,139791,390009535


In [38]:
df[['ticket_size', 'tenor', 'income', 'debt_to_income']]

Unnamed: 0,ticket_size,tenor,income,debt_to_income
0,39530000,84,33840473,0.00025
1,79340000,36,22519044,0.00845
2,63440000,180,139646738,0.00012
3,57510000,180,0,0.00027
4,31160000,180,50955547,0.00003
...,...,...,...,...
95,52590000,6,90481832,0.03287
96,39940000,96,38192056,0.00015
97,34170000,84,11540315,0.00021
98,13450000,60,108335982,0.00007


In [39]:
# Conditioning for risk
# Hapus 
if (df['risk'] == 'low').any():
    df['credit_score'] = [fake.random_int(min=300, max=579) for _ in range(100)]
    df['job_position'] = 'Not working'
    df['debt_to_income'] = [round(fake.pyfloat(min_value=0.5, max_value=2.0), 5) for _ in range(100)]
    df['payment_history'] = [fake.random_element(elements=('Poor', 'Fair')) for _ in range(100)]
    df['credit_limit'] = [fake.random_int(min=0, max=41300000) for _ in range(100)]
    df['credit_utilization'] = [round(fake.pyfloat(min_value=0.5, max_value=2.0), 5) for _ in range(100)]
    df['last_credit_history'] = [fake.random_element(elements=('none', 'doubt', 'non-performing loan')) for _ in range(100)]
    df['other_credit_history1'] = [fake.random_element(elements=('none', 'doubt', 'non-performing loan')) for _ in range(100)]
    df['other_credit_history2'] = [fake.random_element(elements=('none', 'doubt', 'non-performing loan')) for _ in range(100)] 
    df['other_credit_history3'] = [fake.random_element(elements=('none', 'doubt', 'non-performing loan')) for _ in range(100)]
    df['bankcruptcy_or_foreclosure_history'] = 'none'
    df['document_validity_KTP'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable')) for _ in range(100)]
    df['document_validity_NPWP'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable')) for _ in range(100)]
    df['document_validity_SIUP'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable')) for _ in range(100)]
    df['document_validity_akta_notaris'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable')) for _ in range(100)]
    df['document_validity_SKDP'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable')) for _ in range(100)]
    df['document_validity_TDP'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable')) for _ in range(100)]
    df['document_validity_NIB'] = [fake.random_element(elements=('fraud', 'not included', 'unreadable')) for _ in range(100)]
    df['legal_history'] = 'yes'
    df['address_match'] = 'no'
    df['criminal_rate_location'] = 'high'
    
elif (df['risk'] == 'medium').any():
    df['credit_score'] = [fake.random_int(min=580, max=669) for _ in range(100)]
    df['job_position'] = [fake.random_element(elements=('Chairman of the Board of Directors', 'Vice Chairman of the Board', 'Board of Directors (Members)', 'CEO', 'Other C-level', 'President', 'Vice President', 'Manager', 'Permanent employee', 'Temporary employee', 'Contract employee', 'Part time employee', 'Freelance')) for _ in range(100)]
    df['debt_to_income'] = [round(fake.pyfloat(min_value=0.36, max_value=0.49), 5) for _ in range(100)]
    df['payment_history'] = 'Good'
    df['credit_limit'] = [fake.random_int(min=41300001, max=116280000) for _ in range(100)]
    df['credit_utilization'] = [round(fake.pyfloat(min_value=0.31, max_value=0.49), 5) for _ in range(100)]
    df['last_credit_history'] = [fake.random_element(elements=('substandard', 'under attention')) for _ in range(100)]
    df['other_credit_history1'] = [fake.random_element(elements=('substandard', 'under attention')) for _ in range(100)]
    df['other_credit_history2'] = [fake.random_element(elements=('substandard', 'under attention')) for _ in range(100)] 
    df['other_credit_history3'] = [fake.random_element(elements=('substandard', 'under attention')) for _ in range(100)]
    df['bankcruptcy_or_foreclosure_history'] = [fake.random_element(elements=('Less than a year', 'More than a year')) for _ in range(100)]
    df['document_validity_KTP'] = [fake.random_element(elements=('unreadable', 'expired')) for _ in range(100)]
    df['document_validity_NPWP'] = [fake.random_element(elements=('unreadable', 'expired')) for _ in range(100)]
    df['document_validity_SIUP'] = [fake.random_element(elements=('unreadable', 'expired')) for _ in range(100)]
    df['document_validity_akta_notaris'] = [fake.random_element(elements=('unreadable', 'expired')) for _ in range(100)]
    df['document_validity_SKDP'] = [fake.random_element(elements=('unreadable', 'expired')) for _ in range(100)]
    df['document_validity_TDP'] = [fake.random_element(elements=('unreadable', 'expired')) for _ in range(100)]
    df['document_validity_NIB'] = [fake.random_element(elements=('unreadable', 'expired')) for _ in range(100)]
    df['legal_history'] = [fake.random_element(elements=('no', 'yes')) for _ in range(100)]
    df['address_match'] = [fake.random_element(elements=('no', 'yes')) for _ in range(100)]
    df['criminal_rate_location'] = 'medium'

else:
    df['credit_score'] = [fake.random_int(min=670, max=850) for _ in range(100)]
    df['job_position'] = [fake.random_element(elements=('Chairman of the Board of Directors', 'Vice Chairman of the Board', 'Board of Directors (Members)', 'CEO', 'Other C-level', 'President', 'Vice President', 'Manager', 'Permanent employee', 'Temporary employee', 'Contract employee', 'Part time employee', 'Freelance')) for _ in range(100)]
    df['debt_to_income'] = [round(fake.pyfloat(min_value=0, max_value=0.35), 5) for _ in range(100)]
    df['payment_history'] = [fake.random_element(elements=('Excellent')) for _ in range(100)]
    df['credit_limit'] = [fake.random_int(min=116280001, max=300000000) for _ in range(100)]
    df['credit_utilization'] = [round(fake.pyfloat(min_value=0, max_value=0.3)) for _ in range(100)]
    df['last_credit_history'] = [fake.random_element(elements=('none', 'performing loan')) for _ in range(100)]
    df['other_credit_history1'] = [fake.random_element(elements=('none', 'performing loan')) for _ in range(100)]
    df['other_credit_history2'] = [fake.random_element(elements=('none', 'performing loan')) for _ in range(100)] 
    df['other_credit_history3'] = [fake.random_element(elements=('none', 'performing loan')) for _ in range(100)]
    df['bankcruptcy_or_foreclosure_history'] = [fake.random_element(elements=('Less than a year', 'More than a year')) for _ in range(100)]
    df['document_validity_KTP'] = [fake.random_element(elements=('not included', 'valid')) for _ in range(100)]
    df['document_validity_NPWP'] = [fake.random_element(elements=('not included', 'valid')) for _ in range(100)]
    df['document_validity_SIUP'] = [fake.random_element(elements=('not included', 'valid')) for _ in range(100)]
    df['document_validity_akta_notaris'] = [fake.random_element(elements=('not included', 'valid')) for _ in range(100)]
    df['document_validity_SKDP'] = [fake.random_element(elements=('not included', 'valid')) for _ in range(100)]
    df['document_validity_TDP'] = [fake.random_element(elements=('not included', 'valid')) for _ in range(100)]
    df['document_validity_NIB'] = [fake.random_element(elements=('not included', 'valid')) for _ in range(100)]
    df['legal_history'] = 'no'
    df['address_match'] = 'yes'
    df['criminal_rate_location'] = 'high'

In [40]:
df[df['credit_score'] > 500]

Unnamed: 0,first_name,last_name,NIK,customer_type,credit_score,income,employment_status,marital_status,number_of_dependent,occupation,...,date_of_birth,age,address,rt,rw,postal_code,address_match,criminal_rate_location,risk,risk_score
0,Bala,Suwarno,1208184902990002,Business,543,33840473,Retired,Widowed,5,Vice Mayor,...,1996-02-27,27,"Gg. Kebonjati No. 9, RUMFAKAR, BAKAUHENI, CILA...",5,13,24784,no,high,medium,37
7,Prabu,Mangunsong,3326034301700009,Personal,513,22698656,More than 5 years,Married,4,Carpenter,...,1977-01-26,46,"Jalan Moch. Toha No. 607, KARANGAREN, SERAM UT...",8,7,98815,no,high,medium,38
9,Jamalia,Handayani,1302086203580001,Personal,525,55278387,Less than a year,Single,3,Ambassador,...,1969-08-08,53,"Jl. Jakarta No. 095, HAPALAH, NEKAMESE, KUDUS,...",11,10,76252,no,high,high,72
13,Galuh,Anggriawan,3216185109800003,Business,503,48308087,Unemployed,Widowed,4,Lawyer,...,1993-02-14,30,"Gg. S. Parman No. 241, PARIPPUNG, DOLOK BATU N...",13,3,22465,no,high,low,29
15,Mursita,Hasanah,5302265906500004,Business,541,74817836,More than 5 years,Single,1,Journalist,...,1948-02-25,75,"Jalan S. Parman No. 327, SALEK/SALEH AGUNG (MA...",1,12,52193,no,high,low,12
18,Joko,Rajasa,9103136210810008,Business,503,0,More than 5 years,Single,5,Merchant/trader,...,2000-10-23,22,"Gang Suniaraja No. 014, KERTOSONO, SAPARUA, ND...",9,5,95761,no,high,high,78
19,Gamanto,Thamrin,5272026207730001,Personal,518,93903272,3-5 years,Single,3,Regional-owned enterprise employee,...,1963-08-27,59,"Jl. Waringin No. 075, PASURUHAN LOR, TRAGAH, K...",7,12,37354,no,high,high,89
20,Ade,Saputra,5203064211610003,Personal,503,72707504,1-2 years,Divorced,4,Event promoter,...,1983-06-12,39,"Gg. Ciumbuleuit No. 2, ANGGALOMELAI (ANGGOLOME...",1,8,56191,no,high,low,15
28,Usman,Melani,3504045809960001,Business,544,3102453,Retired,Divorced,1,Vice Governor,...,1959-04-24,63,"Jalan Sukajadi No. 7, MON JAMBEE, TONDANO BARA...",4,3,80761,no,high,medium,59
29,Damu,Thamrin,9113507001900009,Business,535,15425745,3-5 years,Widowed,4,Mayor,...,1995-06-22,27,"Jalan Rajiman No. 8, KARANG REJO, MUARA BENGKA...",12,12,24382,no,high,medium,62


In [41]:
df['risk_score'].value_counts()

49    5
93    5
81    4
72    3
36    3
     ..
37    1
38    1
39    1
1     1
96    1
Name: risk_score, Length: 66, dtype: int64

In [42]:
df['risk'].value_counts()

high      35
low       33
medium    32
Name: risk, dtype: int64

In [43]:
df[df['risk'] == 'low']

Unnamed: 0,first_name,last_name,NIK,customer_type,credit_score,income,employment_status,marital_status,number_of_dependent,occupation,...,date_of_birth,age,address,rt,rw,postal_code,address_match,criminal_rate_location,risk,risk_score
12,Bala,Gunawan,3528066701990003,Personal,409,22913187,Retired,Widowed,4,Farm laborer/gardener,...,1988-04-05,35,"Jl. Jakarta No. 0, JUA, MAGELANG TENGAH, SIMAL...",4,7,44163,no,high,low,30
13,Galuh,Anggriawan,3216185109800003,Business,503,48308087,Unemployed,Widowed,4,Lawyer,...,1993-02-14,30,"Gg. S. Parman No. 241, PARIPPUNG, DOLOK BATU N...",13,3,22465,no,high,low,29
15,Mursita,Hasanah,5302265906500004,Business,541,74817836,More than 5 years,Single,1,Journalist,...,1948-02-25,75,"Jalan S. Parman No. 327, SALEK/SALEH AGUNG (MA...",1,12,52193,no,high,low,12
17,Mulyanto,Damanik,3513226209900000,Business,300,80191718,Retired,Divorced,1,Member of the Constitutional Court,...,1955-10-12,67,"Gg. Rajawali Barat No. 2, AWILINAN, SUMBER, SU...",6,8,55851,no,high,low,5
20,Ade,Saputra,5203064211610003,Personal,503,72707504,1-2 years,Divorced,4,Event promoter,...,1983-06-12,39,"Gg. Ciumbuleuit No. 2, ANGGALOMELAI (ANGGOLOME...",1,8,56191,no,high,low,15
21,Teddy,Fujiati,7405256702810008,Business,456,6226821,1-2 years,Single,3,Farm laborer/gardener,...,1978-06-13,44,"Gang Setiabudhi No. 55, MANETWATI, TANJUNG SAK...",5,8,91966,no,high,low,30
22,Cayadi,Usada,7202014903710003,Business,460,103325559,More than 5 years,Married,5,Researcher,...,1969-02-28,54,"Gang Indragiri No. 641, MALOINGGEN, BATANG ONA...",7,5,66126,no,high,low,16
24,Perkasa,Adriansyah,7405035507860003,Business,301,40512554,3-5 years,Married,2,Mason,...,2001-08-12,21,"Gg. Pasir Koja No. 767, PALOAN, JARAI, PANDEGL...",5,4,64419,no,high,low,31
25,Cahyono,Kurniawan,8204095101620004,Business,300,69977889,Retired,Married,4,State-owned enterprise employee,...,1965-05-30,57,"Jl. Rungkut Industri No. 9, KAPU, DURENAN, BUO...",11,10,45652,no,high,low,9
26,Adinata,Safitri,6205064311730009,Personal,442,8691661,1-2 years,Single,1,Household caregiver,...,1974-03-05,49,"Gang Kendalsari No. 4, BELO, CISARUA, HALMAHER...",12,9,15326,no,high,low,1


# 7. Move to Excel

In [44]:
# Convert to Excel
# jca_data = df.to_excel('jca_datasets.xlsx', index=False)