<a href="https://colab.research.google.com/github/0alfajar/MachineLearningProject/blob/main/Credit_Scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Depedencies

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

Collecting Data and Processing Data

In [18]:
df = pd.read_excel('/content/credit_scoring_dqlab.xlsx')

In [19]:
df.head()

Unnamed: 0,kode_kontrak,pendapatan_setahun_juta,kpr_aktif,durasi_pinjaman_bulan,jumlah_tanggungan,rata_rata_overdue,risk_rating
0,AGR-000001,295,YA,48,5,61 - 90 days,4
1,AGR-000011,271,YA,36,5,61 - 90 days,4
2,AGR-000030,159,TIDAK,12,0,0 - 30 days,1
3,AGR-000043,210,YA,12,3,46 - 60 days,3
4,AGR-000049,165,TIDAK,36,0,31 - 45 days,2


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   kode_kontrak             900 non-null    object
 1   pendapatan_setahun_juta  900 non-null    int64 
 2   kpr_aktif                900 non-null    object
 3   durasi_pinjaman_bulan    900 non-null    int64 
 4   jumlah_tanggungan        900 non-null    int64 
 5   rata_rata_overdue        900 non-null    object
 6   risk_rating              900 non-null    int64 
dtypes: int64(4), object(3)
memory usage: 49.3+ KB


In [21]:
print(df[['rata_rata_overdue', 'risk_rating']])

    rata_rata_overdue  risk_rating
0        61 - 90 days            4
1        61 - 90 days            4
2         0 - 30 days            1
3        46 - 60 days            3
4        31 - 45 days            2
..                ...          ...
895         > 90 days            5
896      46 - 60 days            3
897       0 - 30 days            1
898      31 - 45 days            2
899       0 - 30 days            1

[900 rows x 2 columns]


semakin lama overdue maka risk rating semakin besar, jadi pilih salah satu

In [22]:
# drop unused column
df.drop(['kode_kontrak', 'rata_rata_overdue'], axis=1, inplace=True)

In [23]:
# convert kpr_aktif to boolean
df.loc[df['kpr_aktif'] == 'YA', 'kpr_aktif'] = True
df.loc[df['kpr_aktif'] == 'TIDAK', 'kpr_aktif'] = False
df['kpr_aktif'] = df['kpr_aktif'].astype(bool)

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   pendapatan_setahun_juta  900 non-null    int64
 1   kpr_aktif                900 non-null    bool 
 2   durasi_pinjaman_bulan    900 non-null    int64
 3   jumlah_tanggungan        900 non-null    int64
 4   risk_rating              900 non-null    int64
dtypes: bool(1), int64(4)
memory usage: 29.1 KB


Separate Features and Label

In [25]:
feature_column = ['pendapatan_setahun_juta', 'kpr_aktif', 'durasi_pinjaman_bulan', 'jumlah_tanggungan']
X = df[feature_column]
y = df['risk_rating']

Split into training dan testing data

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Modelling

Hyperparameter tunning

In [30]:
tunned_params = [{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

In [31]:
model = GridSearchCV(DecisionTreeClassifier(), tunned_params, scoring='f1_macro')

In [32]:
model.fit(X_train, y_train)

Model Evaluation

In [34]:
print('Hasil nilai uji saat tunning : ')
means = model.cv_results_['mean_test_score']
stds = model.cv_results_['std_test_score']
params = model.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print(f'{mean} ({stdev}) with: {param}')

Hasil nilai uji saat tunning : 
0.291289872927024 (0.0018976964619631977) with: {'max_depth': 1}
0.6209947033582414 (0.029400651466515534) with: {'max_depth': 2}
0.7680884367658256 (0.019226809336145897) with: {'max_depth': 3}
0.8557177963398649 (0.02705749361746386) with: {'max_depth': 4}
0.8627437572982473 (0.027205205780398898) with: {'max_depth': 5}
0.8497846046146375 (0.031411567247156456) with: {'max_depth': 6}
0.8481234292488422 (0.009609212795887346) with: {'max_depth': 7}
0.8421229468600867 (0.023889915817817013) with: {'max_depth': 8}
0.8457685544613925 (0.01849623578039975) with: {'max_depth': 9}
0.835677067742654 (0.0188553391682634) with: {'max_depth': 10}


In [35]:
# parameter terbaik
model.best_params_

{'max_depth': 5}

In [36]:
# model terbaik
model.best_estimator_

In [38]:
# prediksi dengan parameter dan model terbaik
train_pred = model.best_estimator_.predict(X_train)
train_accuracy = metrics.accuracy_score(y_train, train_pred)
test_pred = model.best_estimator_.predict(X_test)
test_accuracy = metrics.accuracy_score(y_test, test_pred)

In [40]:
print(f'Akurasi data training : {train_accuracy}')
print(f'Akurasi data testing : {test_accuracy}')

Akurasi data training : 0.9013888888888889
Akurasi data testing : 0.85
