In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os

#Load dataset
file_path = r"/bank_credit_scoring.csv"
df = pd.read_csv(file_path)

print("Dataset awal:")
print(df.head())

Dataset awal:
   Задолженность  Просрочка, дни  Первоначльный лимит   BIRTHDATE      SEX  \
0        6063.50               3               7000.0  1983-07-08  Мужской   
1        3765.04               0               5000.0  1987-12-19  Женский   
2        2067.66               0               2650.0  1966-02-28  Женский   
3        2370.39               0               3000.0  1972-02-18  Женский   
4        2280.55               0               3000.0  1997-02-23  Женский   

                   EDU   INCOME  TERM Рейтинг кредитной истории  \
0  Среднее специальное   703.07    60                        A1   
1               Высшее  1693.68    60                        B2   
2              Среднее   724.49    60                        C2   
3  Среднее специальное  1045.84    60                        A3   
4               Высшее  1092.65    60                        B3   

              LV_AREA LV_SETTLEMENTNAME                INDUSTRYNAME   PDN  \
0  Гомельская область         КОСАКОВ

In [4]:
#Preprocessing Data
#Menghapus kolom yang tidak relevan (contoh: CLIENTID)
df_preprocessed = df.drop(columns=['CLIENTID'], errors='ignore')

#Mengubah BIRTHDATE menjadi tahun kelahiran untuk analisis lebih lanjut
df_preprocessed['BIRTHYEAR'] = pd.to_datetime(df_preprocessed['BIRTHDATE']).dt.year
df_preprocessed = df_preprocessed.drop(columns=['BIRTHDATE'])

#Memastikan tipe data kolom numerik
numeric_cols = df_preprocessed.select_dtypes(include=['number']).columns
df_preprocessed[numeric_cols] = df_preprocessed[numeric_cols].astype(float)

#Menampilkan data setelah preprocessing
print("\nData setelah preprocessing:")
print(df_preprocessed.head())


Data setelah preprocessing:
   Задолженность  Просрочка, дни  Первоначльный лимит      SEX  \
0        6063.50             3.0               7000.0  Мужской   
1        3765.04             0.0               5000.0  Женский   
2        2067.66             0.0               2650.0  Женский   
3        2370.39             0.0               3000.0  Женский   
4        2280.55             0.0               3000.0  Женский   

                   EDU   INCOME  TERM Рейтинг кредитной истории  \
0  Среднее специальное   703.07  60.0                        A1   
1               Высшее  1693.68  60.0                        B2   
2              Среднее   724.49  60.0                        C2   
3  Среднее специальное  1045.84  60.0                        A3   
4               Высшее  1092.65  60.0                        B3   

              LV_AREA LV_SETTLEMENTNAME                INDUSTRYNAME   PDN  \
0  Гомельская область         КОСАКОВКА                         АПК  0.98   
1                

In [8]:
#Menangani Missing Data
#Memeriksa missing values sebelum penanganan
print("\nMissing values sebelum penanganan:")
print(df_preprocessed.isnull().sum())

#Menghapus baris dengan missing values
df_cleaned = df_preprocessed.dropna()

#Memeriksa missing values setelah penanganan
print("\nMissing values setelah penanganan:")
print(df_cleaned.isnull().sum())


Missing values sebelum penanganan:
Задолженность                   0
Просрочка, дни                  0
Первоначльный лимит             0
SEX                             0
EDU                             0
INCOME                          0
TERM                            0
Рейтинг кредитной истории     932
LV_AREA                      2123
LV_SETTLEMENTNAME              46
INDUSTRYNAME                    0
PDN                             0
SCORINGMARK                  1634
UNDERAGECHILDRENCOUNT           0
VELCOMSCORING                9531
FAMILYSTATUS                    0
BIRTHYEAR                       0
dtype: int64

Missing values setelah penanganan:
Задолженность                0
Просрочка, дни               0
Первоначльный лимит          0
SEX                          0
EDU                          0
INCOME                       0
TERM                         0
Рейтинг кредитной истории    0
LV_AREA                      0
LV_SETTLEMENTNAME            0
INDUSTRYNAME               

In [9]:
#Normalisasi
from sklearn.preprocessing import MinMaxScaler

#Memilih kolom numerik
numeric_cols = df_cleaned.select_dtypes(include=['number']).columns

#Normalisasi menggunakan MinMaxScaler
scaler_minmax = MinMaxScaler()
df_normalized = df_cleaned.copy()
df_normalized[numeric_cols] = scaler_minmax.fit_transform(df_cleaned[numeric_cols])

#Menampilkan data setelah normalisasi
print("\nData setelah normalisasi:")
print(df_normalized.head())


Data setelah normalisasi:
      Задолженность  Просрочка, дни  Первоначльный лимит      SEX  \
8536       0.066892             0.0             0.051282  Мужской   
8546       0.341143             0.0             0.366667  Мужской   
8601       0.164983             0.0             0.166667  Мужской   
8611       0.044593             0.0             0.025641  Мужской   
8626       0.022297             0.0             0.000000  Мужской   

                      EDU    INCOME  TERM Рейтинг кредитной истории  \
8536              Среднее  0.094686   1.0                        E1   
8546              Среднее  0.068303   1.0                        D2   
8601  Среднее специальное  0.062197   1.0                        B3   
8611  Среднее специальное  0.018609   1.0                        D3   
8626              Среднее  0.016725   1.0                        C3   

                LV_AREA LV_SETTLEMENTNAME         INDUSTRYNAME       PDN  \
8536    Минская область           БОРИСОВ         Произ

In [10]:
#Standarisasi
from sklearn.preprocessing import StandardScaler

#Standarisasi menggunakan StandardScaler
scaler_standard = StandardScaler()
df_standardized = df_normalized.copy()
df_standardized[numeric_cols] = scaler_standard.fit_transform(df_normalized[numeric_cols])

#Menampilkan data setelah standarisasi
print("\nData setelah standarisasi:")
print(df_standardized.head())


Data setelah standarisasi:
      Задолженность  Просрочка, дни  Первоначльный лимит      SEX  \
8536      -0.410850       -0.205966            -0.419549  Мужской   
8546       1.966777       -0.205966             2.048951  Мужской   
8601       0.439556       -0.205966             0.483561  Мужской   
8611      -0.604170       -0.205966            -0.620241  Мужской   
8626      -0.797470       -0.205966            -0.820932  Мужской   

                      EDU    INCOME      TERM Рейтинг кредитной истории  \
8536              Среднее  1.390066  0.557622                        E1   
8546              Среднее  0.674402  0.557622                        D2   
8601  Среднее специальное  0.508778  0.557622                        B3   
8611  Среднее специальное -0.673602  0.557622                        D3   
8626              Среднее -0.724688  0.557622                        C3   

                LV_AREA LV_SETTLEMENTNAME         INDUSTRYNAME       PDN  \
8536    Минская область       

In [11]:
#Verifikasi Akhir
#Memeriksa missing values
missing_values = df_standardized.isnull().sum()
print("\nMissing values akhir:")
print(missing_values[missing_values > 0])

#Memeriksa duplikat
duplicates = df_standardized.duplicated().sum()
print(f"\nJumlah data duplikat: {duplicates}")


Missing values akhir:
Series([], dtype: int64)

Jumlah data duplikat: 0


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

# Memuat dataset
file_path = r"/bank_credit_scoring.csv"    # Pastikan nama file sesuai dengan yang diunggah
df = pd.read_csv(file_path)

# Menghapus kolom yang tidak relevan
df = df.drop(columns=['CLIENTID'], errors='ignore')

# Mengubah BIRTHDATE menjadi BIRTHYEAR (jika ada)
if 'BIRTHDATE' in df.columns:
    df['BIRTHYEAR'] = pd.to_datetime(df['BIRTHDATE']).dt.year
    df = df.drop(columns=['BIRTHDATE'])

# Menangani missing values
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Encoding kolom kategorikal
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Memisahkan fitur (X) dan target (y)
X = df.drop(columns=['FAMILYSTATUS'])  # Ganti 'FAMILYSTATUS' sesuai kolom target Anda
y = df['FAMILYSTATUS']

# Normalisasi dan standarisasi
numeric_cols_X = X.select_dtypes(include=['float64', 'int64']).columns
scaler_minmax = MinMaxScaler()
X[numeric_cols_X] = scaler_minmax.fit_transform(X[numeric_cols_X])
scaler_standard = StandardScaler()
X[numeric_cols_X] = scaler_standard.fit_transform(X[numeric_cols_X])

# Membagi data menjadi train dan test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data siap untuk pelatihan.")

Data siap untuk pelatihan.


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
import pandas as pd

# Inisialisasi model
rf_model = RandomForestClassifier(random_state=42)
nb_model = GaussianNB()

# Melatih Random Forest dan mengukur waktu
start_time = time.time()
rf_model.fit(X_train, y_train)
rf_time = time.time() - start_time
rf_pred = rf_model.predict(X_test)

# Melatih Naive Bayes dan mengukur waktu
start_time = time.time()
nb_model.fit(X_train, y_train)
nb_time = time.time() - start_time
nb_pred = nb_model.predict(X_test)

# Menghitung metrik evaluasi
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred, average='weighted', zero_division=0)
rf_recall = recall_score(y_test, rf_pred, average='weighted', zero_division=0)
rf_f1 = f1_score(y_test, rf_pred, average='weighted', zero_division=0)

nb_accuracy = accuracy_score(y_test, nb_pred)
nb_precision = precision_score(y_test, nb_pred, average='weighted', zero_division=0)
nb_recall = recall_score(y_test, nb_pred, average='weighted', zero_division=0)
nb_f1 = f1_score(y_test, nb_pred, average='weighted', zero_division=0)

# Menyusun hasil dalam tabel
results = {
    'Model': ['Random Forest', 'Naive Bayes'],
    'Accuracy': [rf_accuracy, nb_accuracy],
    'Precision': [rf_precision, nb_precision],
    'Recall': [rf_recall, nb_recall],
    'F1-Score': [rf_f1, nb_f1],
    'Waktu (detik)': [rf_time, nb_time]
}
results_df = pd.DataFrame(results)

print("\nHasil Perbandingan Model:")
print(results_df)


Hasil Perbandingan Model:
           Model  Accuracy  Precision    Recall  F1-Score  Waktu (detik)
0  Random Forest  0.764929   0.764323  0.764929  0.764624       4.091902
1    Naive Bayes  0.646851   0.676329  0.646851  0.655620       0.014561
