In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn import over_sampling
from sklearn.metrics import roc_curve,auc, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from prettytable import PrettyTable
from bayes_opt import BayesianOptimization


pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
df_final_1 = pd.read_csv('df_final_1_v1.1.csv.gz')
df_final_2 = pd.read_csv('df_final_2_v1.1.csv.gz')

# Scenario 1

In this dataset, all missing values (after merging) are removed 

# Train Test Split

In [3]:
X = df_final_1.drop(columns=['TARGET'])
y = df_final_1['TARGET']

In [4]:
y.value_counts() / len(y)*100

TARGET
0.0    92.353584
1.0     7.646416
Name: count, dtype: float64

The label TARGET exhibits extreme class imbalance.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Dimension of X_train : ', X_train.shape)
print(f'Dimensi of X_test    : ', X_test.shape)
print(f'Dimensi of y_train   : ', y_train.shape)
print(f'Dimensi of y_test    : ', y_test.shape)

Dimension of X_train :  (100972, 178)
Dimensi of X_test    :  (25244, 178)
Dimensi of y_train   :  (100972,)
Dimensi of y_test    :  (25244,)


# Handling Class Imbalance

Class imbalance needs to be addressed; otherwise, the model will be biased toward the majority class if left unhandled.

In [6]:
# Synthetic Minority Oversampling Technique (SMOTE)
X_train_over, y_train_over = over_sampling.SMOTE(sampling_strategy=1).fit_resample(X_train, y_train)

Synthetic Minority Oversampling Technique (SMOTE) is an effective technique for mitigating the effects of extreme class imbalance, leading to more robust and accurate predictive models. SMOTE generates synthetic samples for the minority class, helping to balance the class distribution. This can improve the model's ability to learn from the underrepresented class.

# Standardization

Certain algorithms are sensitive to outliers and varying data scales, which can lead to misinterpretations of the data. Algorithms that depend on distance calculations, such as k-nearest neighbors (KNN), support vector machines (SVM), and logistic regression, should operate on a common scale.

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_over)
X_test_scaled = scaler.transform(X_test)

Now we have **X_train_over**, **X_train_scaled**, **X_test**, **X_test_scaled**, **y_train_over**, and **y_test** that will be used in the tested algorithms.

# Def function for Model

In [8]:
# Untuk evaluasi model 

def model_evaluation(y_true, y_pred, y_pred_proba, model_name):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    f2 = fbeta_score(y_true, y_pred, beta=2.0)
    auc = roc_auc_score(y_true, y_pred_proba[:, 1])
    
    results = pd.DataFrame([[model_name, acc, prec, rec, f1, f2, auc]],
                       columns=["Model", "Accuracy", "Precision", "Recall",
                                "F1 Score", "F2 Score", "roc_auc_score"])
    return results

In [9]:
# Untuk cek overfitting vs underfitting atau best fit
def check_model_fit(classifier, X_train, X_val, y_train, y_val, model_name="Model"):
    """
    Membandingkan performa model pada data training dan validation
    untuk mendeteksi overfitting/underfitting
    """
    # Prediksi pada data training
    y_train_pred = classifier.predict(X_train)
    y_train_pred_proba = classifier.predict_proba(X_train)
    
    # Prediksi pada data validation
    y_val_pred = classifier.predict(X_val)
    y_val_pred_proba = classifier.predict_proba(X_val)
    
    # Metrics untuk training
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_precision = precision_score(y_train, y_train_pred)
    train_recall = recall_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred)
    train_f2 = fbeta_score(y_train, y_train_pred, beta=2.0)
    train_roc_auc = roc_auc_score(y_train, y_train_pred_proba[:, 1])

    # Metrics untuk validation
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_precision = precision_score(y_val, y_val_pred)
    val_recall = recall_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred)
    val_f2 = fbeta_score(y_val, y_val_pred, beta=2.0)
    val_roc_auc = roc_auc_score(y_val, y_val_pred_proba[:, 1])
    

    # Membuat DataFrame perbandingan
    comparison = pd.DataFrame({
        'Metric': ['Accuracy', 'Precision', 'Recall', 'F1', 'F2', 'ROC AUC'],
        'Training': [train_accuracy, train_precision, train_recall, train_f1, train_f2, train_roc_auc],
        'Validation': [val_accuracy, val_precision, val_recall, val_f1, val_f2, val_roc_auc],
        'Difference': [train_accuracy - val_accuracy, 
                      train_precision - val_precision,
                      train_recall - val_recall,
                      train_f1 - val_f1,
                      train_f2 - val_f2,
                      train_roc_auc - val_roc_auc]
    })
    
    print(f"\nModel Fit Analysis for {model_name}:")
    print(comparison.round(4))
    
    # Analisis overfitting/underfitting
    avg_diff = comparison['Difference'].mean()
    
    print("\nAnalisis:")
    if (train_roc_auc - val_roc_auc) > 0.01:  # Threshold bisa disesuaikan
        print("⚠️ Model menunjukkan tanda OVERFITTING")
        print(f"    selisih metrics training-validation: {(train_roc_auc - val_roc_auc):.4f}")
        print("   - Model terlalu 'menghapal' data training")
        print("   - Performa di validation jauh lebih rendah")
    elif (train_roc_auc - val_roc_auc) < 0.01:  # Threshold bisa disesuaikan
        print("⚠️ Model menunjukkan tanda UNDERFITTING")
        print(f"   ROC-AUC training: {train_roc_auc:.4f}")
        print("   - Model terlalu simpel")
        print("   - Performa rendah bahkan di data training")
    else:
        print("✅ Model memiliki fit yang baik")
        print(f"   Rata-rata selisih metrics: {(train_roc_auc - val_roc_auc):.4f}")
        print("   - Performa seimbang antara training dan validation")



## Logistic Regression

In [10]:
classifier_logreg = LogisticRegression(class_weight='balanced', random_state=42)
classifier_logreg.fit(X_train_scaled, y_train_over)
y_pred = classifier_logreg.predict(X_test_scaled)
y_pred_proba = classifier_logreg.predict_proba(X_test_scaled)

# Evaluasi
results_logreg = model_evaluation(y_test, y_pred, y_pred_proba, "Logistic Regression")
results_logreg

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,Logistic Regression,0.697354,0.155711,0.668566,0.252592,0.40306,0.751725


In [11]:
check_model_fit(
    classifier_logreg,
    X_train_scaled,
    X_test_scaled,
    y_train_over,
    y_test,
    "Logistic Regression"
)


Model Fit Analysis for Logistic Regression:
      Metric  Training  Validation  Difference
0   Accuracy    0.7085      0.6974      0.0112
1  Precision    0.7069      0.1557      0.5512
2     Recall    0.7124      0.6686      0.0439
3         F1    0.7097      0.2526      0.4571
4         F2    0.7113      0.4031      0.3083
5    ROC AUC    0.7764      0.7517      0.0247

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.0247
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


## Decision Tree

In [12]:
classifier_dt = DecisionTreeClassifier()
classifier_dt.fit(X_train_over, y_train_over)
y_pred = classifier_dt.predict(X_test)
y_pred_proba = classifier_dt.predict_proba(X_test)

# Evaluasi
classifier_dt = model_evaluation(y_test, y_pred, y_pred_proba, "Decision Tree")
classifier_dt

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,Decision Tree,0.849271,0.135409,0.180218,0.154632,0.169031,0.542453


In [13]:
classifier_dt = DecisionTreeClassifier()
classifier_dt.fit(X_train_over, y_train_over)
y_pred = classifier_dt.predict(X_test)
y_pred_proba = classifier_dt.predict_proba(X_test)
check_model_fit(
    classifier_dt,
    X_train_over,
    X_test,
    y_train_over,
    y_test,
    "Decision Tree"
)


Model Fit Analysis for Decision Tree:
      Metric  Training  Validation  Difference
0   Accuracy       1.0      0.8477      0.1523
1  Precision       1.0      0.1328      0.8672
2     Recall       1.0      0.1792      0.8208
3         F1       1.0      0.1526      0.8474
4         F2       1.0      0.1675      0.8325
5    ROC AUC       1.0      0.5411      0.4589

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.4589
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


## Random Forest

In [14]:
classifier_rf = RandomForestClassifier()
classifier_rf.fit(X_train_over, y_train_over)
y_pred = classifier_rf.predict(X_test)
y_pred_proba = classifier_rf.predict_proba(X_test)

# Evaluasi
classifier_rf = model_evaluation(y_test, y_pred, y_pred_proba, "Random Forest")
classifier_rf

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,Random Forest,0.923625,0.615385,0.004143,0.00823,0.00517,0.712299


In [15]:
classifier_rf = RandomForestClassifier()
classifier_rf.fit(X_train_over, y_train_over)
y_pred = classifier_rf.predict(X_test)
y_pred_proba = classifier_rf.predict_proba(X_test)

check_model_fit(
    classifier_rf,
    X_train_over,
    X_test,
    y_train_over,
    y_test,
    "Random Forest"
)


Model Fit Analysis for Random Forest:
      Metric  Training  Validation  Difference
0   Accuracy       1.0      0.9234      0.0766
1  Precision       1.0      0.4000      0.6000
2     Recall       1.0      0.0021      0.9979
3         F1       1.0      0.0041      0.9959
4         F2       1.0      0.0026      0.9974
5    ROC AUC       1.0      0.7127      0.2873

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.2873
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


## Light GBM (Gradient Boosting Machines)

In [16]:
X_train_lgbm = X_train_over.copy()
X_test_lgbm = X_test.copy()

X_train_lgbm.columns = X_train_lgbm.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '', regex=True)
X_test_lgbm.columns = X_test_lgbm.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '', regex=True)

classifier_lgbm = LGBMClassifier()
classifier_lgbm.fit(X_train_lgbm, y_train_over)
y_pred = classifier_lgbm.predict(X_test_lgbm)
y_pred_proba = classifier_lgbm.predict_proba(X_test_lgbm)

# Evaluasi
classifier_lgbm = model_evaluation(y_test, y_pred, y_pred_proba, "Light GBM")
classifier_lgbm

[LightGBM] [Info] Number of positive: 93252, number of negative: 93252
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 43928
[LightGBM] [Info] Number of data points in the train set: 186504, number of used features: 175
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,Light GBM,0.92311,0.439024,0.018643,0.035768,0.023059,0.75513


In [17]:
classifier_lgbm = LGBMClassifier()
classifier_lgbm.fit(X_train_lgbm, y_train_over)
y_pred = classifier_lgbm.predict(X_test_lgbm)
y_pred_proba = classifier_lgbm.predict_proba(X_test_lgbm)

check_model_fit(
    classifier_lgbm,
    X_train_lgbm,
    X_test_lgbm,
    y_train_over,
    y_test,
    "Light GBM"
)

[LightGBM] [Info] Number of positive: 93252, number of negative: 93252
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.076886 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43928
[LightGBM] [Info] Number of data points in the train set: 186504, number of used features: 175
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Model Fit Analysis for Light GBM:
      Metric  Training  Validation  Difference
0   Accuracy    0.9592      0.9231      0.0361
1  Precision    0.9990      0.4390      0.5600
2     Recall    0.9194      0.0186      0.9007
3         F1    0.9575      0.0358      0.9218
4         F2    0.9343      0.0231      0.9112
5    ROC AUC    0.9848      0.7551      0.2297

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.2297
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


## Ada Boost

In [18]:
classifier_ab = AdaBoostClassifier()
classifier_ab.fit(X_train_over, y_train_over)
y_pred = classifier_ab.predict(X_test)
y_pred_proba = classifier_ab.predict_proba(X_test)

# Evaluasi
classifier_ab = model_evaluation(y_test, y_pred, y_pred_proba, "Ada Boost")
classifier_ab

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,Ada Boost,0.908453,0.266585,0.112377,0.158106,0.127079,0.68279


In [19]:
classifier_ab = AdaBoostClassifier()
classifier_ab.fit(X_train_over, y_train_over)
y_pred = classifier_ab.predict(X_test)
y_pred_proba = classifier_ab.predict_proba(X_test)

check_model_fit(
    classifier_ab,
    X_train_over,
    X_test,
    y_train_over,
    y_test,
    "Ada Boost"
)


Model Fit Analysis for Ada Boost:
      Metric  Training  Validation  Difference
0   Accuracy    0.9354      0.9085      0.0270
1  Precision    0.9697      0.2666      0.7031
2     Recall    0.8990      0.1124      0.7866
3         F1    0.9330      0.1581      0.7749
4         F2    0.9123      0.1271      0.7852
5    ROC AUC    0.9701      0.6828      0.2873

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.2873
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


## XGBoost

In [20]:
classifier_xgb = XGBClassifier()
classifier_xgb.fit(X_train_over, y_train_over)
y_pred = classifier_xgb.predict(X_test)
y_pred_proba = classifier_xgb.predict_proba(X_test)

# Evaluasi
classifier_xgb = model_evaluation(y_test, y_pred, y_pred_proba, "XGB Boost")
classifier_xgb

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,XGB Boost,0.921922,0.402913,0.042983,0.077679,0.052333,0.736213


In [21]:
classifier_xgb = XGBClassifier()
classifier_xgb.fit(X_train_over, y_train_over)
y_pred = classifier_xgb.predict(X_test)
y_pred_proba = classifier_xgb.predict_proba(X_test)

check_model_fit(
    classifier_xgb,
    X_train_over,
    X_test,
    y_train_over,
    y_test,
    "XGB Boost"
)


Model Fit Analysis for XGB Boost:
      Metric  Training  Validation  Difference
0   Accuracy    0.9648      0.9219      0.0429
1  Precision    0.9990      0.4029      0.5961
2     Recall    0.9306      0.0430      0.8877
3         F1    0.9636      0.0777      0.8859
4         F2    0.9435      0.0523      0.8912
5    ROC AUC    0.9924      0.7362      0.2562

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.2562
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


# Hyperparameter Tuning

## Light GBM

In [22]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score

# Define the parameter space
param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'num_leaves': randint(20, 100),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'min_child_samples': randint(1, 50),
    'class_weight': ['balanced', {0: 1, 1: 11.3}]
}

# Create base model
model = LGBMClassifier(random_state=42)

# Setup RandomizedSearchCV with ROC AUC scoring
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring='roc_auc',  # Use ROC AUC for optimization
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
random_search.fit(X_train_over, y_train_over)

# Print best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best CV ROC AUC score:", random_search.best_score_)

# Get best model
best_model = random_search.best_estimator_

# Calculate metrics for both train and test sets
def get_metrics(model, X, y):
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1]
    
    return {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'roc_auc': roc_auc_score(y, y_pred_proba)
    }

# Get metrics for both sets
train_metrics = get_metrics(best_model, X_train_over, y_train_over)
test_metrics = get_metrics(best_model, X_test, y_test)

# Print metrics comparison
print("\nMetrics Comparison:")
print(f"{'Metric':<15} {'Train':>10} {'Test':>10} {'Difference':>10}")
print("-" * 45)
for metric in train_metrics.keys():
    diff = train_metrics[metric] - test_metrics[metric]
    print(f"{metric:<15} {train_metrics[metric]:>10.3f} {test_metrics[metric]:>10.3f} {diff:>10.3f}")

# Print detailed classification report for test set
print("\nDetailed Classification Report on Test Set:")
print(classification_report(y_test, best_model.predict(X_test)))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 93252, number of negative: 93252
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060681 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43917
[LightGBM] [Info] Number of data points in the train set: 186504, number of used features: 174
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best parameters: {'class_weight': 'balanced', 'colsample_bytree': np.float64(0.6557975442608167), 'learning_rate': np.float64(0.09764339456056544), 'max_depth': 9, 'min_child_samples': 47, 'n_estimators': 150, 'num_leaves': 74, 'subsample': np.float64(0.9932923543227152)}
Best CV ROC AUC score: 0.9732743473208721

Metrics Comparison:
Metric               Train       Test Difference
---------------------------------------------
accuracy             0.964      0.923      0.041
precision        

In [23]:
classifier_lgbm = LGBMClassifier()
classifier_lgbm.fit(X_train_lgbm, y_train_over)
y_pred = classifier_lgbm.predict(X_test_lgbm)
y_pred_proba = classifier_lgbm.predict_proba(X_test_lgbm)

check_model_fit(
    classifier_lgbm,
    X_train_lgbm,
    X_test_lgbm,
    y_train_over,
    y_test,
    "Light GBM"
)

[LightGBM] [Info] Number of positive: 93252, number of negative: 93252
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43928
[LightGBM] [Info] Number of data points in the train set: 186504, number of used features: 175
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Model Fit Analysis for Light GBM:
      Metric  Training  Validation  Difference
0   Accuracy    0.9592      0.9231      0.0361
1  Precision    0.9990      0.4390      0.5600
2     Recall    0.9194      0.0186      0.9007
3         F1    0.9575      0.0358      0.9218
4         F2    0.9343      0.0231      0.9112
5    ROC AUC    0.9848      0.7551      0.2297

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.2297
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


# Scenario 2

In this dataset, all missing values (after merging) are filled with median

# Train Test Split

In [24]:
X = df_final_2.drop(columns=['TARGET'])
y = df_final_2['TARGET']

In [25]:
y.value_counts() / len(y)*100

TARGET
0.0    91.908901
1.0     8.091099
Name: count, dtype: float64

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Dimension of X_train : ', X_train.shape)
print(f'Dimensi of X_test    : ', X_test.shape)
print(f'Dimensi of y_train   : ', y_train.shape)
print(f'Dimensi of y_test    : ', y_test.shape)

Dimension of X_train :  (243180, 178)
Dimensi of X_test    :  (60796, 178)
Dimensi of y_train   :  (243180,)
Dimensi of y_test    :  (60796,)


# Handling Class Imbalance

In [27]:
# Synthetic Minority Oversampling Technique (SMOTE)
X_train_over, y_train_over = over_sampling.SMOTE(sampling_strategy=1).fit_resample(X_train, y_train)

# Standardization

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_over)
X_test_scaled = scaler.transform(X_test)

Now we have **X_train_over**, **X_train_scaled**, **X_test**, **X_test_scaled**, **y_train_over**, and **y_test** that will be used in the tested algorithms.

## Logistic Regression

In [29]:
classifier_logreg = LogisticRegression(class_weight='balanced', random_state=42)
classifier_logreg.fit(X_train_scaled, y_train_over)
y_pred = classifier_logreg.predict(X_test_scaled)
y_pred_proba = classifier_logreg.predict_proba(X_test_scaled)

# Evaluasi
results_logreg = model_evaluation(y_test, y_pred, y_pred_proba, "Logistic Regression")
results_logreg

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,Logistic Regression,0.698566,0.165778,0.676704,0.266314,0.418649,0.752318


In [30]:
check_model_fit(
    classifier_logreg,
    X_train_scaled,
    X_test_scaled,
    y_train_over,
    y_test,
    "Logistic Regression"
)


Model Fit Analysis for Logistic Regression:
      Metric  Training  Validation  Difference
0   Accuracy    0.7072      0.6986      0.0086
1  Precision    0.7041      0.1658      0.5384
2     Recall    0.7147      0.6767      0.0380
3         F1    0.7094      0.2663      0.4431
4         F2    0.7126      0.4186      0.2939
5    ROC AUC    0.7739      0.7523      0.0216

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.0216
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


## Decision Tree

In [31]:
classifier_dt = DecisionTreeClassifier()
classifier_dt.fit(X_train_over, y_train_over)
y_pred = classifier_dt.predict(X_test)
y_pred_proba = classifier_dt.predict_proba(X_test)

# Evaluasi
classifier_dt = model_evaluation(y_test, y_pred, y_pred_proba, "Decision Tree")
classifier_dt

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,Decision Tree,0.842901,0.135764,0.175788,0.153205,0.166001,0.538682


In [32]:
classifier_dt = DecisionTreeClassifier()
classifier_dt.fit(X_train_over, y_train_over)
y_pred = classifier_dt.predict(X_test)
y_pred_proba = classifier_dt.predict_proba(X_test)
check_model_fit(
    classifier_dt,
    X_train_over,
    X_test,
    y_train_over,
    y_test,
    "Decision Tree"
)


Model Fit Analysis for Decision Tree:
      Metric  Training  Validation  Difference
0   Accuracy       1.0      0.8428      0.1572
1  Precision       1.0      0.1382      0.8618
2     Recall       1.0      0.1805      0.8195
3         F1       1.0      0.1566      0.8434
4         F2       1.0      0.1701      0.8299
5    ROC AUC       1.0      0.5408      0.4592

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.4592
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


## Random Forest

In [33]:
classifier_rf = RandomForestClassifier()
classifier_rf.fit(X_train_over, y_train_over)
y_pred = classifier_rf.predict(X_test)
y_pred_proba = classifier_rf.predict_proba(X_test)

# Evaluasi
classifier_rf = model_evaluation(y_test, y_pred, y_pred_proba, "Random Forest")
classifier_rf

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,Random Forest,0.919255,0.6,0.003662,0.00728,0.004571,0.721932


In [34]:
classifier_rf = RandomForestClassifier()
classifier_rf.fit(X_train_over, y_train_over)
y_pred = classifier_rf.predict(X_test)
y_pred_proba = classifier_rf.predict_proba(X_test)

check_model_fit(
    classifier_rf,
    X_train_over,
    X_test,
    y_train_over,
    y_test,
    "Random Forest"
)


Model Fit Analysis for Random Forest:
      Metric  Training  Validation  Difference
0   Accuracy       1.0      0.9192      0.0808
1  Precision       1.0      0.5806      0.4194
2     Recall       1.0      0.0037      0.9963
3         F1       1.0      0.0073      0.9927
4         F2       1.0      0.0046      0.9954
5    ROC AUC       1.0      0.7224      0.2776

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.2776
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


## Light GBM (Gradient Boosting Machines)

In [35]:
X_train_lgbm = X_train_over.copy()
X_test_lgbm = X_test.copy()

X_train_lgbm.columns = X_train_lgbm.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '', regex=True)
X_test_lgbm.columns = X_test_lgbm.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '', regex=True)

classifier_lgbm = LGBMClassifier()
classifier_lgbm.fit(X_train_lgbm, y_train_over)
y_pred = classifier_lgbm.predict(X_test_lgbm)
y_pred_proba = classifier_lgbm.predict_proba(X_test_lgbm)

# Evaluasi
classifier_lgbm = model_evaluation(y_test, y_pred, y_pred_proba, "Light GBM")
classifier_lgbm

[LightGBM] [Info] Number of positive: 223500, number of negative: 223500
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 44142
[LightGBM] [Info] Number of data points in the train set: 447000, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,Light GBM,0.918876,0.458537,0.019125,0.036719,0.02366,0.760155


In [36]:
classifier_lgbm = LGBMClassifier()
classifier_lgbm.fit(X_train_lgbm, y_train_over)
y_pred = classifier_lgbm.predict(X_test_lgbm)
y_pred_proba = classifier_lgbm.predict_proba(X_test_lgbm)

check_model_fit(
    classifier_lgbm,
    X_train_lgbm,
    X_test_lgbm,
    y_train_over,
    y_test,
    "Light GBM"
)

[LightGBM] [Info] Number of positive: 223500, number of negative: 223500
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 44142
[LightGBM] [Info] Number of data points in the train set: 447000, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Model Fit Analysis for Light GBM:
      Metric  Training  Validation  Difference
0   Accuracy    0.9554      0.9189      0.0365
1  Precision    0.9982      0.4585      0.5397
2     Recall    0.9125      0.0191      0.8933
3         F1    0.9534      0.0367      0.9167
4         F2    0.9284      0.0237      0.9047
5    ROC AUC    0.9808      0.7602      0.2207

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.2207
   - Model terlalu 'menghapal'

## Ada Boost

In [37]:
classifier_ab = AdaBoostClassifier()
classifier_ab.fit(X_train_over, y_train_over)
y_pred = classifier_ab.predict(X_test)
y_pred_proba = classifier_ab.predict_proba(X_test)

# Evaluasi
classifier_ab = model_evaluation(y_test, y_pred, y_pred_proba, "Ada Boost")
classifier_ab

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,Ada Boost,0.900717,0.239181,0.104578,0.145527,0.117841,0.685228


In [38]:
classifier_ab = AdaBoostClassifier()
classifier_ab.fit(X_train_over, y_train_over)
y_pred = classifier_ab.predict(X_test)
y_pred_proba = classifier_ab.predict_proba(X_test)

check_model_fit(
    classifier_ab,
    X_train_over,
    X_test,
    y_train_over,
    y_test,
    "Ada Boost"
)


Model Fit Analysis for Ada Boost:
      Metric  Training  Validation  Difference
0   Accuracy    0.9290      0.9007      0.0283
1  Precision    0.9668      0.2392      0.7276
2     Recall    0.8885      0.1046      0.7839
3         F1    0.9260      0.1455      0.7805
4         F2    0.9031      0.1178      0.7853
5    ROC AUC    0.9669      0.6852      0.2816

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.2816
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


## XGBoost

In [39]:
classifier_xgb = XGBClassifier()
classifier_xgb.fit(X_train_over, y_train_over)
y_pred = classifier_xgb.predict(X_test)
y_pred_proba = classifier_xgb.predict_proba(X_test)

# Evaluasi
classifier_xgb = model_evaluation(y_test, y_pred, y_pred_proba, "XGB Boost")
classifier_xgb

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,F2 Score,roc_auc_score
0,XGB Boost,0.918613,0.466119,0.046185,0.084043,0.056336,0.75276


In [40]:
classifier_xgb = XGBClassifier()
classifier_xgb.fit(X_train_over, y_train_over)
y_pred = classifier_xgb.predict(X_test)
y_pred_proba = classifier_xgb.predict_proba(X_test)

check_model_fit(
    classifier_xgb,
    X_train_over,
    X_test,
    y_train_over,
    y_test,
    "XGB Boost"
)


Model Fit Analysis for XGB Boost:
      Metric  Training  Validation  Difference
0   Accuracy    0.9586      0.9186      0.0400
1  Precision    0.9978      0.4661      0.5317
2     Recall    0.9193      0.0462      0.8731
3         F1    0.9570      0.0840      0.8729
4         F2    0.9340      0.0563      0.8777
5    ROC AUC    0.9868      0.7528      0.2341

Analisis:
⚠️ Model menunjukkan tanda OVERFITTING
    selisih metrics training-validation: 0.2341
   - Model terlalu 'menghapal' data training
   - Performa di validation jauh lebih rendah


# Hypeparameter Tuning

## Light GBM

In [41]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score

# Define the parameter space
param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'num_leaves': randint(20, 100),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'min_child_samples': randint(1, 50),
    'class_weight': ['balanced', {0: 1, 1: 11.3}]
}

# Create base model
model = LGBMClassifier(random_state=42)

# Setup RandomizedSearchCV with ROC AUC scoring
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring='roc_auc',  # Use ROC AUC for optimization
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
random_search.fit(X_train_over, y_train_over)

# Print best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best CV ROC AUC score:", random_search.best_score_)

# Get best model
best_model = random_search.best_estimator_

# Calculate metrics for both train and test sets
def get_metrics(model, X, y):
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1]
    
    return {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'roc_auc': roc_auc_score(y, y_pred_proba)
    }

# Get metrics for both sets
train_metrics = get_metrics(best_model, X_train_over, y_train_over)
test_metrics = get_metrics(best_model, X_test, y_test)

# Print metrics comparison
print("\nMetrics Comparison:")
print(f"{'Metric':<15} {'Train':>10} {'Test':>10} {'Difference':>10}")
print("-" * 45)
for metric in train_metrics.keys():
    diff = train_metrics[metric] - test_metrics[metric]
    print(f"{metric:<15} {train_metrics[metric]:>10.3f} {test_metrics[metric]:>10.3f} {diff:>10.3f}")

# Print detailed classification report for test set
print("\nDetailed Classification Report on Test Set:")
print(classification_report(y_test, best_model.predict(X_test)))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 223500, number of negative: 223500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.155983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44142
[LightGBM] [Info] Number of data points in the train set: 447000, number of used features: 176
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Best parameters: {'class_weight': 'balanced', 'colsample_bytree': np.float64(0.9186171947440931), 'learning_rate': np.float64(0.06503043695984914), 'max_depth': 7, 'min_child_samples': 21, 'n_estimators': 202, 'num_leaves': 94, 'subsample': np.float64(0.7836995567863468)}
Best CV ROC AUC score: 0.968591413599988

Metrics Comparison:
Metric               Train       Test Difference
---------------------------------------------
accuracy             0.957      0.919      0.038
precision       

Light GBM give the best performance between other algorithms but after we try to hyperparameter tuning, the score still overfitting and bias to class 0. Although the accuracy is 0.919, but the score is misleading due to class imbalance extreme. Next, we will try hyperparameter tuning on XGBoost as the second best score.

## Logistic Regression

In [42]:
from scipy.stats import uniform, loguniform

# Parameter tuning
param_dist = {
    'C': loguniform(1e-3, 1e-1),
    'max_iter': [200],
    'solver': ['liblinear'],
    'penalty': ['l1'],
    'class_weight': ['balanced']
}

# Define scoring metrics for RandomizedSearchCV
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'roc_auc': 'roc_auc'
}

# Create RandomizedSearchCV
lr = LogisticRegression(random_state=42)
random_search = RandomizedSearchCV(
    lr,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring=scoring,
    refit='roc_auc',  # Optimize for ROC AUC
    random_state=42,
    n_jobs=-1
)

# Fit the model
random_search.fit(X_train_over, y_train_over)

# Get best model
best_model = random_search.best_estimator_

def evaluate_model(model, X, y, dataset_name):
    # Get predictions
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_pred_proba)
    conf_matrix = confusion_matrix(y, y_pred)
    class_report = classification_report(y, y_pred)
    
    print(f"\n=== {dataset_name} Set Metrics ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    
    
    print("\nConfusion Matrix:")
    print(conf_matrix)
    
    print("\nClassification Report:")
    print(class_report)

# Print best parameters
print("Best Parameters:", random_search.best_params_)
print("Best ROC AUC Score:", random_search.best_score_)

# Evaluate on both train and test sets
evaluate_model(best_model, X_train_over, y_train_over, "Training")
evaluate_model(best_model, X_test, y_test, "Testing")


Best Parameters: {'C': np.float64(0.07969454818643935), 'class_weight': 'balanced', 'max_iter': 200, 'penalty': 'l1', 'solver': 'liblinear'}
Best ROC AUC Score: 0.7737343916039817

=== Training Set Metrics ===
Accuracy: 0.7076
Precision: 0.7041
Recall: 0.7163
ROC AUC: 0.7744

Confusion Matrix:
[[156204  67296]
 [ 63402 160098]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.71      0.70      0.71    223500
         1.0       0.70      0.72      0.71    223500

    accuracy                           0.71    447000
   macro avg       0.71      0.71      0.71    447000
weighted avg       0.71      0.71      0.71    447000


=== Testing Set Metrics ===
Accuracy: 0.6982
Precision: 0.1659
Recall: 0.6787
ROC AUC: 0.7540

Confusion Matrix:
[[39111 16770]
 [ 1579  3336]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.70      0.81     55881
         1.0       0.17      0.68      0.