In [56]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split

from xgboost import XGBClassifier

In [57]:
train_val = pd.read_csv("../dataset/cooked/train_val.data")
test = pd.read_csv("../dataset/cooked/test.data")

print("Train_val shape:", train_val.shape)
print("Test shape:", test.shape)

Train_val shape: (455, 32)
Test shape: (114, 32)


In [58]:
# Pisahkan fitur dan label
X_train_val = train_val.drop(columns=["Diagnosis"])
y_train_val = train_val["Diagnosis"]

X_test = test.drop(columns=["Diagnosis"])
y_test = test["Diagnosis"]

# Encode label ke angka
y_train_val = y_train_val.map({'B':0, 'M':1})
y_test = y_test.map({'B':0, 'M':1})

# Scaling
scaler = StandardScaler()
X_train_val_scaled = scaler.fit_transform(X_train_val)
X_test_scaled = scaler.transform(X_test)

X_train_val_scaled.shape, X_test_scaled.shape


((455, 31), (114, 31))

In [59]:
param_bounds = {
    "n_estimators": (50, 1200),
    "learning_rate": (0.001, 0.3),
    "max_depth": (3, 15),
    "min_child_weight": (1, 15),
    "subsample": (0.5, 1.0),
    "colsample_bytree": (0.5, 1.0),
    "gamma": (0, 10),
    "reg_alpha": (0, 10),
    "reg_lambda": (0, 10)
}

In [60]:
def wolf_to_params(wolf, bounds):
    names = list(bounds.keys())
    params = {}
    for i, key in enumerate(names):
        low, high = bounds[key]
        value = low + wolf[i] * (high - low)
        params[key] = int(value) if key in ["n_estimators", "max_depth", "min_child_weight"] else float(value)
    return params

In [61]:
def fitness(params, X, y):
    model = XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        tree_method='hist',
        n_jobs=-1,
        **params
    )
    
    score = cross_val_score(model, X, y, cv=3, scoring="roc_auc").mean()
    return score

In [62]:
def mutate_wolf(wolf, rate=0.4, strength=0.2):
    if np.random.rand() < rate:
        wolf = wolf + np.random.uniform(-strength, strength, size=len(wolf))
    return np.clip(wolf, 0, 1)

In [63]:



def gwo_hyperopt(X, y, param_bounds, n_wolves=10, max_iter=20):
    dim = len(param_bounds)
    wolves = np.random.rand(n_wolves, dim)

    alpha = beta = delta = None
    alpha_score = beta_score = delta_score = -np.inf

    # inisialisasi aman
    alpha = wolves[0].copy()
    beta  = wolves[1].copy()
    delta = wolves[2].copy()

    for it in range(max_iter):
        print(f"=== Iterasi {it+1}/{max_iter} ===")

        for i in range(n_wolves):
            params = wolf_to_params(wolves[i], param_bounds)
            score = fitness(params, X, y)

            if score > alpha_score:
                delta, delta_score = beta, beta_score
                beta, beta_score = alpha, alpha_score
                alpha, alpha_score = wolves[i].copy(), score
            elif score > beta_score:
                delta, delta_score = beta, beta_score
                beta, beta_score = wolves[i].copy(), score
            elif score > delta_score:
                delta, delta_score = wolves[i].copy(), score

        print(f"Alpha Score: {alpha_score:.4f}")

        # NON-LINEAR DECAY (fix: pakai it, bukan iter)
        a = 2 * (1 - (it / max_iter)**2)

        for i in range(n_wolves):

            # update berdasarkan alpha
            r1, r2 = np.random.rand(), np.random.rand()
            A1 = 2 * a * r1 - a
            C1 = 2 * r2
            D_alpha = np.abs(C1 * alpha - wolves[i])
            X1 = alpha - A1 * D_alpha

            # beta
            r1, r2 = np.random.rand(), np.random.rand()
            A2 = 2 * a * r1 - a
            C2 = 2 * r2
            D_beta = np.abs(C2 * beta - wolves[i])
            X2 = beta - A2 * D_beta

            # delta
            r1, r2 = np.random.rand(), np.random.rand()
            A3 = 2 * a * r1 - a
            C3 = 2 * r2
            D_delta = np.abs(C3 * delta - wolves[i])
            X3 = delta - A3 * D_delta

            # update posisi serigala
            wolves[i] = (X1 + X2 + X3) / 3

            # normalisasi kembali
            wolves[i] = np.clip(wolves[i], 0, 1)

            # mutation
            wolves[i] = mutate_wolf(wolves[i], rate=0.15, strength=0.25)

    best_params = wolf_to_params(alpha, param_bounds)
    return best_params


In [64]:
best_params = gwo_hyperopt(
    X_train_val_scaled, 
    y_train_val, 
    param_bounds,
    n_wolves=20,
    max_iter=50
)

best_params

=== Iterasi 1/50 ===
Alpha Score: 0.9901
=== Iterasi 2/50 ===
Alpha Score: 0.9936
=== Iterasi 3/50 ===
Alpha Score: 0.9940
=== Iterasi 4/50 ===
Alpha Score: 0.9940
=== Iterasi 5/50 ===
Alpha Score: 0.9940
=== Iterasi 6/50 ===
Alpha Score: 0.9940
=== Iterasi 7/50 ===
Alpha Score: 0.9940
=== Iterasi 8/50 ===
Alpha Score: 0.9940
=== Iterasi 9/50 ===
Alpha Score: 0.9940
=== Iterasi 10/50 ===
Alpha Score: 0.9940
=== Iterasi 11/50 ===
Alpha Score: 0.9940
=== Iterasi 12/50 ===
Alpha Score: 0.9940
=== Iterasi 13/50 ===
Alpha Score: 0.9940
=== Iterasi 14/50 ===
Alpha Score: 0.9942
=== Iterasi 15/50 ===
Alpha Score: 0.9942
=== Iterasi 16/50 ===
Alpha Score: 0.9943
=== Iterasi 17/50 ===
Alpha Score: 0.9943
=== Iterasi 18/50 ===
Alpha Score: 0.9943
=== Iterasi 19/50 ===
Alpha Score: 0.9943
=== Iterasi 20/50 ===
Alpha Score: 0.9943
=== Iterasi 21/50 ===
Alpha Score: 0.9943
=== Iterasi 22/50 ===
Alpha Score: 0.9943
=== Iterasi 23/50 ===
Alpha Score: 0.9943
=== Iterasi 24/50 ===
Alpha Score: 0.9943
=

{'n_estimators': 595,
 'learning_rate': 0.09889995915404967,
 'max_depth': 9,
 'min_child_weight': 1,
 'subsample': 0.681548352719294,
 'colsample_bytree': 0.5091953469833224,
 'gamma': 0.03543470351396573,
 'reg_alpha': 0.4548909493873457,
 'reg_lambda': 0.33070077431464534}

In [65]:
best_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    n_jobs=-1,
    **best_params
)

best_model.fit(X_train_val_scaled, y_train_val)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.5091953469833224
,device,
,early_stopping_rounds,
,enable_categorical,False


In [66]:
pred_prob = best_model.predict_proba(X_test_scaled)[:,1]
pred_label = best_model.predict(X_test_scaled)

acc = accuracy_score(y_test, pred_label)
auc = roc_auc_score(y_test, pred_prob)
loss = log_loss(y_test, pred_prob)

print("Accuracy :", acc)
print("AUC      :", auc)
print("Logloss  :", loss)

Accuracy : 0.956140350877193
AUC      : 0.9963624338624338
Logloss  : 0.0832694164692546
