In [6]:
import numpy as np
import pandas as pd
import random

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    log_loss,
    roc_auc_score
)

from gwo import GreyWolfOptimizer

In [7]:
trainValData = pd.read_csv("../dataset/train_val.data")
testData = pd.read_csv("../dataset/test.data")

X_trainVal = trainValData.drop(columns=["Diagnosis"])
y_trainVal = trainValData["Diagnosis"]

X_test = testData.drop(columns=["Diagnosis"])
y_test = testData["Diagnosis"]

X_train, X_val, y_train, y_val = train_test_split(
    X_trainVal,
    y_trainVal,
    test_size=0.2,
    random_state=1,
    stratify=y_trainVal
)

In [8]:
baselineModel = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    subsample=1.0,
    random_state=1
)

baselineModel.fit(X_trainVal, y_trainVal)
y_pred_baseline = baselineModel.predict(X_test)

baselineResults = {
    "Accuracy": accuracy_score(y_test, y_pred_baseline),
    "Precision": precision_score(y_test, y_pred_baseline),
    "Recall": recall_score(y_test, y_pred_baseline),
    "F1": f1_score(y_test, y_pred_baseline)
}

baselineResults

{'Accuracy': 0.9824561403508771,
 'Precision': 0.9761904761904762,
 'Recall': 0.9761904761904762,
 'F1': 0.9761904761904762}

In [9]:
def gradientBoostObjectiveFunction(position):
    n_estimators = int(position[0])
    max_depth = int(position[1])
    learning_rate = position[2]
    subsample = position[3]

    model = GradientBoostingClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        random_state=1
    )

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    fitness = log_loss(y_val, y_pred_proba) + (1 - roc_auc_score(y_val, y_pred_proba)) * 0.1
    return fitness

In [10]:
gwo = GreyWolfOptimizer(
    objectiveFunction=gradientBoostObjectiveFunction,
    lowerBound=[50, 2, 0.01, 0.6],  # n_estimators, max_depth, learning_rate, subsample
    upperBound=[300, 8, 0.3, 1.0],
    dimension=4
)

gwoResult = gwo.optimize()
bestParams = gwoResult["bestPosition"]

print("\n=== BEST PARAMETER FROM GWO ===")
print("n_estimators :", int(bestParams[0]))
print("max_depth   :", int(bestParams[1]))
print("learning_rate:", bestParams[2])
print("subsample :", bestParams[3])

Iterasi 1/50 | Best Fitness: 0.063271
Iterasi 2/50 | Best Fitness: 0.056620
Iterasi 3/50 | Best Fitness: 0.056620
Iterasi 4/50 | Best Fitness: 0.056620
Iterasi 5/50 | Best Fitness: 0.056620
Iterasi 6/50 | Best Fitness: 0.056620
Iterasi 7/50 | Best Fitness: 0.056620
Iterasi 8/50 | Best Fitness: 0.054371


KeyboardInterrupt: 

In [None]:
bestParamsDict = {
    "n_estimators": int(bestParams[0]),
    "max_depth": int(bestParams[1]),
    "learning_rate": bestParams[2],
    "subsample": bestParams[3]
}

finalModel = GradientBoostingClassifier(
    n_estimators=bestParamsDict["n_estimators"],
    max_depth=bestParamsDict["max_depth"],
    learning_rate=bestParamsDict["learning_rate"],
    subsample=bestParamsDict["subsample"],
    random_state=1
)

finalModel.fit(X_trainVal, y_trainVal)
y_pred_gwo = finalModel.predict(X_test)

gwoResults = {
    "Accuracy": accuracy_score(y_test, y_pred_gwo),
    "Precision": precision_score(y_test, y_pred_gwo),
    "Recall": recall_score(y_test, y_pred_gwo),
    "F1": f1_score(y_test, y_pred_gwo)
}

gwoResults

In [None]:
comparison = pd.DataFrame(
    [baselineResults, gwoResults],
    index=["Baseline GB", "GB + AGWO"]
)

comparison