In [8]:
import numpy as np
import pandas as pd
import random

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    log_loss,
    roc_auc_score
)

from gwo import GreyWolfOptimizer
# from numba import njit

In [9]:
trainValData = pd.read_csv("../dataset/train_val.data")
testData = pd.read_csv("../dataset/test.data")

X_trainVal = trainValData.drop(columns=["Diagnosis"])
y_trainVal = trainValData["Diagnosis"]

X_test = testData.drop(columns=["Diagnosis"])
y_test = testData["Diagnosis"]

In [10]:
X_train, X_val, y_train, y_val = train_test_split(
    X_trainVal,
    y_trainVal,
    test_size=0.2,
    random_state=1,
    stratify=y_trainVal
)

In [11]:
baselineModel = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features=0.8,
    random_state=1
)

In [12]:
baselineModel.fit(X_trainVal, y_trainVal)
y_pred_baseline = baselineModel.predict(X_test)

baselineResults = {
    "Accuracy": accuracy_score(y_test, y_pred_baseline),
    "Precision": precision_score(y_test, y_pred_baseline),
    "Recall": recall_score(y_test, y_pred_baseline),
    "F1": f1_score(y_test, y_pred_baseline)
}

baselineResults

{'Accuracy': 0.9473684210526315,
 'Precision': 0.9285714285714286,
 'Recall': 0.9285714285714286,
 'F1': 0.9285714285714286}

In [13]:
def randomForestObjectiveFunction(position):
    n_estimators = int(position[0])
    max_depth = int(position[1])
    min_samples_split = int(position[2])
    min_samples_leaf = int(position[3])
    max_features = position[4]  # float, gunakan proporsi fitur

    # ubah float menjadi string 'auto' atau gunakan int proporsi
    if max_features > 1.0:
        max_features = 1.0

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=1
    )

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_val)[:, 1]  # probabilitas kelas 1
    fitness = log_loss(y_val, y_pred_proba) + (1 - roc_auc_score(y_val, y_pred_proba)) * 0.1
    return fitness

In [14]:
gwo = GreyWolfOptimizer(
    objectiveFunction=randomForestObjectiveFunction,
    lowerBound=[50, 2, 2, 1, 0.6],
    upperBound=[300, 8, 10, 5, 1.0],
    dimension=5
)

gwoResult = gwo.optimize()

bestParams = gwoResult["bestPosition"] 

KeyboardInterrupt: 

In [None]:
print("\n=== BEST PARAMETER FROM GWO ===")
print("n_estimators :", int(bestParams[0]))
print("max_depth   :", int(bestParams[1]))
print("min_samples_split :", int(bestParams[2]))
print("min_samples_leaf  :", int(bestParams[3]))
print("max_features      :", bestParams[4])

In [None]:
bestParamsDict = {
    "n_estimators": int(bestParams[0]),
    "max_depth": int(bestParams[1]),
    "min_samples_split": int(bestParams[2]),
    "min_samples_leaf": int(bestParams[3]),
    "max_features": bestParams[4]
}

In [None]:
finalModel = RandomForestClassifier(
    n_estimators=bestParamsDict["n_estimators"],
    max_depth=bestParamsDict["max_depth"],
    min_samples_split=bestParamsDict["min_samples_split"],
    min_samples_leaf=bestParamsDict["min_samples_leaf"],
    max_features=bestParamsDict["max_features"],
    random_state=1
)

In [None]:
finalModel.fit(X_trainVal, y_trainVal)
y_pred_gwo = finalModel.predict(X_test)

gwoResults = {
    "Accuracy": accuracy_score(y_test, y_pred_gwo),
    "Precision": precision_score(y_test, y_pred_gwo),
    "Recall": recall_score(y_test, y_pred_gwo),
    "F1": f1_score(y_test, y_pred_gwo)
}

In [None]:
comparison = pd.DataFrame(
    [baselineResults, gwoResults],
    index=["Baseline RF", "Random Forest + GWO"]
)

comparison