In [10]:
import numpy as np
import time
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import ydf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from treeple.datasets import make_trunk_classification


def train_and_evaluate(model, model_name, X_train, X_test, y_train, y_test):
    """
    Train a given model and evaluate its accuracy and training time.
    """
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    y_pred = model.predict(X_test)
    if isinstance(y_pred[0], np.ndarray):  # Some models return probabilities
        y_pred = np.argmax(y_pred, axis=1)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name}: Accuracy = {accuracy:.4f}, Training Time = {train_time:.4f} sec")
    
    return accuracy, train_time

In [9]:
n_samples = 1000
n_dim = 784
X, y = make_trunk_classification(n_samples=n_samples, n_dim=n_dim, n_informative=600, seed=0)


# Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# xgboost

In [None]:
start_time = time.time()
xgb_model = xgb.XGBRFRegressor(random_state=42).fit(
    X_train, y_train)
train_time = time.time() - start_time
y_pred = xgb_model.predict(X_test)

y_pred = (y_pred >= 0.5).astype(int)  # Assign class 1 if prediction >= 0.5, else class 0

accuracy = accuracy_score(y_test, y_pred)
print(f"xgboost: Accuracy = {accuracy:.4f}, Training Time = {train_time:.4f} sec")

xgboost: Accuracy = 0.7500, Training Time = 2.2643 sec


array([1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 1.,
       0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1.,
       1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1.,
       0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0.,
       0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1.,
       1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0.,
       0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1.])

In [None]:


# --- Step 3: Train and Evaluate Different Gradient Boosting Models ---

# XGBoost
xgb_model = xgb.XGBRFRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss', tree_method="hist")
acc_xgb, time_xgb = train_and_evaluate(xgb_model, "XGBoost")

# LightGBM
lgb_model = lgb.LGBMClassifier(n_estimators=100, max_depth=6, learning_rate=0.1)
acc_lgb, time_lgb = train_and_evaluate(lgb_model, "LightGBM")

# CatBoost (doesn't require one-hot encoding)
cb_model = cb.CatBoostClassifier(n_estimators=100, depth=6, learning_rate=0.1, verbose=0)
acc_cb, time_cb = train_and_evaluate(cb_model, "CatBoost")

# YDF 
ydf_model = ydf.GradientBoostedTreesLearner().train(X_train).train(X_train)


Parameters: { "use_label_encoder" } are not used.



XGBoost: Accuracy = 0.7850, Training Time = 1.5620 sec
[LightGBM] [Info] Number of positive: 400, number of negative: 400
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007735 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 199920
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 784
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[WinError 2] 系统找不到指定的文件。
  File "c:\Users\clark\anaconda3\envs\treeple_new\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\clark\anaconda3\envs\treeple_new\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\clark\anaconda3\envs\treeple_new\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\clark\anaconda3\envs\treeple_new\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,






LightGBM: Accuracy = 0.7800, Training Time = 0.3862 sec
CatBoost: Accuracy = 0.7800, Training Time = 2.4365 sec


TypeError: GradientBoostedTreesLearner.__init__() missing 1 required positional argument: 'label'

In [None]:

# --- Step 4: Save Results ---
results = np.array([
    ["XGBoost", acc_xgb, time_xgb],
    ["LightGBM", acc_lgb, time_lgb],
    ["CatBoost", acc_cb, time_cb],
    ["YDF", acc_ydf, train_time_ydf]
])

np.savetxt("./result/trunk_simulation_results.txt", results, fmt="%s", delimiter=",", header="Model,Accuracy,Training_Time", comments="")
print("\nResults saved to ./result/trunk_simulation_results.txt")

# --- Step 5: Plot Accuracy vs. Training Time ---
import matplotlib.pyplot as plt

model_names = ["XGBoost", "LightGBM", "CatBoost", "YDF"]
accuracies = [acc_xgb, acc_lgb, acc_cb, acc_ydf]
train_times = [time_xgb, time_lgb, time_cb, train_time_ydf]

plt.figure(figsize=(10, 6))
plt.scatter(train_times, accuracies, color=["red", "blue", "green", "purple"], s=100)

for i, name in enumerate(model_names):
    plt.text(train_times[i], accuracies[i], name, fontsize=12, ha='right')

plt.xlabel("Training Time (sec)", fontsize=16)
plt.ylabel("Accuracy", fontsize=16)
plt.title("Trunk Simulation: Accuracy vs. Training Time for Boosting Models", fontsize=18)
plt.grid(True)
plt.show()
