In [1]:
import optuna

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd

from xgboost import XGBClassifier

In [3]:
from sklearn.metrics import accuracy_score
def my_accuracy(y_true, y_pred):
    if len(y_pred.shape)>1:
        true_labels = np.argmax(y_pred, axis=1)
    else:
        true_labels=y_pred

    return -accuracy_score(y_true, true_labels)

In [4]:
prefix = "../"

In [5]:
train_per_class = 20
round = 0
dataset="citeseer"

In [6]:
input = [Path(prefix+f"/snakemake_base/splits/{dataset}_planetoid/{train_per_class}_500_rest_0.npz").resolve().absolute(),
         Path(prefix+f"snakemake_base/aggregated_datasets/{dataset}_planetoid_{round}.pkl").resolve().absolute()]

In [7]:
splits = np.load(input[0])
train_mask = splits["train_mask"]
val_mask = splits["val_mask"]

df  = pd.read_pickle(input[1])
train_df = df[train_mask]
#print("number_of_columns", len(df.columns))
X_train = train_df.drop("labels", axis=1)

y_train = train_df["labels"]
print(df.shape)

(3327, 3704)


In [8]:
val_df = df[val_mask]
X_val = val_df.drop("labels", axis=1)
y_val = val_df["labels"]

In [9]:
from xgboost.callback import TrainingCallback, LearningRateScheduler
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train, missing=np.NaN)
dval = xgb.DMatrix(X_val, label=y_val, missing=np.NaN)


In [10]:
class LinearScheduler(TrainingCallback):
    def __init__(self, attr,  start_val, step, stop_val, timespan=1, silent=True, offset=0):
        self.attr = attr
        self.timespan = timespan
        self.start_val = start_val
        self.curr_val=start_val
        self.stop_val = stop_val
        self.step=step
        self.silent=silent
        self.offset=offset

        if step<0:
            self.agg = max
        else:
            self.agg = min

    def before_training(self, model):
        if not self.silent:
            print("new "+self.attr, self.curr_val)
        model.set_param(self.attr, self.curr_val)
        return model
        
    def after_iteration(self, model, epoch, evals_log):
        #print(model.attributes())
        if epoch>self.offset and epoch % (self.timespan) == 0:
            new_val = self.agg(self.stop_val, self.curr_val+self.step)
            if new_val!=self.curr_val:
                self.curr_val=new_val
                if not self.silent:
                    print("new "+self.attr, self.curr_val)
                model.set_param(self.attr, self.curr_val)


class ExponentialScheduler(TrainingCallback):
    def __init__(self, attr,  start_val, factor, stop_val, timespan=1, silent=True, offset=0):
        self.attr = attr
        self.timespan = timespan
        self.start_val = start_val
        self.curr_val=start_val
        self.stop_val = stop_val
        self.factor=factor
        self.silent=silent
        self.offset=offset

        if factor>1:
            self.agg = min
        else:
            self.agg = max

    def before_training(self, model):
        if not self.silent:
            print("new "+self.attr, self.curr_val)
        model.set_param(self.attr, self.curr_val)
        return model
        
    def after_iteration(self, model, epoch, evals_log):
        #print(model.attributes())
        if epoch>self.offset and epoch % (self.timespan) == 0:
            new_val = self.agg(self.stop_val, self.curr_val*self.factor)
            if new_val!=self.curr_val:
                self.curr_val=new_val
                if not self.silent:
                    print("new "+self.attr, self.curr_val)
                model.set_param(self.attr, self.curr_val)

In [11]:
num_classes = len(np.bincount(y_train))

In [18]:
# 1. Define an objective function to be maximized.
def objective(trial):
    lr_scheduler = ExponentialScheduler("learning_rate",
                                        start_val=trial.suggest_float("init_lr", 0.01, 1),
                                        factor=trial.suggest_categorical("lr_factor",[0.99, 1]),
                                        stop_val = 0.01,
                                        timespan=1
                                       )
    start_subsample = trial.suggest_float("start_subsample", 0.1,1)
    stop_subsample = trial.suggest_float("stop_subsample", start_subsample,1)
    sample_scheduler = LinearScheduler("subsample",
                                        start_val=start_subsample,
                                        step=0.1,
                                        stop_val = stop_subsample,
                                        timespan=num_classes*trial.suggest_int("timespan_subsample",1,3),
                                        offset = num_classes*trial.suggest_int("timespan_offset",1,3)
                                       )

    start_weight = trial.suggest_int("start_weight", 1,10)
    stop_weight = trial.suggest_int("stop_weight", 1,10)
    step_weight = 1 if stop_subsample>=start_subsample else -1
    weight_scheduler = LinearScheduler("min_child_weight",
                                        start_val=start_weight,
                                        step=step_weight,
                                        stop_val = stop_weight,
                                        timespan=num_classes*trial.suggest_int("timespan_weight",1,3)
                                       ) 
    #tree_size_scheduler = LinearScheduler("max_depth",1,1,10, timespan=10)
    colsample =  trial.suggest_float("colsample", 0.1,1)
    params = dict( 
        n_estimators=500,
        max_depth=trial.suggest_int("max_depth", 1,15),
        learning_rate=1,
        objective='multi:softmax',
        random_state=0,
        eval_metric=my_accuracy,
        disable_default_eval_metric= 1,
        n_jobs=1,
        early_stopping_rounds=trial.suggest_int("early_stopping_rounds", 1,100),
 #       min_child_weight=1,#child_scheduler.min_child_weight,
    #    multi_strategy='multi_output_tree',
    #    tree_method="approx",
    #    gamma=1,
 #       subsample=sample,
        colsample_bytree = colsample,
        #max_delta_step=0.2
 #       reg_lambda = 1,
 #       reg_alpha = 0.5,
        colsample_bylevel = colsample,
        colsample_bynode = colsample,
        callbacks = (lr_scheduler, sample_scheduler, weight_scheduler),
#        booster =  trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        reg_lambda= trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        reg_alpha= trial.suggest_float('alpha', 1e-8, 1.0, log=True)
    )
    #bst = xgb.train(params, dtrain)
    bst = XGBClassifier( **params)

    result = bst.fit(dtrain.get_data(),dtrain.get_label(),
        eval_set=[(dval.get_data(),dval.get_label())],
        verbose=False)
    return bst.best_score

In [19]:
storage = optuna.storages.JournalStorage(
    optuna.storages.JournalFileStorage("./journal.log"),
)

study = optuna.create_study(
    storage=storage,  # Specify the storage URL here.
    study_name=f"{dataset}-{round}-{train_per_class}",
    load_if_exists=True,
    direction='minimize'
)

  storage = optuna.storages.JournalStorage(
[I 2024-01-26 18:32:27,462] Using an existing study with name 'citeseer-0-20' instead of creating a new one.


In [20]:
# 3. Create a study object and optimize the objective function.
#study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-01-26 18:32:29,412] Trial 4 finished with value: -0.342 and parameters: {'init_lr': 0.8648429572587255, 'lr_factor': 0.99, 'start_subsample': 0.8016280755212755, 'stop_subsample': 0.9306811301724154, 'timespan_subsample': 2, 'timespan_offset': 3, 'start_weight': 7, 'stop_weight': 6, 'timespan_weight': 1, 'colsample': 0.7864231761677293, 'max_depth': 15, 'early_stopping_rounds': 81, 'lambda': 1.6414526376328532e-06, 'alpha': 0.012651222632663078}. Best is trial 1 with value: -0.538.
[I 2024-01-26 18:32:32,442] Trial 5 finished with value: -0.53 and parameters: {'init_lr': 0.2157628241402139, 'lr_factor': 1, 'start_subsample': 0.17806763192988395, 'stop_subsample': 0.45397184552732195, 'timespan_subsample': 2, 'timespan_offset': 1, 'start_weight': 4, 'stop_weight': 2, 'timespan_weight': 3, 'colsample': 0.5578920610836502, 'max_depth': 15, 'early_stopping_rounds': 76, 'lambda': 0.017604897443959093, 'alpha': 0.0002883978961552386}. Best is trial 1 with value: -0.538.
[I 2024-01-26

KeyboardInterrupt: 