In [1]:
import optuna

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd

In [3]:
from graph_description.training_utils import my_accuracy, LinearScheduler, ExponentialScheduler

In [4]:
prefix = "../"

In [5]:
train_per_class = 20
round = 0
dataset="citeseer"

In [6]:
input = [Path(prefix+f"/snakemake_base/splits/{dataset}_planetoid/{train_per_class}_500_rest_0.npz").resolve().absolute(),
         Path(prefix+f"snakemake_base/aggregated_datasets/{dataset}_planetoid_{round}.pkl").resolve().absolute()]

In [7]:
splits = np.load(input[0])
train_mask = splits["train_mask"]
val_mask = splits["val_mask"]

df  = pd.read_pickle(input[1])
train_df = df[train_mask]
#print("number_of_columns", len(df.columns))
X_train = train_df.drop("labels", axis=1)

y_train = train_df["labels"]
print(df.shape)

(3327, 3704)


In [8]:
val_df = df[val_mask]
X_val = val_df.drop("labels", axis=1)
y_val = val_df["labels"]

In [9]:
from xgboost import XGBClassifier
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train, missing=np.NaN)
dval = xgb.DMatrix(X_val, label=y_val, missing=np.NaN)


In [10]:
num_classes = len(np.bincount(y_train))

In [11]:
from graph_description.training_utils import xgb_objective
from functools import partial

objective = partial(xgb_objective, num_classes=num_classes, dtrain=dtrain, dval=dval)

In [12]:
storage = optuna.storages.JournalStorage(
    optuna.storages.JournalFileStorage("./journal.log"),
)

study = optuna.create_study(
    storage=storage,  # Specify the storage URL here.
    study_name=f"{dataset}-{round}-{train_per_class}",
    load_if_exists=True,
    direction='minimize'
)

  storage = optuna.storages.JournalStorage(
[I 2024-01-26 23:30:12,685] Using an existing study with name 'citeseer-0-20' instead of creating a new one.


In [13]:
# 3. Create a study object and optimize the objective function.
#study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-01-26 23:30:16,567] Trial 87 finished with value: -0.552 and parameters: {'init_lr': 0.08158310639537916, 'lr_factor': 0.99, 'start_subsample': 0.9218747090804481, 'stop_subsample': 0.9395238182287228, 'timespan_subsample': 1, 'timespan_offset': 1, 'start_weight': 7, 'stop_weight': 1, 'timespan_weight': 2, 'colsample': 0.5798556718989539, 'max_depth': 6, 'early_stopping_rounds': 52, 'lambda': 0.0013115063687358464, 'alpha': 9.824433799576479e-05}. Best is trial 59 with value: -0.584.
[I 2024-01-26 23:30:19,056] Trial 88 finished with value: -0.576 and parameters: {'init_lr': 0.10596843159800869, 'lr_factor': 0.99, 'start_subsample': 0.8909799159504164, 'stop_subsample': 0.9319236579265341, 'timespan_subsample': 1, 'timespan_offset': 1, 'start_weight': 6, 'stop_weight': 1, 'timespan_weight': 2, 'colsample': 0.5456736761959801, 'max_depth': 4, 'early_stopping_rounds': 72, 'lambda': 0.00811305497930896, 'alpha': 0.001611904104243642}. Best is trial 59 with value: -0.584.
[I 2024-0