In [1]:
import optuna

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd

In [3]:
from graph_description.training_utils import my_accuracy, LinearScheduler, ExponentialScheduler

In [4]:
prefix = "../"

In [5]:
train_per_class = 20
round = 0
dataset="citeseer"

In [6]:
input = [Path(prefix+f"/snakemake_base/splits/{dataset}_planetoid/{train_per_class}_500_rest_0.npz").resolve().absolute(),
         Path(prefix+f"snakemake_base/aggregated_datasets/{dataset}_planetoid_{round}_dense.pkl").resolve().absolute()]

In [7]:
splits = np.load(input[0])
train_mask = splits["train_mask"]
val_mask = splits["val_mask"]

df  = pd.read_pickle(input[1])
train_df = df[train_mask]
#print("number_of_columns", len(df.columns))
X_train = train_df.drop("labels", axis=1)

y_train = train_df["labels"]
print(df.shape)

(3327, 3704)


In [25]:
def load_dataset_splitted(path_splits, path_df, return_train=True, return_val=True, return_test=False, return_full=False):
    splits = np.load(input[0])
    df  = pd.read_pickle(input[1])

    def get_by_split(split_name):
        mask = splits[split_name]
        mask_df = df[mask]
        X = mask_df.drop("labels", axis=1)
        y = mask_df["labels"]
        return X, y

    out = tuple()
    if return_train:
        out += get_by_split("train_mask")
    if return_val:
        out += get_by_split("val_mask")
    if return_test:
        out += get_by_split("test_mask")
    if return_full:
        out +=(df,)
    print(len(out))
    return out

In [26]:
(X_train, y_train, X_val, y_val)=load_dataset_splitted(input[0], input[1])

4


In [8]:
val_df = df[val_mask]
X_val = val_df.drop("labels", axis=1)
y_val = val_df["labels"]

In [9]:
num_classes = len(np.bincount(y_train))

In [10]:
from imodels import RuleFitClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

In [11]:
import warnings
warnings.filterwarnings("ignore", message="invalid value encountered in scalar subtract")
warnings.filterwarnings("ignore", message="overflow encountered in reduce")

In [12]:
max_rules=10

In [13]:
def rule_objective(trial, max_rules):
    params = dict(
        max_rules=int(max_rules),
        cv=False,
        random_state=0,
        tree_size = trial.suggest_int('tree_size',2,100),
        memory_par = trial.suggest_float('memory_par',1e-3,100,log=True), # learning rate
        lin_trim_quantile = trial.suggest_float('lin_trim_quantile',0,1),
        exp_rand_tree_size = trial.suggest_categorical('exp_rand_tree_size',[False, True]),
#        alpha = trial.suggest_float('alpha',1e-4, 10, log=True),
    )
    clf = OneVsRestClassifier(RuleFitClassifier(**params))
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_val)
    return accuracy_score(prediction, y_val)

In [14]:
from functools import partial
objective = partial(rule_objective, max_rules=max_rules)

In [15]:
import os
try:
    this_file = Path(__file__)
except NameError:
    this_file = Path(os.path.abspath(''))
if this_file.stem in ("notebooks", "scripts"):
    root_folder = this_file.parent
else:
    root_folder = this_file

In [16]:
journal_path = root_folder/"hyper_param_journal.log"
print("journal_path", journal_path)
storage = optuna.storages.JournalStorage(
    optuna.storages.JournalFileStorage(str(journal_path)),
)

study = optuna.create_study(
    storage=storage,  # Specify the storage URL here.
    study_name=f"{dataset}-{round}-{train_per_class}-rulefit{max_rules}",
    load_if_exists=True,
    direction='maximize'
)

journal_path /home/stamm/projects/graph_description/hyper_param_journal.log


  storage = optuna.storages.JournalStorage(
[I 2024-01-31 11:59:13,303] Using an existing study with name 'citeseer-0-20-rulefit10' instead of creating a new one.


In [17]:
# 3. Create a study object and optimize the objective function.
#study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-01-31 11:59:30,803] Trial 100 finished with value: 0.514 and parameters: {'tree_size': 9, 'memory_par': 0.0070015451611187395, 'lin_trim_quantile': 0.9091285252016981, 'exp_rand_tree_size': False}. Best is trial 31 with value: 0.572.
[I 2024-01-31 11:59:47,083] Trial 101 finished with value: 0.49 and parameters: {'tree_size': 15, 'memory_par': 70.55239642306952, 'lin_trim_quantile': 0.670421342166295, 'exp_rand_tree_size': True}. Best is trial 31 with value: 0.572.
[I 2024-01-31 12:00:01,683] Trial 102 finished with value: 0.456 and parameters: {'tree_size': 14, 'memory_par': 17.94850050116941, 'lin_trim_quantile': 0.8000480581594751, 'exp_rand_tree_size': True}. Best is trial 31 with value: 0.572.
[I 2024-01-31 12:00:22,936] Trial 103 finished with value: 0.544 and parameters: {'tree_size': 11, 'memory_par': 0.0017909209500775174, 'lin_trim_quantile': 0.7390399854161661, 'exp_rand_tree_size': True}. Best is trial 31 with value: 0.572.
[I 2024-01-31 12:00:57,447] Trial 104 fini