In [123]:
from pathlib import Path
import numpy as np
import pandas as pd

from xgboost import XGBClassifier

In [None]:
class my_class():
    def __init__(self, d):
        self.d=d
    def __getattr__(self, key):
        return self.d[key]

In [387]:
from sklearn.metrics import accuracy_score
def my_accuracy(y_true, y_pred):
    if len(y_pred.shape)>1:
        true_labels = np.argmax(y_pred, axis=1)
    else:
        true_labels=y_pred

    return -accuracy_score(y_true, true_labels)

In [None]:
score_dict = {}

In [16]:
prefix = "../"

In [792]:
stopping_round=100
dataset = "citeseer"
wildcards = my_class({'dataset': dataset, 'group': 'planetoid', 'round': '1', 'num_train_per_class': '50', 'num_val': '500', 'num_test': 'rest', 'split_seed': '0', 'n_estimators': '5000', 'max_depth': '1', 'clf_seed': '0', 'early_stopping_rounds': f'_{stopping_round}'})

In [807]:
train_per_class = 20

In [808]:
input = [Path(prefix+f"/snakemake_base/splits/{dataset}_planetoid/{train_per_class}_500_rest_0.npz").resolve().absolute(),
         Path(prefix+f"snakemake_base/aggregated_datasets/{dataset}_planetoid_0.pkl").resolve().absolute()]

In [809]:
splits = np.load(input[0])
train_mask = splits["train_mask"]
val_mask = splits["val_mask"]

df  = pd.read_pickle(input[1])
train_df = df[train_mask]
#print("number_of_columns", len(df.columns))
X_train = train_df.drop("labels", axis=1)

y_train = train_df["labels"]
print(df.shape)

(3327, 3704)


In [810]:
from xgboost.callback import TrainingCallback, LearningRateScheduler

In [811]:

class GammaScheduler(TrainingCallback):
    def __init__(self, gamma, decay=0.9):
        self.gamma=gamma
        self.decay=decay
    def after_iteration(self, model, epoch, evals_log):
        #print(model.attributes())
        self.gamma*=self.decay
        model.set_param("min_split_loss", self.gamma)
        #print(dir(model))
        #model.gamma = model.gamma*0.95
        #print(evals_log)

class ChildWeightScheduler(TrainingCallback):
    def __init__(self, min_child_weight, timespan=10, min_size=1):
        self.min_child_weight=int(min_child_weight)
        self.timespan=timespan
        self.min_size=min_size
    def after_iteration(self, model, epoch, evals_log):
        #print(model.attributes())
        if epoch>0 and epoch % self.timespan == 0:
            self.min_child_weight=max(self.min_size, self.min_child_weight-1)
            print("new_child_weight", self.min_child_weight)
            model.set_param("min_child_weight", self.min_child_weight)

In [832]:
class LRScheduler(TrainingCallback):
    def __init__(self, gamma, decay=0.95, min_lr=0.03):
        self.start_gamma=gamma
        self.gamma=gamma
        self.decay=decay
        self.min_lr = min_lr
        self.min_message=True
    def after_iteration(self, model, epoch, evals_log):
        #print(model.attributes())
        new_gamma = max(self.gamma * self.decay, self.min_lr)
        if new_gamma==self.min_lr and self.min_message:
            self.min_message=False
            print("min lr reached")
        self.gamma=new_gamma
        model.set_param("learning_rate", self.gamma)
        #print(dir(model))
        #model.gamma = model.gamma*0.95
        #print(evals_log)

        
class ChildWeightSampleScheduler(TrainingCallback):
    def __init__(self, timespan=10, start_sample=0.3, sample_step=0.05, max_sample=1.0):
        self.start_sample=start_sample
        self.sample=start_sample
        self.sample_step = sample_step
        self.min_child_weight=int(max(min(self.sample*train_per_class/2.3,10),3))
        self.timespan=timespan
        self.min_size=max(int(self.sample*train_per_class/7),3)
        self.max_sample=max_sample
        
        
    def after_iteration(self, model, epoch, evals_log):
        #print(model.attributes())
        if epoch>0 and epoch % self.timespan == 0:
            new_size = max(self.min_size, self.min_child_weight-1)
            if new_size<self.min_child_weight:
                self.min_child_weight=new_size
                print("new_child_weight", self.min_child_weight)
                model.set_param("min_child_weight", self.min_child_weight)
            else:
                if self.sample==self.max_sample:
                    return
                self.sample+=self.sample_step
                new_sample = min(self.sample, self.max_sample)

                self.sample=new_sample
                print("new sample", self.sample)
                print(self.sample, train_per_class, self.sample*train_per_class)
                model.set_param("subsample", self.sample)
                #model.set_param("colsample_bylevel", self.sample)
                #model.set_param("colsample_bynode", self.sample)
                #model.set_param("colsample_bytree", self.sample)
                
                
                #self.min_child_weight=int(self.sample*train_per_class/7)
                #self.min_size=int(self.sample*train_per_class/7)
                #
                #print("new_child_weight", self.min_child_weight)
                #model.set_param("min_child_weight", self.min_child_weight)


In [833]:
# best at 50:
50,
  ('n_estimators', 5000),
  ('max_depth', 15),
  ('learning_rate', 0.3),
  ('objective', 'multi:softmax'),
  ('random_state', '0'),
  ('eval_metric', <function __main__.my_accuracy(y_true, y_pred)>),
  ('disable_default_eval_metric', 1),
  ('n_jobs', 1),
  ('early_stopping_rounds', 100),
  ('min_child_weight', 3),
  ('subsample', 0.1),
  ('colsample_bytree', 0.5),
  ('colsample_bylevel', 0.5),
  ('colsample_bynode', 0.5),
  ('callbacks',
   (<__main__.ChildWeightSampleScheduler at 0x7ff0407de8f0>,
    <__main__.LRScheduler at 0x7ff040a40940>)),
  ('start_sample', 0.1),
  ('sample', 0.9),
  ('sample_step', 0.05),
  ('min_child_weight', 3),
  ('timespan', 5),
  ('min_size', 5),
  ('max_sample', 0.9),
  ('start_gamma', 0.3),
  ('gamma', 0.01),
  ('decay', 0.97),
  ('min_lr', 0.01)): -0.628,

# best at 100
 100,
  ('n_estimators', 5000),
  ('max_depth', 10),
  ('learning_rate', 0.03),
  ('objective', 'multi:softmax'),
  ('random_state', '0'),
  ('eval_metric', <function __main__.my_accuracy(y_true, y_pred)>),
  ('disable_default_eval_metric', 1),
  ('n_jobs', 1),
  ('early_stopping_rounds', 150),
  ('min_child_weight', 10),
  ('subsample', 0.4),
  ('colsample_bytree', 0.4),
  ('colsample_bylevel', 0.4),
  ('colsample_bynode', 0.4),
  ('callbacks',
   (<__main__.ChildWeightSampleScheduler at 0x7ff041e6f640>,))): -0.716,
 ('citeseer',
  100,
  ('n_estimators', 5000),
  ('max_depth', 10),
  ('learning_rate', 0.03),
  ('objective', 'multi:softmax'),
  ('random_state', '0'),
  ('eval_metric', <function __main__.my_accuracy(y_true, y_pred)>),
  ('disable_default_eval_metric', 1),
  ('n_jobs', 1),
  ('early_stopping_rounds', 50),
  ('min_child_weight', 10),
  ('subsample', 0.4),
  ('colsample_bytree', 0.4),
  ('colsample_bylevel', 0.4),
  ('colsample_bynode', 0.4),
  ('callbacks',
   (<__main__.ChildWeightSampleScheduler at 0x7ff042f960e0>,))): -0.716,

IndentationError: unexpected indent (2521361648.py, line 2)

In [836]:
val_df = df[val_mask]
X_val = val_df.drop("labels", axis=1)
y_val = val_df["labels"]
sample={10:1,20:0.5,50:0.45, 100:0.3}[train_per_class]
child_scheduler = ChildWeightSampleScheduler(timespan=5, start_sample=.45, sample_step=0.01, max_sample=0.7 )
lr_scheduler = LRScheduler(0.2, 0.99, min_lr=0.05)
print(child_scheduler.min_child_weight)
params = dict( 
    n_estimators=int(wildcards.n_estimators),
    max_depth=15,
    learning_rate=0.2,
    objective='multi:softmax',
    random_state=wildcards.clf_seed,
    eval_metric=my_accuracy,
    disable_default_eval_metric= 1,
    n_jobs=1,
    early_stopping_rounds=int(wildcards.early_stopping_rounds[1:]),
    min_child_weight=child_scheduler.min_child_weight,
#    tree_method="approx",
#    gamma=1,
    subsample=sample,
    colsample_bytree = 0.5,
    #max_delta_step=0.2
#    reg_lambda = 50,
#    reg_alpha = 1.5,
    colsample_bylevel = 0.5,
    colsample_bynode = 0.5,
    callbacks = (child_scheduler,lr_scheduler)
)
bst = XGBClassifier(**params

)
result = bst.fit(X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=True)
#print(dir(bst))
#print("EVAL RESULTS", bst.evals_result())
train_score = my_accuracy(y_train, bst.predict(X_train))
print("Best_iteration", bst.best_iteration, f"delta={bst.best_score-train_score:.3}", "best_score", bst.best_score)
score_dict[(dataset, train_per_class,)+tuple(params.items())+tuple(child_scheduler.__dict__.items())+tuple(lr_scheduler.__dict__.items())]=(bst.best_score)

3
[0]	validation_0-my_accuracy:-0.20000	validation_1-my_accuracy:-0.09000
[1]	validation_0-my_accuracy:-0.23333	validation_1-my_accuracy:-0.10400
[2]	validation_0-my_accuracy:-0.26667	validation_1-my_accuracy:-0.10800
[3]	validation_0-my_accuracy:-0.26667	validation_1-my_accuracy:-0.19200
[4]	validation_0-my_accuracy:-0.17500	validation_1-my_accuracy:-0.15600
new sample 0.46
0.46 20 9.200000000000001
[5]	validation_0-my_accuracy:-0.20833	validation_1-my_accuracy:-0.16000
[6]	validation_0-my_accuracy:-0.29167	validation_1-my_accuracy:-0.16200
[7]	validation_0-my_accuracy:-0.32500	validation_1-my_accuracy:-0.17200
[8]	validation_0-my_accuracy:-0.33333	validation_1-my_accuracy:-0.24400
[9]	validation_0-my_accuracy:-0.42500	validation_1-my_accuracy:-0.38400
new sample 0.47000000000000003
0.47000000000000003 20 9.4
[10]	validation_0-my_accuracy:-0.35833	validation_1-my_accuracy:-0.32800
[11]	validation_0-my_accuracy:-0.45000	validation_1-my_accuracy:-0.37200
[12]	validation_0-my_accuracy:-0

In [824]:
child_scheduler.__dict__

{'start_sample': 0.45,
 'sample': 0.51,
 'sample_step': 0.01,
 'min_child_weight': 3,
 'timespan': 5,
 'min_size': 3,
 'max_sample': 0.7}

In [720]:
print(dataset, train_per_class)

citeseer 50


In [670]:
0.98**200

0.0175879466057215

In [783]:
score_dict

{(20,
  ('n_estimators', 5000),
  ('max_depth', 1),
  ('learning_rate', 0.03),
  ('objective', 'multi:softmax'),
  ('random_state', '0'),
  ('eval_metric', <function __main__.my_accuracy(y_true, y_pred)>),
  ('disable_default_eval_metric', 1),
  ('n_jobs', 1),
  ('early_stopping_rounds', 50),
  ('min_child_weight', 2),
  ('gamma', 2),
  ('colsample_bytree', 0.5),
  ('reg_lambda', 5),
  ('colsample_bylevel', 0.5),
  ('colsample_bynode', 0.5)): 0.352,
 (20,
  ('n_estimators', 5000),
  ('max_depth', 1),
  ('learning_rate', 0.03),
  ('objective', 'multi:softmax'),
  ('random_state', '0'),
  ('eval_metric', <function __main__.my_accuracy(y_true, y_pred)>),
  ('disable_default_eval_metric', 1),
  ('n_jobs', 1),
  ('early_stopping_rounds', 50),
  ('min_child_weight', 2),
  ('gamma', 2),
  ('subsample', 0.5),
  ('colsample_bytree', 0.5),
  ('reg_lambda', 5),
  ('colsample_bylevel', 0.5),
  ('colsample_bynode', 0.5)): 0.286,
 (50,
  ('n_estimators', 5000),
  ('max_depth', 1),
  ('learning_rate'