In [1]:
from py_boost import SketchBoost
import numpy as np

In [2]:
from importlib import reload
import helper
reload(helper)

<module 'helper' from '/home/davinci/term3/ml_proj/helper.py'>

In [3]:
image_dir = './data/validation'
label_dir = './data/validation_labels'

y_MF, y_BP, y_CC = helper.load_dataset(image_dir, label_dir, cut_per_set=None)

In [4]:
y_MF.shape, y_BP.shape, y_CC.shape

((2904, 489), (2904, 1943), (2904, 320))

In [5]:
X = np.load('./data/vectorization/validation.npz', allow_pickle=True)
X = X['arr_0']

In [6]:
X_train, X_test, y_train, y_test = helper.split_dataset(X, y_BP, test_size=.3)

In [7]:
y_train.shape, y_test.shape

((2032, 1720), (872, 1720))

# PyBoost

Let's play around with different `lr` and `gd_steps`.

In [None]:
lr_values = [1e-2, 1e-3, 1e-4]
gd_steps_values = [5, 10, 20]

best_score = -np.inf
best_params = {}

for lr in lr_values:
    for gd_steps in gd_steps_values:
        model_mf = SketchBoost(
            loss='multilabel', metric='f1', ntrees=20_000,
            lr=lr, es=1_000, lambda_l2=1, gd_steps=gd_steps,
            min_data_in_leaf=10, max_bin=256, max_depth=5,
            verbose=1_000
        )

        model_mf.fit(X_train, y_train, eval_sets=[{'X': X_test, 'y': y_test}])

        y_pred = model_mf.predict(np.array(X_test))
        score = helper.count_f1_max(y_pred, y_test)
        print(f"lr: {lr}, gd_steps: {gd_steps}, F1 Score: {score:.5f}")

        if score > best_score:
            best_score = score
            best_params = {'lr': lr, 'gd_steps': gd_steps}

print(f"Best Parameters: {best_params}")
print(f"Best F1 Score: {best_score:.5f}")

[21:19:36] Stdout logging level is INFO.
[21:19:36] GDBT train starts. Max iter 20000, early stopping rounds 1000
[21:19:36] Iter 0; Sample 0, F1_score = 0.0; 
[21:20:02] Iter 1000; Sample 0, F1_score = 0.02305357720150229; 
[21:20:28] Iter 2000; Sample 0, F1_score = 0.02360407592352108; 
[21:20:53] Iter 3000; Sample 0, F1_score = 0.023907885599021537; 
[21:21:19] Iter 4000; Sample 0, F1_score = 0.025682312305476004; 
[21:21:44] Iter 5000; Sample 0, F1_score = 0.026198808118212923; 
[21:22:08] Iter 6000; Sample 0, F1_score = 0.02620830849288307; 
[21:22:33] Iter 7000; Sample 0, F1_score = 0.026311099548206776; 
[21:22:57] Iter 8000; Sample 0, F1_score = 0.0265534027349849; 
[21:23:23] Iter 9000; Sample 0, F1_score = 0.026601241274630302; 
[21:23:48] Iter 10000; Sample 0, F1_score = 0.026678989332000903; 
[21:24:12] Iter 11000; Sample 0, F1_score = 0.026740382141597987; 
[21:24:36] Iter 12000; Sample 0, F1_score = 0.027238117120430224; 
[21:25:02] Iter 13000; Sample 0, F1_score = 0.0273

Since best params are `lr = 0.01` and `gd_steps = 10` we will use them.

In [None]:
model_mf = SketchBoost(
            loss='multilabel', metric='f1', ntrees=20_000,
            lr=.01, es=1_000, lambda_l2=1, gd_steps=10,
            min_data_in_leaf=10, max_bin=256, max_depth=5,
            verbose=1_000
        )

model_mf.fit(X_train, y_train, eval_sets=[{'X': X_test, 'y': y_test}])

y_pred = model_mf.predict(np.array(X_test))
score = helper.count_f1_max(y_pred, y_test)
print(f"lr: {lr}, gd_steps: {gd_steps}, Score: {score:.5f}")

[22:04:46] Stdout logging level is INFO.
[22:04:46] GDBT train starts. Max iter 20000, early stopping rounds 1000
[22:04:46] Iter 0; Sample 0, F1_score = 0.0; 
[22:05:18] Iter 1000; Sample 0, F1_score = 0.023958054620118796; 
[22:05:50] Iter 2000; Sample 0, F1_score = 0.02624877893669352; 
[22:06:20] Iter 3000; Sample 0, F1_score = 0.02731273944178301; 
[22:06:52] Iter 4000; Sample 0, F1_score = 0.027519415673867877; 
[22:07:24] Iter 5000; Sample 0, F1_score = 0.02764096171108459; 
[22:07:55] Iter 6000; Sample 0, F1_score = 0.02782178168325158; 
[22:08:26] Iter 7000; Sample 0, F1_score = 0.0277995541155098; 
[22:08:42] Early stopping at iter 7482, best iter 6482, best_score 0.027832133049662353
lr: 0.0001, gd_steps: 20, F1 Score: 0.21381


# PyBoost with ICA dim red

Let's check if ICA increses score, as before.

In [22]:
from sklearn.decomposition import FastICA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

n_components = 256
pipeline_transform = Pipeline([
    ('scaler', StandardScaler()),
    ('ica', FastICA(n_components=n_components, random_state=42, max_iter=2_000)),
])

pipeline_transform.fit(X_train)

X_train_transformed = pipeline_transform.transform(X_train)
X_test_transformed = pipeline_transform.transform(X_test)



In [23]:
model_mf = SketchBoost(
            loss='multilabel', metric='f1', ntrees=20_000,
            lr=.01, es=1_000, lambda_l2=1, gd_steps=10,
            min_data_in_leaf=10, max_bin=256, max_depth=5,
            verbose=1_000
        )

model_mf.fit(X_train_transformed, y_train, eval_sets = [{'X': X_test_transformed, 'y': y_test}])

[22:13:37] Stdout logging level is INFO.
[22:13:37] GDBT train starts. Max iter 20000, early stopping rounds 1000
[22:13:38] Iter 0; Sample 0, F1_score = 0.0; 
[22:14:02] Iter 1000; Sample 0, F1_score = 0.02127316340148577; 
[22:14:29] Iter 2000; Sample 0, F1_score = 0.021717544033652277; 
[22:14:53] Iter 3000; Sample 0, F1_score = 0.021862141923949797; 
[22:15:18] Iter 4000; Sample 0, F1_score = 0.022013651024497177; 
[22:15:42] Iter 5000; Sample 0, F1_score = 0.022054798613758655; 
[22:15:53] Early stopping at iter 5390, best iter 4390, best_score 0.02207241589716595


<py_boost.gpu.sketch_boost.SketchBoost at 0x7f08b8b6eec0>

In [24]:
y_pred = model_mf.predict(np.array(X_test))
print(f"{helper.count_f1_max(y_pred, y_test):.5f}")

0.10608


So, ICA works best with classic ML then.