In [1]:
import optuna
import pandas as pd
import numpy as np
import sys
sys.path.append('../..')
from helper.lazy_regression import TooLazyForRegression
from helper.feature_extract import extract_highest_amplitude_features_with_mp
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import time
import functools
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

In [2]:
data = pd.read_table('../../data/data_spg.txt', sep=' ')
data = extract_highest_amplitude_features_with_mp(df=data, sensor_types=[['G01', 'G02']], create_one_sensor_feature=True, n_processes=4, keep_columns=False)

INFO || Extracting Max Features for types: ['G01', 'G02']


In [15]:
data_max = data.drop(['start_time'], axis=1)
X, y = data_max.drop(['size_mm'], axis=1), data_max['size_mm']
y =y.astype(np.str)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 102)

In [2]:
def objective(trial):
    #dtrain = xgb.DMatrix(X_train, label=y_train)
    #dvalid = xgb.DMatrix(X_test, label=y_test)
    
    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 15, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    scores = cross_val_score(estimator=xgb.XGBRegressor(**param), 
                             X=X_train, y=y_train, cv=5, scoring='r2')
    score = np.mean(scores)    
    return score

In [15]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600, n_jobs=-1)

[32m[I 2021-11-18 23:18:40,574][0m A new study created in memory with name: no-name-f40227f5-e2d9-4029-a23f-2c7ba8d0793e[0m
[32m[I 2021-11-18 23:18:51,122][0m Trial 10 finished with value: 0.691742962191477 and parameters: {'booster': 'gblinear', 'lambda': 7.570538066602389e-06, 'alpha': 2.408102706002304e-07, 'subsample': 0.3807958395336593, 'colsample_bytree': 0.811691190733562}. Best is trial 10 with value: 0.691742962191477.[0m
[32m[I 2021-11-18 23:18:51,200][0m Trial 1 finished with value: 0.6919960264168279 and parameters: {'booster': 'gblinear', 'lambda': 1.1274948140448687e-05, 'alpha': 1.2515333649168662e-05, 'subsample': 0.6459729265281196, 'colsample_bytree': 0.5748757480373319}. Best is trial 1 with value: 0.6919960264168279.[0m
[32m[I 2021-11-18 23:18:51,245][0m Trial 19 finished with value: 0.691956871897386 and parameters: {'booster': 'gblinear', 'lambda': 4.517851353665061e-06, 'alpha': 4.2740019762417737e-05, 'subsample': 0.6180497254077049, 'colsample_bytre

In [10]:
def objective(trial):
    #dtrain = xgb.DMatrix(X_train, label=y_train)
    #dvalid = xgb.DMatrix(X_test, label=y_test)
    
    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        # use exact for small dataset.
        "tree_method": "gpu_hist",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 15, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    scores = cross_val_score(estimator=xgb.XGBRegressor(**param), 
                             X=X_train, y=y_train, cv=5, scoring='r2')
    score = np.mean(scores)    
    return score

In [11]:
start = time.time()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600, n_jobs=-1)
end = time.time()
print(end - start)

[32m[I 2021-11-19 08:27:05,997][0m A new study created in memory with name: no-name-12412e53-7c1c-49dd-afa5-c1463d9b9fd3[0m
[32m[I 2021-11-19 08:27:15,128][0m Trial 17 finished with value: 0.6903664157878662 and parameters: {'booster': 'gblinear', 'lambda': 0.00021753814349399054, 'alpha': 1.22322402995512e-06, 'subsample': 0.27276088517898256, 'colsample_bytree': 0.6589401396065396}. Best is trial 17 with value: 0.6903664157878662.[0m
[32m[I 2021-11-19 08:27:15,215][0m Trial 2 finished with value: 0.6928262833683515 and parameters: {'booster': 'gblinear', 'lambda': 1.1843250783744273e-06, 'alpha': 0.00140463897412675, 'subsample': 0.868365733592126, 'colsample_bytree': 0.529552054146663}. Best is trial 2 with value: 0.6928262833683515.[0m
[32m[I 2021-11-19 08:27:15,266][0m Trial 11 finished with value: 0.6906752768180897 and parameters: {'booster': 'gblinear', 'lambda': 0.022406905885162545, 'alpha': 9.98047800460738e-06, 'subsample': 0.6310995110069253, 'colsample_bytree':

1526.514310836792


In [6]:
def objective(X_train, y_train, X_test, y_test, trial):
    #param_list
    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        # use exact for small dataset.
        "tree_method": "gpu_hist",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 15, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    xgboost_tuna = xgb.XGBClassifier(
        random_state=42, 
        **param
    )
    
    xgboost_tuna.fit(X_train, y_train)
    y_pred = xgboost_tuna.predict(X_test)
    
    return r2_score(y_pred,y_test)

In [9]:
start = time.time()
study = optuna.create_study(direction="maximize")
study.optimize(functools.partial(objective, X_train, y_train, X_test, y_test), n_trials=50)
end = time.time()
print(end - start)

[32m[I 2021-11-19 08:26:09,142][0m A new study created in memory with name: no-name-8a2097b2-140d-4d95-80cd-e60eb904e2fb[0m
[33m[W 2021-11-19 08:26:09,143][0m Trial 0 failed because of the following error: TypeError('objective() takes 1 positional argument but 5 were given')[0m
Traceback (most recent call last):
  File "C:\Users\Firat\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
TypeError: objective() takes 1 positional argument but 5 were given


TypeError: objective() takes 1 positional argument but 5 were given

In [12]:
start = time.time()
model= xgb.XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("R2-Score:",r2_score(y_pred,y_test))
end = time.time()
print("Time:",end - start)

R2-Score: 0.7667383684682142
Time: 1.3453047275543213


# Optuna mit filtered Features

In [88]:
data_max = data[["max_centroid_frequency_G","max_mab_G","velocity","size_mm"]]
X, y = data_max.drop(['size_mm'], axis=1), np.log(data_max['size_mm'])
y =y.astype(np.str)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 102)

In [89]:
start = time.time()
model= xgb.XGBRegressor(tree_method="gpu_hist")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("R2-Score:",r2_score(y_pred,y_test))
end = time.time()
print("Time:",end - start)

R2-Score: 0.8121151145016201
Time: 0.657667875289917


In [10]:
def objective(trial):
    #dtrain = xgb.DMatrix(X_train, label=y_train)
    #dvalid = xgb.DMatrix(X_test, label=y_test)
    
    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        # use exact for small dataset.
        "tree_method": "gpu_hist",
        "gpu_id": 0,
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 15, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    scores = cross_val_score(estimator=xgb.XGBRegressor(**param), 
                             X=X_train, y=y_train, cv=5, scoring='r2')
    score = np.mean(scores)    
    return score

In [11]:
start = time.time()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
end = time.time()
print(end - start)

[32m[I 2021-11-19 11:20:37,689][0m A new study created in memory with name: no-name-f3af1ee0-ba7b-443c-9eab-0c040010abd9[0m
[32m[I 2021-11-19 11:20:38,400][0m Trial 0 finished with value: 0.6163269720314869 and parameters: {'booster': 'gblinear', 'lambda': 1.7669525334998298e-06, 'alpha': 3.8278731945671986e-07, 'subsample': 0.7093317233635006, 'colsample_bytree': 0.2563989128660562}. Best is trial 0 with value: 0.6163269720314869.[0m
[32m[I 2021-11-19 11:20:39,000][0m Trial 1 finished with value: 0.6163278684813588 and parameters: {'booster': 'gblinear', 'lambda': 6.240621490580263e-08, 'alpha': 2.2254945193935147e-08, 'subsample': 0.9709451166435186, 'colsample_bytree': 0.22649178207429924}. Best is trial 1 with value: 0.6163278684813588.[0m
[32m[I 2021-11-19 11:20:39,597][0m Trial 2 finished with value: 0.6163284428345003 and parameters: {'booster': 'gblinear', 'lambda': 1.4643344611306307e-05, 'alpha': 5.183233084496974e-08, 'subsample': 0.9927633392666535, 'colsample_by

478.7807705402374


# Randomsearch

In [None]:
random_grid = {
        'max_depth':np.arange(10,15,1),
        'learning_rate': np.arange(0.1, 0.6, 0.1),
        'booster': ['gbtree', 'gblinear', 'dart'],
        'gamma':np.arange(0.1, 1.1, 0.1),
        'reg_alpha':np.arange(0.1, 1.1, 0.1),
        'reg_lambda':np.arange(0.1, 1.1, 0.1),
        'importance_type':['gain ', 'weight', 'cover'],
        'tree_method' : ['gpu_hist']
    }

In [17]:
def hyperParameterTuning_RandomizedSearchCV(X_train, y_train):
    param = {

        # defines booster, gblinear for linear functions.
        "booster": ["gbtree", "gblinear", "dart"],
        # L2 regularization weight.
        "lambda": np.arange(0.1, 1.1, 0.1),
        # L1 regularization weight.
        "alpha": np.arange(0.1, 1.1, 0.1),
        # sampling ratio for training data.
        "subsample": np.arange(0.1, 1.1, 0.1),
        # sampling according to each tree.
        "colsample_bytree": np.arange(0.1, 1.1, 0.1),
        
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = np.arange(3, 15, 1)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = np.arange(2, 10, 0.5)
        param["eta"] = np.arange(0.1, 1.1, 0.1)
        # defines how selective algorithm is.
        param["gamma"] = np.arange(0.1, 1.1, 0.1)
        param["grow_policy"] = ["depthwise", "lossguide"]

    if param["booster"] == "dart":
        param["sample_type"] = ["uniform", "weighted"]
        param["normalize_type"] = ["tree", "forest"]
        param["rate_drop"] = np.arange(0.1, 1.1, 0.1)
        param["skip_drop"] = np.arange(0.1, 1.1, 0.1)



    random_search = RandomizedSearchCV(estimator=XGBRegressor(tree_method = "gpu_hist"),
                                      param_distributions=param,
                                      cv=5, verbose=2, n_iter=100)
    random_search.fit(X,y)
    return print(random_search.best_params_,"\n",random_search.best_score_)

In [18]:
start = time.time()
hyperParameterTuning_RandomizedSearchCV(X, y)
end = time.time()
print(end - start)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END alpha=0.7000000000000001, booster=dart, colsample_bytree=1.0, lambda=0.8, subsample=0.5; total time=   5.2s
[CV] END alpha=0.7000000000000001, booster=dart, colsample_bytree=1.0, lambda=0.8, subsample=0.5; total time=   5.9s
[CV] END alpha=0.7000000000000001, booster=dart, colsample_bytree=1.0, lambda=0.8, subsample=0.5; total time=   4.9s
[CV] END alpha=0.7000000000000001, booster=dart, colsample_bytree=1.0, lambda=0.8, subsample=0.5; total time=   5.7s
[CV] END alpha=0.7000000000000001, booster=dart, colsample_bytree=1.0, lambda=0.8, subsample=0.5; total time=   5.7s
[CV] END alpha=0.6, booster=dart, colsample_bytree=0.5, lambda=0.5, subsample=0.30000000000000004; total time=   5.7s
[CV] END alpha=0.6, booster=dart, colsample_bytree=0.5, lambda=0.5, subsample=0.30000000000000004; total time=   6.0s
[CV] END alpha=0.6, booster=dart, colsample_bytree=0.5, lambda=0.5, subsample=0.30000000000000004; total time=   6.1

# Gpu supported parameter

In [69]:
def hyperParameterTuning_RandomizedSearchCV(X_train, y_train):
    param = {
        "subsample": np.arange(0.05, 1.05, 0.05),
        "colsample_bytree": np.arange(0.05, 1.05, 0.05),
        "colsample_bylevel": np.arange(0.05, 1.05, 0.05),
        "max_bin": np.arange(50, 500, 5),
        "gamma": np.arange(0.1, 1.1, 0.1), #range: [0,∞]
        # L1 regularization weight.
        "alpha": np.arange(0.1, 1.1, 0.1),
        # sampling ratio for training data.
        "subsample": np.arange(0.05, 1.05, 0.05),
        # sampling according to each tree.
            
    }

    random_search = RandomizedSearchCV(estimator=XGBRegressor(tree_method = "gpu_hist"),
                                      param_distributions=param,
                                      cv=5, verbose=2, n_iter=20)
    random_search.fit(X,y)
    return print(random_search.best_params_,"\n",random_search.best_score_)

In [70]:
start = time.time()
hyperParameterTuning_RandomizedSearchCV(X, y)
end = time.time()
print(end - start)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END alpha=0.6, colsample_bylevel=0.2, colsample_bytree=0.45, gamma=0.4, max_bin=310, subsample=0.7000000000000001; total time=   0.4s
[CV] END alpha=0.6, colsample_bylevel=0.2, colsample_bytree=0.45, gamma=0.4, max_bin=310, subsample=0.7000000000000001; total time=   0.3s
[CV] END alpha=0.6, colsample_bylevel=0.2, colsample_bytree=0.45, gamma=0.4, max_bin=310, subsample=0.7000000000000001; total time=   0.3s
[CV] END alpha=0.6, colsample_bylevel=0.2, colsample_bytree=0.45, gamma=0.4, max_bin=310, subsample=0.7000000000000001; total time=   0.3s
[CV] END alpha=0.6, colsample_bylevel=0.2, colsample_bytree=0.45, gamma=0.4, max_bin=310, subsample=0.7000000000000001; total time=   0.3s
[CV] END alpha=0.8, colsample_bylevel=0.05, colsample_bytree=0.2, gamma=0.8, max_bin=420, subsample=0.15000000000000002; total time=   0.3s
[CV] END alpha=0.8, colsample_bylevel=0.05, colsample_bytree=0.2, gamma=0.8, max_bin=420, subsample=0.1

# long ride optuna gpu

In [90]:
def objective(trial):
    #dtrain = xgb.DMatrix(X_train, label=y_train)
    #dvalid = xgb.DMatrix(X_test, label=y_test)
    
    param = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        # use exact for small dataset.
        "tree_method": "gpu_hist",
        "gpu_id": 0,
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        
    }

    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 15, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    scores = cross_val_score(estimator=xgb.XGBRegressor(**param), 
                             X=X_train, y=y_train, cv=5, scoring='r2')
    score = np.mean(scores)    
    return score

In [91]:
start = time.time()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=500)
end = time.time()
print(end - start)

[32m[I 2021-11-19 12:58:12,362][0m A new study created in memory with name: no-name-ba99ac6c-ce92-48f5-8483-1bc97ff41e16[0m
[32m[I 2021-11-19 12:58:14,438][0m Trial 0 finished with value: -31.68576040621415 and parameters: {'booster': 'gbtree', 'lambda': 2.184442738188155e-08, 'alpha': 5.2393972956926794e-08, 'subsample': 0.5187137811974085, 'colsample_bytree': 0.9579260449736255, 'max_depth': 5, 'min_child_weight': 6, 'eta': 7.103468011792766e-08, 'gamma': 4.270037084528077e-07, 'grow_policy': 'lossguide'}. Best is trial 0 with value: -31.68576040621415.[0m
[32m[I 2021-11-19 12:58:15,491][0m Trial 1 finished with value: -31.685559139673224 and parameters: {'booster': 'gbtree', 'lambda': 0.011508073692161672, 'alpha': 0.8306677104104288, 'subsample': 0.6523842265934384, 'colsample_bytree': 0.9516143296468023, 'max_depth': 3, 'min_child_weight': 10, 'eta': 1.0127034228506854e-07, 'gamma': 1.1625929395087546e-08, 'grow_policy': 'lossguide'}. Best is trial 1 with value: -31.685559

13409.569886922836


Trial 454 finished with value: 0.8383125767132482 and parameters: {'booster': 'dart', 'lambda': 1.5152615156514128e-07, 'alpha': 0.26592360933017956, 'subsample': 0.918877889986739, 'colsample_bytree': 0.8140490039889527, 'max_depth': 7, 'min_child_weight': 2, 'eta': 0.10644218612472013, 'gamma': 7.640534597687832e-08, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 2.6396107867776647e-05, 'skip_drop': 5.144366755254109e-08}. Best is trial 454 with value: 0.8383125767132482.

In [3]:
data_max = data[["max_centroid_frequency_G","max_mab_G","velocity","size_mm"]]
X, y = data_max.drop(['size_mm'], axis=1), np.log(data_max['size_mm'])
y =y.astype(np.str)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 42)

In [4]:
parameters={
    'tree_method': 'gpu_hist',
    'booster': 'dart',
    'lambda': 1.5152615156514128e-07,
    'alpha': 0.26592360933017956,
    'subsample': 0.918877889986739,
    'colsample_bytree': 0.8140490039889527,
    'max_depth': 7,
    'min_child_weight': 2,
    'eta': 0.10644218612472013,
    'gamma': 7.640534597687832e-08,
    'grow_policy': 'depthwise',
    'sample_type': 'weighted',
    'normalize_type': 'tree',
    'rate_drop': 2.6396107867776647e-05,
    'skip_drop': 5.144366755254109e-08} 

In [5]:
start = time.time()
model= xgb.XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("R2-Score:",r2_score(y_pred,y_test))
end = time.time()
print("Time:",end - start)

R2-Score: 0.8076197604491866
Time: 5.762835502624512


In [6]:
start = time.time()
model= xgb.XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("R2-Score:",r2_score(y_pred,y_test))
end = time.time()
print("Time:",end - start)

R2-Score: 0.8097026835392929
Time: 1.2158234119415283


# Best model from optuna with all data

In [14]:
data_max = data.drop(['start_time'], axis=1)
X, y = data_max.drop(['size_mm'], axis=1), np.log(data_max['size_mm'])
y =y.astype(np.str)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 102)

In [15]:
start = time.time()
model= xgb.XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("R2-Score:",r2_score(y_pred,y_test))
end = time.time()
print("Time:",end - start)

R2-Score: 0.8367933756400202
Time: 5.387394428253174


In [16]:
start = time.time()
model= xgb.XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("R2-Score:",r2_score(y_pred,y_test))
end = time.time()
print("Time:",end - start)

R2-Score: 0.833721206253053
Time: 1.3403022289276123


In [19]:
parameters={'subsample': 0.2, 'lambda': 1.0, 'colsample_bytree': 0.8, 'booster': 'dart', 'alpha': 1.0}

In [20]:
start = time.time()
model= xgb.XGBRegressor(**parameters)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("R2-Score:",r2_score(y_pred,y_test))
end = time.time()
print("Time:",end - start)

R2-Score: 0.8184503695961582
Time: 2.7519032955169678
