In [12]:
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    fbeta_score,
)

df = pd.read_csv("../stats/combined.csv")
X = df.drop(["chr", "start", "end", "cnv_type", "BAM_CREF_SKIP", "BAM_CSOFT_CLIP", "BAM_CHARD_CLIP", "BAM_CPAD", "BAM_CEQUAL", "BAM_CDIFF", "BAM_CBACK"], axis=1)
lbl_e = LabelEncoder()
y = lbl_e.fit_transform(df["cnv_type"])

# Podział na zbiór treningowy i testowy
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
#Define the objective function to optimize
results = []
def objective(trial):
    # Define the hyperparameters to search over
    model_type = trial.suggest_categorical('model_type', ['RandomForest', 'LightGBM', 'XGBoost'])
    n_estimators = trial.suggest_int('n_estimators', 10, 100, step=10)
    max_depth = trial.suggest_int('max_depth', 10, 100, step=10)
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced', {0: 1, 1: 3, 2: 3}])
    log_transform = trial.suggest_categorical('log_transform', [True, False])
    standard_scaler = trial.suggest_categorical('standard_scaler', [True, False])
    undersampling = trial.suggest_categorical('undersampling', [True, False])

    # Preprocess the data based on hyperparameters
    if undersampling:
        # Undersampling klas mniejszościowych (przykład)
        count = Counter(y_train)
        classes_resampling = {2: int(count[2] * 0.3)}
        under = RandomUnderSampler(
            sampling_strategy=classes_resampling, random_state=42
        )
        X_res, y_res = under.fit_resample(X_train, y_train)
    else:
        X_res, y_res = X_train, y_train

    if standard_scaler:
        scaler = StandardScaler()
        X_res = scaler.fit_transform(X_res)
        x_test_res = scaler.transform(x_test)
    
    if log_transform:
        X_res = np.log1p(X_res)
        x_test_res = np.log1p(x_test)


    if model_type == 'RandomForest':
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, class_weight=class_weight, random_state=42, n_jobs=-1)
    elif model_type == 'LightGBM':
        model = LGBMClassifier(n_estimators=n_estimators, max_depth=max_depth, class_weight=class_weight, random_state=42, n_jobs=-1)
    elif model_type == 'XGBoost':
        model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, scale_pos_weight=class_weight, random_state=42, n_jobs=-1)


    # Trenowanie modelu
    model.fit(X_res, y_res)

    # Przewidywanie na zbiorze testowym
    if standard_scaler or log_transform:
        y_pred = model.predict(x_test_res)
    else:
        y_pred = model.predict(x_test)
    fbeta = fbeta_score(y_test, y_pred, beta = 3, average='macro')


    results.append({
        'model': model_type,
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'class_weight': class_weight,
        'log_transform': log_transform,
        'standard_scaler': standard_scaler,
        'undersampling': undersampling,
        'fbeta': fbeta,
        'classification_report': classification_report(y_test, y_pred, zero_division=True),
        'confusion_matrix': confusion_matrix(y_test, y_pred)
    })

    return fbeta

# Load your data and preprocess it
# ...

# Split the data into training and testing sets
# ...

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params
best_accuracy = study.best_value

print("Najlepsze hiperparametry:")
print(best_params)
print("Najlepsza dokładność:", best_accuracy)


[I 2023-10-10 01:12:55,673] A new study created in memory with name: no-name-4ffb36d8-dd42-4026-b923-26b4ad45a132


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1768
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -2.282466
[LightGBM] [Info] Start training from score -1.173089
[LightGBM] [Info] Start training from score -0.530080


[I 2023-10-10 01:13:09,469] Trial 0 finished with value: 0.22361611448493712 and parameters: {'model_type': 'LightGBM', 'n_estimators': 60, 'max_depth': 30, 'class_weight': {0: 1, 1: 3, 2: 3}, 'log_transform': True, 'standard_scaler': True, 'undersampling': True}. Best is trial 0 with value: 0.22361611448493712.
  X_res = np.log1p(X_res)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057211 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1758
[LightGBM] [Info] Number of data points in the train set: 8570985, number of used features: 9
[LightGBM] [Info] Start training from score -3.146749
[LightGBM] [Info] Start training from score -2.037372
[LightGBM] [Info] Start training from score -0.190389


[I 2023-10-10 01:13:27,388] Trial 1 finished with value: 0.24116456895847774 and parameters: {'model_type': 'LightGBM', 'n_estimators': 50, 'max_depth': 20, 'class_weight': {0: 1, 1: 3, 2: 3}, 'log_transform': True, 'standard_scaler': True, 'undersampling': False}. Best is trial 1 with value: 0.24116456895847774.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046205 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1758
[LightGBM] [Info] Number of data points in the train set: 8570985, number of used features: 9
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2023-10-10 01:13:48,327] Trial 2 finished with value: 0.8813943362681701 and parameters: {'model_type': 'LightGBM', 'n_estimators': 60, 'max_depth': 100, 'class_weight': 'balanced', 'log_transform': False, 'standard_scaler': True, 'undersampling': False}. Best is trial 2 with value: 0.8813943362681701.
[I 2023-10-10 01:16:56,899] Trial 3 finished with value: 0.856019306408058 and parameters: {'model_type': 'XGBoost', 'n_estimators': 50, 'max_depth': 50, 'class_weight': None, 'log_transform': False, 'standard_scaler': False, 'undersampling': False}. Best is trial 2 with value: 0.8813943362681701.
Parameters: { "scale_pos_weight" } are not used.

[I 2023-10-10 01:20:52,909] Trial 4 finished with value: 0.8555617598656159 and parameters: {'model_type': 'XGBoost', 'n_estimators': 40, 'max_depth': 100, 'class_weight': {0: 1, 1: 3, 2: 3}, 'log_transform': True, 'standard_scaler': False, 'undersampling': False}. Best is trial 2 with value: 0.8813943362681701.
[I 2023-10-10 01:21:29,829] Tr

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030958 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -2.282466
[LightGBM] [Info] Start training from score -1.173089
[LightGBM] [Info] Start training from score -0.530080


[I 2023-10-10 01:23:33,343] Trial 7 finished with value: 0.8838238844387808 and parameters: {'model_type': 'LightGBM', 'n_estimators': 40, 'max_depth': 10, 'class_weight': {0: 1, 1: 3, 2: 3}, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 7 with value: 0.8838238844387808.
[I 2023-10-10 01:23:52,696] Trial 8 finished with value: 0.8794311319689303 and parameters: {'model_type': 'RandomForest', 'n_estimators': 20, 'max_depth': 30, 'class_weight': {0: 1, 1: 3, 2: 3}, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 7 with value: 0.8838238844387808.
[I 2023-10-10 01:24:29,640] Trial 9 finished with value: 0.8717792294942143 and parameters: {'model_type': 'RandomForest', 'n_estimators': 40, 'max_depth': 60, 'class_weight': 'balanced', 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 7 with value: 0.8838238844387808.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -2.282466
[LightGBM] [Info] Start training from score -1.173089
[LightGBM] [Info] Start training from score -0.530080


[I 2023-10-10 01:24:47,965] Trial 10 finished with value: 0.8838269238248437 and parameters: {'model_type': 'LightGBM', 'n_estimators': 90, 'max_depth': 10, 'class_weight': {0: 1, 1: 3, 2: 3}, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 10 with value: 0.8838269238248437.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037198 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -2.282466
[LightGBM] [Info] Start training from score -1.173089
[LightGBM] [Info] Start training from score -0.530080


[I 2023-10-10 01:25:07,893] Trial 11 finished with value: 0.8838121501880014 and parameters: {'model_type': 'LightGBM', 'n_estimators': 100, 'max_depth': 10, 'class_weight': {0: 1, 1: 3, 2: 3}, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 10 with value: 0.8838269238248437.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038440 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -2.282466
[LightGBM] [Info] Start training from score -1.173089
[LightGBM] [Info] Start training from score -0.530080


[I 2023-10-10 01:25:26,251] Trial 12 finished with value: 0.8838269238248437 and parameters: {'model_type': 'LightGBM', 'n_estimators': 90, 'max_depth': 10, 'class_weight': {0: 1, 1: 3, 2: 3}, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 10 with value: 0.8838269238248437.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032671 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -2.282466
[LightGBM] [Info] Start training from score -1.173089
[LightGBM] [Info] Start training from score -0.530080


[I 2023-10-10 01:25:44,597] Trial 13 finished with value: 0.883827084889295 and parameters: {'model_type': 'LightGBM', 'n_estimators': 90, 'max_depth': 30, 'class_weight': {0: 1, 1: 3, 2: 3}, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 13 with value: 0.883827084889295.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -2.282466
[LightGBM] [Info] Start training from score -1.173089
[LightGBM] [Info] Start training from score -0.530080


[I 2023-10-10 01:26:02,979] Trial 14 finished with value: 0.883827084889295 and parameters: {'model_type': 'LightGBM', 'n_estimators': 90, 'max_depth': 30, 'class_weight': {0: 1, 1: 3, 2: 3}, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 13 with value: 0.883827084889295.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:26:20,153] Trial 15 finished with value: 0.8838288670834564 and parameters: {'model_type': 'LightGBM', 'n_estimators': 80, 'max_depth': 40, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:26:36,185] Trial 16 finished with value: 0.8838243195249453 and parameters: {'model_type': 'LightGBM', 'n_estimators': 70, 'max_depth': 60, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028756 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:26:53,795] Trial 17 finished with value: 0.8838288670834564 and parameters: {'model_type': 'LightGBM', 'n_estimators': 80, 'max_depth': 40, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.
[I 2023-10-10 01:29:18,317] Trial 18 finished with value: 0.8595844092164064 and parameters: {'model_type': 'RandomForest', 'n_estimators': 70, 'max_depth': 40, 'class_weight': None, 'log_transform': False, 'standard_scaler': False, 'undersampling': False}. Best is trial 15 with value: 0.8838288670834564.
[I 2023-10-10 01:31:09,368] Trial 19 finished with value: 0.8692430648851124 and parameters: {'model_type': 'XGBoost', 'n_estimators': 70, 'max_depth': 40, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029707 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:31:26,508] Trial 20 finished with value: 0.8838288670834564 and parameters: {'model_type': 'LightGBM', 'n_estimators': 80, 'max_depth': 60, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042359 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:31:43,853] Trial 21 finished with value: 0.8838288670834564 and parameters: {'model_type': 'LightGBM', 'n_estimators': 80, 'max_depth': 70, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:32:01,155] Trial 22 finished with value: 0.8838288670834564 and parameters: {'model_type': 'LightGBM', 'n_estimators': 80, 'max_depth': 70, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031819 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:32:21,122] Trial 23 finished with value: 0.8838118914678783 and parameters: {'model_type': 'LightGBM', 'n_estimators': 100, 'max_depth': 40, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030640 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:32:38,303] Trial 24 finished with value: 0.8838288670834564 and parameters: {'model_type': 'LightGBM', 'n_estimators': 80, 'max_depth': 80, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:32:54,306] Trial 25 finished with value: 0.8838243195249453 and parameters: {'model_type': 'LightGBM', 'n_estimators': 70, 'max_depth': 50, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1758
[LightGBM] [Info] Number of data points in the train set: 8570985, number of used features: 9
[LightGBM] [Info] Start training from score -2.130622
[LightGBM] [Info] Start training from score -2.119857
[LightGBM] [Info] Start training from score -0.272875


[I 2023-10-10 01:33:00,990] Trial 26 finished with value: 0.8482436161952434 and parameters: {'model_type': 'LightGBM', 'n_estimators': 10, 'max_depth': 70, 'class_weight': None, 'log_transform': False, 'standard_scaler': True, 'undersampling': False}. Best is trial 15 with value: 0.8838288670834564.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029839 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:33:17,930] Trial 27 finished with value: 0.8838288670834564 and parameters: {'model_type': 'LightGBM', 'n_estimators': 80, 'max_depth': 60, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.
[I 2023-10-10 01:34:41,083] Trial 28 finished with value: 0.8763391882254888 and parameters: {'model_type': 'RandomForest', 'n_estimators': 100, 'max_depth': 40, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.
Parameters: { "scale_pos_weight" } are not used.

[I 2023-10-10 01:35:11,369] Trial 29 finished with value: 0.19222211369384126 and parameters: {'model_type': 'XGBoost', 'n_estimators': 60, 'max_depth': 20, 'class_weight': 'balanced', 'log_transform': True, 'standard_scaler': True, 'undersampling': True}. Best is trial 15 with value: 0.8838288670834564.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.038113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:35:26,096] Trial 30 finished with value: 0.8838367214151394 and parameters: {'model_type': 'LightGBM', 'n_estimators': 60, 'max_depth': 50, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:35:40,558] Trial 31 finished with value: 0.8838367214151394 and parameters: {'model_type': 'LightGBM', 'n_estimators': 60, 'max_depth': 50, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:35:55,195] Trial 32 finished with value: 0.8838367214151394 and parameters: {'model_type': 'LightGBM', 'n_estimators': 60, 'max_depth': 50, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032149 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:36:09,607] Trial 33 finished with value: 0.8838367214151394 and parameters: {'model_type': 'LightGBM', 'n_estimators': 60, 'max_depth': 50, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.
  X_res = np.log1p(X_res)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058831 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1758
[LightGBM] [Info] Number of data points in the train set: 8570985, number of used features: 9
[LightGBM] [Info] Start training from score -2.130622
[LightGBM] [Info] Start training from score -2.119857
[LightGBM] [Info] Start training from score -0.272875


[I 2023-10-10 01:36:30,045] Trial 34 finished with value: 0.19580140734942988 and parameters: {'model_type': 'LightGBM', 'n_estimators': 60, 'max_depth': 50, 'class_weight': None, 'log_transform': True, 'standard_scaler': True, 'undersampling': False}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1755
[LightGBM] [Info] Number of data points in the train set: 8570985, number of used features: 9
[LightGBM] [Info] Start training from score -2.130622
[LightGBM] [Info] Start training from score -2.119857
[LightGBM] [Info] Start training from score -0.272875


[I 2023-10-10 01:36:48,764] Trial 35 finished with value: 0.8625456343431935 and parameters: {'model_type': 'LightGBM', 'n_estimators': 50, 'max_depth': 50, 'class_weight': None, 'log_transform': False, 'standard_scaler': False, 'undersampling': False}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2023-10-10 01:37:02,485] Trial 36 finished with value: 0.8813324828274819 and parameters: {'model_type': 'LightGBM', 'n_estimators': 50, 'max_depth': 50, 'class_weight': 'balanced', 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.
[I 2023-10-10 01:40:42,230] Trial 37 finished with value: 0.8653955598243425 and parameters: {'model_type': 'XGBoost', 'n_estimators': 60, 'max_depth': 80, 'class_weight': None, 'log_transform': False, 'standard_scaler': True, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1755
[LightGBM] [Info] Number of data points in the train set: 8570985, number of used features: 9
[LightGBM] [Info] Start training from score -2.130622
[LightGBM] [Info] Start training from score -2.119857
[LightGBM] [Info] Start training from score -0.272875


[I 2023-10-10 01:41:01,668] Trial 38 finished with value: 0.862536092821505 and parameters: {'model_type': 'LightGBM', 'n_estimators': 50, 'max_depth': 60, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': False}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1768
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2023-10-10 01:41:15,501] Trial 39 finished with value: 0.19222184360151207 and parameters: {'model_type': 'LightGBM', 'n_estimators': 60, 'max_depth': 20, 'class_weight': 'balanced', 'log_transform': True, 'standard_scaler': True, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.
[I 2023-10-10 01:41:44,817] Trial 40 finished with value: 0.872441703882617 and parameters: {'model_type': 'RandomForest', 'n_estimators': 30, 'max_depth': 90, 'class_weight': None, 'log_transform': False, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029362 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:42:00,589] Trial 41 finished with value: 0.8838243195249453 and parameters: {'model_type': 'LightGBM', 'n_estimators': 70, 'max_depth': 50, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032725 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:42:15,154] Trial 42 finished with value: 0.8838367214151394 and parameters: {'model_type': 'LightGBM', 'n_estimators': 60, 'max_depth': 40, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029356 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:42:29,670] Trial 43 finished with value: 0.8838367214151394 and parameters: {'model_type': 'LightGBM', 'n_estimators': 60, 'max_depth': 50, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035865 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:42:41,158] Trial 44 finished with value: 0.8838324483403422 and parameters: {'model_type': 'LightGBM', 'n_estimators': 40, 'max_depth': 30, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.
[I 2023-10-10 01:45:09,094] Trial 45 finished with value: 0.8664284527249789 and parameters: {'model_type': 'XGBoost', 'n_estimators': 50, 'max_depth': 60, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030199 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.369557
[LightGBM] [Info] Start training from score -1.358792
[LightGBM] [Info] Start training from score -0.715783


[I 2023-10-10 01:45:22,586] Trial 46 finished with value: 0.8838343689297449 and parameters: {'model_type': 'LightGBM', 'n_estimators': 50, 'max_depth': 40, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031785 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1765
[LightGBM] [Info] Number of data points in the train set: 4004094, number of used features: 9
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2023-10-10 01:45:34,762] Trial 47 finished with value: 0.8813324097750904 and parameters: {'model_type': 'LightGBM', 'n_estimators': 40, 'max_depth': 70, 'class_weight': 'balanced', 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.
[I 2023-10-10 01:46:21,897] Trial 48 finished with value: 0.880851006346771 and parameters: {'model_type': 'RandomForest', 'n_estimators': 60, 'max_depth': 30, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}. Best is trial 30 with value: 0.8838367214151394.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058177 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1755
[LightGBM] [Info] Number of data points in the train set: 8570985, number of used features: 9
[LightGBM] [Info] Start training from score -2.130622
[LightGBM] [Info] Start training from score -2.119857
[LightGBM] [Info] Start training from score -0.272875


[I 2023-10-10 01:46:46,237] Trial 49 finished with value: 0.8625457333638478 and parameters: {'model_type': 'LightGBM', 'n_estimators': 70, 'max_depth': 50, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': False}. Best is trial 30 with value: 0.8838367214151394.


Najlepsze hiperparametry:
{'model_type': 'LightGBM', 'n_estimators': 60, 'max_depth': 50, 'class_weight': None, 'log_transform': True, 'standard_scaler': False, 'undersampling': True}
Najlepsza dokładność: 0.8838367214151394


In [15]:
pd.DataFrame(results).to_csv("results.csv", index=False)