In [1]:

import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn.model_selection import*
import optuna
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
dataset = read_csv(url, header=None)
data = dataset.values
x_data, y_data = data[:, :-1],data[:, -1]
x_data = pd.DataFrame(x_data.astype('float64'))
#Label Encoding Target
le = preprocessing.LabelEncoder()
le.fit(y_data)
y_data = le.transform(y_data)
y_data = pd.DataFrame(y_data.astype('float64'))
##compare a validation set with the kfold output from optuna 
features, test_features, target, test_target = train_test_split(x_data, y_data, test_size=0.3, random_state=17)
x_data_lgb = x_data.to_numpy()
y_data_lgb = y_data.to_numpy()


def objective(trial):
    params = {
        'device' : 'gpu',
        'objective': 'binary',
        'metric': 'binary_logloss',
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'verbose' : -100 #leave out verbose parameter to show INFO logs
    }
    # Perform k-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, valid_idx in kf.split(x_data_lgb, y_data_lgb):
        dtrain = lgb.Dataset(x_data_lgb[train_idx], label=y_data_lgb[train_idx])
        dvalid = lgb.Dataset(x_data_lgb[valid_idx], label=y_data_lgb[valid_idx])
        model = lgb.train(params, dtrain, valid_sets=[dvalid], num_boost_round=50)
        valid_pred = model.predict(x_data_lgb[valid_idx])
        score = log_loss(y_data_lgb[valid_idx], valid_pred)
        scores.append(score)
    return np.mean(scores)
study = optuna.create_study(direction='minimize')
# Suppress INFO logs by setting the logging level to WARNING
optuna.logging.set_verbosity(optuna.logging.ERROR)
study.optimize(objective, n_trials=10)
# Access the best hyperparameters and corresponding log loss
best_params = study.best_params
best_log_loss = study.best_value

print(best_params)
print(best_log_loss)



[I 2024-04-17 16:55:58,118] A new study created in memory with name: no-name-a676303e-9c39-49e9-b54d-622267b503eb


{'lambda_l1': 7.634190995582307, 'lambda_l2': 8.073694938705138, 'num_leaves': 8, 'feature_fraction': 0.6392025951065148, 'bagging_fraction': 0.9182770389118676, 'bagging_freq': 1, 'min_child_samples': 25}
0.5368546535643052
