In [1]:
import pandas as pd
import numpy as np
import os
import warnings
import optuna
import pandas as pd
from tqdm import tqdm
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
X = pd.read_csv("./data/train_feat.csv")
Y = pd.read_csv("./data/train_output.csv")

In [3]:
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=507)

In [4]:
def objective(trial):
    params = {
        "verbose": -1,
        "metric": "mape",
        "num_iterations": trial.suggest_int("num_iterations", 100, 600), 
        "verbosity": -1,
        "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 64, 2**10),
        "subsample": trial.suggest_float("subsample", 0.1, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
        "max_bin": trial.suggest_int("max_bin", 255, 512),
    }

    model = LGBMRegressor(**params)
    model.fit(x_train, y_train)
    pred = model.predict(x_valid)
    mape = mean_absolute_percentage_error(y_valid, pred)
    # print(f"MAPE: {mape*100}")
    return mape

In [5]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2023-11-11 17:00:53,660] A new study created in memory with name: no-name-4389f503-eed8-495c-8a93-eef59eec7b82
[I 2023-11-11 17:00:54,257] Trial 0 finished with value: 0.10486679696854902 and parameters: {'num_iterations': 151, 'learning_rate': 0.05134320919484361, 'num_leaves': 414, 'subsample': 0.174738299754772, 'colsample_bytree': 0.8401435810538883, 'min_data_in_leaf': 20, 'max_bin': 274}. Best is trial 0 with value: 0.10486679696854902.
[I 2023-11-11 17:00:55,286] Trial 1 finished with value: 0.10163216727694346 and parameters: {'num_iterations': 386, 'learning_rate': 0.04923059788024086, 'num_leaves': 760, 'subsample': 0.45122120910748653, 'colsample_bytree': 0.8407956218754967, 'min_data_in_leaf': 90, 'max_bin': 343}. Best is trial 1 with value: 0.10163216727694346.
[I 2023-11-11 17:00:57,309] Trial 2 finished with value: 0.1038678027228092 and parameters: {'num_iterations': 443, 'learning_rate': 0.1337899326108272, 'num_leaves': 1023, 'subsample': 0.9855414755268451, 'colsa

In [6]:
print('Best hyperparameters:', study.best_params)
print('Best MAPE:', study.best_value)

Best hyperparameters: {'num_iterations': 552, 'learning_rate': 0.013436493534440383, 'num_leaves': 985, 'subsample': 0.764884507170416, 'colsample_bytree': 0.6573794617181706, 'min_data_in_leaf': 13, 'max_bin': 384}
Best MAPE: 0.09383876878890121


In [7]:
pred.to_csv('data/pred.csv', index=False)

NameError: name 'pred' is not defined