In [None]:
import optuna
import pandas as pd, numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load Data and Preprocessing

In [None]:
# https://www.kaggle.com/code/alvinleenh/gridworld-sales-eda-ctb-prediction
train = pd.read_csv('/kaggle/input/predicting-sales-quantity-in-our-dynamic-gridworld/train.csv')
sup = pd.read_csv('/kaggle/input/predicting-sales-quantity-in-our-dynamic-gridworld/supplemental_cities.csv')
test = pd.read_csv('/kaggle/input/predicting-sales-quantity-in-our-dynamic-gridworld/test.csv')
submission = pd.read_csv('/kaggle/input/predicting-sales-quantity-in-our-dynamic-gridworld/sample_submission.csv')
train = pd.merge(train, sup, on='city_id')
test = pd.merge(test, sup, on='city_id')

In [None]:
train

In [None]:
for data in [train, test]:
    # Split the 'city_id' column into three columns
    data[['city_x', 'city_y', 'city_z']] = data['city_id'].str.split('/', expand=True).astype(int)
    data.drop('city_id', axis=1, inplace=True)

In [None]:
X = train.drop(columns=['quantity'])
y = train['quantity']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
eval_set = [(X_train, y_train), (X_test, y_test)]

In [None]:

def objective(trial):
    xgb_parameters = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'max_leaves': trial.suggest_int('max_leaves', 20, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.9),
    }
    model = XGBRegressor(**xgb_parameters)

    model.fit(X_train, y_train, 
              early_stopping_rounds=5, 
              eval_set=eval_set, verbose=False)

    y_pred = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred, squared=False)
    
    print(score)
    return score

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time
study = optuna.create_study(direction='minimize', study_name='XGB')
study.optimize(objective, n_trials=1000)

In [None]:
trials_df = study.trials_dataframe()

In [None]:
trials_df.sort_values('value', ascending=True)

In [None]:
study.best_params