In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

In [2]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

In [3]:
final_predictions = []
scores = []
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])

    model = XGBRegressor(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    final_predictions.append(preds_valid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

In [4]:
study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=10)

[32m[I 2021-09-04 14:10:17,577][0m A new study created in memory with name: no-name-6fc4468b-0613-400d-8e61-78269c0cac58[0m


[0]	validation_0-rmse:7.63534
[1000]	validation_0-rmse:0.72073
[2000]	validation_0-rmse:0.71996
[2073]	validation_0-rmse:0.71995


[32m[I 2021-09-04 14:10:39,932][0m Trial 0 finished with value: 0.7199329449691756 and parameters: {'learning_rate': 0.01875533650797952, 'reg_lambda': 2.2506405295295566e-05, 'reg_alpha': 0.002985147693490237, 'subsample': 0.3420725639032903, 'colsample_bytree': 0.20808619404301743, 'max_depth': 7}. Best is trial 0 with value: 0.7199329449691756.[0m


[0]	validation_0-rmse:7.23814
[783]	validation_0-rmse:0.72199


[32m[I 2021-09-04 14:10:47,090][0m Trial 1 finished with value: 0.7207733644988662 and parameters: {'learning_rate': 0.0703337432475698, 'reg_lambda': 0.000992596761534797, 'reg_alpha': 0.00014971266201840226, 'subsample': 0.4034888082166066, 'colsample_bytree': 0.14729062173253402, 'max_depth': 6}. Best is trial 0 with value: 0.7199329449691756.[0m


[0]	validation_0-rmse:7.59723
[1000]	validation_0-rmse:0.72214
[1947]	validation_0-rmse:0.72177


[32m[I 2021-09-04 14:11:00,618][0m Trial 2 finished with value: 0.7216220645872072 and parameters: {'learning_rate': 0.02374358855656539, 'reg_lambda': 3.544519416645159e-07, 'reg_alpha': 3.49333326120028e-08, 'subsample': 0.1809567234977425, 'colsample_bytree': 0.3734011179474789, 'max_depth': 6}. Best is trial 0 with value: 0.7199329449691756.[0m


[0]	validation_0-rmse:6.57598
[369]	validation_0-rmse:0.73731


[32m[I 2021-09-04 14:11:05,098][0m Trial 3 finished with value: 0.7278588837530366 and parameters: {'learning_rate': 0.15672279974768688, 'reg_lambda': 3.6155217551924306e-07, 'reg_alpha': 3.556836777301639e-05, 'subsample': 0.2701148863965662, 'colsample_bytree': 0.6938084871622449, 'max_depth': 6}. Best is trial 0 with value: 0.7199329449691756.[0m


[0]	validation_0-rmse:7.40620
[1000]	validation_0-rmse:0.72008
[1579]	validation_0-rmse:0.71995


[32m[I 2021-09-04 14:11:14,940][0m Trial 4 finished with value: 0.7198280185719244 and parameters: {'learning_rate': 0.04859894094611356, 'reg_lambda': 1.2991878358044703e-07, 'reg_alpha': 0.19755873782670785, 'subsample': 0.903924438756917, 'colsample_bytree': 0.7594754112388074, 'max_depth': 5}. Best is trial 4 with value: 0.7198280185719244.[0m


[0]	validation_0-rmse:7.28576
[1000]	validation_0-rmse:0.71982
[1549]	validation_0-rmse:0.71976


[32m[I 2021-09-04 14:11:22,174][0m Trial 5 finished with value: 0.7196697288789504 and parameters: {'learning_rate': 0.06415103177137244, 'reg_lambda': 3.558371376725628, 'reg_alpha': 9.795339970249134e-06, 'subsample': 0.20032877560092, 'colsample_bytree': 0.14245364366071778, 'max_depth': 4}. Best is trial 5 with value: 0.7196697288789504.[0m


[0]	validation_0-rmse:7.67803
[1000]	validation_0-rmse:0.72572
[2000]	validation_0-rmse:0.72184
[3000]	validation_0-rmse:0.72024
[4000]	validation_0-rmse:0.71928
[5000]	validation_0-rmse:0.71874
[6000]	validation_0-rmse:0.71841
[6999]	validation_0-rmse:0.71827


[32m[I 2021-09-04 14:11:46,528][0m Trial 6 finished with value: 0.7182668489342742 and parameters: {'learning_rate': 0.013213246765445165, 'reg_lambda': 0.17448566123187914, 'reg_alpha': 0.001024018077221822, 'subsample': 0.44480859206754253, 'colsample_bytree': 0.2265809905715841, 'max_depth': 4}. Best is trial 6 with value: 0.7182668489342742.[0m


[0]	validation_0-rmse:7.57730
[1000]	validation_0-rmse:0.72093
[2000]	validation_0-rmse:0.72032
[2026]	validation_0-rmse:0.72035


[32m[I 2021-09-04 14:12:03,175][0m Trial 7 finished with value: 0.7202124480872998 and parameters: {'learning_rate': 0.026344985887918435, 'reg_lambda': 0.1061107953033534, 'reg_alpha': 0.4051014405518432, 'subsample': 0.7302404188227508, 'colsample_bytree': 0.9536313865176592, 'max_depth': 6}. Best is trial 6 with value: 0.7182668489342742.[0m


[0]	validation_0-rmse:7.49140
[1000]	validation_0-rmse:0.73074
[2000]	validation_0-rmse:0.72807
[3000]	validation_0-rmse:0.72670
[4000]	validation_0-rmse:0.72581
[5000]	validation_0-rmse:0.72498
[6000]	validation_0-rmse:0.72445
[6999]	validation_0-rmse:0.72382


[32m[I 2021-09-04 14:12:17,757][0m Trial 8 finished with value: 0.7238029661984259 and parameters: {'learning_rate': 0.03744181191383959, 'reg_lambda': 0.000371987621834347, 'reg_alpha': 0.0007558923236813879, 'subsample': 0.10524876911641057, 'colsample_bytree': 0.7301902477829995, 'max_depth': 1}. Best is trial 6 with value: 0.7182668489342742.[0m


[0]	validation_0-rmse:6.04834
[1000]	validation_0-rmse:0.72505
[2000]	validation_0-rmse:0.72229
[3000]	validation_0-rmse:0.72111
[4000]	validation_0-rmse:0.72052
[5000]	validation_0-rmse:0.71991
[6000]	validation_0-rmse:0.71961
[6999]	validation_0-rmse:0.71910


[32m[I 2021-09-04 14:12:32,861][0m Trial 9 finished with value: 0.7190864695832339 and parameters: {'learning_rate': 0.22503325691364856, 'reg_lambda': 3.7632988603423764e-06, 'reg_alpha': 33.94342400947907, 'subsample': 0.25676993146380006, 'colsample_bytree': 0.7743793805567915, 'max_depth': 1}. Best is trial 6 with value: 0.7182668489342742.[0m


In [5]:
study.best_params

{'learning_rate': 0.013213246765445165,
 'reg_lambda': 0.17448566123187914,
 'reg_alpha': 0.001024018077221822,
 'subsample': 0.44480859206754253,
 'colsample_bytree': 0.2265809905715841,
 'max_depth': 4}