-
Notifications
You must be signed in to change notification settings - Fork 0
/
4.0_GPB1_tune.py
129 lines (104 loc) · 3.91 KB
/
4.0_GPB1_tune.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import optuna
from category_encoders.target_encoder import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import gpboost as gpb
# Set print options
np.set_printoptions(suppress=True, precision=6, edgeitems = 7)
pd.options.display.float_format = '{:.6f}'.format
pd.set_option('display.max_columns', None)
random_state = 1923
# Read data
df = pd.read_csv("./InputData/full_data.csv")
# Reindex data from 0:N
df = df.reset_index(drop = True)
# Drop rows with too high duration
high_end = 10800 # 3 hours
df = df[df["duration"] <= high_end]
# Split features & target, drop non-feature columns
X = df.drop(["created_at", "actual_delivery_time", "duration"], axis = 1)
y = df.duration
# Split features & group variables
G = X["store_id"]
X = X.drop(["store_id"], axis = 1)
# Enforce dtype of group variables
G = G.astype("str")
train_end = int(len(df) * 0.6)
val_end = train_end + int(len(df) * 0.2)
# Train - val - test split, 60 - 20 - 20
X_train, X_val, X_test = X[:train_end], X[train_end:val_end], X[val_end:]
G_train, G_val, G_test = G[:train_end], G[train_end:val_end], G[val_end:]
y_train, y_val, y_test = y[:train_end], y[train_end:val_end], y[val_end:]
# Target encoder for non-group categoricals, without hierarchy
encoder = TargetEncoder(cols = ["market_id", "order_protocol", "store_primary_category"])
# Preprocess data
X_train = encoder.fit_transform(X_train, y_train)
X_val = encoder.transform(X_val)
X_test = encoder.transform(X_test)
# Objective function
def objective_gpb(trial):
# Define hyperparameter space
learning_rate = trial.suggest_float("learning_rate", 0.05, 1)
num_leaves = trial.suggest_int("num_leaves", 2**2, 2**10)
#max_depth = trial.suggest_int("max_depth", 2, 20) # Max depth of 20 is too restrictive for LightGBM
min_child_samples = trial.suggest_int("min_child_samples", 10, 1000, log = True)
min_child_weight = trial.suggest_float("min_child_weight", 0.001, 20, log = True)
reg_alpha = trial.suggest_float("l1_reg", 0, 1)
reg_lambda = trial.suggest_float("l2_reg", 0, 2)
colsample_bytree = trial.suggest_float("colsample_bytree", 0.25, 1)
# Create model
#callbacks = [gpb.early_stopping(50)]
model = gpb.GPBoostRegressor(
n_jobs = 10,
#device_type = "gpu",
n_estimators = 300,
num_leaves = num_leaves,
random_state = random_state,
#max_depth = max_depth,
max_depth = -1,
min_child_samples = min_child_samples,
learning_rate = learning_rate,
min_child_weight = min_child_weight,
reg_alpha = reg_alpha,
reg_lambda = reg_lambda,
colsample_bytree = colsample_bytree
)
# Create random effects model
gp_model = gpb.GPModel(
group_data = G_train, # Random intercepts for each group
likelihood = "gaussian",
seed = random_state
)
gp_model.set_prediction_data(group_data_pred = G_val)
# Train model with early stopping
model.fit(
X_train,
y_train,
gp_model = gp_model,
eval_set = [(X_val, y_val)],
early_stopping_rounds = 50,
#callbacks = callbacks,
verbose = False)
# Report best number of rounds
trial.set_user_attr("n_rounds", (model.best_iteration_ + 1))
return model.best_score_['valid_0']['l2']
# Create study
study_gpb = optuna.create_study(
sampler = optuna.samplers.TPESampler(seed = random_state),
study_name = "tune_gpb",
direction = "minimize"
)
# Perform study
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_gpb.optimize(
objective_gpb,
n_trials = 500,
show_progress_bar = True)
# Save tuning log
trials_gpb = study_gpb.trials_dataframe().sort_values("value", ascending = True)
trials_gpb.to_csv("./ModifiedData/trials_gpb1.csv", index = False)