In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

from skopt import BayesSearchCV
from skopt.space import Integer, Real

from utils import CustomEarlyStopper

In [2]:
df = pd.read_csv('prepared_data_ml.csv')

In [3]:
df_remain = df[['user_id', 'curr_orig', 'country_reg']].copy()
df = df.drop(columns=['user_id', 'curr_orig', 'country_reg'])

In [4]:
df.columns

Index(['weeks', 'total_avrg', 'lower_bound', 'upper_bound', 'variation',
       'deposit_count', 'total_summ', 'lifetime', 'gross_summ', 'outliers',
       'y'],
      dtype='object')

In [5]:
df

Unnamed: 0,weeks,total_avrg,lower_bound,upper_bound,variation,deposit_count,total_summ,lifetime,gross_summ,outliers,y
0,2,28.323333,24.630123,35.763723,0.181707,13.0,392.56,2604.0,27.841667,0.0,32.16
1,2,28.323333,24.630123,35.763723,0.181707,13.0,392.56,2604.0,27.841667,0.0,32.16
2,2,28.323333,24.630123,35.763723,0.181707,13.0,392.56,2604.0,27.841667,0.0,37.51
3,2,28.323333,24.630123,35.763723,0.181707,13.0,392.56,2604.0,27.841667,0.0,32.16
4,2,28.323333,24.630123,35.763723,0.181707,13.0,392.56,2604.0,27.841667,0.0,53.11
...,...,...,...,...,...,...,...,...,...,...,...
173869,5,23.273333,20.531894,26.014772,0.124038,3.0,69.82,29.0,24.101667,0.0,24.56
173870,5,23.273333,20.531894,26.014772,0.124038,3.0,69.82,29.0,24.101667,0.0,12.29
173871,5,23.273333,20.531894,26.014772,0.124038,3.0,69.82,29.0,24.101667,0.0,12.31
173872,5,14.363333,14.357850,14.368816,0.100000,3.0,43.09,29.0,14.361667,0.0,14.28


In [6]:
param_dist = {
    'n_estimators': Integer(32, 128),
    'max_depth': Integer(2, 10),   
    'min_child_weight': Integer(1, 5),
    'subsample': Real(0.1, 0.5),         
    'colsample_bytree': Real(0.1, 0.9),  
    'gamma': Real(0, 0.5),
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 1.5, 2, 3],
    'learning_rate': Real(0.001, 0.1),
}

In [7]:
y = df['y']
X = df.drop(['y'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
model = XGBRegressor()

opt = BayesSearchCV(
    model,
    param_dist,
    n_iter=20,
    cv=5,
    n_jobs=-1,
)

early_stop = CustomEarlyStopper(no_improvement_rounds=3)
opt.fit(X_train, y_train, callback=early_stop)
model = opt.best_estimator_

In [9]:
y_pred = model.predict(X_test)
print('Best params:', opt.best_params_)
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('R2:', r2_score(y_test, y_pred))

Best params: OrderedDict([('colsample_bytree', 0.5585947436732303), ('gamma', 0.2599470885983259), ('learning_rate', 0.06669835545118918), ('max_depth', 7), ('min_child_weight', 1), ('n_estimators', 126), ('reg_alpha', 0), ('reg_lambda', 1), ('subsample', 0.446344012260508)])
RMSE: 36.33715212135232
MAE: 15.476672522190952
R2: 0.7243085879838832
