In [21]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 100)

data = pd.read_csv('../data/processed_data.csv')

print(f'Data shape - {data.shape}')
data.head()

Data shape - (81877, 7)


Unnamed: 0,Region,City,Location,Floor,Area,Rooms,Price
0,ivano-frankovsk,galich,"Галич, Івано-Франківська область",Middle,36.8,1,339.673913
1,ivano-frankovsk,burshtyn,"Бурштин, Івано-Франківська область",Last,50.0,3,296.0
2,ivano-frankovsk,burshtyn,"Бурштин, Івано-Франківська область",Middle,39.4,2,380.71066
3,ivano-frankovsk,burshtyn,"Бурштин, Івано-Франківська область",Middle,50.0,1,280.0
4,ivano-frankovsk,kalush,"Калуш, Івано-Франківська область",Middle,34.6,1,423.699422


##  My own train_test_split

In [22]:
def train_val_split(data, random_state):
    df = data.copy()
#     if df.Location:
    df.drop('Location', axis=1, inplace=True)
    
    cities = df.City.unique()
    small_np = []
    big_np = []

    for i in cities:
        if df[df.City == i].shape[0] == 1:
            small_np.append(i)

        elif df[df.City == i].shape[0] > 1:
            big_np.append(i)


    X_train1, y_train1 = df[df.City.isin(small_np)].drop('Price', axis=1), df[df.City.isin(small_np)].Price
    df = df[df.City.isin(big_np)]

    X_train, X_test, y_train, y_test = train_test_split(df.drop(['Price'], axis=1),
                                                        df.Price, test_size=0.21, random_state=random_state, stratify=df.City)

    X_train_final = pd.concat([X_train1, X_train])
    y_train_final = pd.concat([y_train1, y_train])
    
    
    return X_train_final, X_test, y_train_final, y_test

# Optuna | XGBoost pipeline

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from optuna import create_study
from xgboost import XGBRegressor

from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
def objective(trial):
    
    X_train, X_test, y_train, y_test = train_val_split(data, random_state=42)

    params = {
        'tree_method':'gpu_hist',
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
        'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
        'learning_rate': trial.suggest_float("learning_rate", 0.002, 0.1, log=True),
        'n_estimators': trial.suggest_int("n_estimators", 2500, 6000, 500),
        'max_depth': trial.suggest_int("max_depth", 4, 12, 1),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    
    model = XGBRegressor(**params)  
    
#     model.fit(X_train, y_train) #, early_stopping_rounds=100, eval_set=[(X_test, y_test)], eval_metric='rmse', verbose=False)
    
    encoder = ColumnTransformer([('encoder', OneHotEncoder(sparse_output=False), ['Region', 'Floor', 'City'])],
                        remainder='passthrough')

    pipe_optuna = Pipeline(
        steps=[
            ('encoder', encoder),
            ('model', model)
        ])
    
    pipe_optuna.fit(X_train, y_train)
    
    rmse = mean_squared_error(y_test, pipe_optuna.predict(X_test), squared=False)
    
    return rmse

In [20]:
opt_time = 60 * 60

study_xgbr = create_study(study_name="XGBRegressor Optuna Optimization", direction="minimize")
study_xgbr.optimize(objective, n_jobs=-1, timeout=opt_time)

print(f'Best value: {study_xgbr.best_value}')
# print(f'Best trial: {study.best_trial}')
print(f'Number of finished trials: {len(study_xgbr.trials)}')
print(f"Наилучшие значения гиперпараметров {study_xgbr.best_params}")

[32m[I 2023-06-26 21:46:40,679][0m A new study created in memory with name: XGBRegressor Optuna Optimization[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.

  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsamp

[32m[I 2023-06-26 21:54:36,675][0m Trial 2 finished with value: 355.5829483635719 and parameters: {'lambda': 0.22808785819895017, 'alpha': 0.6474540006189071, 'subsample': 1.0, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.8, 'learning_rate': 0.002409602017058074, 'n_estimators': 4000, 'max_depth': 7, 'min_child_weight': 80}. Best is trial 0 with value: 352.91170488374087.[0m
[32m[I 2023-06-26 21:54:38,937][0m Trial 4 finished with value: 352.23647112372726 and parameters: {'lambda': 5.535521556336007, 'alpha': 0.019367934315758444, 'subsample': 0.2, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.9000000000000001, 'learning_rate': 0.08188954700961704, 'n_estimators': 2500, 'max_depth': 12, 'min_child_weight': 113}. Best is trial 4 with value: 352.23647112372726.[0m
[32m[I 2023-06-26 21:54:44,773][0m Trial 7 finished with value: 356.7712801829759 and parameters: {'lambda': 0.0016122716349590862, 'alpha': 0.023257937664538788, 'subsample': 0.4, 'colsample_bytree': 0.7, 'colsam

[32m[I 2023-06-26 21:55:36,757][0m Trial 1 finished with value: 353.3528823273409 and parameters: {'lambda': 0.03156116028658455, 'alpha': 1.299960679526242, 'subsample': 0.5, 'colsample_bytree': 0.6000000000000001, 'colsample_bylevel': 1.0, 'learning_rate': 0.002988669905577964, 'n_estimators': 6000, 'max_depth': 6, 'min_child_weight': 22}. Best is trial 4 with value: 352.23647112372726.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
[32m[I 2023-06-26 21:56:56,907][0m Trial 6 finished with value: 351.1346741246525 and parameters: {'lambda': 0.03176480337287356, 'alpha': 0.32229089120125615, 'subsample': 0.5, 'colsample_bytree': 0.6000000000000001, 'colsampl

[32m[I 2023-06-26 21:59:59,771][0m Trial 9 finished with value: 354.02711692485906 and parameters: {'lambda': 0.011709121858123443, 'alpha': 0.16268758227189423, 'subsample': 1.0, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.8, 'learning_rate': 0.04872088947738585, 'n_estimators': 4500, 'max_depth': 6, 'min_child_weight': 278}. Best is trial 6 with value: 351.1346741246525.[0m
[32m[I 2023-06-26 22:00:06,927][0m Trial 14 finished with value: 353.97051521548474 and parameters: {'lambda': 7.073173816098202, 'alpha': 0.030997867322956982, 'subsample': 0.9000000000000001, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.5, 'learning_rate': 0.03566454073696287, 'n_estimators': 3000, 'max_depth': 6, 'min_child_weight': 235}. Best is trial 6 with value: 351.1346741246525.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree':

[32m[I 2023-06-26 22:04:49,371][0m Trial 12 finished with value: 352.2622867734045 and parameters: {'lambda': 0.3795862751843082, 'alpha': 4.166321932154421, 'subsample': 0.7, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.8, 'learning_rate': 0.015343673899735538, 'n_estimators': 6000, 'max_depth': 7, 'min_child_weight': 133}. Best is trial 6 with value: 351.1346741246525.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
[32m[I 2023-06-26 22:04:58,866][0m Trial 13 finished with value: 352.41815649346205 and parameters: {'lambda': 2.123082357554227, 'alpha': 1.6774973386068592, 'subsample': 0.4, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.8, 'learning_rate

  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
[32m[I 2023-06-26 22:11:19,215][0m Trial 17 finished with value: 352.91401301639223 and parameters: {'lambda': 0.03992599145143583, 'alpha': 0.003814440345949655, 'subsample': 0.9000000000000001, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.5, 'learning_rate': 0.012231610137369406, 'n_estimators': 5500, 'max_depth': 12, 'min_child_weight': 174}. Best is trial 15 with value: 351.12321652283345.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.

[32m[I 2023-06-26 22:14:48,034][0m Trial 26 finished with value: 350.78646792685106 and parameters: {'lambda': 0.009510015667168439, 'alpha': 0.07564549303265711, 'subsample': 0.4, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.5, 'learning_rate': 0.06740639982002028, 'n_estimators': 5000, 'max_depth': 9, 'min_child_weight': 20}. Best is trial 24 with value: 350.3776124993405.[0m
[32m[I 2023-06-26 22:14:50,837][0m Trial 25 finished with value: 348.62109059412 and parameters: {'lambda': 0.008044471945644024, 'alpha': 0.0984332201094881, 'subsample': 0.4, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.5, 'learning_rate': 0.025520256682260222, 'n_estimators': 5000, 'max_depth': 9, 'min_child_weight': 5}. Best is trial 25 with value: 348.62109059412.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_dis

[32m[I 2023-06-26 22:17:07,481][0m Trial 23 finished with value: 352.3907842820532 and parameters: {'lambda': 0.013636779743313633, 'alpha': 0.0018970729484311182, 'subsample': 0.5, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.4, 'learning_rate': 0.09770447718993708, 'n_estimators': 5000, 'max_depth': 12, 'min_child_weight': 5}. Best is trial 21 with value: 348.41524766577925.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
[32m[I 2023-06-26 22:19:58,101][0m Trial 28 finished with value: 350.4525032429966 and parameters: {'lambda': 0.009755972356212651, 'alpha': 0.11631260862222281, 'subsample': 0.6000000000000001, 'colsample_bytree': 1.0, 'colsample_bylev

[32m[I 2023-06-26 22:24:46,034][0m Trial 30 finished with value: 349.3751550258608 and parameters: {'lambda': 0.010896994226455615, 'alpha': 0.0846604802721195, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.4, 'learning_rate': 0.02377210823436398, 'n_estimators': 5000, 'max_depth': 9, 'min_child_weight': 4}. Best is trial 21 with value: 348.41524766577925.[0m
[32m[I 2023-06-26 22:24:47,795][0m Trial 31 finished with value: 349.86936569225554 and parameters: {'lambda': 0.008925531415910131, 'alpha': 0.08955949215074195, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.4, 'learning_rate': 0.02063268420727819, 'n_estimators': 5000, 'max_depth': 9, 'min_child_weight': 6}. Best is trial 21 with value: 348.41524766577925.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  '

[32m[I 2023-06-26 22:27:51,729][0m Trial 34 finished with value: 353.1000224125209 and parameters: {'lambda': 0.00744528234646234, 'alpha': 0.11499750082758603, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.4, 'learning_rate': 0.025888888963509094, 'n_estimators': 5500, 'max_depth': 9, 'min_child_weight': 54}. Best is trial 21 with value: 348.41524766577925.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
[32m[I 2023-06-26 22:30:02,325][0m Trial 35 finished with value: 355.87684446124024 and parameters: {'lambda': 0.004825869149766778, 'alpha': 0.16423649575433166, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 

[32m[I 2023-06-26 22:34:59,015][0m Trial 40 finished with value: 353.28934743463725 and parameters: {'lambda': 0.021531189813567633, 'alpha': 0.20402372525302775, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.30000000000000004, 'learning_rate': 0.027968588968718563, 'n_estimators': 5500, 'max_depth': 8, 'min_child_weight': 31}. Best is trial 21 with value: 348.41524766577925.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
[32m[I 2023-06-26 22:35:10,476][0m Trial 37 finished with value: 354.7923375025592 and parameters: {'lambda': 0.1399492221761421, 'alpha': 0.9164591883858084, 'subsample': 0.30000000000000004, 'colsample_

  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
[32m[I 2023-06-26 22:37:08,321][0m Trial 43 finished with value: 356.20774508680216 and parameters: {'lambda': 0.019102329911696102, 'alpha': 0.7159254434841238, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.30000000000000004, 'colsample_bylevel': 0.2, 'learning_rate': 0.03139123227114827, 'n_estimators': 4500, 'max_depth': 8, 'min_child_weight': 29}. Best is trial 21 with value: 348.41524766577925.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_by

  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
[32m[I 2023-06-26 22:42:42,988][0m Trial 49 finished with value: 355.9267441218842 and parameters: {'lambda': 0.004463316145582611, 'alpha': 0.430957957017274, 'subsample': 0.5, 'colsample_bytree': 0.30000000000000004, 'colsample_bylevel': 0.2, 'learning_rate': 0.01860166134578193, 'n_estimators': 4500, 'max_depth': 10, 'min_child_weight': 30}. Best is trial 21 with value: 348.41524766577925.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  '

[32m[I 2023-06-26 22:43:36,996][0m Trial 50 finished with value: 351.2752128652833 and parameters: {'lambda': 0.0048982314464525245, 'alpha': 0.5440508918792534, 'subsample': 0.5, 'colsample_bytree': 0.30000000000000004, 'colsample_bylevel': 0.7, 'learning_rate': 0.018702146382134216, 'n_estimators': 4500, 'max_depth': 10, 'min_child_weight': 30}. Best is trial 21 with value: 348.41524766577925.[0m
  'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 1.0, 0.1),
  'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 1.0, 0.1),
[32m[I 2023-06-26 22:45:10,163][0m Trial 51 finished with value: 354.27098962402613 and parameters: {'lambda': 0.005273121681473061, 'alpha': 0.33800014869516404, 'subsample': 0.2, 'colsample_bytree': 0.30000000000000004

Best value: 348.41524766577925
Number of finished trials: 60
Наилучшие значения гиперпараметров {'lambda': 0.01973328702317252, 'alpha': 6.20425684193659, 'subsample': 0.5, 'colsample_bytree': 0.5, 'colsample_bylevel': 0.4, 'learning_rate': 0.022436637559530896, 'n_estimators': 5000, 'max_depth': 12, 'min_child_weight': 3}


# Pipeline

In [29]:
# Region City Floor Area Rooms(int64) | optuna | 60 minutes | 70 trials | xgb_v1
params = {'lambda': 0.03223912653294917, 'alpha': 0.004698857584966701, 'subsample': 1.0, 'colsample_bytree': 0.5,
          'colsample_bylevel': 0.9000000000000001, 'learning_rate': 0.05757942646087289, 'n_estimators': 2500,
          'max_depth': 12, 'min_child_weight': 1}

# Region City Floor Area Rooms(int64) Offers | optuna | 60 minutes | 60 trials | xgb_v3
# params = {'lambda': 0.01973328702317252, 'alpha': 6.20425684193659, 'subsample': 0.5, 'colsample_bytree': 0.5,
#           'colsample_bylevel': 0.4, 'learning_rate': 0.022436637559530896, 'n_estimators': 5000, 'max_depth': 12,
#           'min_child_weight': 3}



xgb_model = XGBRegressor(**params) 


enc = ColumnTransformer([('encoder', OneHotEncoder(sparse_output=False), ['Region', 'Floor', 'City'])],
                        remainder='passthrough')

pipe = Pipeline(
    steps=[
        ('encoder', enc),
        ('model', xgb_model)
    ])

# Training pipeline

In [31]:
%%time

X_train, X_test, y_train, y_test = train_val_split(data, random_state=42)

pipe.fit(X_train, y_train)
print(f'rmse = {mean_squared_error(pipe.predict(X_test)*X_test.Area, y_test*X_test.Area, squared=False)}')

rmse = 21313.43496451753
CPU times: total: 2h 45min 30s
Wall time: 22min 30s


# Train on the full dataset

In [33]:
pipe.fit(data.drop(['Price', 'Location'], axis=1), data['Price'])

# Save model

In [35]:
import joblib
import pickle

joblib.dump(pipe, '../models/xgb_v1.joblib')
# joblib.dump(pipe1, '../models/xgb_v1.pkl')
# s = pickle.dumps(pipe1)

['../models/xgb_v1.joblib']