In [1]:
import pandas as pd 
import pickle
import numpy as np
import random 

from ml_optfit.ml_optfit import HyperOptim
from sklearn.metrics import mean_squared_error
import lightgbm as lgb 
import xgboost as xg
SEED=42
random.seed(SEED)
np.random.seed(seed=SEED)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('../data/modelling_data/train.csv')
valid = pd.read_csv('../data/modelling_data/valid.csv')

In [3]:
features = train.select_dtypes(include=[np.number, bool]).columns
features = [i for i in features if i not in ['prezzo', 'log_prezzo', 'log_m2'] and 'emb_' not in i]
target='prezzo'

### Lightgbm Model

In [None]:
hyperopt=HyperOptim(direction='minimize', 
                    train=train, 
                    valid=valid, 
                    features=features, 
                    target=target, 
                    prediction_type='regression',
                    evaluation_func=mean_squared_error)

lightgbm_hyper_dict = {
                    'n_estimators':
                            {'type': 'int',
                            'low': 500,
                            'high':2000,
                            'log':False,
                            'step':100},
                    'boosting_type':
                            {'type':'class',
                             'values':['gbdt']},
                    'num_leaves':
                            {'type':'int',
                             'low':5,
                             'high':31},
                    'learning_rate':
                            {'type':'float',
                             'low':0.001,
                             'high':0.1,
                             'log':True},
                    'reg_lambda':
                            {'type':'float',
                             'low':0,
                             'high':10},
                    'reg_alpha':
                            {'type':'float',
                             'low':0,
                             'high':10},
                    'subsample':
                            {'type':'float',
                             'low':0,
                             'high':1},
                    'subsample_freq':
                            {'type':'int',
                             'low':0,
                             'high':2},
                    'min_child_samples':
                            {'type':'int',
                             'low':1,
                             'high':20},
                    'n_jobs':
                            {'type':'class',
                             'values':[-1]}
                            }

study, best_hyper, best_model=hyperopt.optimize_model(model_type=lgb.LGBMRegressor, 
                       study_name='lightgbm', 
                       hyperparam_dict=lightgbm_hyper_dict, 
                       multivariate=True, 
                       n_trials=150)

Best trial: 102. Best value: 2.22638e+10: 100%|██████████| 150/150 [03:41<00:00,  1.48s/it]


In [53]:
with open('artifacts/lightgbm_reg/lightgbm_regressor.pkl','wb') as f:
    pickle.dump(best_model, f)
    
with open('artifacts/lightgbm_reg/study.pkl','wb') as f:
    pickle.dump(study, f)

### XGBoost Regressor

In [22]:
hyperopt=HyperOptim(direction='minimize', 
                    train=train, 
                    valid=valid, 
                    features=features, 
                    target=target, 
                    prediction_type='regression',
                    evaluation_func=mean_squared_error)

xgboost_hyper_dict = {
                    'n_estimators':
                            {'type': 'int',
                            'low': 500,
                            'high':2000,
                            'log':False,
                            'step':100},
                    'max_depth':
                            {'type':'int',
                             'low':4,
                             'high':8},
                    'max_leaves':
                            {'type':'int',
                             'low':5,
                             'high':31},
                    'learning_rate':
                            {'type':'float',
                             'low':0.001,
                             'high':0.1,
                             'log':True},
                    'reg_lambda':
                            {'type':'float',
                             'low':0,
                             'high':10},
                    'reg_alpha':
                            {'type':'float',
                             'low':0,
                             'high':10},
                    'subsample':
                            {'type':'float',
                             'low':0,
                             'high':1},
                    'grow_policy':
                            {'type':'class',
                             'values':['depthwise','lossguide']},
                    'n_jobs':
                            {'type':'class',
                             'values':[-1]},
                    'objective':
                            {'type':'class',
                             'values':['reg:linear']}
                            }

study, best_hyper, best_model=hyperopt.optimize_model(model_type=xg.XGBRegressor, 
                       study_name='xgboost', 
                       hyperparam_dict=xgboost_hyper_dict, 
                       multivariate=True, 
                       n_trials=150)

  0%|          | 0/150 [00:00<?, ?it/s]

Best trial: 126. Best value: 2.25762e+10: 100%|██████████| 150/150 [03:25<00:00,  1.37s/it]


In [26]:
with open('artifacts/xgboost_reg/xgboost_regressor.pkl','wb') as f:
    pickle.dump(best_model, f)
    
with open('artifacts/xgboost_reg/study.pkl','wb') as f:
    pickle.dump(study, f)