In [1]:
import pandas as pd 
import pickle
import numpy as np
import random 
from collections import defaultdict

from ml_optfit.ml_optfit import HyperOptim
from sklearn.metrics import d2_pinball_score
import lightgbm as lgb 
import xgboost as xg
SEED=42
random.seed(SEED)
np.random.seed(seed=SEED)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv('../data/modelling_data/train.csv')
valid = pd.read_csv('../data/modelling_data/valid.csv')

In [3]:
features = train.select_dtypes(include=[np.number, bool]).columns
features = [i for i in features if i not in ['prezzo', 'log_prezzo', 'log_m2'] and 'emb_' not in i]
target='prezzo'
target_quantiles = [0.05, 0.5, 0.95]

### Lightgbm Model

In [4]:
saving_dict = defaultdict(dict)
for target_q in target_quantiles:
    hyperopt=HyperOptim(direction='maximize', 
                        train=train, 
                        valid=valid, 
                        features=features, 
                        target=target, 
                        prediction_type='quantile',
                        evaluation_func=d2_pinball_score,
                        target_quantile=target_q)

    lightgbm_hyper_dict = {
                        'n_estimators':
                                {'type': 'int',
                                'low': 500,
                                'high':2500,
                                'log':False,
                                'step':100},
                        'boosting_type':
                                {'type':'class',
                                'values':['gbdt']},
                        'num_leaves':
                                {'type':'int',
                                'low':5,
                                'high':60},
                        'min_data_in_leaf':
                                {'type':'int',
                                'low':5,
                                'high':50},
                        'learning_rate':
                                {'type':'float',
                                'low':0.0001,
                                'high':0.1,
                                'log':True},
                        'reg_lambda':
                                {'type':'float',
                                'low':0,
                                'high':20},
                        'reg_alpha':
                                {'type':'float',
                                'low':0,
                                'high':20},
                        'subsample':
                                {'type':'float',
                                'low':0,
                                'high':1},
                        'subsample_freq':
                                {'type':'int',
                                'low':0,
                                'high':3},
                        'n_jobs':
                                {'type':'class',
                                'values':[-1]},
                        'objective':
                                {'type':'class',
                                'values':['quantile']},
                        'alpha':
                                {'type':'class',
                                'values':[target_q]}
                                }

    study, best_hyper, best_model=hyperopt.optimize_model(model_type=lgb.LGBMRegressor, 
                        study_name='lightgbm', 
                        hyperparam_dict=lightgbm_hyper_dict, 
                        multivariate=True, 
                        n_trials=200)
    saving_dict[f'lightgbm_q{target_q}']['study']=study
    saving_dict[f'lightgbm_q{target_q}']['model']=best_model

Best trial: 126. Best value: 0.343702: 100%|██████████| 200/200 [05:43<00:00,  1.72s/it]
Best trial: 61. Best value: 0.535741: 100%|██████████| 200/200 [18:28<00:00,  5.54s/it]
Best trial: 198. Best value: 0.637093: 100%|██████████| 200/200 [08:18<00:00,  2.49s/it]


In [5]:
with open('artifacts/lightgbm_reg/quantile_lightgbm.pkl','wb') as f:
    pickle.dump(saving_dict, f)

### XGBoost Regressor

In [6]:
saving_dict = defaultdict(dict)
for target_q in target_quantiles:
    hyperopt=HyperOptim(direction='maximize', 
                            train=train, 
                            valid=valid, 
                            features=features, 
                            target=target, 
                            prediction_type='quantile',
                            evaluation_func=d2_pinball_score,
                            target_quantile=target_q)

    xgboost_hyper_dict = {
                        'n_estimators':
                                {'type': 'int',
                                'low': 500,
                                'high':2500,
                                'log':False,
                                'step':100},
                        'max_depth':
                                {'type':'int',
                                'low':4,
                                'high':12},
                        'max_leaves':
                                {'type':'int',
                                'low':5,
                                'high':45},
                        'learning_rate':
                                {'type':'float',
                                'low':0.0001,
                                'high':0.1,
                                'log':True},
                        'reg_lambda':
                                {'type':'float',
                                'low':0,
                                'high':20},
                        'reg_alpha':
                                {'type':'float',
                                'low':0,
                                'high':20},
                        'subsample':
                                {'type':'float',
                                'low':0,
                                'high':1},
                        'grow_policy':
                                {'type':'class',
                                'values':['depthwise','lossguide']},
                        'n_jobs':
                                {'type':'class',
                                'values':[-1]},
                        'objective':
                                {'type':'class',
                                'values':['reg:quantileerror']},
                        'quantile_alpha':
                                {'type':'class',
                                'values':[target_q]}
                                }

    study, best_hyper, best_model=hyperopt.optimize_model(model_type=xg.XGBRegressor, 
                        study_name='xgboost', 
                        hyperparam_dict=xgboost_hyper_dict, 
                        multivariate=True, 
                        n_trials=200)
    saving_dict[f'xgboost_q{target_q}']['study']=study
    saving_dict[f'xgboost_q{target_q}']['model']=best_model

Best trial: 152. Best value: 0.346362: 100%|██████████| 200/200 [07:19<00:00,  2.20s/it]
Best trial: 128. Best value: 0.531284: 100%|██████████| 200/200 [14:51<00:00,  4.46s/it]
Best trial: 143. Best value: 0.630138: 100%|██████████| 200/200 [07:19<00:00,  2.20s/it]


In [7]:
with open('artifacts/xgboost_reg/quantile_xgboost.pkl','wb') as f:
    pickle.dump(saving_dict, f)

### Model Stacking

In [8]:
from sklearn.linear_model import QuantileRegressor

with open('artifacts/lightgbm_reg/quantile_lightgbm.pkl', 'rb') as fp:
    quantile_lightgbm = pickle.load(fp)

with open('artifacts/xgboost_reg/quantile_xgboost.pkl', 'rb') as fp:
    quantile_xgboost = pickle.load(fp)

In [9]:
q_05_lightgbm_pred = quantile_lightgbm['lightgbm_q0.05']['model'].predict(valid[features])
q_50_lightgbm_pred = quantile_lightgbm['lightgbm_q0.5']['model'].predict(valid[features])
q_95_lightgbm_pred = quantile_lightgbm['lightgbm_q0.95']['model'].predict(valid[features])

q_05_xgboost_pred = quantile_xgboost['xgboost_q0.05']['model'].predict(valid[features])
q_50_xgboost_pred = quantile_xgboost['xgboost_q0.5']['model'].predict(valid[features])
q_95_xgboost_pred = quantile_xgboost['xgboost_q0.95']['model'].predict(valid[features])



In [10]:
q_05_df = pd.DataFrame({'lightgbm_05':q_05_lightgbm_pred, 'xgboost_05':q_05_xgboost_pred})
q_05_df['target']=valid[target]
model05 = QuantileRegressor(quantile=0.05)
model05.fit(q_05_df[['lightgbm_05', 'xgboost_05']], q_05_df['target'])
print('Q05 Pinball', d2_pinball_score(q_05_df['target'], model05.predict(q_05_df[['lightgbm_05', 'xgboost_05']]), alpha=0.05))

q_50_df = pd.DataFrame({'lightgbm_50':q_50_lightgbm_pred, 'xgboost_50':q_50_xgboost_pred})
q_50_df['target']=valid[target]
model50 = QuantileRegressor(quantile=0.50)
model50.fit(q_50_df[['lightgbm_50', 'xgboost_50']], q_50_df['target'])
print('Q50 Pinball', d2_pinball_score(q_50_df['target'], model50.predict(q_50_df[['lightgbm_50', 'xgboost_50']]), alpha=0.50))

q_95_df = pd.DataFrame({'lightgbm_95':q_95_lightgbm_pred, 'xgboost_95':q_95_xgboost_pred})
q_95_df['target']=valid[target]
model95 = QuantileRegressor(quantile=0.95)
model95.fit(q_95_df[['lightgbm_95', 'xgboost_95']], q_95_df['target'])
print('Q95 Pinball', d2_pinball_score(q_95_df['target'], model95.predict(q_95_df[['lightgbm_95', 'xgboost_95']]), alpha=0.95))

Q05 Pinball 0.3576784343293722
Q50 Pinball 0.5376238475745536
Q95 Pinball 0.6465627059720779
