In [1]:
import pandas as pd 
import pickle 
import lightgbm
import numpy as np
from sklearn.metrics import root_mean_squared_error, r2_score, d2_pinball_score,mean_absolute_error, mean_absolute_percentage_error


### Refit on Train + Val and Eval on Test

In [2]:
with open('artifacts/lightgbm_reg/quantile_lightgbm.pkl', 'rb') as fp:
    quantile_lightgbm = pickle.load(fp)
    
train = pd.read_csv('../data/modelling_data/train.csv')
valid = pd.read_csv('../data/modelling_data/valid.csv')
train_valid = pd.concat([train, valid], axis=0).sample(frac=1)
test = pd.read_csv('../data/modelling_data/test.csv')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
features = train.select_dtypes(include=[np.number, bool]).columns
features = [i for i in features if i not in ['prezzo', 'log_prezzo', 'log_m2'] and 'emb_' not in i]
target='prezzo'

In [4]:
model_05 = lightgbm.LGBMRegressor(**quantile_lightgbm['lightgbm_q0.05']['study'].best_params)
model_05.fit(train_valid[features], train_valid[target])

model_50 = lightgbm.LGBMRegressor(**quantile_lightgbm['lightgbm_q0.5']['study'].best_params)
model_50.fit(train_valid[features], train_valid[target])

model_95 = lightgbm.LGBMRegressor(**quantile_lightgbm['lightgbm_q0.95']['study'].best_params)
model_95.fit(train_valid[features], train_valid[target])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002616 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 927
[LightGBM] [Info] Number of data points in the train set: 13887, number of used features: 38
[LightGBM] [Info] Start training from score 85000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 927
[LightGBM] [Info] Number of data points in the train set: 13887, number of used features: 38
[LightGBM] [Info] Start training from score 275000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is no

In [5]:
def get_evaluation_metrics(y_true, y_pred, alpha=0.5):
    print('RMSE:', round(root_mean_squared_error(y_true=y_true, y_pred=y_pred),2))
    print('MAE:', round(mean_absolute_error(y_true=y_true, y_pred=y_pred),2))
    print('R2:', f'{round(100*r2_score(y_true=y_true, y_pred=y_pred),2)}%')
    print('Pinball Score', f'{round(100*d2_pinball_score(y_true=y_true, y_pred=y_pred, alpha=alpha),2)}%')
    print('MAPE:', f'{round(100*mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred),2)}%')
    
def coverage_fraction(y, y_low, y_high):
    print('Coverage Fraction:', f'{round(100*np.mean(np.logical_and(y >= y_low, y <= y_high)),2)}%')

In [6]:
q_05_lightgbm_pred = model_05.predict(test[features])
q_50_lightgbm_pred = model_50.predict(test[features])
q_95_lightgbm_pred = model_95.predict(test[features])

In [10]:
get_evaluation_metrics(test[target], q_50_lightgbm_pred)

print('\nOther Quantiles')
print('Pinball Score Q05', f'{round(100*d2_pinball_score(y_true=test[target], y_pred=q_05_lightgbm_pred, alpha=0.05),2)}%')
print('Pinball Score Q95', f'{round(100*d2_pinball_score(y_true=test[target], y_pred=q_95_lightgbm_pred, alpha=0.95),2)}%')
coverage_fraction(test[target], q_05_lightgbm_pred, q_95_lightgbm_pred)

RMSE: 122184.36
MAE: 69701.6
R2: 81.76%
Pinball Score 60.15%
MAPE: 23.16%

Other Quantiles
Pinball Score Q05 39.56%
Pinball Score Q95 72.78%
Coverage Fraction: 87.86%


### Final refit

In [11]:
train_valid_test = pd.concat([train,valid,test],axis=0).sample(frac=1)

model_05 = lightgbm.LGBMRegressor(**quantile_lightgbm['lightgbm_q0.05']['study'].best_params)
model_05.fit(train_valid_test[features], train_valid_test[target])

model_50 = lightgbm.LGBMRegressor(**quantile_lightgbm['lightgbm_q0.5']['study'].best_params)
model_50.fit(train_valid_test[features], train_valid_test[target])

model_95 = lightgbm.LGBMRegressor(**quantile_lightgbm['lightgbm_q0.95']['study'].best_params)
model_95.fit(train_valid_test[features], train_valid_test[target])

with open('artifacts/refitted_lightgbm/model_05.pkl','wb') as f:
    pickle.dump(model_05, f)
    
with open('artifacts/refitted_lightgbm/model_50.pkl','wb') as f:
    pickle.dump(model_50, f)
    
with open('artifacts/refitted_lightgbm/model_95.pkl','wb') as f:
    pickle.dump(model_95, f)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005697 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 925
[LightGBM] [Info] Number of data points in the train set: 15551, number of used features: 38
[LightGBM] [Info] Start training from score 84000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002745 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 925
[LightGBM] [Info] Number of data points in the train set: 15551, number of used features: 38
[LightGBM] [Info] Start training from score 270000.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003005 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is no