In [51]:
import pandas as pd
from pycaret.regression import setup, compare_models, blend_models, finalize_model, plot_model, predict_model, add_metric, evaluate_model,tune_model
import random

VERSION = "baseline_v0"

def preprocess(df):
    
    df['day'] = df["dates"].dt.dayofweek
    df['week'] = df["dates"].dt.isocalendar().week
    df['year'] = df["dates"].dt.year
    df['month'] = df["dates"].dt.month
    
    return df


In [52]:
train = pd.concat([pd.read_csv("second_group_sku.csv", parse_dates=["dates"]), pd.read_csv("first_group_sku.csv", parse_dates=["dates"]),
                   pd.read_csv("third_group_sku.csv", parse_dates=["dates"])]).sort_values(by = ["dates","SKU"])

target = "price_per_sku"
FEATURES = [c for c in main_train_data.columns if c not in ['dates', "num_purchases", target]]

train = train.dropna(subset=[target])
train = train.reset_index().drop(['index'], axis = 1)
test = pd.read_csv("test.csv", parse_dates=["dates"]).sort_values(by = ["dates","SKU"])

In [53]:
main_train_data = preprocess(train)
main_test_data = preprocess(test)

In [54]:
main_train_data

Unnamed: 0,dates,SKU,price_per_sku,num_purchases,day,week,year,month
0,2018-01-17,60870,109.182,7,2,3,2018,1
1,2018-01-18,60870,109.182,7,3,3,2018,1
2,2018-01-19,60870,109.182,8,4,3,2018,1
3,2018-01-21,60870,109.182,7,6,3,2018,1
4,2018-01-22,60870,109.315,8,0,4,2018,1
...,...,...,...,...,...,...,...,...
6694,2019-12-23,38767,74.536,7,0,52,2019,12
6695,2019-12-23,80631,92.257,5,0,52,2019,12
6696,2019-12-23,84395,140.815,10,0,52,2019,12
6697,2019-12-24,10334,101.094,9,1,52,2019,12


In [57]:
setup(data=main_train_data.drop(['dates', "num_purchases"], axis = 1),
          target=target,
          fold = 5)

best_models  = compare_models(sort = 'RMSE', n_select = 3)

blender = blend_models(estimator_list=best_models)

tuned_blender = tune_model(blender)

pred = predict_model(tuned_blender, data= main_test_data.drop(['dates'], axis = 1))

Unnamed: 0,Description,Value
0,Session id,1785
1,Target,price_per_sku
2,Target type,Regression
3,Original data shape,"(6699, 6)"
4,Transformed data shape,"(6699, 6)"
5,Transformed train set shape,"(4689, 6)"
6,Transformed test set shape,"(2010, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dt,Decision Tree Regressor,0.3433,2.1167,1.4373,0.9993,0.0113,0.0026,0.008
rf,Random Forest Regressor,0.6293,2.426,1.556,0.9992,0.0129,0.0054,0.056
xgboost,Extreme Gradient Boosting,0.8423,2.7658,1.6527,0.9991,0.014,0.0075,0.028
catboost,CatBoost Regressor,1.1812,5.4023,2.3156,0.9982,0.0201,0.0107,0.344
et,Extra Trees Regressor,0.9777,6.23,2.4781,0.998,0.0179,0.0082,0.042
lightgbm,Light Gradient Boosting Machine,1.3379,7.175,2.6724,0.9976,0.0234,0.0122,0.018
knn,K Neighbors Regressor,2.1585,10.6721,3.2652,0.9965,0.0297,0.0201,0.008
gbr,Gradient Boosting Regressor,3.0097,23.9109,4.8861,0.9921,0.0369,0.0258,0.034
ada,AdaBoost Regressor,18.1066,441.5978,21.0043,0.8554,0.1975,0.1776,0.016
ridge,Ridge Regression,30.6773,3062.2077,55.3157,0.0005,0.3313,0.2601,0.006


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5602,1.835,1.3546,0.9994,0.011,0.0048
1,0.6144,2.2104,1.4868,0.9993,0.0119,0.0052
2,0.5487,1.6946,1.3018,0.9994,0.0111,0.0048
3,0.5852,1.8934,1.376,0.9994,0.0109,0.0049
4,0.5633,1.4373,1.1989,0.9995,0.0096,0.0049
Mean,0.5744,1.8142,1.3436,0.9994,0.0109,0.0049
Std,0.0233,0.253,0.0942,0.0001,0.0007,0.0001


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.5007,1.516,1.2312,0.9995,0.0098,0.0042
1,0.5847,2.3799,1.5427,0.9993,0.0123,0.0049
2,0.5019,1.604,1.2665,0.9994,0.0109,0.0044
3,0.5387,1.8193,1.3488,0.9994,0.0105,0.0045
4,0.5105,1.2709,1.1273,0.9996,0.009,0.0044
Mean,0.5273,1.718,1.3033,0.9994,0.0105,0.0045
Std,0.0318,0.3747,0.1392,0.0001,0.0011,0.0002


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [56]:
pred

Unnamed: 0,SKU,day,week,year,month,prediction_label
0,98255,0,51,2018,12,66.848859
1,98255,4,51,2018,12,66.925391
2,98255,5,51,2018,12,66.887894
3,98255,6,51,2018,12,66.903453
4,98255,0,52,2018,12,67.059008
...,...,...,...,...,...,...
170,80631,1,1,2019,12,86.265813
166,83329,1,1,2019,12,82.108742
172,84395,1,1,2019,12,134.420586
162,89272,1,1,2019,12,119.951156


In [58]:
pred['dates'] = main_test_data['dates']
pred = pred.sort_values(by=["dates"])
pred = pred.rename(columns={"prediction_label":"price_per_sku"}).reset_index()
pred[['dates', 'SKU', 'price_per_sku']].to_csv(f'{VERSION}.csv', index=False)


In [59]:
pred

Unnamed: 0,index,SKU,day,week,year,month,price_per_sku,dates
0,0,98255,0,51,2018,12,67.008377,2018-12-17
1,1,98255,4,51,2018,12,67.030037,2018-12-21
2,2,98255,5,51,2018,12,67.021258,2018-12-22
3,3,98255,6,51,2018,12,67.034671,2018-12-23
4,4,98255,0,52,2018,12,67.278432,2018-12-24
...,...,...,...,...,...,...,...,...
170,171,19632,1,1,2019,12,148.203630,2019-12-31
171,165,17812,1,1,2019,12,72.863926,2019-12-31
172,168,10334,1,1,2019,12,96.588445,2019-12-31
173,174,49661,1,1,2019,12,159.257009,2019-12-31
