In [1]:
import pandas as pd 
import numpy as np 

from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean, SeasonalRollingMean
from mlforecast.target_transforms import Differences
from hierarchicalforecast.utils import aggregate
import lightgbm as lgb 
from copy import copy
from hierarchicalforecast.utils import aggregate
from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.methods import BottomUp, TopDown, MinTrace, ERM, OptimalCombination
from IPython.display import display 


skip = [f"d_{i}" for i in range(1, 700 + 1)] 
HORIZON = 28 # horizon de forecast
EXTRA_FEATURES = [
           'event_name_1', 'event_name_2', 'event_type_1', 'event_type_2',
           'snap_CA', 'snap_TX', 'snap_WI'
           ]

# spec de la hierarchie que l'on veut travaillé 
spec = [
    ['state_id', 'store_id', 'cat_id'], 
    ['state_id', 'store_id', 'cat_id', 'dept_id'],
    ['state_id', 'store_id', 'cat_id', 'dept_id', 'item_id'],
]

calendar = pd.read_csv("/home/jupyter/mawa/data/calendar.csv")
row_sales = pd.read_csv("/home/jupyter/mawa/data/sales_train_evaluation.csv")
prices = pd.read_csv('/home/jupyter/mawa/data/sell_prices.csv')
row_sales['state_id@store_id'] = row_sales['state_id'].astype(str) + "@" + row_sales['store_id'].astype(str)
all_state_and_store = np.unique(row_sales['state_id@store_id'])

# formattage ligne et renommage des colonnes pour respecter les conventions de la librairie.
train = (row_sales
         .drop(skip, axis=1)
         .pipe(pd.melt, 
               id_vars=["id", "item_id", "dept_id", "cat_id", 
                        "store_id", "state_id", "state_id@store_id"],
               var_name='d',
               value_name="y"
               )
         .merge(calendar, how="inner", on=["d"])
         .assign(date= lambda x : pd.to_datetime(x['date']))
        )

for col in EXTRA_FEATURES:
    train[f"{col}"] = train[col].fillna('nan').astype("category").cat.codes.astype(int)

# extract all series id
train.tail(5)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,state_id@store_id,d,y,date,...,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
37838085,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,WI@WI_3,d_1941,1,2016-05-22,...,2,5,2016,30,4,3,2,0,0,0
37838086,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,WI@WI_3,d_1941,0,2016-05-22,...,2,5,2016,30,4,3,2,0,0,0
37838087,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,WI@WI_3,d_1941,2,2016-05-22,...,2,5,2016,30,4,3,2,0,0,0
37838088,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,WI@WI_3,d_1941,0,2016-05-22,...,2,5,2016,30,4,3,2,0,0,0
37838089,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,WI@WI_3,d_1941,1,2016-05-22,...,2,5,2016,30,4,3,2,0,0,0


### future dataframe  

In [None]:
import os
if not os.path.isdir('exog_data.pkl'):
    X_df = (prices
         #prices[prices['wm_yr_wk'] >= train['wm_yr_wk'].max()]
        .merge(calendar[["wm_yr_wk", "date"]], on='wm_yr_wk')
        .merge(train[["id", "store_id", "item_id", "cat_id", "dept_id", "state_id"]].drop_duplicates(),
               on=['item_id', 'store_id'], how="left")
    )

    X_all_df = [] 
    for ind_spec in spec:
        pdf = (X_df
               .groupby(ind_spec + ["date"])
               .agg(sell_price = ('sell_price', 'mean'))
               .reset_index()
               .rename(columns={'date':'ds'})
               .sort_values(by="ds")
               .assign(
                   unique_id = lambda x : x[ind_spec].agg('/'.join, axis=1),
                   momentum_price = lambda x : (x['sell_price'] / x.groupby(ind_spec)['sell_price']
                                                .transform('shift')
                                               ).fillna(0),
               )
               .set_index('unique_id')
              )
        X_all_df.append(pdf)

    X_all_df_ = pd.concat(X_all_df).reset_index()[['unique_id', "ds", "sell_price", "momentum_price"]]
    X_all_df_ = X_all_df_.merge(calendar.rename(columns={"date":"ds"})[["ds", 'event_name_1', 
                                                        'event_name_2', 'event_type_1',
                                                        'event_type_2', 'snap_CA', 'snap_TX', 
                                                        'snap_WI']], how="left", on=["ds"])
    for col in EXTRA_FEATURES:
        X_all_df_[col] = X_all_df_[col].fillna('nan').astype("category").cat.codes.astype(int)
        
    X_all_df_.to_pickle('exog_data.pkl')
    import gc
    del X_all_df, pdf, X_df
    gc.collect()
else:
    X_all_df_ = pd.read_pickle("exog_data.pkl")

# set up reconciliation framework

In [2]:
bottom_fcst = ['LGBMRegressor', 
               'LGBMRegressor/BottomUp',
               'LGBMRegressor/TopDown_method-forecast_proportions',
               'LGBMRegressor/MinTrace_method-mint_shrink',
               'LGBMRegressor/OptimalCombination_method-ols',
                  ]
    
# Reconcile the base predictions
reconcilers = [
   BottomUp(),
   TopDown(method='forecast_proportions'),
   MinTrace(method='mint_shrink'), 
   OptimalCombination(method='ols'), 
   # ERM(method='reg'),
  ]
hrec = HierarchicalReconciliation(reconcilers=reconcilers)

# paramètre basique pour le lightgbm
# paramètre basique pour le lightgbm
model_params = {
    'verbose': -1,
    'force_col_wise': True,
    'num_leaves': 128,
    'n_estimators': 500,
}

fcst = MLForecast(
    models=[lgb.LGBMRegressor(**model_params)],
    freq='D',
    lags=[7 * (i+1) for i in range(8)],
    lag_transforms = {
        1:  [ExpandingMean()],
        7:  [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        14: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        28: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
    },
    date_features=['month', 'day', 'dayofweek', 'quarter', 'week'],    
    num_threads=70,
)
merge_calendar = (calendar
                 .rename(columns={"date":"ds"})
                 .assign(ds= lambda x : pd.to_datetime(x['ds']))
                 )

for col in EXTRA_FEATURES:
    merge_calendar[col] = merge_calendar[col].fillna('nan').astype("category").cat.codes

def hierarchical_forecast(train, store, extra, spec, h, exog):
    # transform the dataset to hierarchical format with the summing matrix (S_df) and a dict of the hierarchie.
    Y_df, S_df, tags = aggregate(
        train.loc[train['state_id@store_id'] == store].rename(columns={'date':'ds'}),
        spec
    )
    # get series
    all_series_at_bottom = tags['state_id/store_id/cat_id/dept_id/item_id']
    
    fcst_ = []
    fitted = []
    
    # i dont want to fit all granularity in the same models.
    # it allow to set the right features[series] for the right granularity 
    for series in spec:    
        forecast_model = copy(fcst)
        fitted_ds =  (Y_df
                      .loc[tags['/'.join(series)]]
                      .reset_index()
                      .merge( (X_all_df_
                               .assign(ds=lambda x : pd.to_datetime(x['ds']))
                              ),
                          how="left", on=["ds", "unique_id"]
                      )
                     )
        fitted_ds = pd.concat(
            (fitted_ds, 
            fitted_ds['unique_id'].str.split('/', expand=True).set_axis(series, axis=1)
            ), axis=1)[['unique_id', 'ds', 'y'] + series + extra]
        
        for col in series:
            fitted_ds[col] = fitted_ds[col].fillna('nan').astype("category").cat.codes.astype(int)
        
        # fit model.
        forecast_model.fit(
            fitted_ds,
            id_col='unique_id',
            time_col='ds',
            target_col='y',
            fitted=True,
            static_features=series,
            #max_horizon=h #one model per steps
        )
        # forecast 28 day ahead
        Y_hat_df = forecast_model.predict(h, 
                                          X_df= (
                                              X_all_df_.loc[X_all_df_['unique_id'].isin(Y_df.index.unique())]
                                              .query("ds >= '2016-05-23'")
                                          )
                                         )
        fcst_.append(Y_hat_df)
        # draw the fitted value 
        Y_fitted_df = forecast_model.forecast_fitted_values()
        fitted.append(Y_fitted_df)
        
    # reconcile the all forecast.
    Y_rec_df = hrec.reconcile(
        Y_hat_df=pd.concat(fcst_), #Y_hat_df, 
        Y_df=pd.concat(fitted), #Y_fitted_df, 
        S=S_df, 
        tags=tags
    )

    bottom_level = (Y_rec_df
                    .loc[all_series_at_bottom]
                    .assign(blend_fcst=lambda x : x[bottom_fcst].mean(axis=1))
                    .reset_index(names="unique_id")
                     .assign(unique_id= lambda x : (x['unique_id'].str.split('/', expand=True).iloc[:, -1]
                                + '_' +
                                x['unique_id'].str.split('/', expand=True).iloc[:, 1]) 
                                + '_evaluation'
        )
                   )
    return bottom_level


In [47]:
all_fcst = []
for store in all_state_and_store:
    print(f"running for {store}")
    btm = hierarchical_forecast(
        train, 
        store, 
        extra=EXTRA_FEATURES,
        spec=spec,
        h=HORIZON, 
        exog=X_all_df_
    )
    all_fcst.append(btm)
    # btm.to_pickle(f'hier_{store}.pkl')
pd.concat(all_fcst).to_pickle('all_fcst_hier.pkl')
hier_fcst = pd.concat(all_fcst)
hier_fcst.head()

running for CA@CA_1
running for CA@CA_2
running for CA@CA_3
running for CA@CA_4
running for TX@TX_1
running for TX@TX_2
running for TX@TX_3
running for WI@WI_1
running for WI@WI_2
running for WI@WI_3


Unnamed: 0,unique_id,index,ds,LGBMRegressor,index/BottomUp,LGBMRegressor/BottomUp,index/TopDown_method-forecast_proportions,LGBMRegressor/TopDown_method-forecast_proportions,index/MinTrace_method-mint_shrink,LGBMRegressor/MinTrace_method-mint_shrink,index/OptimalCombination_method-ols,LGBMRegressor/OptimalCombination_method-ols,blend_fcst
0,FOODS_1_001_CA_1_evaluation,0,2016-05-23,0.959109,0.0,0.959109,0.0,0.836653,-3049.679773,0.873283,-3046.143386,0.636093,0.85285
1,FOODS_1_001_CA_1_evaluation,1,2016-05-24,0.735005,1.0,0.735005,1.767324e-08,0.718589,-3049.673476,0.752092,-3046.139923,0.782667,0.744672
2,FOODS_1_001_CA_1_evaluation,2,2016-05-25,0.693631,2.0,0.693631,1.366277e-07,0.665206,-3049.667179,0.671126,-3046.13646,0.637267,0.672172
3,FOODS_1_001_CA_1_evaluation,3,2016-05-26,0.673162,3.0,0.673162,4.460956e-07,0.650177,-3049.660882,0.63037,-3046.132996,0.61442,0.648258
4,FOODS_1_001_CA_1_evaluation,4,2016-05-27,0.808946,4.0,0.808946,1.024028e-06,0.704992,-3049.654586,0.72161,-3046.129533,0.541779,0.717255


In [10]:
zero_frq = (train
 .loc[train['date'] >= train['date'].max() - pd.DateOffset(days=90)]
 .query("y == 0")
 .groupby('id')
 .size()
 .to_frame('nb')
 .assign(freq = lambda x : x["nb"] / 90)
 .reset_index()
)

fcst = pd.read_pickle('all_fcst_hier.pkl')
fcst['yhat'] = fcst['LGBMRegressor/BottomUp'].clip(0, None)

make_submission(fcst.merge(zero_frq.rename(columns={"id":"unique_id"}), how="left", on=["unique_id"]).assign(yhat=lambda x : np.where(
    (x["LGBMRegressor/BottomUp"].clip(0, None) > 0) & (x["freq"] >= 0.95), 0, x["LGBMRegressor/BottomUp"].clip(0, None))).rename(columns={"unique_id":"id", "ds":"date"}), 
                fcst_col="yhat", filename="v3hier")

In [27]:
make_submission(fcst.merge(zero_frq.rename(columns={"id":"unique_id"}), how="left", on=["unique_id"]).assign(yhat=lambda x : np.where(
    (x["LGBMRegressor/BottomUp"].clip(0, None) > 0) & (x["freq"] >= 0.95), 0, x["LGBMRegressor/BottomUp"].clip(0, None))).rename(columns={"unique_id":"id", "ds":"date"}), 
                fcst_col="yhat", filename="v3hier")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation["id"] = evaluation["id"].str.replace("evaluation", "validation")
