In [14]:
import pandas as pd 
import numpy as np 

skip = [f"d_{i}" for i in range(1, 700 + 1)] 
HORIZON = 28 # horizon de forecast
CAT_COLS = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
# spec de la hierarchie que l'on veut travaillé 
spec = [
    ['state_id', 'store_id', 'cat_id'], 
    ['state_id', 'store_id', 'cat_id', 'dept_id'],
    ['state_id', 'store_id', 'cat_id', 'dept_id', 'item_id'],
]

calendar = pd.read_csv("C:/Users/n000193384/Downloads/awa_mooc/src/data/calendar.csv", 
                       parse_dates=['date'],
                       usecols=['date', 'd', 'wm_yr_wk', 'event_name_1', 'event_name_2', 
                                'event_type_1', 'event_type_2',
                                'snap_CA', 'snap_TX', 'snap_WI']
)

event_cols = [k for k in calendar if k.startswith('event')]
for col in event_cols:
    calendar[col] = calendar[col].fillna('nan')
    
row_sales = pd.read_csv("C:/Users/n000193384/Downloads/awa_mooc/src/data/sales_train_evaluation.csv").sample(100)
prices = pd.read_csv('C:/Users/n000193384/Downloads/awa_mooc/src/data/sell_prices.csv')

row_sales['state_id@store_id'] = row_sales['state_id'].astype(str) + "@" + row_sales['store_id'].astype(str)
all_state_and_store = np.unique(row_sales['state_id@store_id'])

# formattage ligne et renommage des colonnes pour respecter les conventions de la librairie.
train = (row_sales
         .drop(skip, axis=1)
         .pipe(pd.melt, 
               id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id", "state_id@store_id"],
               var_name='d',
               value_name="y"
               )
         .merge(calendar, how="left", on=["d"])
         .merge(prices, on=["store_id", "item_id", "wm_yr_wk"])
        )

st_idx = (train
          .loc[(train['y'] > 0) & train['sell_price'].notnull()]
          .groupby('id', as_index=False)
          .agg(start_date = ('date', 'min'))
)

train = train.merge(st_idx, how="left", on=["id"]).query('date>=start_date')

future_cal = calendar[calendar['date'] >  train['date'].max()]
future_prices = prices[prices['wm_yr_wk'] >= train['wm_yr_wk'].max()].copy()
future_prices['id'] = future_prices['item_id'].astype(str) + '_' + future_prices['store_id'].astype(str) + '_evaluation'
X_df = future_prices.merge(future_cal, on='wm_yr_wk').drop(columns=['store_id', 'item_id', 'wm_yr_wk', 'd'])

# extract all series id
train.tail(5)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,state_id@store_id,d,y,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,start_date
115688,HOBBIES_2_149_TX_1_evaluation,HOBBIES_2_149,HOBBIES_2,HOBBIES,TX_1,TX,TX@TX_1,d_1941,0,2016-05-22,11617,,,,,0,0,0,0.97,2013-05-07
115689,FOODS_3_718_CA_4_evaluation,FOODS_3_718,FOODS_3,FOODS,CA_4,CA,CA@CA_4,d_1940,0,2016-05-21,11617,,,,,0,0,0,1.48,2012-12-29
115690,FOODS_3_718_CA_4_evaluation,FOODS_3_718,FOODS_3,FOODS,CA_4,CA,CA@CA_4,d_1941,1,2016-05-22,11617,,,,,0,0,0,1.48,2012-12-29
115691,HOUSEHOLD_2_446_WI_2_evaluation,HOUSEHOLD_2_446,HOUSEHOLD_2,HOUSEHOLD,WI_2,WI,WI@WI_2,d_1940,0,2016-05-21,11617,,,,,0,0,0,25.97,2013-02-12
115692,HOUSEHOLD_2_446_WI_2_evaluation,HOUSEHOLD_2_446,HOUSEHOLD_2,HOUSEHOLD,WI_2,WI,WI@WI_2,d_1941,0,2016-05-22,11617,,,,,0,0,0,25.97,2013-02-12


In [30]:
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean, SeasonalRollingMean
import lightgbm as lgb 

# paramètre basique pour le lightgbm
model_params = {
    'verbose': -1,
    'force_col_wise': True,
    'num_leaves': 100,
    'n_estimators': 100,
}

fcst = MLForecast(
    models=[lgb.LGBMRegressor(**model_params)],
    freq='D',
    lags=[7 * (i+1) for i in range(8)],
    lag_transforms = {
        1:  [ExpandingMean()],
        7:  [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        14: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        28: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
    },
    date_features=['month', 'day', 'dayofweek', 'quarter', 'week'],    
    num_threads=-1,
)

fcst.fit(
    train,
    id_col='id',
    time_col='date',
    target_col='y',
    static_features=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
)

bottom_level_forecast = fcst.predict(28, X_df=X_df)
bottom_level_forecast.head()



Unnamed: 0,id,date,LGBMRegressor
0,FOODS_1_001_CA_1_evaluation,2016-05-23,0.777874
1,FOODS_1_001_CA_1_evaluation,2016-05-24,0.702040
2,FOODS_1_001_CA_1_evaluation,2016-05-25,0.624894
3,FOODS_1_001_CA_1_evaluation,2016-05-26,0.684752
4,FOODS_1_001_CA_1_evaluation,2016-05-27,0.700597
...,...,...,...
853715,HOUSEHOLD_2_516_WI_3_evaluation,2016-06-15,0.151102
853716,HOUSEHOLD_2_516_WI_3_evaluation,2016-06-16,0.162570
853717,HOUSEHOLD_2_516_WI_3_evaluation,2016-06-17,0.188832
853718,HOUSEHOLD_2_516_WI_3_evaluation,2016-06-18,0.204748


In [None]:
toplevel = (train
            .groupby(['state_id', 'store_id', 'cat_id', 'date'], as_index=False)
            .agg(y=('y', 'sum'))
            .assign(unique_id = lambda x : x['state_id'].astype(str) + '@' + x['store_id'].astype(str) + '@' + x['cat_id'].astype('str'))
            .rename(columns={"date":"ds"})
)[["unique_id", "ds", "y"]]


from statsforecast import StatsForecast
from statsforecast.models import (
    AutoETS,
    DynamicOptimizedTheta,
    AutoCES,
    AutoARIMA, 
    MSTL,
    SeasonalWindowAverage
)

# on s'attend à un effet hebdomadaire.
EXPECTED_SEASONAL_VALUE = 30
HORIZON = 28

# Les modèles qui vont être "fitté"
models = [
    AutoETS(season_length=EXPECTED_SEASONAL_VALUE),
    DynamicOptimizedTheta(season_length=EXPECTED_SEASONAL_VALUE),
    AutoCES(season_length=EXPECTED_SEASONAL_VALUE),
    MSTL(
        season_length=[EXPECTED_SEASONAL_VALUE, 90], # seasonalities of the time series
        trend_forecaster=AutoARIMA() # model used to forecast trend
    )
]

# le wrapper.
wrapper_models = StatsForecast( 
    models=models,
    freq='D', 
    n_jobs=-1,
    fallback_model=SeasonalWindowAverage(season_length=EXPECTED_SEASONAL_VALUE, window_size=HORIZON)
)

toplevel_forecast = wrapper_models.forecast(df=train, h=HORIZON).assign(blend_yhat=lambda x : x.clip(0, None).median(axis=1))


In [31]:
Y_hat_df = bottom_level_forecast.append(toplevel)

from hierarchicalforecast.utils import aggregate
from hierarchicalforecast.core import HierarchicalReconciliation
from hierarchicalforecast.methods import BottomUp, TopDown, MinTrace, ERM, OptimalCombination

bottom_fcst = [
    'LGBMRegressor', 
    'LGBMRegressor/BottomUp',
    'LGBMRegressor/TopDown_method-forecast_proportions',
    'LGBMRegressor/MinTrace_method-ols', 
    'LGBMRegressor/MinTrace_method-mint_shrink'
    ]
    
# Reconcile the base predictions
reconcilers = [
    BottomUp(),
    TopDown(method='forecast_proportions'),
    MinTrace(method='ols'),
    MinTrace(method='mint_shrink'),
    ERM('reg'),
    OptimalCombination("ols")
]
hrec = HierarchicalReconciliation(reconcilers=reconcilers)

apd = []
for store in all_store:
    Y_df, S_df, tags = aggregate(
        train.loc[train['state_id@store_id'] == store].rename(columns={'date':'ds'}),
        spec
    )
    # get series
    all_series_at_bottom = tags['state_id/store_id/cat_id/dept_id/item_id']
    
    Y_rec_df = hrec.reconcile(Y_hat_df=Y_hat_df, 
                              Y_df=train[['unique_id', 'ds', 'y']].append(toplevel),  # Y_fitted_df,
                              S=S_df,
                              tags=tags)

    rec_df = (Y_rec_df
                    .loc[all_series_at_bottom]
                    .assign(blend_fcst=lambda x : x[bottom_fcst].mean(axis=1))
                    .reset_index(names="unique_id")
                     .assign(unique_id= lambda x : (x['unique_id'].str.split('/', expand=True).iloc[:, -1]
                                + '_' +
                                x['unique_id'].str.split('/', expand=True).iloc[:, 1]) 
                                + '_evaluation'
        )
                   )
    apd.append(rec_df)

In [32]:
def make_submission(test, fcst_col="", filename=""):
    # private leaderboard
    submission = pd.read_csv("/home/jupyter/mawa/data/sample_submission.csv")
    predictions = test[['id', 'date', fcst_col]]
    predictions = pd.pivot(predictions, index='id', columns='date', values=fcst_col).reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    # public leaderboard set to 0
    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row]
    evaluation = submission[submission['id'].isin(evaluation_rows)]
    evaluation["id"] = evaluation["id"].str.replace("evaluation", "validation")

    validation = submission[['id']].merge(predictions, on='id')
    final = pd.concat([validation, evaluation])
    final.iloc[:, 1:] = final.iloc[:, 1:].astype(np.float32)
    final.to_csv(f'submission_{filename}.csv.gz', index=False, compression="gzip")



make_submission((preds
                 .reset_index()
                 .assign(yhat=lambda x: pd.to_numeric(np.around(x["LGBMRegressor"], 2).clip(0, None)))
                 .assign(id=lambda x : x['id'])
                ),
                fcst_col="yhat", filename=f"mlforecast"
               )


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation["id"] = evaluation["id"].str.replace("evaluation", "validation")
