In [1]:
import forecast_engine.modelling.ml_forecast.ml_forecast as ml_forecast
import forecast_engine.modelling.ml_forecast.stats_forecast as sf_forecast
import forecast_engine.modelling.ml_forecast.smape as smape
import forecast_engine.modelling.ml_forecast.skus_to_exclude as skus_exclude
import forecast_engine.modelling.ml_forecast.output_prep as output_prep
import forecast_engine.modelling.ml_forecast.mape_prep as mape_prep
import forecast_engine.modelling.utils.config as config
import forecast_engine.preprocess.nodes as preprocess
import forecast_engine.modelling.ml_forecast.ml_forecast_tuned as ml_forecast_tuned
import forecast_engine.modelling.ml_forecast.hyper_parameter_optuna as tune_models
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')


LOAD CONFIG

In [107]:
# Load congif
import datetime
now = str(datetime.datetime.now().replace(second=0, microsecond=0))
config_catalog = config.load_catalog_params(os.getcwd(), 'conf/catalogs')
config_parameters = config.load_catalog_params(os.getcwd(), 'conf/parameters')
version = str(config_catalog['data_preprocessing']['version']) 
run_level = config_catalog['data_preprocessing']['run_level']
model_output_path = "/".join([os.getcwd(),config_catalog['output_file_path']['model_output_path'],version])
newpath = model_output_path 
processed_data_path =  "/".join([newpath,'processed_data'])
final_results_path =  "/".join([newpath,'final_results'])
mape_summary_path =  "/".join([newpath,'mape_summary'])
# Output Paths
if not os.path.exists(newpath):
    os.makedirs(newpath)
    os.makedirs(processed_data_path)
    os.makedirs(final_results_path)
    os.makedirs(mape_summary_path)
   

# Inputs
country = config_catalog['data_preprocessing']['country']
actual_sales =  "/".join([os.getcwd(),config_catalog['data_preprocessing']['actual_sales_file_path']])
weather =  "/".join([os.getcwd(),config_catalog['data_preprocessing']['drivers']['weather']])
macro =  "/".join([os.getcwd(),config_catalog['data_preprocessing']['drivers']['macro']])
asi =  "/".join([os.getcwd(),config_catalog['data_preprocessing']['drivers']['asi']])
ndvi =  "/".join([os.getcwd(),config_catalog['data_preprocessing']['drivers']['ndvi']])
vhi =  "/".join([os.getcwd(),config_catalog['data_preprocessing']['drivers']['vhi']])
    



# Forecast Engine Parameters
cores_to_use = config_parameters['global_forecast_engineSettings']['cores_to_use']
shap_plots = config_parameters['global_forecast_engineSettings']['shap_plots']
train_end_date = pd.to_datetime(config_parameters['global_forecast_engineSettings']['train_end_date'])
forecast_start_date = pd.to_datetime(config_parameters['global_forecast_engineSettings']['forecast_start_date'])
train_start_date = pd.to_datetime(config_parameters['global_forecast_engineSettings']['train_start_date'])
try : train_start_date - pd.DateOffset(years=3)
except: train_start_date = train_end_date - pd.DateOffset(years=3)
forecast_horizon = config_parameters['global_forecast_engineSettings']['forecast_horizon']
validation_period = config_parameters['global_forecast_engineSettings']['validation_period']

intermittence_thresh = config_parameters['global_forecast_engineSettings']['intermittence_thresh']
min_month_thresh = config_parameters['global_forecast_engineSettings']['min_month_thresh']
season_length = min_month_thresh

use_external_features = config_parameters['global_forecast_engineSettings']['use_external_features']
if use_external_features == 1: external_feature_names = config_parameters['global_forecast_engineSettings']['external_feature_names']
else: external_feature_names = []

multivariate_models = config_parameters['global_forecast_engineSettings']['multivariate_models']
univariate_models = config_parameters['global_forecast_engineSettings']['univariate_models']


hyperparameter_tuning = config_parameters['global_forecast_engineSettings']['hyperparameter_tuning']
if  hyperparameter_tuning == 1:
    params_xgb = config_parameters['global_forecast_engineSettings']['hyperparameter_tune_space']['params_xgb']
    params_rf = config_parameters['global_forecast_engineSettings']['hyperparameter_tune_space']['params_rf']
    params_lsvr = config_parameters['global_forecast_engineSettings']['hyperparameter_tune_space']['params_lsvr']
    params_histgb = config_parameters['global_forecast_engineSettings']['hyperparameter_tune_space']['params_histgb']
    params_ridge = config_parameters['global_forecast_engineSettings']['hyperparameter_tune_space']['params_ridge']
    params_knn = config_parameters['global_forecast_engineSettings']['hyperparameter_tune_space']['params_knn']
    params_mlp = config_parameters['global_forecast_engineSettings']['hyperparameter_tune_space']['params_mlp']

LOAD INPUTS

In [64]:
# Load input files
sales_data, df, raw_data = preprocess.read_prep_actuals(actual_sales,run_level)
wthr_data = preprocess.read_prep_weather(weather)
macro = preprocess.read_macro_files(macro, country = country)
agi = preprocess.read_agi_files(asi,ndvi,vhi, country = country)
df_model = preprocess.model_data_combine(df,wthr_data,macro,agi)
df_model['key'] = df_model['ds'].dt.month.astype(str) + df_model['unique_id'].apply(lambda x : x.split('-')[2])
df_holi = preprocess.create_holiday(list(raw_data['Country'].unique()))
df_model = pd.merge(df_model,df_holi[['key','holiday']],on='key',how='left').drop('key',axis=1)
df_model = df_model[['y','unique_id','ds']+external_feature_names]

In [65]:
# Drop SKUs
df_model= skus_exclude.skus_to_exclude(df_model, min_month_thresh)
dropped = pd.DataFrame([x for x in list(df['Material'].unique()) if x not in list(df_model['unique_id'].unique())]).rename(columns={0:'unique_id'})
dropped['category'] = f'actuals length <{min_month_thresh} months'

In [66]:
# Flag SKUs
import datetime
dates, act_dates = preprocess.prep_dates_list(df)
dates = [x for x in df.set_index('Material').columns.to_list() if datetime.datetime.strptime(x, "%Y - %m").date() >= train_start_date  and  datetime.datetime.strptime(x, "%Y - %m").date() < forecast_start_date] 
df = df.set_index('Material')[dates].reset_index()
df['intermittence'] = df.apply(lambda x : preprocess.intermittence(x), axis = 1)
df['seq - flag'] = df.apply(lambda x : preprocess.seq_flag(x,intermittence_thresh,dates,df.columns) , axis = 1)
SKU_flags = df[['Material','seq - flag']]
regular_series = list(SKU_flags[SKU_flags['seq - flag'] == 1]['Material'])
df_local = df.copy()
df.drop(['intermittence','seq - flag'], axis = 1, inplace=True)

In [105]:
# Configure training and validation data
train = df_model.loc[(df_model['ds'] >= train_start_date) & (df_model['ds'] <= train_end_date)].reset_index(drop=True)
train.fillna(0,inplace=True)
if use_external_features == 0 : train = train[['unique_id', 'ds', 'y']]
valid = df_model.loc[(df_model['ds'] > train_end_date) & 
            (df_model['ds'] < forecast_start_date)].reset_index(drop=True)
train_uni = train[['y','unique_id','ds']]

In [7]:
min_vol_thresh = .3

In [8]:
train_grp = train[train["ds"].astype(str).str.contains("2022")].groupby('unique_id')['y'].mean().reset_index()
sparse_data = list(train_grp[train_grp['y'] < (train_grp['y'].mean()*min_vol_thresh)]['unique_id'])
regular_series = [x for x in regular_series if x not in sparse_data]

TRAINING

In [11]:
if hyperparameter_tuning : 
            
    best_params = tune_models.hyper_paramerter_tune(train, valid, regular_series, forecast_horizon, external_feature_names)




    #best hyperparmeters 
    hyper_paramerter_space = {
                            'params_xgb' : {k:best_params[k] for k,v in params_xgb.items()},
                            'params_rf' : {k:best_params[k] for k,v in params_rf.items()},
                            'params_lsvr' : {k:best_params[k] for k,v in params_lsvr.items()},
                            'params_histgb' : {k:best_params[k] for k,v in params_histgb.items()},
                            'params_ridge' : {k:best_params[k] for k,v in params_ridge.items()},
                            'params_knn' : {k:best_params[k] for k,v in params_knn.items()},
                            'params_mlp' : {k:best_params[k] for k,v in params_mlp.items()},
                            }
else:
    pass

In [12]:
if hyperparameter_tuning : 
    ml_,model = ml_forecast_tuned.ml_forecast(train, regular_series, multivariate_models, hyper_paramerter_space = hyper_paramerter_space,
                                                horizon = validation_period, processed_data_path = processed_data_path, static_features = external_feature_names)
else : 
    ml_,model = ml_forecast.ml_forecast(train, regular_series, multivariate_models, horizon = validation_period, processed_data_path=processed_data_path,static_features = external_feature_names)

sf_ = sf_forecast.sf_forecast(train_uni, regular_series,  univariate_models, season_length = season_length, horizon = validation_period) 


final_model_fit = output_prep.output_prep(ml_,sf_,sales_data)
best_model_df, valid_predictions = mape_prep.mape_prep(final_model_fit)
skus = list(best_model_df.dropna()['unique_id'].unique())
skus_dropped = pd.DataFrame(best_model_df[best_model_df['validation_mape'].isna()]['unique_id'])
skus_dropped['category'] = 'no actuals found for validation period'


  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)


OOS PREDICTION

In [13]:
train = df_model.loc[(df_model['ds'] < forecast_start_date)].reset_index(drop=True)
train.fillna(0,inplace=True)
train_uni = train[['unique_id','y','ds']]
if use_external_features == 0 : train = train[['unique_id', 'ds', 'y']]
train_uni = train[['unique_id','y','ds']]
if hyperparameter_tuning : ml_oos, model = ml_forecast_tuned.ml_forecast(train, regular_series, multivariate_models, hyper_paramerter_space = hyper_paramerter_space,
                                                horizon = forecast_horizon, processed_data_path = processed_data_path, static_features = external_feature_names)
else : ml_oos, model = ml_forecast.ml_forecast(train, regular_series, multivariate_models, horizon = forecast_horizon, processed_data_path=processed_data_path,static_features = external_feature_names)
sf_oos = sf_forecast.sf_forecast(train_uni, regular_series,  univariate_models, season_length = season_length, horizon = forecast_horizon)

  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / (1 + e)
  fits = y / 

In [14]:
if shap_plots: preprocess.shap_plots(train,model,best_model_df,regular_series,processed_data_path)

In [15]:
ml_oos['key'] = ml_oos['unique_id'] + ml_oos['ds'].astype(str)
sf_oos = sf_oos.reset_index()
sf_oos['key'] = sf_oos['unique_id'] + sf_oos['ds'].astype(str)
final_model_fit = pd.merge(ml_oos,sf_oos.drop(['unique_id','ds'],axis=1),on='key',how='left')


In [19]:
final_output = output_prep.prep_oos(ml_oos, sf_oos, best_model_df, skus)

final_output_1 = preprocess.post_process(final_output,best_model_df,forecast_start_date)

In [22]:
sap_output = preprocess.prep_sap_output(raw_data,final_output,run_level)

Local pipeline

In [24]:
import re
import os
import pandas as pd
import numpy as np


import forecast_engine.modelling.utils.nodes as utils
import forecast_engine.modelling.local_pipeline.forecast_engine.forecast_engine as forecast_engine





import forecast_engine.modelling.local_pipeline.models.ml_models.naive_models as naive_models
import forecast_engine.modelling.local_pipeline.models.ml_models.holt_winters as holt_winters
import forecast_engine.modelling.local_pipeline.models.ml_models.croston as croston

import forecast_engine.modelling.utils.config as config
import forecast_engine.modelling.utils.logger as logger
import forecast_engine.preprocess.nodes as preprocess

In [51]:
#Flags
debug_flag = 0
debug_models = []
outlier_flag = 1
select_variable_flag = 0
all_at_once_flag = 1
months_to_forecast = 12
max_validation = 6
validation_window = 3
single_series = 0
multivariate_flag = 0
ts_feature_flag = 0


univariate_models = {'hw_ets_r_MNN':holt_winters.hw_ets_r_MNN, 'hw_ets_ATN':holt_winters.hw_ets_ATN,  
                    'hw_ets_AFN':holt_winters.hw_ets_AFN, 'hw_ets_NFN':holt_winters.hw_ets_NFN, 'hw_ets_r_MAN':holt_winters.hw_ets_r_MAN,
                   
                    'mean_model':naive_models.mean_model, 'median_model':naive_models.median_model,
                    'snaive':naive_models.snaive, 'naive':naive_models.naive, 'naive_ets':naive_models.naive_ets,
                     
                    'croston':croston.croston}
                    

list_of_models = univariate_models.copy()

In [67]:
df_local.drop(['intermittence'],axis =1,inplace=True)
df_local.fillna(0,inplace=True)

In [68]:
skus = list(sales_data[sales_data['year']=='2022']['Material'].unique())
df_sparse = df_local[df_local['Material'].isin(sparse_data)]
df_sparse.rename(columns = {'Material':'index'},inplace=True)
df_sparse = df_sparse[df_sparse['index'].isin(skus)].reset_index(drop=True)

In [70]:
%%capture
lst_error = []
keys = list(df_sparse.head(10)['index'].unique())
oos = pd.DataFrame(columns = ['index','DATE','Method','Forecast'])
models = pd.DataFrame(columns = ['models', 'smape','index'])
fitted_values = pd.DataFrame(columns = ['date_', 'forecast', 'method','index'])
ensemble = pd.DataFrame(columns= ['index','date_','forecast','method'])
for i in keys:
    try:
        df_output = df_sparse[df_sparse['index'] == i].apply(lambda x: forecast_engine.forecast_engine(x, months_to_forecast,debug_flag,debug_models,list_of_models, single_series,  all_at_once_flag, validation_window, max_validation, outlier_flag, select_variable_flag),axis = 1)    
        df_output = df_output.ravel()
        output_temp = df_output[0]['output'][['index','DATE','Method','Forecast']]
        oos = pd.concat([oos,output_temp])

        model_list = df_output[0]['dataf']
        model_list['index'] = i
        models = pd.concat([models,model_list])
        
        df_fitted_values_list = df_output[0]['fitted_values_list']
        df_fitted_values_list = df_fitted_values_list[0][model_list['models'][0]]['forecast']
        df_fitted_values_list['index'] = i
        fitted_values = pd.concat([fitted_values,df_fitted_values_list])

        df_ensemble = df_output[0]['df_ensemble'][['index','date_','forecast','method']]
        ensemble = pd.concat([ensemble,df_ensemble])

        
    except:
        lst_error.append(i)

    

In [None]:
def post_process_sparse(oos,models,run_level,raw_data,forecast_start_date):
    oos_mer = pd.merge(oos,models[['smape','index']].drop_duplicates(),on='index',how='left')
    oos_mer['DATE'] = oos_mer['DATE'] .apply(lambda x: datetime.datetime.strptime(x, "%Y - %m") )
    oos_mer = oos_mer.loc[(oos_mer['DATE'] >= forecast_start_date)].reset_index(drop=True)
    oos_mer.rename(columns = {'index':'unique_id','Method':'variable','smape':'validation_mape','DATE':'ds','Forecast':'value'},inplace=True)
    oos_mer['value'] = oos_mer['value'].apply(lambda x : x if x > 0 else 0)
    final_output_sparse = oos_mer.copy()
    final_output_1_sparse = oos_mer.pivot(index=['unique_id','variable','validation_mape'],columns="ds", values='value').reset_index().rename_axis(None, axis=1)
    sap_output_sparse = preprocess.prep_sap_output(raw_data,final_output_sparse,run_level)
    return final_output_1_sparse, sap_output_sparse

In [99]:
final_output_1_sparse , sap_output_sparse = post_process_sparse(oos,models,run_level,raw_data,forecast_start_date)


In [128]:
def valid_predictions_sprase_prep(fitted_values,sales_data,train_end_date,forecast_start_date):
    fitted_values['key'] = fitted_values['index'] + fitted_values['date_']
    sales_data_ = sales_data.copy()
    sales_data_['key'] = sales_data_['Material'] + sales_data_['version']
    validation_predictions_sparse = pd.merge(fitted_values,sales_data_[['key','y']],on='key',how='left').fillna(0)
    validation_predictions_sparse['DATE'] = validation_predictions_sparse['date_'] .apply(lambda x: datetime.datetime.strptime(x, "%Y - %m").date() )
    validation_predictions_sparse = validation_predictions_sparse.loc[(validation_predictions_sparse['DATE'] > train_end_date) & (validation_predictions_sparse['DATE'] < forecast_start_date)].reset_index(drop=True).drop('DATE',axis=1)
    validation_predictions_sparse['forecast'] = validation_predictions_sparse['forecast'].apply(lambda x : x if x > 0 else 0)
    return validation_predictions_sparse

In [131]:
validation_predictions_sparse['year'] = validation_predictions_sparse['date_'].apply(lambda x : x.split(' - ')[0])
validation_predictions_sparse['month'] = validation_predictions_sparse['date_'].apply(lambda x : x.split(' - ')[1])
validation_predictions_sparse.rename(columns = {'forecast':'Forecast','date_':'ds','index':'unique_id'},inplace = True)
validation_predictions_sparse = validation_predictions_sparse[valid_predictions.columns.to_list()]

PIPELINE DEBUG

In [1]:
import warnings
warnings.filterwarnings('ignore')
import forecast_engine.modelling.thunderbird_global as thunderbird
import forecast_engine.modelling.utils.logger as logger

In [3]:
logger.logger.info("loading data preprocessing pipeline...")
thunderbird.thunderbird_global.instantiate_inputs()
thunderbird.thunderbird_global.drop_skus()
thunderbird.thunderbird_global.flag_skus()
logger.logger.info("data preprocessing pipeline complete...")

logger.logger.info("loading model parameters...")
thunderbird.thunderbird_global.thunder_fuel()
logger.logger.info("model parameters loaded...")

In [3]:
logger.logger.info("hyperparameter tuning started...")
thunderbird.thunderbird_global.tune_engine()
logger.logger.info("hyperparameter tuning complete...")

2023-04-07 22:12:28,503 | INFO | hyperparameter tuning started...
2023-04-07 22:12:28,504 | INFO |  engine not tuned
2023-04-07 22:12:28,505 | INFO | hyperparameter tuning complete...


In [19]:
logger.logger.info("fitting models started...")
thunderbird.thunderbird_global.fit_models()
logger.logger.info("fitting models complete...")

In [20]:
logger.logger.info("oos prediction started...")
thunderbird.thunderbird_global.predict_oos()
logger.logger.info("oos prediction complete...")

In [21]:
logger.logger.info("running post processing pipeline...")

logger.logger.info("post processing pipeline complete...")
logger.logger.info("Saving model outputs...")
thunderbird.thunderbird_global.generate_results()
logger.logger.info("Thunder bird forecast engine run complete.")