##  FUNCTIONS

### DATA QUALITY

In [43]:
def data_quality(x):
    
    #types
    temp = x.astype({'month': 'O', 'wday': 'O'})             
    
    #Impute nulls
    temp.loc[x['event_name_1'].isna(),'event_name_1'] = 'no_event'
    
    def impute_mode(records):
        #mode of price in that product
        mode = records.sell_price.mode()[0]
        #Impute nulls
        records.loc[records.sell_price.isna(),'sell_price'] = mode
    
        return(records)

    temp = temp.groupby('item_id').apply(impute_mode)
      
    return(temp)

### Creation of variables

In [44]:
def make_variables(x):
    
    # INTERMITTENT DEMAND
    
    def stock_break(sales, n = 5):
        zero_sales = pd.Series(np.where(sales == 0,1,0))
        num_zeros = zero_sales.rolling(n).sum()
        stock_break = np.where(num_zeros == n,1,0)
        return(stock_break)
    
    x = x.sort_values(by = ['store_id','item_id','date'])
    x['stock_break_3'] = x.groupby(['store_id','item_id']).sales.transform(lambda x: stock_break(x, 3)).values
    x['stock_break_7'] = x.groupby(['store_id','item_id']).sales.transform(lambda x: stock_break(x,7)).values
    x['stock_break_15'] = x.groupby(['store_id','item_id']).sales.transform(lambda x: stock_break(x,15)).values
    
    
    #LAGS
    
    def make_lags(x, variable, num_lags = 7):
        lags = pd.DataFrame()
        for cada in range(1,num_lags+1):
            lags[variable + '_lag_'+ str(cada)] = x[variable].shift(cada)
        return(lags)
    
    lags_sell_price_x = x.groupby(['store_id','item_id'])\
                    .apply(lambda x: make_lags(x = x, variable = 'sell_price', num_lags= 7))
    
    lags_stock_break_3_x = x.groupby(['store_id','item_id'])\
                    .apply(lambda x: make_lags(x = x, variable = 'stock_break_3', num_lags= 1))
    
    lags_stock_break_7_x = x.groupby(['store_id','item_id'])\
                    .apply(lambda x: make_lags(x = x, variable = 'stock_break_7', num_lags= 1))
    
    lags_stock_break_15_x = x.groupby(['store_id','item_id'])\
                    .apply(lambda x: make_lags(x = x, variable = 'stock_break_15', num_lags= 1))
    
    lags_sales_x = x.groupby(['store_id','item_id'])\
                    .apply(lambda x: make_lags(x = x, variable = 'sales', num_lags= 15))
    
    
    # MOBILE WINDOWS
    
    def min_movil(x, variable, num_periods = 7):
        minm = pd.DataFrame()
        for cada in range(2,num_periods+1):
            minm[variable + '_minm_' + str(cada)] = x[variable].shift(1).rolling(cada).min()
        return(minm)
    
    def mean_movil(x, variable, num_periods = 7):
        mm = pd.DataFrame()
        for cada in range(2,num_periods+1):
            mm[variable + '_mm_' + str(cada)] = x[variable].shift(1).rolling(cada).mean()
        return(mm)
    
    def max_movil(x, variable, num_periods = 7):
        maxm = pd.DataFrame()
        for cada in range(2,num_periods+1):
            maxm[variable + '_maxm_' + str(cada)] = x[variable].shift(1).rolling(cada).max()
        return(maxm)
    
    min_movil_x = x.groupby(['store_id','item_id'])\
                    .apply(lambda x: min_movil(x = x, variable = 'sales', num_periods= 15))
    
    mean_movil_x = x.groupby(['store_id','item_id'])\
                    .apply(lambda x: mean_movil(x = x, variable = 'sales', num_periods= 15))
    
    max_movil_x = x.groupby(['store_id','item_id'])\
                    .apply(lambda x: max_movil(x = x, variable = 'sales', num_periods= 15))
    
    
    #JOIN GENERATED DATAFRAMES
    
    x_join = pd.concat([x,
                      lags_sell_price_x,
                      lags_stock_break_3_x,
                      lags_stock_break_7_x,
                      lags_stock_break_15_x,
                      lags_sales_x,
                      min_movil_x,
                      mean_movil_x,
                      max_movil_x], axis = 1)

    x_join.dropna(inplace=True)
    
    x_join.drop(columns = ['sell_price','stock_break_3','stock_break_7','stock_break_15'],
                  inplace=True)
    
    #Create a single variable for the product
    x_join.insert(loc=0,column='producto',value=x_join.store_id + '_'+ x_join.item_id)
    x_join = x_join.drop(columns = ['store_id','item_id'])
    
    return(x_join)

###  Variable transformation

In [45]:
def transform_variables(x,y=None,mode = 'training'):
    
    '''
    I have modified this function so that it works for both training and execution:

    * Including the mode parameter, which defaults to training
    * Making the y parameter optional, since in execution it is not used

    When used in training mode apply the fit_transform method and save the objects.

    While when used in run mode it loads the objects and applies only the transform method.
    '''
    
    x.reset_index(inplace = True)

    #ENCODERS
    name_ohe = 'ohe_retail.pickle'
    name_te = 'te_retail.pickle'
    path_ohe = path + '/04_Models/' + name_ohe
    path_te = path + '/04_Models/' + name_te
    
    #ONE HOT ENCODING
    var_ohe = ['event_name_1']
    if mode == 'training':
        #If it is in training apply fit_transform and save the encoder
        ohe = OneHotEncoder(sparse = False, handle_unknown='ignore')
        ohe_x = ohe.fit_transform(x[var_ohe])
        ohe_x = pd.DataFrame(ohe_x, columns = ohe.get_feature_names_out())
        with open(path_ohe, mode='wb') as file:
           pickle.dump(ohe, file)
    else:
        # If it is running it retrieves the save and just applies transform
        with open(path_ohe, mode='rb') as file:
            ohe = pickle.load(file)
        ohe_x = ohe.transform(x[var_ohe])
        ohe_x = pd.DataFrame(ohe_x, columns = ohe.get_feature_names_out())

    #TARGET ENCODING    
    var_te = ['month','wday','weekday']
    if mode == 'training':
        # MAKE SURE Y HAS THE SAME REGISTERS AS X
        y.reset_index(inplace = True, drop = True)
        y = y.loc[y.index.isin(x.index)]
        # If it is in training apply fit_transform and save the encoder
        te = TargetEncoder(min_samples_leaf=100, return_df = False)
        te_x = te.fit_transform(x[var_te], y = y)
        names_te = [variable + '_te' for variable in var_te]
        te_x = pd.DataFrame(te_x, columns = names_te)
        with open(path_te, mode='wb') as file:
           pickle.dump(te, file)
    else:
        # If it is running it retrieves the save and just applies transform
        with open(path_te, mode='rb') as file:
            te = pickle.load(file)
        te_x = te.transform(x[var_te])
        names_te = [variable + '_te' for variable in var_te]
        te_x = pd.DataFrame(te_x, columns = names_te)
    
      

    #INTEGRATE, CLEAN AND RETURN THE DATAFRAME
    #Delete the already transformed originals
    x = x.drop(columns=['event_name_1','month','wday','weekday'])

    x = pd.concat([x,ohe_x,te_x], axis=1).set_index('date')

    return(x)

### Variables Pre-selection

In [46]:
def variables_selection(x,y):
    
    '''
    only for fit.
    '''
    #DROP COLUMN PRODUCTO AND THE INDEX
    x.reset_index(drop = True,inplace = True)
    x.drop(columns='producto',inplace = True)
    
    # verify x and y need to have same size
    y = y.loc[y.index.isin(x.index)]
    

    mutual_selector = mutual_info_regression(x,y)
    position_variable_limit = 70
    ranking_mi = pd.DataFrame(mutual_selector, index = x.columns).reset_index()
    ranking_mi.columns = ['variable','importance_mi']
    ranking_mi = ranking_mi.sort_values(by = 'importance_mi', ascending = False)
    ranking_mi['ranking_mi'] = np.arange(0,ranking_mi.shape[0])
    in_mi = ranking_mi.iloc[0:position_variable_limit].variable
    x_mi = x[in_mi].copy()

    return(x_mi)

### Modelling

#### FIT

In [47]:
def modelling(x_producto, y):
    
    '''
    This function is what does the individual modeling.

    Receives the x and y data of a product.

    Find the optimal parameters for that product.

    Returns the best model.
    '''
      
    
    #Exclude product as a modeling variable
    var_model = x_producto.columns.to_list()[2:]
    
    #Define cross validation
    time_cv = TimeSeriesSplit(5, test_size = 8)
    
    #Define algorithm grid
    pipe = Pipeline([('algorithm',HistGradientBoostingRegressor())])
    
    grid = [ 
         {'algorithm': [HistGradientBoostingRegressor()]
#          'algoritmo__learning_rate': [0.01,0.025,0.05,0.1],
#          'algoritmo__max_iter': [50,100,200],
#          'algoritmo__max_depth': [5,10,20,50],
#          'algoritmo__scoring': ['neg_mean_absolute_error'],
#          'algoritmo__l2_regularization': [0,0.25,0.5,0.75,1]
         }
                       
    ]
           
    #make models
    random_search = RandomizedSearchCV(estimator = pipe,
                                   param_distributions = grid, 
                                   n_iter = 1, 
                                   cv = time_cv, 
                                   scoring = 'neg_mean_absolute_error', 
                                   verbose = 0,
                                   n_jobs = -1)
    
    model = random_search.fit(x_producto[var_model],y)
    
    #Retrain the best over all datas
    final_model = model.best_estimator_.fit(x_producto[var_model],y)

    return(final_model)

In [48]:
def run_training(df):
    
    '''
    This function goes through all the products and calls modelling() to create a total list with all the models of all the products.

    It receives the dataframe of the x's already cleaned and segmented by product, and also the target.

    It does not return anything, but saves the object already trained with all the models on disk.
    '''
    
    list_products = list(df.producto.unique())
    
    list_models =[] 
    
    for cada in list_products:
        
        #Rename
        producto = cada
        target = 'sales'

        x = df.loc[df.producto == producto].copy().drop(columns=target).copy()
        y = df.loc[df.producto == producto,'sales'].copy()

        x = transform_variables(x,y)
        x = variables_selection(x,y)
        
        #call a la function de modelling
        model = modelling(x,y)
        
        #Add final model to the list
        list_models.append((producto,model))
        
    #save list of fit models
    name_models = 'list_models_retail.pickle'
    path_models = path + '/04_Models/' + name_models
    with open(path_models, mode='wb') as file:
       pickle.dump(list_models, file)

#### EXECUTION

In [49]:
#Run prediction
def run_execution(df):
    
    '''
    This function makes the forecast for each product, but only for one day.

    Receives the new dataset to predict.
    
    It must have the structure of the file DatosParaProduccion.csv in the Validation folder.

    It goes through each product, loading its corresponding model, selecting its data, and making predictions.

    Returns the prediction for all the products but ONLY FOR THE DAY THAT IS TOUCHED.
    '''
    
    #upload models
    name_models = 'list_models_retail.pickle'
    path_models = path + '/04_Models/' + name_models
    with open(path_models, mode='rb') as file:
       list_models = pickle.load(file)
    
    predictions_df = pd.DataFrame(columns=['date','producto','sales','predictions'])
    
    for cada in range(0,len(list_models)):

        producto = list_models[cada][0]
        model = list_models[cada][1]
        variables = model[0].feature_names_in_
        target = 'sales'
        
        x = df.loc[df.producto == producto].copy().drop(columns=target).copy()
        y = df.loc[df.producto == producto,'sales'].copy()
        
        date = df.reset_index().copy()
        date = date.loc[date.producto == producto,'date'].values

        #Transformation of variables
        x = transform_variables(x, mode = 'execution')
        
        #Variables Selection
        x = x[variables]
        
        #Predictions
        predictions = pd.DataFrame(data={'date': date,
                                          'producto': producto,
                                          'sales': y,
                                          'prediction': model.predict(x)})

        predictions['prediction'] = predictions.prediction.astype('int')

        predictions_df = pd.concat([predictions_df,predictions])
    
    predictions_df = predictions_df.loc[predictions_df.index == predictions_df.index.min()]
    return(predictions_df)

In [50]:
def forecast_recursive(x):
    '''
    This function is the one that applies the recursive forecast to predict 8 days.
    
    Receives the new dataset to predict.

    It must have the structure of the file DatosParaProduccion.csv in the Validation folder.
    
    Since to apply recursion:

    * It will predict the first day for which it has all the information (ie 15 days from the oldest day)
    * When finished, it saves the sales prediction in the record to be predicted and eliminates the records of the oldest day in the data frame.
    * Therefore in the next iteration it will predict the next day.

    For example:

    If the oldest day in the dataset is 12/09/2015 then the first day you can predict
    
    (and of which we no longer have data) is 12/24/2015.

    When you predict the data of 24 for each product you overwrite it as your sales
    
    and removes all records from day 09.

    Then the oldest day becomes the 10th and therefore the day to predict is the 25th.

    And so until the end of 8 cycles to predict the week we want.
    '''

    
    for cada in range(0,8):
        step1_df = data_quality(x)
        step2_df = make_variables(step1_df)
        
        #predict
        f = run_execution(step2_df)
        f['store_id'] = f.producto.str[:4]
        f['item_id'] = f.producto.str[5:]

        #Update sales data with prediction
        x.loc[(x.index.isin(f.date)) & (x.store_id.isin(f.store_id)) & (x.item_id.isin(f.item_id)),'sales'] = f.prediction
                                                              
        #Drop last day of x
        x = x.loc[x.index != x.index.min()]
        
    return(x)

## PROCESS

### RETRAINING

In [51]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder

from sklearn.feature_selection import mutual_info_regression

from sklearn.model_selection import TimeSeriesSplit

from sklearn.pipeline import Pipeline

from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

#upload data
path = '../../'
name_data_file = 'work.csv'
full_path = path + '/02_Data/03_Work/'+ name_data_file
df = pd.read_csv(full_path,sep=',',parse_dates=['date'],index_col='date')

#Select variables used
final_variables = ['store_id',
                     'item_id',
                     'event_name_1',                     
                     'month',
                     'sell_price',                      
                     'wday',
                     'weekday',
                     'sales']

df = df[final_variables]

step1_df = data_quality(df)
step2_df = make_variables(step1_df)

run_training(step2_df)

### EVALUATION

In [52]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder

from sklearn.feature_selection import mutual_info_regression

from sklearn.model_selection import TimeSeriesSplit

from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

#Upload data
path = '../../'
name_data_file = 'validation.csv'
full_path = path + '/02_Data/02_Validation/' + name_data_file
df = pd.read_csv(full_path,sep=',',parse_dates=['date'],index_col='date')

#Select only used
final_variables = ['store_id',
                     'item_id',
                     'event_name_1',                     
                     'month',
                     'sell_price',                      
                     'wday',
                     'weekday',
                     'sales']


df = df[final_variables]

step1_df = data_quality(df)
step2_df = make_variables(step1_df)

forecast_1day = run_execution(step2_df)

print('MAE = ', mean_absolute_error(forecast_1day.sales,forecast_1day.prediction))

forecast_1day

MAE =  5.2


Unnamed: 0,date,producto,sales,predictions,prediction
2015-12-16,2015-12-16,CA_3_FOODS_3_090,0,,-10.0
2015-12-16,2015-12-16,CA_3_FOODS_3_120,52,,51.0
2015-12-16,2015-12-16,CA_3_FOODS_3_202,20,,17.0
2015-12-16,2015-12-16,CA_3_FOODS_3_252,36,,35.0
2015-12-16,2015-12-16,CA_3_FOODS_3_288,35,,23.0
2015-12-16,2015-12-16,CA_3_FOODS_3_329,64,,43.0
2015-12-16,2015-12-16,CA_3_FOODS_3_555,30,,26.0
2015-12-16,2015-12-16,CA_3_FOODS_3_586,76,,63.0
2015-12-16,2015-12-16,CA_3_FOODS_3_587,29,,33.0
2015-12-16,2015-12-16,CA_3_FOODS_3_714,19,,17.0


### EXECUTION

In [56]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder

from sklearn.feature_selection import mutual_info_regression

from sklearn.model_selection import TimeSeriesSplit

from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

#Upload data
path = '../../'
name_data_file = 'DataForProduction.csv'
full_path = path + '/02_Data/02_Validation/' + name_data_file
df = pd.read_csv(full_path,sep=';',parse_dates=['date'],index_col='date')

#Seleccionar solo las que se han usado
final_variables = ['store_id',
                     'item_id',
                     'event_name_1',                     
                     'month',
                     'sell_price',                      
                     'wday',
                     'weekday',
                     'sales']

df = df[final_variables]

#run predictions
forecast = forecast_recursive(df)

forecast.sort_values(by = ['store_id','item_id'])

Unnamed: 0_level_0,store_id,item_id,event_name_1,month,sell_price,wday,weekday,sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-12-17,CA_3,FOODS_3_090,,12,1.00,6,Thursday,0
2015-12-18,CA_3,FOODS_3_090,,12,1.00,7,Friday,1
2015-12-19,CA_3,FOODS_3_090,,12,1.00,1,Saturday,0
2015-12-20,CA_3,FOODS_3_090,,12,1.00,2,Sunday,6
2015-12-21,CA_3,FOODS_3_090,,12,1.00,3,Monday,4
...,...,...,...,...,...,...,...,...
2015-12-27,CA_4,FOODS_3_714,,12,1.58,2,Sunday,7
2015-12-28,CA_4,FOODS_3_714,,12,1.58,3,Monday,9
2015-12-29,CA_4,FOODS_3_714,,12,1.58,4,Tuesday,7
2015-12-30,CA_4,FOODS_3_714,,12,1.58,5,Wednesday,8
