# Calculate Training metrics for Naive Method

In [1]:
from datetime import timedelta
import itertools
import json
from math import sqrt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import yaml
import mlflow
from datetime import datetime
from time import time

# Get the current project path (where you open the notebook)
# and go up two levels to get the project path
current_dir = Path.cwd()
proj_path = current_dir.parent.parent

# make the code in src available to import in this notebook
import sys
sys.path.append(os.path.join(proj_path, 'src'))

# Custom functions and classes
# from sarima import *
from utils import *

from sklearn.metrics import mean_absolute_error, mean_squared_error

# Catalog contains all the paths related to datasets
with open(os.path.join(proj_path, 'conf/catalog.yml'), "r") as f:
    catalog = yaml.safe_load(f)['breakfast']
    
# Params contains all of the dataset creation parameters and model parameters
with open(os.path.join(proj_path, 'conf/params.yml'), "r") as f:
    params = yaml.safe_load(f)

In [2]:
# Step 1: Load the data, convert to a proper datetime format and apply correction
merged_data = pd.read_csv(os.path.join(proj_path,
                                       catalog['output_dir']['dir'],  
                                       catalog['output_dir']['merged']))
merged_data['WEEK_END_DATE'] = pd.to_datetime(merged_data['WEEK_END_DATE'])
merged_data['WEEK_END_DATE'] = merged_data['WEEK_END_DATE'] + timedelta(days=3)

# Step 2: Create date folds
date_ranges = make_dates(params['breakfast']['experiment_dates'])
date_ranges

  and should_run_async(code)


Unnamed: 0,train_start,train_end,valid_start,valid_end,test_start,test_end
0,2009-01-17,2010-12-04,2010-12-11,2011-01-01,2011-01-08,2011-01-29
1,2009-02-14,2011-01-01,2011-01-08,2011-01-29,2011-02-05,2011-02-26
2,2009-03-14,2011-01-29,2011-02-05,2011-02-26,2011-03-05,2011-03-26
3,2009-04-11,2011-02-26,2011-03-05,2011-03-26,2011-04-02,2011-04-23
4,2009-05-09,2011-03-26,2011-04-02,2011-04-23,2011-04-30,2011-05-21
5,2009-06-06,2011-04-23,2011-04-30,2011-05-21,2011-05-28,2011-06-18
6,2009-07-04,2011-05-21,2011-05-28,2011-06-18,2011-06-25,2011-07-16
7,2009-08-01,2011-06-18,2011-06-25,2011-07-16,2011-07-23,2011-08-13
8,2009-08-29,2011-07-16,2011-07-23,2011-08-13,2011-08-20,2011-09-10
9,2009-09-26,2011-08-13,2011-08-20,2011-09-10,2011-09-17,2011-10-08


In [3]:
stores = list(params['breakfast']['dataset']['store_ids'].keys())
upcs = list(params['breakfast']['dataset']['upc_ids'].keys())
store_upc_pairs = list(itertools.product(stores, upcs))

for store_id, upc_id in store_upc_pairs: 
    
    # Filter product category and dates 
    df_filtered = merged_data[(merged_data['WEEK_END_DATE']>=date_ranges['train_start'].iloc[0]) &
                              (merged_data['WEEK_END_DATE']<=date_ranges['train_end'].iloc[-1]) &
                              (merged_data['STORE_NUM']==store_id) &
                              (merged_data['UPC']==upc_id)]

    y_pred = df_filtered['UNITS'].values[:-1]
    y_true = df_filtered['UNITS'].values[1:]
        
    
    print(f'Training MAE product category {store_id}_{upc_id}: {mean_absolute_error(y_true, y_pred)}')
    print(f'rmse: {sqrt(mean_squared_error(y_true, y_pred))}')
    
    fdir = os.path.join(proj_path, catalog['results']['dir'], 'breakfast')
    fname = os.path.join(fdir, f'naive_training_{store_id}_{upc_id}.csv')
    create_folder(fdir)
    
    pd.DataFrame({'train_mae': [mean_absolute_error(y_true, y_pred)],
                  'train_rmse': [sqrt(mean_squared_error(y_true, y_pred))]}).to_csv(fname, index=False)
    

Training MAE product category 2277_1600027527: 58.21917808219178
rmse: 120.5924871010026
Training MAE product category 2277_3800031838: 38.486301369863014
rmse: 74.97839871571414
Training MAE product category 2277_1111009477: 28.958904109589042
rmse: 39.876865267491176
Training MAE product category 2277_7192100339: 26.575342465753426
rmse: 41.46974470656164
Training MAE product category 389_1600027527: 51.69178082191781
rmse: 119.19452846686528
Training MAE product category 389_3800031838: 37.0
rmse: 72.0281527912562
Training MAE product category 389_1111009477: 17.541095890410958
rmse: 23.34391475662091
Training MAE product category 389_7192100339: 19.431506849315067
rmse: 31.186030416689537
Training MAE product category 25229_1600027527: 37.1986301369863
rmse: 86.2657966475293
Training MAE product category 25229_3800031838: 29.958904109589042
rmse: 61.6463622008853
Training MAE product category 25229_1111009477: 14.013698630136986
rmse: 18.68484111977495
Training MAE product category

  and should_run_async(code)
