# Calculate Training metrics for Naive Method


In [1]:
from datetime import timedelta
import itertools
import json
from math import sqrt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import yaml
import mlflow
from datetime import datetime
from time import time

# Get the current project path (where you open the notebook)
# and go up two levels to get the project path
current_dir = Path.cwd()
proj_path = current_dir.parent.parent

# make the code in src available to import in this notebook
import sys
sys.path.append(os.path.join(proj_path, 'src'))

# Custom functions and classes
# from sarima import *
from utils import *

from sklearn.metrics import mean_absolute_error, mean_squared_error

# Catalog contains all the paths related to datasets
with open(os.path.join(proj_path, 'conf/catalog.yml'), "r") as f:
    catalog = yaml.safe_load(f)['olist']
    
# Params contains all of the dataset creation parameters and model parameters
with open(os.path.join(proj_path, 'conf/params.yml'), "r") as f:
    params = yaml.safe_load(f)

In [2]:
# Step 1: Load the data, convert to a proper datetime format and apply correction
merged_data = pd.read_csv(os.path.join(proj_path,
                                       catalog['output_dir']['dir'], 
                                       catalog['output_dir']['transactions']))

merged_data['order_approved_at'] = pd.to_datetime(merged_data['order_approved_at'])
# merged_data['order_approved_at'] = merged_data['order_approved_at']

# Step 2: Create date folds
date_ranges = make_dates(params['olist']['experiment_dates'])
date_ranges

  and should_run_async(code)


Unnamed: 0,train_start,train_end,valid_start,valid_end,test_start,test_end
0,2017-01-01,2017-12-03,2017-12-10,2017-12-31,2018-01-07,2018-01-28
1,2017-01-29,2017-12-31,2018-01-07,2018-01-28,2018-02-04,2018-02-25
2,2017-02-26,2018-01-28,2018-02-04,2018-02-25,2018-03-04,2018-03-25
3,2017-03-26,2018-02-25,2018-03-04,2018-03-25,2018-04-01,2018-04-22
4,2017-04-23,2018-03-25,2018-04-01,2018-04-22,2018-04-29,2018-05-20
5,2017-05-21,2018-04-22,2018-04-29,2018-05-20,2018-05-27,2018-06-17
6,2017-06-18,2018-05-20,2018-05-27,2018-06-17,2018-06-24,2018-07-15
7,2017-07-16,2018-06-17,2018-06-24,2018-07-15,2018-07-22,2018-08-12


In [3]:
for prod_cat in params['olist']['product_categories']:
    
    # Filter product category and dates 
    df_filtered = merged_data[merged_data['product_category_name']==prod_cat].copy()
    df_train = df_filtered[(df_filtered['order_approved_at'] >= date_ranges['train_start'].iloc[0]) &
                           (df_filtered['order_approved_at'] <= date_ranges['train_end'].iloc[-1])]
    
    y_pred = df_train['payment_value'].values[:-1]
    y_true = df_train['payment_value'].values[1:]
    
    print(f'Training MAE product category {prod_cat}: {mean_absolute_error(y_true, y_pred)}')
    print(sqrt(mean_squared_error(y_true, y_pred)))
    
    fdir = os.path.join(proj_path, catalog['results']['dir'])
    fname = os.path.join(fdir, f'naive_training_{prod_cat}.csv')
    create_folder(fdir)
    
    pd.DataFrame({'train_mae': [mean_absolute_error(y_true, y_pred)],
                  'train_rmse': [sqrt(mean_squared_error(y_true, y_pred))]}).to_csv(fname, index=False)


Training MAE product category bed_bath_table: 34.18421052631579
51.86318032751376
Training MAE product category health_beauty: 23.88157894736842
36.76078431097322
Training MAE product category sports_leisure: 21.276315789473685
29.36902234811725
Training MAE product category furniture_decor: 26.486842105263158
40.18264223320942
Training MAE product category housewares: 21.06578947368421
28.917532470985943
Training MAE product category watches_gifts: 17.105263157894736
28.77498913987632
Training MAE product category telephony: 12.81578947368421
18.55149305950446


  and should_run_async(code)
