# Baseline I: SARIMA


In [6]:
from datetime import timedelta
import itertools
import json
import numpy as np
import os
import pandas as pd
from pathlib import Path
import yaml
import mlflow
from datetime import datetime
from time import time

# Get the current project path (where you open the notebook)
# and go up two levels to get the project path
current_dir = Path.cwd()
proj_path = current_dir.parent.parent

# make the code in src available to import in this notebook
import sys
sys.path.append(os.path.join(proj_path, 'src'))

# Custom functions and classes
from sarima import *
from utils import *
from metrics import *

# Catalog contains all the paths related to datasets
with open(os.path.join(proj_path, 'conf/catalog.yml'), "r") as f:
    catalog = yaml.safe_load(f)['olist']
    
# Params contains all of the dataset creation parameters and model parameters
with open(os.path.join(proj_path, 'conf/params.yml'), "r") as f:
    params = yaml.safe_load(f)

In [3]:
# Step 1: Load the data, convert to a proper datetime format and apply correction
merged_data = pd.read_csv(os.path.join(proj_path,
                                       catalog['output_dir']['dir'], 
                                       catalog['output_dir']['transactions']))

In [4]:
merged_data.head(3)

Unnamed: 0.1,Unnamed: 0,product_category_name,order_approved_at,payment_value
0,0,agro_industry_and_commerce,2017-01-29,2
1,1,agro_industry_and_commerce,2017-02-05,2
2,2,agro_industry_and_commerce,2017-02-12,2


In [5]:
merged_data['order_approved_at'] = pd.to_datetime(merged_data['order_approved_at'])
# merged_data['order_approved_at'] = merged_data['order_approved_at']

# Step 2: Create date folds
date_ranges = make_dates(params['olist']['experiment_dates'])
date_ranges

Unnamed: 0,train_start,train_end,valid_start,valid_end,test_start,test_end
0,2017-01-01,2017-12-03,2017-12-10,2017-12-31,2018-01-07,2018-01-28
1,2017-01-29,2017-12-31,2018-01-07,2018-01-28,2018-02-04,2018-02-25
2,2017-02-26,2018-01-28,2018-02-04,2018-02-25,2018-03-04,2018-03-25
3,2017-03-26,2018-02-25,2018-03-04,2018-03-25,2018-04-01,2018-04-22
4,2017-04-23,2018-03-25,2018-04-01,2018-04-22,2018-04-29,2018-05-20
5,2017-05-21,2018-04-22,2018-04-29,2018-05-20,2018-05-27,2018-06-17
6,2017-06-18,2018-05-20,2018-05-27,2018-06-17,2018-06-24,2018-07-15
7,2017-07-16,2018-06-17,2018-06-24,2018-07-15,2018-07-22,2018-08-12


In [46]:
for prod_cat in params['olist']['product_categories']:
    print(f'Processing product category: {prod_cat}')

    # Initialize mlflow tracking
    create_folder(os.path.join(proj_path, 'mlruns'))
    mlflow.set_tracking_uri(os.path.join(proj_path, 'mlruns'))
    mlflow.set_experiment(prod_cat)
    
    start_timer = time()
    lt_preds = []
    nd_preds = []
    used_params_folds = []
    for _, train_start, train_end, valid_start, valid_end, test_start, test_end in date_ranges.itertuples():

        # Filter product category and dates 
        df_filtered = merged_data[merged_data['product_category_name']==prod_cat].copy()
        
        df_train = df_filtered[(df_filtered['order_approved_at'] >= train_start) &
                               (df_filtered['order_approved_at'] <= train_end)]
        df_valid = df_filtered[(df_filtered['order_approved_at'] >= valid_start) &
                               (df_filtered['order_approved_at'] <= valid_end)]
        df_test  = df_filtered[(df_filtered['order_approved_at'] >= test_start) &
                               (df_filtered['order_approved_at'] <= test_end)]
        
        # Define set of parameters for SARIMA
        p = d = q = range(0, 2)
        pdq = list(itertools.product(p, d, q))
        spdq = list(itertools.product(p, d, q, [2,3,4]))
        all_params = list(itertools.product(pdq, spdq))
        
        model = SklearnSarima(df_train['payment_value'].values)
        model.fit_best_params(df_valid['payment_value'].values, all_params)
        
        lt_predictions = model.predict(df_test.shape[0])
        nd_predictions = model.fit_predict(df_test['payment_value'].values)
        
        lt_preds.extend(lt_predictions)
        nd_preds.extend(nd_predictions)
        
        used_params = model.get_params()
        used_params_folds.append(used_params)
        
    df_filtered = merged_data[(merged_data['product_category_name']==prod_cat) &
                     (merged_data['order_approved_at'] >= params['olist']['experiment_dates']['test_start']) & 
                     (merged_data['order_approved_at'] <= params['olist']['experiment_dates']['test_end'])].copy()
    
    lt_metrics = get_metrics(df_filtered['payment_value'].values, lt_preds)
    nd_metrics = get_metrics(df_filtered['payment_value'].values, nd_preds)
    
    fdir = os.path.join(proj_path, catalog['results']['dir'])
    fname = os.path.join(fdir, f'exp1_sarima_{prod_cat}.csv')
    create_folder(fdir)
    
    save_data = pd.DataFrame({'y_true': df_filtered['payment_value'].values,
                              'nd_preds':np.array(nd_preds).flatten(),
                              'lt_preds':lt_preds,
                              'dates':df_filtered['order_approved_at'].values})
    
    save_data.to_csv(fname)
    duration_min = int((time() - start_timer) // 60)
    with mlflow.start_run() as run:
        mlflow.log_param('Product Category',prod_cat)
        mlflow.log_param('SARIMA_Params_Criterion', used_params_folds)                                     
        mlflow.log_metrics(lt_metrics)
        mlflow.log_metrics(nd_metrics)
        mlflow.log_artifact(fname)
        mlflow.log_metric('time', duration_min)

Finding best parameters:   0%|          | 0/192 [00:00<?, ?it/s]

Processing product category: bed_bath_table


Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.59it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 11.25it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 11.12it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 10.87it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 10.76it/s]
Finding best parameters: 100%|██████████| 192/192 [00:18<00:00, 10.66it/s]
Finding best parameters: 100%|██████████| 192/192 [00:18<00:00, 10.46it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 10.81it/s]
Finding best parameters:   0%|          | 0/192 [00:00<?, ?it/s]

Processing product category: health_beauty


Finding best parameters: 100%|██████████| 192/192 [00:14<00:00, 13.02it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 10.80it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 10.93it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 10.77it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 10.69it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 11.29it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 11.07it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 10.79it/s]
Finding best parameters:   3%|▎         | 5/192 [00:00<00:05, 34.66it/s]

Processing product category: sports_leisure


Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.50it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 11.27it/s]
Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.55it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 10.76it/s]
Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.37it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 10.98it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 10.92it/s]
Finding best parameters: 100%|██████████| 192/192 [00:18<00:00, 10.65it/s]
Finding best parameters:   3%|▎         | 5/192 [00:00<00:05, 36.01it/s]

Processing product category: furniture_decor


Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.26it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.33it/s]
Finding best parameters: 100%|██████████| 192/192 [00:13<00:00, 14.11it/s]
Finding best parameters: 100%|██████████| 192/192 [00:13<00:00, 14.19it/s]
Finding best parameters: 100%|██████████| 192/192 [00:13<00:00, 14.41it/s]
Finding best parameters: 100%|██████████| 192/192 [00:13<00:00, 13.77it/s]
Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.83it/s]
Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.56it/s]
Finding best parameters:   3%|▎         | 5/192 [00:00<00:04, 43.18it/s]

Processing product category: housewares


Finding best parameters: 100%|██████████| 192/192 [00:14<00:00, 13.58it/s]
Finding best parameters: 100%|██████████| 192/192 [00:14<00:00, 13.06it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.36it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.20it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.75it/s]
Finding best parameters: 100%|██████████| 192/192 [00:14<00:00, 13.09it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.22it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 11.23it/s]
Finding best parameters:   0%|          | 0/192 [00:00<?, ?it/s]

Processing product category: watches_gifts


Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.18it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.59it/s]
Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.49it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.19it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.05it/s]
Finding best parameters: 100%|██████████| 192/192 [00:14<00:00, 13.04it/s]
Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.92it/s]
Finding best parameters: 100%|██████████| 192/192 [00:20<00:00,  9.43it/s]
Finding best parameters:   3%|▎         | 5/192 [00:00<00:04, 43.95it/s]

Processing product category: telephony


Finding best parameters: 100%|██████████| 192/192 [00:13<00:00, 13.89it/s]
Finding best parameters: 100%|██████████| 192/192 [00:14<00:00, 12.98it/s]
Finding best parameters: 100%|██████████| 192/192 [00:13<00:00, 14.10it/s]
Finding best parameters: 100%|██████████| 192/192 [00:13<00:00, 13.74it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.49it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.23it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.74it/s]
Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.98it/s]
Finding best parameters:   3%|▎         | 5/192 [00:00<00:05, 33.63it/s]

Processing product category: garden_tools


Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.10it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 11.22it/s]
Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.47it/s]
Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.38it/s]
Finding best parameters: 100%|██████████| 192/192 [00:16<00:00, 11.47it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.00it/s]
Finding best parameters: 100%|██████████| 192/192 [00:15<00:00, 12.00it/s]
Finding best parameters: 100%|██████████| 192/192 [00:17<00:00, 11.04it/s]
