# Global Setting

In [147]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.stattools import acovf
from sklearn.model_selection import TimeSeriesSplit
from prophet import Prophet
import itertools
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_plotly
import xgboost as xgb
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")

##### Load data

In [140]:
sales = pd.read_csv('../data/sales_products_ts.csv')
products_info = pd.read_csv("../data/products_statistics.csv")

Define error functions

In [5]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mape(y_true, y_pred):
    return mean_absolute_percentage_error(y_true, y_pred)

def wape(y_true, y_pred):
    acovf_vals = acovf(y_true, unbiased=False)
    return np.sum(np.abs(y_true - y_pred) / acovf_vals) * 100


Define metrics dataframe

In [6]:
metrics = pd.DataFrame(columns=['id','model','rmse', 'mape', 'mase'])

Replace outliers function

In [41]:
def replace_outliers(df):
    q3 = np.quantile(df['quantity'], 0.75)
    q1 = np.quantile(df['quantity'], 0.25)
    iqr = np.subtract(*np.percentile(df['quantity'], [75, 25], interpolation='linear', axis=0))
    max_limit = q3 + (1.5 * iqr)
    min_limit = q1 - (1.5 * iqr)
    df.loc[df['quantity'] > max_limit, 'quantity'] = max_limit
    df.loc[df['quantity'] < min_limit, 'quantity'] = min_limit

    return df


Preprocessing data

In [90]:
def preprocessing(df):
    id_list = sales['id'].unique()
    df['date'] = pd.to_datetime(df.date)
    df = df.sort_values('date')
    df['log'] = np.log(df.quantity)
    df = df[['id','date', 'quantity', 'log']]
    df =  replace_outliers(df)
    return df, id_list

Plot the time series

In [151]:
def initial_plots(df, item):
    fig, (a1,a2) = plt.subplots(1,2, figsize=(15, 5))
    a1.plot(df.date, df['quantity'])
    a1.set_title(f'Original Serie - {item}')
    a2.plot(df.date, df['log'], color='r')
    a2.set_title(f'Log Transformation - {item}')
    fig.tight_layout()
    os.makedirs(f'../scale_outputs/{item}/images/', exist_ok=True)
    fig.savefig(f'../scale_outputs/{item}/images/ts_plot.png')
    plt.show()
    return fig

Train-test split

In [150]:
def split(df, ndays):
  weeks_train = round((max(df['date']) - min(df['date'])).days / 7 * 0.7)
  split_date = max(df['date']) - timedelta(weeks=weeks_train)
  #split_date = max(df.fecha) - timedelta(days=ndays)
  train = df[df['fecha']< split_date] 
  test = df[df['fecha'] >= split_date]
  return split_date, train, test

In [143]:
sales, id_list = preprocessing(sales)

Multiple time series loop

In [None]:
for item in id_list:
    one_product = sales[sales['id']==item]
    one_product = one_product.drop_duplicates()
    ts_plot = initial_plots(one_product, item)
    split_date, train, test = split(df)
    

In [None]:
id_ex = '17410306'
m = sales[sales['id']== id_ex]

