In [48]:
#######
# cmd #
#######
# ! python -m pip install --upgrade pip
# ! pip install --pre pandas==2.0.0rc0

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import warnings

pd.options.mode.dtype_backend = 'pyarrow'

############################################
########## IMPORTACAO DOS DADOS ############
############################################
# 1m 15.9s w/ old pandas
# 5.9s w/ new pandas

#INPUT_DIR = 'C:/Users/u00378/Desktop/PIBIC_2021-2022/data_from_kaggle'
INPUT_DIR = 'C:/Users/Igor/Desktop/PIBIC/PIBIC_2021-2022/data_from_kaggle'

calendar = pd.read_csv(f'{INPUT_DIR}/calendar.csv', engine='pyarrow', use_nullable_dtypes=True)
selling_prices = pd.read_csv(f'{INPUT_DIR}/sell_prices.csv', engine='pyarrow', use_nullable_dtypes=True)
sample_submission = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv', engine='pyarrow', use_nullable_dtypes=True)
sales_train_val = pd.read_csv(f'{INPUT_DIR}/sales_train_validation.csv', engine='pyarrow', use_nullable_dtypes=True) #sales_train d_1 ate d_1913
sales_train_eva = pd.read_csv(f'{INPUT_DIR}/sales_train_evaluation.csv',engine='pyarrow', use_nullable_dtypes=True) #sales_train d_1 ate d_1941

############
### Note ###
############
# sales_train_eva contempla o sales_train_val e ainda adciona observacoes das vendas dos dias d_1914 - d_1941
# id = ..._validation => até d_1913
# id = ..._evaluation => até d_1941

############
### Goal ###
############
# validation part of submission sample => cross validation w/ d_1 to d_1913 => calculate sMAPE e MASE w/ d_1914 to d_1941
# evaluation part of submission sample => cross validation w/ d_1 to d_1941 => calculate M5 final score in kaggle by concatenating these parts

In [42]:
cols = []

for i in range(1, 1942, 1):
    col = f"d_{i}"
    cols.append(col)

sales_train = pd.concat([sales_train_val, sales_train_eva[cols[-28:]]], axis = 1)
sales_train['id'] = sales_train['id'].apply(lambda w : w.replace('_validation', ''))

In [90]:
def filter_item_store(item_id, store_id):
    '''
    Funcao para filtrar os dados no nivel mais desagregado possivel - venda do item_id na store_id

    '''
    v = sales_train[(sales_train['item_id']== item_id) & (sales_train['store_id']== store_id)][cols].sum().values

    return pd.Series(v, index = calendar['date'][:-28], dtype = 'int64[pyarrow]')


def filter_dept_store(dept_id, store_id):
    '''
    Funcao para filtrar os dados no nivel da venda de todos os item_id do dept_id na store_id

    '''
    v = sales_train[(sales_train['dept_id']== dept_id) & (sales_train['store_id']== store_id)][cols].sum().values

    return pd.Series(v, index = calendar['date'][:-28], dtype = 'int64[pyarrow]')

def filter_store(store_id):
    '''
    Funcao para filtrar os dados no nivel da venda de todos os item_id na store_id

    '''
    v = sales_train[sales_train['store_id'] == store_id][cols].sum().values

    return pd.Series(v, index = calendar['date'][:-28], dtype = 'int64[pyarrow]')


def no_filter():
    '''
    Funcao para alcancar as vendas no nivel mais agregado possivel - vendas diarias da walmart como um todo

    '''
    v = sales_train[cols].sum().values

    return pd.Series(v, index = calendar['date'][:-28], dtype = 'int64[pyarrow]')