In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [None]:
data_original = pd.read_csv('../in/datos_originales_empresa.csv')
data_original.rename(columns={'material':'unique_id', 'fecha_semana':'ds', 'cantidad':'y'}, inplace=True)
data_original.set_index('ds', inplace=True)
data_original.index = pd.to_datetime(data_original.index)
data_original.sort_values(by=['unique_id', 'ds'], inplace=True)
data_original.head(5)

## Data Preprocessing

#### 1. Complete dates & Impute zeros

In [7]:
df_complete_date_range = pd.DataFrame()

for product in data_original.unique_id.unique():
    df_one_id = data_original[data_original['unique_id'] == product]
    new_dates = pd.date_range(df_one_id.index.min().date(), df_one_id.index.max().date(), freq='7D')
    df_one_id = df_one_id.reindex(new_dates)
    df_one_id['unique_id'] = product
    df_one_id = df_one_id.fillna(0)
    df_complete_date_range = pd.concat([df_complete_date_range, df_one_id])

df_complete_date_range.head()


In [8]:
def cumsum_restart_zero(column):
    cumsum = 0
    cumsum_list = []
    for value in column:
        if value == 0:
            cumsum = 0
        cumsum += value
        cumsum_list.append(cumsum)
    return cumsum_list

In [None]:
df_complete_date_range['week_with_sale'] = np.where(df_complete_date_range['y'] > 0, 1, 0)
df_complete_date_range['cumulative_weeks'] = df_complete_date_range.groupby('unique_id')['week_with_sale'].transform(cumsum_restart_zero)
df_complete_date_range

#### 2. Exclude test period

In [None]:
df_without_test_period = pd.DataFrame()

for product in df_complete_date_range.unique_id.unique():
    df_one_id = df_complete_date_range[df_complete_date_range['unique_id'] == product]
    index_9w = df_one_id.index[df_one_id['cumulative_weeks']==9].min()
    index_1w = index_9w - timedelta(weeks=8)
    df_no_test = df_one_id[df_one_id.index >= index_1w]
    df_without_test_period = pd.concat([df_without_test_period, df_no_test])

df_without_test_period

#### 3. Products selection

We load a pre-computed CSV file because the execution of certain steps takes several minutes to complete.

In [None]:
#df_without_test_period.to_csv('../out/id_selection/df_without_test_period.csv')
df_without_test_period = pd.read_csv('../out/id_selection/df_without_test_period.csv')
df_without_test_period.rename(columns={'Unnamed: 0':'ds'}, inplace=True)
df_without_test_period.set_index('ds', inplace=True)
df_without_test_period.index = pd.to_datetime(df_without_test_period.index)
df_without_test_period

In [None]:
t = df_without_test_period.index.max()
t2 = t - timedelta(weeks= 8)

df_without_test_period.reset_index(inplace=True)

df_summary = pd.DataFrame()
df_summary['min_date'] = df_without_test_period.groupby('unique_id')['ds'].min()
df_summary['max_date'] = df_without_test_period.groupby('unique_id')['ds'].max()
df_summary['lifetime'] = (df_summary['max_date'] - df_summary['min_date']).dt.days /7
df_summary['lifetime'] = df_summary['lifetime'].round()
df_summary['n_records'] = df_without_test_period.groupby('unique_id')['y'].count()
df_summary['n_zero'] = df_without_test_period[df_without_test_period['y'] == 0].groupby('unique_id')['y'].count()
df_summary['n_zero'] = df_summary['n_zero'].fillna(0)
df_summary['% zero'] = ((df_summary['n_zero']/df_summary['n_records'])*100).round(1)
df_summary['n_consecutive_weeks'] = df_without_test_period.groupby('unique_id')['cumulative_weeks'].max()
df_summary['ok_zeros'] = np.where(df_summary['% zero'] < 20,1, 0)
df_summary['ok_2years'] = np.where(df_summary['min_date'] < '2021-01-01', 1, 0)
df_summary['ok_sales_last_2months'] = np.where(df_summary['max_date'] > t2,1, 0)
df_summary['time_series'] = np.where((df_summary['ok_zeros'] == 1) &(df_summary['ok_2years'] == 1) & (df_summary['ok_sales_last_2months'] == 1) , 1, 0)

df_summary

In [4]:
id_selected = df_summary[df_summary['time_series']==1]
len(id_selected.index.unique())

604

In [6]:
#id_selected.to_csv('../out/id_selection/id_selected.csv')
#df_summary.to_csv('../out/id_selection/ids_summary_complete.csv')

In [None]:

df_without_outliers = df_without_outliers[df_without_outliers['unique_id'].isin(id_selected.index.unique())]
df_without_outliers

In [10]:
df_without_test_period = df_without_test_period[df_without_test_period['unique_id'].isin(id_selected.index.unique())]

604

#### 4. Replace Outliers

In [11]:
def replace_outliers(df, target):
    q3 = np.quantile(df[target], 0.75)
    q1 = np.quantile(df[target], 0.25)
    iqr = q3- q1
    max_limit = q3 + (1.5 * iqr)
    min_limit = q1 - (1.5 * iqr)
    df.loc[df[target] > max_limit, target] = max_limit
    df.loc[df[target] < min_limit, target] = min_limit

    return df

In [14]:
df_without_outliers = pd.DataFrame() 
for product in df_without_test_period.unique_id.unique():
    df_one_id = df_without_test_period[df_without_test_period['unique_id'] == product]
    df_one_id = replace_outliers(df_one_id, 'y')
    df_without_outliers = pd.concat([df_without_outliers, df_one_id])

df_without_outliers

In [None]:
df_without_outliers.reset_index(inplace=True)
df_without_outliers.set_index('unique_id', inplace=True)
df_without_outliers = df_without_outliers[['ds','y']]
df_without_outliers

#### 5. Adjust Series' Length

In [None]:
df_without_outliers = df_without_outliers[df_without_outliers['ds']>='2019-01-01']
df_without_outliers = df_without_outliers[df_without_outliers['ds']<'2023-01-01']
#df_without_outliers.to_csv('../out/sales_files/weekly_sales_selected_loop_without_test_outliers_2019-2022.csv')
df_without_outliers

* Log transformation is included in the modeling file for practicality.