# Opportunity 2

## Imports

In [19]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
from fbprophet import Prophet

warnings.simplefilter('ignore')

## Data setup

In [11]:
data_dir = Path('data')
stores_data = {}

for parquet_file in data_dir.glob('*.parquet*'):
    file_name = str(parquet_file).replace('data\sales-', '').replace('.parquet.gzip', '')
    stores_data[file_name] = {'sales_df': pd.read_parquet(parquet_file).drop(['SALES_DIST', 'SOLD_TO', 'SHIP_TO'], axis=1)}
    
stores_data

{'7003984': {'sales_df':            BILL_DATE  MATERIAL  PIEZAS
  8511      2019-03-03      2641       1
  8512      2019-03-03     12916       1
  8513      2019-03-03      2464       1
  9478      2019-03-03     11074       1
  9479      2019-03-03     20306       1
  ...              ...       ...     ...
  141730332 2020-09-17     31907       2
  141730333 2020-09-17     36526       2
  141730334 2020-09-17     36879       1
  141730335 2020-09-17     39950       5
  141730336 2020-09-17     44388       1
  
  [4333 rows x 3 columns]},
 '7007091': {'sales_df':            BILL_DATE  MATERIAL  PIEZAS
  17613     2019-03-03      9184       1
  17614     2019-03-03       495       1
  17615     2019-03-03       552       1
  18706     2019-03-03      2130       1
  41200     2019-03-03     12064       1
  ...              ...       ...     ...
  187137484 2021-03-30     32003       1
  187137485 2021-03-30     32334       1
  187516552 2021-03-31      7285       1
  187516553 2021-03-3

## Get first and last date for each store

In [12]:
for store, data in stores_data.items():
    df_sorted_by_dates = data['sales_df'].sort_values(by='BILL_DATE')
    data['oldest_date'] = df_sorted_by_dates.iloc[0]["BILL_DATE"]
    data['newest_date'] = df_sorted_by_dates.iloc[-1]["BILL_DATE"]

## Top 10 most sold products per store

In [13]:
for store, data in stores_data.items():
    product_sales = data['sales_df'].groupby(['MATERIAL'])['PIEZAS'].sum().sort_values(ascending=False)
    data['top_products'] = product_sales.head(10)

stores_data

{'7003984': {'sales_df':            BILL_DATE  MATERIAL  PIEZAS
  8511      2019-03-03      2641       1
  8512      2019-03-03     12916       1
  8513      2019-03-03      2464       1
  9478      2019-03-03     11074       1
  9479      2019-03-03     20306       1
  ...              ...       ...     ...
  141730332 2020-09-17     31907       2
  141730333 2020-09-17     36526       2
  141730334 2020-09-17     36879       1
  141730335 2020-09-17     39950       5
  141730336 2020-09-17     44388       1
  
  [4333 rows x 3 columns],
  'oldest_date': Timestamp('2019-03-03 00:00:00'),
  'newest_date': Timestamp('2020-09-17 00:00:00'),
  'top_products': MATERIAL
  1564     600
  4373      90
  3439      85
  12504     80
  3008      77
  6623      75
  27752     71
  4077      60
  7904      58
  4414      55
  Name: PIEZAS, dtype: int64},
 '7007091': {'sales_df':            BILL_DATE  MATERIAL  PIEZAS
  17613     2019-03-03      9184       1
  17614     2019-03-03       495       1

## Get daily sales of top 10 products of each store

In [14]:
for store, data in stores_data.items():
    idx = pd.date_range(data['oldest_date'], data['newest_date'])
    data['top_products_sales'] = {}
    for product, sale in data['top_products'].iteritems():
        product_series = data['sales_df'][data['sales_df']["MATERIAL"] == product]
        product_series = product_series.groupby('BILL_DATE').PIEZAS.sum().to_frame(name = 'PIEZAS')

        product_series.index = pd.DatetimeIndex(product_series.index)
        product_series = product_series.reindex(idx, fill_value=0)
        product_series.index.name = "BILL_DATE"
        product_series = product_series.reset_index()
        data['top_products_sales'][product] = product_series

stores_data

{'7003984': {'sales_df':            BILL_DATE  MATERIAL  PIEZAS
  8511      2019-03-03      2641       1
  8512      2019-03-03     12916       1
  8513      2019-03-03      2464       1
  9478      2019-03-03     11074       1
  9479      2019-03-03     20306       1
  ...              ...       ...     ...
  141730332 2020-09-17     31907       2
  141730333 2020-09-17     36526       2
  141730334 2020-09-17     36879       1
  141730335 2020-09-17     39950       5
  141730336 2020-09-17     44388       1
  
  [4333 rows x 3 columns],
  'oldest_date': Timestamp('2019-03-03 00:00:00'),
  'newest_date': Timestamp('2020-09-17 00:00:00'),
  'top_products': MATERIAL
  1564     600
  4373      90
  3439      85
  12504     80
  3008      77
  6623      75
  27752     71
  4077      60
  7904      58
  4414      55
  Name: PIEZAS, dtype: int64,
  'top_products_sales': {1564:      BILL_DATE  PIEZAS
   0   2019-03-03       0
   1   2019-03-04       0
   2   2019-03-05       0
   3   2019-03

## Forecast the next 7 days for each of the top 10 products at each store

In [20]:
for store in stores_data:
    print(f'Store: {store}')
    stores_data[store]['models'] = {}
    stores_data[store]['predictions'] = {}
    for product in stores_data[store]['top_products_sales']:
        print(f'Product: {product}')
        daily_sales = stores_data[store]['top_products_sales'][product]
        daily_sales.columns = ['ds', 'y']
        m = Prophet(interval_width=0.95, daily_seasonality=True, yearly_seasonality=False) 
        stores_data[store]['models'][product] = m.fit(daily_sales)
        future = m.make_future_dataframe(periods=7, freq='D')
        forecast = m.predict(future)
        forecast = forecast.tail(7)
        stores_data[store]['predictions'][product] = forecast
        print(forecast[['ds', 'yhat']])

Store: 7003984
Product: 1564
            ds      yhat
565 2020-09-18  0.114441
566 2020-09-19 -1.133410
567 2020-09-20 -1.149122
568 2020-09-21  2.550615
569 2020-09-22 -1.147955
570 2020-09-23  0.085674
571 2020-09-24  0.086254
Product: 4373
            ds      yhat
565 2020-09-18 -0.208219
566 2020-09-19 -0.208249
567 2020-09-20 -0.087529
568 2020-09-21  0.035591
569 2020-09-22  0.158712
570 2020-09-23 -0.210780
571 2020-09-24  0.158646
Product: 3439
            ds      yhat
565 2020-09-18 -0.034916
566 2020-09-19 -0.159642
567 2020-09-20 -0.161692
568 2020-09-21  0.023219
569 2020-09-22  0.084877
570 2020-09-23 -0.038344
571 2020-09-24  0.208192
Product: 12504
            ds      yhat
565 2020-09-18  0.835303
566 2020-09-19  0.584523
567 2020-09-20  0.588943
568 2020-09-21  0.964738
569 2020-09-22  0.592457
570 2020-09-23  0.968239
571 2020-09-24  0.595947
Product: 3008
            ds      yhat
565 2020-09-18 -0.089792
566 2020-09-19 -0.089462
567 2020-09-20  0.191037
568 2020-09-21

            ds      yhat
750 2021-03-25 -0.026933
751 2021-03-26 -0.026945
752 2021-03-27 -0.026957
753 2021-03-28  0.468308
754 2021-03-29 -0.026979
755 2021-03-30 -0.026991
756 2021-03-31  0.500553
Product: 8649
            ds      yhat
750 2021-03-25  0.035914
751 2021-03-26 -0.001472
752 2021-03-27 -0.001455
753 2021-03-28  0.438056
754 2021-03-29 -0.001420
755 2021-03-30 -0.001403
756 2021-03-31  0.526957
Store: 7023126
Product: 2541
            ds      yhat
751 2021-03-24  0.191714
752 2021-03-25  0.276026
753 2021-03-26  0.360338
754 2021-03-27  0.088698
755 2021-03-28  0.088706
756 2021-03-29  0.330796
757 2021-03-30  0.470020
Product: 29725
            ds      yhat
751 2021-03-24  0.055221
752 2021-03-25 -0.066851
753 2021-03-26 -0.076600
754 2021-03-27 -0.198666
755 2021-03-28 -0.189698
756 2021-03-29 -0.004528
757 2021-03-30 -0.116238
Product: 29258
            ds      yhat
751 2021-03-24  0.129816
752 2021-03-25  0.279484
753 2021-03-26  0.214685
754 2021-03-27  0.103262
75

## Round forecasted values and turn negative ones into 0

In [23]:
for store in stores_data:
    print(f'Store: {store}')
    for product in stores_data[store]['top_products_sales']:
        print(f'Product: {product}')
        stores_data[store]['predictions'][product]['yhat'] = round(stores_data[store]['predictions'][product]['yhat'])
        stores_data[store]['predictions'][product][stores_data[store]['predictions'][product]['yhat'] <= 0] = 0
        print(stores_data[store]['predictions'][product]['yhat'])

Store: 7003984
Product: 1564
565    0.0
566    0.0
567    0.0
568    3.0
569    0.0
570    0.0
571    0.0
Name: yhat, dtype: float64
Product: 4373
565    0.0
566    0.0
567    0.0
568    0.0
569    0.0
570    0.0
571    0.0
Name: yhat, dtype: float64
Product: 3439
565    0.0
566    0.0
567    0.0
568    0.0
569    0.0
570    0.0
571    0.0
Name: yhat, dtype: float64
Product: 12504
565    1.0
566    1.0
567    1.0
568    1.0
569    1.0
570    1.0
571    1.0
Name: yhat, dtype: float64
Product: 3008
565    0.0
566    0.0
567    0.0
568    0.0
569    0.0
570    0.0
571    0.0
Name: yhat, dtype: float64
Product: 6623
565    0.0
566    0.0
567    0.0
568    0.0
569    0.0
570    0.0
571    0.0
Name: yhat, dtype: float64
Product: 27752
565    0.0
566    0.0
567    0.0
568    0.0
569    0.0
570    0.0
571    0.0
Name: yhat, dtype: float64
Product: 4077
565    0.0
566    0.0
567    0.0
568    0.0
569    0.0
570    0.0
571    0.0
Name: yhat, dtype: float64
Product: 7904
565    0.0
566    0.0
567

515    0.0
516    0.0
517    0.0
518    0.0
519    0.0
520    0.0
521    0.0
Name: yhat, dtype: float64
