In [2]:
import pandas as pd

# Importing the dataset
df = pd.DataFrame(pd.read_csv('../raw_data/bakerysales.csv'))
df['date'] = pd.to_datetime(df['date'])

# Extract the articles and the quantities in order to transform them into column s through a pivot method.
# We'll now have 149 column, one per product with the corresponding qty
pivot = df[['article', 'Quantity']]
products = pivot.pivot(columns = 'article', values = 'Quantity')

# Merge the pivot table with the original dataset and fill the Nan with zeros
# Now for each date point we have the quantity of the article sold
data = df.merge(products, left_index = True, right_index = True)
data = data.fillna(value = 0)

# Keep only the top 7 products (representing 68% of the volume sold)
# Set date as index
data_target = data[['date', 'TRADITIONAL BAGUETTE', 'CROISSANT', 'COUPE', 'PAIN AU CHOCOLAT', 'BAGUETTE', 'BANETTE', 'CEREAL BAGUETTE']]
data_target = data_target.set_index('date')
data_target = data_target.rename(columns = {'TRADITIONAL BAGUETTE' : 'traditional_baguette',
                                             'CROISSANT' : 'croissant',
                                            'COUPE' : 'coupe',
                                            'PAIN AU CHOCOLAT' : 'pain_au_chocolat',
                                            'BAGUETTE' : 'baguette',
                                            'BANETTE' : 'banette',
                                            'CEREAL BAGUETTE' : 'cereal_baguette'})


In [3]:
data_target


Unnamed: 0_level_0,traditional_baguette,croissant,coupe,pain_au_chocolat,baguette,banette,cereal_baguette
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-02,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2021-01-02,0.0,0.0,0.0,3.0,0.0,0.0,0.0
2021-01-02,0.0,0.0,0.0,2.0,0.0,0.0,0.0
2021-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-02,5.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
2022-09-30,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2022-09-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-09-30,0.0,0.0,2.0,0.0,0.0,0.0,0.0
2022-09-30,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
data_target = data_target[data_target != 0]
data_target.dropna(axis = 0, how = 'all', inplace = True)
data_target


Unnamed: 0_level_0,traditional_baguette,croissant,coupe,pain_au_chocolat,baguette,banette,cereal_baguette
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-02,,,,,1.0,,
2021-01-02,,,,3.0,,,
2021-01-02,,,,2.0,,,
2021-01-02,5.0,,,,,,
2021-01-02,,,,,2.0,,
...,...,...,...,...,...,...,...
2022-09-30,5.0,,,,,,
2022-09-30,,,1.0,,,,
2022-09-30,,,2.0,,,,
2022-09-30,1.0,,,,,,
