In [1]:
import pandas as pd
import numpy as np
from datetime import datetime


In [2]:

import pandas as pd

# Importing the dataset
df = pd.DataFrame(pd.read_csv('../raw_data/bakerysales.csv'))

df['date_time'] = df['date'] + ' ' + df['time']
df['date_time'] = pd.to_datetime(df['date_time'])

# Extract the articles and the quantities in order to transform them into column s through a pivot method.
# We'll now have 149 column, one per product with the corresponding qty
pivot = df[['article', 'Quantity']]
products = pivot.pivot(columns = 'article', values = 'Quantity')

# Merge the pivot table with the original dataset and fill the Nan with zeros
# Now for each date point we have the quantity of the article sold
data = df.merge(products, left_index = True, right_index = True)
data = data.fillna(value = 0)

# Keep only the top 7 products (representing 68% of the volume sold)
# Set date as index
data_target = data[['date_time', 'TRADITIONAL BAGUETTE', 'CROISSANT', 'COUPE', 'PAIN AU CHOCOLAT', 'BAGUETTE', 'BANETTE', 'CEREAL BAGUETTE']]
data_target = data_target.resample('60min', on = 'date_time').sum()

data_target = data_target.rename(columns = {'TRADITIONAL BAGUETTE' : 'traditional_baguette',
                                             'CROISSANT' : 'croissant',
                                            'COUPE' : 'coupe',
                                            'PAIN AU CHOCOLAT' : 'pain_au_chocolat',
                                            'BAGUETTE' : 'baguette',
                                            'BANETTE' : 'banette',
                                            'CEREAL BAGUETTE' : 'cereal_baguette'})


# Removing the empty rows
data_target = data_target[data_target != 0]
data_target.dropna(axis = 0, how = 'all', inplace = True)
data_target = data_target.fillna(value = 0)


In [32]:
data_target


Unnamed: 0_level_0,traditional_baguette,croissant,coupe,pain_au_chocolat,baguette,banette,cereal_baguette
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-02 08:00:00,0.0,0.0,0.0,3.0,1.0,0.0,0.0
2021-01-02 09:00:00,22.0,25.0,1.0,8.0,2.0,4.0,0.0
2021-01-02 10:00:00,33.0,21.0,8.0,17.0,4.0,2.0,4.0
2021-01-02 11:00:00,32.0,16.0,9.0,7.0,15.0,18.0,3.0
2021-01-02 12:00:00,41.0,3.0,17.0,10.0,20.0,16.0,12.0
...,...,...,...,...,...,...,...
2022-09-30 12:00:00,22.0,1.0,9.0,2.0,8.0,4.0,0.0
2022-09-30 15:00:00,0.0,0.0,0.0,3.0,0.0,0.0,0.0
2022-09-30 16:00:00,14.0,6.0,7.0,8.0,0.0,0.0,3.0
2022-09-30 17:00:00,5.0,3.0,2.0,3.0,0.0,1.0,2.0


In [38]:
data_weather = pd.read_csv('../raw_data/open-meteo-paris.csv')

df_weather = data_weather

datetime.strptime(df_weather['time'][4], '%Y-%m-%dT%H:%M')
df_weather['timestamp'] = df_weather['time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M'))

# Creating day of week as a cyclical feature
# First, create the day of the week as a numerical feature
df_weather['day_of_week'] = pd.to_datetime(df_weather['timestamp']).dt.weekday
# Since we have a 7 days week period (e.g., days in a week)
period = 7

# Convert 'day_of_week' to radians
df_weather['day_of_week_radians'] = 2 * np.pi * df_weather['day_of_week'] / period

# Create new features using sine and cosine
df_weather['day_of_week_sin'] = np.sin(df_weather['day_of_week_radians'])
df_weather['day_of_week_cos'] = np.cos(df_weather['day_of_week_radians'])
# Dropping ['month_radians']
df_weather.drop(columns=['day_of_week_radians', 'day_of_week'], inplace=True)
# Drops old DATE column
df_weather = df_weather.drop(columns=['time'])
# Creates cyclical month feature according to the date
df_weather['month'] = df_weather.timestamp.dt.month
# Assuming we have a 12 month period (e.g., month in a year)
period = 12

# Convert 'month' to radians
df_weather['month_radians'] = 2 * np.pi * df_weather['month'] / period

# Create new features using sine and cosine
df_weather['month_sin'] = np.sin(df_weather['month_radians'])
df_weather['month_cos'] = np.cos(df_weather['month_radians'])
# Dropping ['month_radians']
df_weather.drop(columns=['month_radians', 'month'], inplace=True)
# Setting new date column as index
df_weather.set_index(['timestamp'], inplace=True)
# Creates dictionary with Holidays
holidays = [
    '2021-01-01',
    '2021-04-05',
    '2021-05-01',
    '2021-05-08',
    '2021-05-13',
    '2021-05-24',
    '2021-07-14',
    '2021-08-15',
    '2021-11-01',
    '2021-11-11',
    '2021-12-25',
    '2022-01-01',
    '2022-04-18',
    '2022-05-01',
    '2022-05-08',
    '2022-05-26',
    '2022-06-06',
    '2022-07-14',
    '2022-08-15',
]
#holidays = [pd.to_datetime(holiday)for holiday in holidays]
#df_weather['isHoliday'] = df_weather.index.map(lambda x: 1 if x in holidays else 0)
df_weather['isHoliday'] = df_weather.index.map(lambda x: 1 if x.strftime('%Y-%m-%d') in holidays else 0)
# df_weather = df_weather.resample('20min', on = 'timestamp').mean().ffill()


In [30]:
type(df_weather.index[0].strftime('%Y-%m-%d'))


str

In [39]:
merged_data = df_weather.join(data_target, how = 'left')

final_data = merged_data

final_data = final_data.fillna(value = 0)


In [13]:
final_data.reset_index(inplace = True)


In [40]:
final_data


Unnamed: 0_level_0,temperature_2m (°C),relative_humidity_2m (%),apparent_temperature (°C),precipitation (mm),rain (mm),wind_speed_10m (km/h),wind_speed_100m (km/h),day_of_week_sin,day_of_week_cos,month_sin,month_cos,isHoliday,traditional_baguette,croissant,coupe,pain_au_chocolat,baguette,banette,cereal_baguette
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-01-01 00:00:00,-1.4,97,-4.7,0.0,0.0,6.1,11.6,-0.433884,-0.900969,0.5,8.660254e-01,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-01 01:00:00,-0.1,97,-3.2,0.0,0.0,6.1,12.9,-0.433884,-0.900969,0.5,8.660254e-01,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-01 02:00:00,-0.9,97,-3.9,0.0,0.0,5.1,11.2,-0.433884,-0.900969,0.5,8.660254e-01,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-01 03:00:00,-1.5,97,-4.7,0.0,0.0,5.6,10.3,-0.433884,-0.900969,0.5,8.660254e-01,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-01 04:00:00,-1.4,97,-4.5,0.0,0.0,4.8,10.0,-0.433884,-0.900969,0.5,8.660254e-01,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-30 19:00:00,15.1,55,11.4,0.0,0.0,18.1,30.6,-0.433884,-0.900969,-1.0,-1.836970e-16,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-09-30 20:00:00,14.6,56,11.0,0.0,0.0,18.1,31.2,-0.433884,-0.900969,-1.0,-1.836970e-16,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-09-30 21:00:00,14.5,57,10.7,0.0,0.0,19.3,32.4,-0.433884,-0.900969,-1.0,-1.836970e-16,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2022-09-30 22:00:00,13.5,68,9.6,0.5,0.5,22.7,37.3,-0.433884,-0.900969,-1.0,-1.836970e-16,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
final_data.to_csv('../raw_data/hourly_final_dataset.csv', index = True)


In [46]:
pd.DataFrame(pd.read_csv('../raw_data/hourly_final_dataset.csv'))


Unnamed: 0,timestamp,temperature_2m (°C),relative_humidity_2m (%),apparent_temperature (°C),precipitation (mm),rain (mm),wind_speed_10m (km/h),wind_speed_100m (km/h),day_of_week_sin,day_of_week_cos,month_sin,month_cos,isHoliday,traditional_baguette,croissant,coupe,pain_au_chocolat,baguette,banette,cereal_baguette
0,2021-01-01 00:00:00,-1.4,97,-4.7,0.0,0.0,6.1,11.6,-0.433884,-0.900969,0.5,8.660254e-01,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2021-01-01 01:00:00,-0.1,97,-3.2,0.0,0.0,6.1,12.9,-0.433884,-0.900969,0.5,8.660254e-01,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-01-01 02:00:00,-0.9,97,-3.9,0.0,0.0,5.1,11.2,-0.433884,-0.900969,0.5,8.660254e-01,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2021-01-01 03:00:00,-1.5,97,-4.7,0.0,0.0,5.6,10.3,-0.433884,-0.900969,0.5,8.660254e-01,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-01-01 04:00:00,-1.4,97,-4.5,0.0,0.0,4.8,10.0,-0.433884,-0.900969,0.5,8.660254e-01,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15307,2022-09-30 19:00:00,15.1,55,11.4,0.0,0.0,18.1,30.6,-0.433884,-0.900969,-1.0,-1.836970e-16,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15308,2022-09-30 20:00:00,14.6,56,11.0,0.0,0.0,18.1,31.2,-0.433884,-0.900969,-1.0,-1.836970e-16,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15309,2022-09-30 21:00:00,14.5,57,10.7,0.0,0.0,19.3,32.4,-0.433884,-0.900969,-1.0,-1.836970e-16,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15310,2022-09-30 22:00:00,13.5,68,9.6,0.5,0.5,22.7,37.3,-0.433884,-0.900969,-1.0,-1.836970e-16,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
