In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from datetime import datetime


In [2]:
# Importing the dataset
df = pd.DataFrame(pd.read_csv('../raw_data/bakerysales.csv'))
df['date'] = pd.to_datetime(df['date'])

# Extract the articles and the quantities in order to transform them into column s through a pivot method.
# We'll now have 149 column, one per product with the corresponding qty
pivot = df[['article', 'Quantity']]
products = pivot.pivot(columns = 'article', values = 'Quantity')

# Merge the pivot table with the original dataset and fill the Nan with zeros
# Now for each date point we have the quantity of the article sold
data = df.merge(products, left_index = True, right_index = True)
data = data.fillna(value = 0)

# Keep only the top 7 products (representing 68% of the volume sold)
# Set date as index
data_target = data[['date', 'TRADITIONAL BAGUETTE', 'CROISSANT', 'COUPE', 'PAIN AU CHOCOLAT', 'BAGUETTE', 'BANETTE', 'CEREAL BAGUETTE']]
data_target = data_target.set_index('date')
data_target = data_target.rename(columns = {'TRADITIONAL BAGUETTE' : 'traditional_baguette',
                                             'CROISSANT' : 'croissant',
                                            'COUPE' : 'coupe',
                                            'PAIN AU CHOCOLAT' : 'pain_au_chocolat',
                                            'BAGUETTE' : 'baguette',
                                            'BANETTE' : 'banette',
                                            'CEREAL BAGUETTE' : 'cereal_baguette'})

data_target = data_target[data_target != 0]
data_target.dropna(axis = 0, how = 'all', inplace = True)
data_target = data_target.fillna(value = 0)


In [3]:
# Function for converting date in format dd/mm/yyy to yyyy-mm-dd datetime type
def convert_to_date(date_string):
    date_list = date_string.split("/")
    _day, _month, _year = date_list[0], date_list[1], date_list[2]
    date_str = _year+"-"+_month+"-"+_day
    return datetime.strptime(date_str, '%Y-%m-%d').date()

# Load weather data
data_weather = pd.read_csv('../raw_data/paris_weather.csv')
df_weather = data_weather

# Converting date in format dd/mm/yyy to yyyy-mm-dd datetime type
df_weather['date'] = df_weather['DATE'].apply(convert_to_date)
df_weather['date'] = pd.to_datetime(df_weather['date'])


# Creating day of week as a cyclical feature
# First, create the day of the week as a numerical feature
df_weather['day_of_week'] = pd.to_datetime(df_weather['date']).dt.weekday
# Since we have a 7 days week period (e.g., days in a week)
period = 7
# Convert 'day_of_week' to radians
df_weather['day_of_week_radians'] = 2 * np.pi * df_weather['day_of_week'] / period
# Create new features using sine and cosine
df_weather['day_of_week_sin'] = np.sin(df_weather['day_of_week_radians'])
df_weather['day_of_week_cos'] = np.cos(df_weather['day_of_week_radians'])
# Dropping ['month_radians']
df_weather.drop(columns=['day_of_week_radians', 'day_of_week', 'DATE'], inplace=True)


# Creates cyclical month feature according to the date
df_weather['month'] = df_weather.date.dt.month
# Assuming we have a 12 month period (e.g., month in a year)
period = 12
# Convert 'month' to radians
df_weather['month_radians'] = 2 * np.pi * df_weather['month'] / period
# Create new features using sine and cosine
df_weather['month_sin'] = np.sin(df_weather['month_radians'])
df_weather['month_cos'] = np.cos(df_weather['month_radians'])
# Dropping ['month_radians']
df_weather.drop(columns=['month_radians', 'month'], inplace=True)


# Generates periodical features for SUNRISE and SUNSET
# Creates cyclical time feature according to the SUNRISE time
df_weather['sunrise_time'] = pd.to_datetime(df_weather['SUNRISE']).dt.hour*60+pd.to_datetime(df_weather['SUNRISE']).dt.minute
# Assuming we have a 1440 minutes in period (e.g., minutes  in a day)
period = 1440
# Convert 'month' to radians
df_weather['sunrise_time_radians'] = 2 * np.pi * df_weather['sunrise_time'] / period
# Create new features using sine and cosine
df_weather['sunrise_time_sin'] = np.sin(df_weather['sunrise_time_radians'])
df_weather['sunrise_time_cos'] = np.cos(df_weather['sunrise_time_radians'])
# Dropping ['sunrise_time_radians']
df_weather.drop(columns=['sunrise_time_radians', 'sunrise_time', 'SUNRISE'], inplace=True)


# Creates cyclical time feature according to the SUNSET time
df_weather['sunset_time'] = pd.to_datetime(df_weather['SUNSET']).dt.hour*60+pd.to_datetime(df_weather['SUNSET']).dt.minute
# Assuming we have a 1440 minutes in period (e.g., minutes  in a day)
period = 1440
# Convert 'month' to radians
df_weather['sunset_time_radians'] = 2 * np.pi * df_weather['sunset_time'] / period
# Create new features using sine and cosine
df_weather['sunset_time_sin'] = np.sin(df_weather['sunset_time_radians'])
df_weather['sunset_time_cos'] = np.cos(df_weather['sunset_time_radians'])
# Dropping ['sunset_time_radians']
df_weather.drop(columns=['sunset_time_radians', 'sunset_time', 'SUNSET'], inplace=True)


# Setting new date column as index
df_weather.set_index(['date'], inplace=True)
df_weather.head()


# Creating the dictionary for the weather comments and converting OPINION to numbers
dict_meteo = {
'météo idéale': 4,
'météo favorable': 3,
'météo correcte': 2,
'météo défavorable': 1,
'météo très défavorable': 0
}
df_weather['opinion'] = df_weather['OPINION'].map(dict_meteo)
df_weather.drop(columns= ['OPINION'], inplace=True)
df_weather.head()


# Creates dictionary with Holidays
holidays = [
    '2021-1-1',
    '2021-4-5',
    '2021-5-1',
    '2021-5-8',
    '2021-5-13',
    '2021-5-24',
    '2021-7-14',
    '2021-8-15',
    '2021-11-1',
    '2021-11-11',
    '2021-12-25',
    '2022-1-1',
    '2022-4-18',
    '2022-5-1',
    '2022-5-8',
    '2022-5-26',
    '2022-6-6',
    '2022-7-14',
    '2022-8-15',
]
holidays = [pd.to_datetime(holiday)for holiday in holidays ]
# Add isHoliday column to the df_weather
df_weather['isHoliday'] = df_weather.index.map(lambda x: 1 if x in holidays else 0)


  df_weather['sunrise_time'] = pd.to_datetime(df_weather['SUNRISE']).dt.hour*60+pd.to_datetime(df_weather['SUNRISE']).dt.minute
  df_weather['sunrise_time'] = pd.to_datetime(df_weather['SUNRISE']).dt.hour*60+pd.to_datetime(df_weather['SUNRISE']).dt.minute
  df_weather['sunset_time'] = pd.to_datetime(df_weather['SUNSET']).dt.hour*60+pd.to_datetime(df_weather['SUNSET']).dt.minute
  df_weather['sunset_time'] = pd.to_datetime(df_weather['SUNSET']).dt.hour*60+pd.to_datetime(df_weather['SUNSET']).dt.minute


In [4]:
merged_data = data_target.merge(df_weather, left_index = True, right_index = True)

f_data = merged_data.drop(['WINDSPEED_MAX_KMH',
                           'PRECIP_TOTAL_DAY_MM',
                           'HUMIDITY_MAX_PERCENT',
                           'VISIBILITY_AVG_KM',
                           'PRESSURE_MAX_MB',
                           'CLOUDCOVER_AVG_PERCENT',
                           'DEWPOINT_MAX_C',
                           'WEATHER_CODE_MORNING',
                           'WEATHER_CODE_NOON',
                           'WEATHER_CODE_EVENING',
                           'TOTAL_SNOW_MM',
                           'SUNHOUR',
                           'day_of_week_sin',
                           'day_of_week_cos',
                           'isHoliday'], axis = 1)


In [5]:
traditional_baguette = merged_data.drop(['croissant', 'coupe', 'pain_au_chocolat', 'baguette', 'banette', 'cereal_baguette'], axis = 1)
croissant = merged_data.drop(['traditional_baguette', 'coupe', 'pain_au_chocolat', 'baguette', 'banette', 'cereal_baguette'], axis = 1)
coupe = merged_data.drop(['traditional_baguette', 'croissant', 'pain_au_chocolat', 'baguette', 'banette', 'cereal_baguette'], axis = 1)
pain_au_chocolat = merged_data.drop(['traditional_baguette', 'coupe', 'croissant', 'baguette', 'banette', 'cereal_baguette'], axis = 1)
baguette = merged_data.drop(['traditional_baguette', 'coupe', 'pain_au_chocolat', 'croissant', 'banette', 'cereal_baguette'], axis = 1)
banette = merged_data.drop(['traditional_baguette', 'coupe', 'pain_au_chocolat', 'baguette', 'croissant', 'cereal_baguette'], axis = 1)
cereal_baguette = merged_data.drop(['traditional_baguette', 'coupe', 'pain_au_chocolat', 'baguette', 'banette', 'croissant'], axis = 1)


In [8]:
tradi_corr = traditional_baguette.corr()
tradi_corr.style.background_gradient(cmap = 'coolwarm')


Unnamed: 0,traditional_baguette,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,PRESSURE_MAX_MB,CLOUDCOVER_AVG_PERCENT,HEATINDEX_MAX_C,DEWPOINT_MAX_C,WINDTEMP_MAX_C,WEATHER_CODE_MORNING,WEATHER_CODE_NOON,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,TEMPERATURE_NIGHT_C,day_of_week_sin,day_of_week_cos,month_sin,month_cos,sunrise_time_sin,sunrise_time_cos,sunset_time_sin,sunset_time_cos,opinion,isHoliday
traditional_baguette,1.0,0.037959,0.038446,7.4e-05,0.037953,0.038395,0.03805,-0.006339,-0.0265,0.007935,-0.013912,-0.011257,0.036302,0.021159,0.036596,-0.000459,-0.004659,-0.00086,-0.006985,0.035811,0.027719,0.038795,-0.020013,0.004614,-0.031232,-0.029423,0.034105,0.028611,0.025392,0.031176,0.02898,0.016638
MAX_TEMPERATURE_C,0.037959,1.0,0.912663,-0.262793,0.894349,0.987247,0.989089,-0.1282,-0.457572,0.302385,-0.115748,-0.457769,0.992481,0.671989,0.901122,-0.138334,-0.225632,-0.233939,-0.169933,0.947804,0.702525,0.905653,0.087096,-0.018121,-0.615011,-0.6205,0.664373,0.522072,0.685082,0.709805,0.781218,0.019312
MIN_TEMPERATURE_C,0.038446,0.912663,1.0,-0.189369,0.98413,0.92884,0.912036,-0.037633,-0.289938,0.205966,-0.272192,-0.199953,0.926806,0.82567,0.985775,-0.001569,-0.096755,-0.068944,-0.16998,0.876652,0.551697,0.979194,0.109095,-0.035125,-0.69714,-0.591531,0.619465,0.581499,0.58301,0.637586,0.674242,0.023265
WINDSPEED_MAX_KMH,7.4e-05,-0.262793,-0.189369,1.0,-0.159092,-0.237886,-0.27286,0.18121,0.093372,-0.135543,-0.169468,0.284516,-0.26104,-0.112,-0.234673,0.158206,0.229384,0.249449,0.068632,-0.245897,-0.220228,-0.165702,-0.072563,0.017702,0.210087,0.070077,-0.071675,-0.044201,-0.134105,-0.097931,-0.233707,0.010776
TEMPERATURE_MORNING_C,0.037953,0.894349,0.98413,-0.159092,1.0,0.911094,0.883872,0.025034,-0.264095,0.150125,-0.306157,-0.148091,0.908771,0.843111,0.989045,0.032162,-0.059476,-0.035091,-0.151089,0.850478,0.513991,0.99007,0.109515,-0.044506,-0.690821,-0.579644,0.600238,0.578274,0.568474,0.621033,0.648188,0.022837
TEMPERATURE_NOON_C,0.038395,0.987247,0.92884,-0.237886,0.911094,1.0,0.978028,-0.108428,-0.447105,0.290484,-0.149299,-0.416753,0.983253,0.689134,0.919191,-0.118697,-0.22116,-0.188491,-0.161574,0.941714,0.683429,0.919182,0.094236,-0.031436,-0.625405,-0.625383,0.661432,0.544537,0.675873,0.704676,0.774031,0.012155
TEMPERATURE_EVENING_C,0.03805,0.989089,0.912036,-0.27286,0.883872,0.978028,1.0,-0.146705,-0.446774,0.310918,-0.103633,-0.46363,0.984276,0.666721,0.895964,-0.144218,-0.227836,-0.244469,-0.173711,0.946093,0.713323,0.893247,0.084175,-0.006633,-0.608495,-0.628343,0.664621,0.534765,0.685126,0.71475,0.778651,0.017549
PRECIP_TOTAL_DAY_MM,-0.006339,-0.1282,-0.037633,0.18121,0.025034,-0.108428,-0.146705,1.0,0.276284,-0.425587,-0.262538,0.377882,-0.119057,0.129926,-0.00219,0.389274,0.437958,0.385576,0.035686,-0.162568,-0.201332,0.013264,-0.030538,-0.029742,0.096956,-0.058451,-0.039195,0.126346,-0.034682,-0.003847,-0.259987,-0.008351
HUMIDITY_MAX_PERCENT,-0.0265,-0.457572,-0.289938,0.093372,-0.264095,-0.447105,-0.446774,0.276284,1.0,-0.407586,-0.095613,0.55231,-0.415222,0.150413,-0.256399,0.413132,0.335408,0.300693,0.073683,-0.484554,-0.374429,-0.298585,-0.001712,0.015093,0.202246,0.138517,-0.268976,-0.001655,-0.217855,-0.249899,-0.40607,0.021717
VISIBILITY_AVG_KM,0.007935,0.302385,0.205966,-0.135543,0.150125,0.290484,0.310918,-0.425587,-0.407586,1.0,0.265138,-0.501267,0.299224,0.016858,0.181445,-0.494254,-0.467718,-0.364669,-0.260452,0.362305,0.421483,0.171177,0.072613,0.009726,-0.170432,-0.195267,0.346536,0.153666,0.156201,0.237042,0.363539,-0.057338


In [None]:
coupe_corr = coupe.corr()
coupe_corr.style.background_gradient(cmap = 'coolwarm')


In [None]:
pain_choc_corr = pain_au_chocolat.corr()
pain_choc_corr.style.background_gradient(cmap = 'coolwarm')


In [None]:
baguette_corr = baguette.corr()
baguette_corr.style.background_gradient(cmap = 'coolwarm')


In [None]:
banette_corr = banette.corr()
banette_corr.style.background_gradient(cmap = 'coolwarm')


In [None]:
cereal_bag_corr = cereal_baguette.corr()
cereal_bag_corr.style.background_gradient(cmap = 'coolwarm')
