In [1]:
import pandas as pd
import numpy as np
from functools import reduce

import datetime
import seaborn as sns
from matplotlib import rcParams
import matplotlib.pyplot as plt
# figure size in inches
rcParams['figure.figsize'] = 30,15
pd.set_option('display.max_columns', None)


In [2]:
df_sales = pd.read_csv ('data/raw_data/sales_clean.csv')
df_climate = pd.read_csv('data/raw_data/climate_clean.csv')
df_holidays = pd.read_csv('data/raw_data/holidays_clean.csv')

In [3]:
df_climate

Unnamed: 0,date,average_temp,total_precip_mm,did_rain,did_snow
0,2015-09-21,20.9,0.0,0,0
1,2015-09-22,20.4,0.0,0,0
2,2015-09-23,19.0,0.0,0,0
3,2015-09-24,19.9,0.0,0,0
4,2015-09-25,21.4,0.0,0,0
...,...,...,...,...,...
2055,2021-05-07,28.0,0.0,0,0
2056,2021-05-08,29.0,0.0,0,0
2057,2021-05-09,22.0,1.5,1,0
2058,2021-05-10,18.0,0.0,0,0


In [4]:
df_holidays

Unnamed: 0,date,day_type,holiday_type,holiday_name
0,2015-09-21,laborable,,
1,2015-09-22,laborable,,
2,2015-09-23,laborable,,
3,2015-09-24,laborable,,
4,2015-09-25,laborable,,
...,...,...,...,...
2055,2021-05-07,laborable,,
2056,2021-05-08,sábado,,
2057,2021-05-09,domingo,,
2058,2021-05-10,laborable,,


In [5]:
df1= pd.merge(df_sales, df_climate, how="right", on="date")

In [6]:
df1

Unnamed: 0,date,total_sales,day_of_week,month_name,day,year,average_temp,total_precip_mm,did_rain,did_snow
0,2015-09-21,233.00,Monday,September,21,2015,20.9,0.0,0,0
1,2015-09-22,95.80,Tuesday,September,22,2015,20.4,0.0,0,0
2,2015-09-23,156.50,Wednesday,September,23,2015,19.0,0.0,0,0
3,2015-09-24,141.80,Thursday,September,24,2015,19.9,0.0,0,0
4,2015-09-25,1095.15,Friday,September,25,2015,21.4,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...
2055,2021-05-07,2154.00,Friday,May,7,2021,28.0,0.0,0,0
2056,2021-05-08,6241.70,Saturday,May,8,2021,29.0,0.0,0,0
2057,2021-05-09,6611.10,Sunday,May,9,2021,22.0,1.5,1,0
2058,2021-05-10,946.40,Monday,May,10,2021,18.0,0.0,0,0


In [7]:
df2 = pd.merge(df1, df_holidays, how="right", on="date")

In [8]:
df2

Unnamed: 0,date,total_sales,day_of_week,month_name,day,year,average_temp,total_precip_mm,did_rain,did_snow,day_type,holiday_type,holiday_name
0,2015-09-21,233.00,Monday,September,21,2015,20.9,0.0,0,0,laborable,,
1,2015-09-22,95.80,Tuesday,September,22,2015,20.4,0.0,0,0,laborable,,
2,2015-09-23,156.50,Wednesday,September,23,2015,19.0,0.0,0,0,laborable,,
3,2015-09-24,141.80,Thursday,September,24,2015,19.9,0.0,0,0,laborable,,
4,2015-09-25,1095.15,Friday,September,25,2015,21.4,0.0,0,0,laborable,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2055,2021-05-07,2154.00,Friday,May,7,2021,28.0,0.0,0,0,laborable,,
2056,2021-05-08,6241.70,Saturday,May,8,2021,29.0,0.0,0,0,sábado,,
2057,2021-05-09,6611.10,Sunday,May,9,2021,22.0,1.5,1,0,domingo,,
2058,2021-05-10,946.40,Monday,May,10,2021,18.0,0.0,0,0,laborable,,


In [9]:
df2['holiday_type'] = df2['holiday_type'].str.upper()
df2['holiday_name'] = df2['holiday_name'].str.upper()

In [10]:
df2.sample(40)

Unnamed: 0,date,total_sales,day_of_week,month_name,day,year,average_temp,total_precip_mm,did_rain,did_snow,day_type,holiday_type,holiday_name
1025,2018-07-12,1468.2,Thursday,July,12,2018,26.3,0.0,0,0,laborable,,
543,2017-03-17,2011.5,Friday,March,17,2017,10.9,0.0,0,0,laborable,,
112,2016-01-11,595.3,Monday,January,11,2016,9.9,11.18,1,0,laborable,,
38,2015-10-29,0.0,Thursday,October,29,2015,14.1,0.0,0,0,laborable,,
239,2016-05-17,234.7,Tuesday,May,17,2016,17.6,0.0,0,0,laborable,,
1822,2020-09-16,744.0,Wednesday,September,16,2020,23.5,0.0,1,0,laborable,,
1816,2020-09-10,457.8,Thursday,September,10,2020,24.1,0.0,0,0,laborable,,
357,2016-09-12,896.5,Monday,September,12,2016,25.9,0.0,1,0,laborable,,
1451,2019-09-11,1665.5,Wednesday,September,11,2019,20.1,0.0,0,0,laborable,,
516,2017-02-18,7437.6,Saturday,February,18,2017,6.7,0.0,0,0,sábado,,


In [11]:
df2['is_closed'] = df2['total_sales'].apply(lambda x: 1 if x == 0 else 0)


In [12]:
df2['is_lockdown'] = df2['date'].apply(lambda x: 1 if x >= '2020-03-13' and x <= '2020-06-26' else 0)


In [13]:
df2['is_curfew'] = df2['date'].apply(lambda x: 1 if x >= '2020-03-13' and x <= '2021-05-09' else 0)


In [14]:
df2.head()

Unnamed: 0,date,total_sales,day_of_week,month_name,day,year,average_temp,total_precip_mm,did_rain,did_snow,day_type,holiday_type,holiday_name,is_closed,is_lockdown,is_curfew
0,2015-09-21,233.0,Monday,September,21,2015,20.9,0.0,0,0,laborable,,,0,0,0
1,2015-09-22,95.8,Tuesday,September,22,2015,20.4,0.0,0,0,laborable,,,0,0,0
2,2015-09-23,156.5,Wednesday,September,23,2015,19.0,0.0,0,0,laborable,,,0,0,0
3,2015-09-24,141.8,Thursday,September,24,2015,19.9,0.0,0,0,laborable,,,0,0,0
4,2015-09-25,1095.15,Friday,September,25,2015,21.4,0.0,0,0,laborable,,,0,0,0


In [15]:
df2.corr()

Unnamed: 0,total_sales,day,year,average_temp,total_precip_mm,did_rain,did_snow,is_closed,is_lockdown,is_curfew
total_sales,1.0,-0.030031,-0.013859,-0.265996,0.071107,0.108535,0.002959,-0.260383,-0.247142,-0.158114
day,-0.030031,1.0,-0.023629,0.012277,-0.021252,-0.007165,0.021833,0.034367,0.0158,0.002079
year,-0.013859,-0.023629,1.0,-0.046058,0.007636,0.044646,0.006112,0.226014,0.272899,0.68838
average_temp,-0.265996,0.012277,-0.046058,1.0,-0.106723,-0.205768,-0.095801,0.040597,0.0492,-0.000332
total_precip_mm,0.071107,-0.021252,0.007636,-0.106723,1.0,0.457043,0.093648,0.054079,0.049235,0.034876
did_rain,0.108535,-0.007165,0.044646,-0.205768,0.457043,1.0,0.141058,0.094187,0.095855,0.062359
did_snow,0.002959,0.021833,0.006112,-0.095801,0.093648,0.141058,1.0,0.058615,0.041986,0.020193
is_closed,-0.260383,0.034367,0.226014,0.040597,0.054079,0.094187,0.058615,1.0,0.949148,0.430816
is_lockdown,-0.247142,0.0158,0.272899,0.0492,0.049235,0.095855,0.041986,0.949148,1.0,0.458189
is_curfew,-0.158114,0.002079,0.68838,-0.000332,0.034876,0.062359,0.020193,0.430816,0.458189,1.0


In [16]:
df2.to_csv("data/db_load_files/clean_data.csv", index=False)

In [17]:
df3 = df2.set_index("date")


In [18]:
df3

Unnamed: 0_level_0,total_sales,day_of_week,month_name,day,year,average_temp,total_precip_mm,did_rain,did_snow,day_type,holiday_type,holiday_name,is_closed,is_lockdown,is_curfew
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-09-21,233.00,Monday,September,21,2015,20.9,0.0,0,0,laborable,,,0,0,0
2015-09-22,95.80,Tuesday,September,22,2015,20.4,0.0,0,0,laborable,,,0,0,0
2015-09-23,156.50,Wednesday,September,23,2015,19.0,0.0,0,0,laborable,,,0,0,0
2015-09-24,141.80,Thursday,September,24,2015,19.9,0.0,0,0,laborable,,,0,0,0
2015-09-25,1095.15,Friday,September,25,2015,21.4,0.0,0,0,laborable,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-07,2154.00,Friday,May,7,2021,28.0,0.0,0,0,laborable,,,0,0,1
2021-05-08,6241.70,Saturday,May,8,2021,29.0,0.0,0,0,sábado,,,0,0,1
2021-05-09,6611.10,Sunday,May,9,2021,22.0,1.5,1,0,domingo,,,0,0,1
2021-05-10,946.40,Monday,May,10,2021,18.0,0.0,0,0,laborable,,,0,0,0


In [19]:
df3['year'] = df3.year.astype('category')

In [20]:
del df3['day']
del df3['holiday_type']
del df3['holiday_name' ]

In [21]:
del df3['did_snow']


In [22]:
df4 = pd.get_dummies(df3 ,dummy_na=True)

In [23]:
df4

Unnamed: 0_level_0,total_sales,average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,day_of_week_nan,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,month_name_nan,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,year_nan,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,day_type_nan
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
2015-09-21,233.00,20.9,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2015-09-22,95.80,20.4,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2015-09-23,156.50,19.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2015-09-24,141.80,19.9,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2015-09-25,1095.15,21.4,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-07,2154.00,28.0,0.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
2021-05-08,6241.70,29.0,0.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2021-05-09,6611.10,22.0,1.5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2021-05-10,946.40,18.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


In [24]:
del df4['day_type_nan']
del df4['month_name_nan']
del df4['day_of_week_nan']
del df4['year_nan']

In [25]:
df4

Unnamed: 0_level_0,total_sales,average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
2015-09-21,233.00,20.9,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
2015-09-22,95.80,20.4,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
2015-09-23,156.50,19.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
2015-09-24,141.80,19.9,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
2015-09-25,1095.15,21.4,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-07,2154.00,28.0,0.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
2021-05-08,6241.70,29.0,0.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2021-05-09,6611.10,22.0,1.5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0
2021-05-10,946.40,18.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0


In [26]:
#add total sales lag

number_lags=1
for lag in range(1, number_lags + 1):
    df4['prev_sales'] = df4.total_sales.shift(lag)

df4.dropna(subset = ["prev_sales"], inplace=True)



In [27]:
number_lags=1
for lag in range(1, number_lags + 1):
    df4['is_post_holiday'] = df4.day_type_festivo.shift(lag)

df4.dropna(subset = ["is_post_holiday"], inplace=True)

In [28]:
df4['is_pre_holiday'] = df4.day_type_festivo.shift(-1)
df4.dropna(subset = ["is_pre_holiday"], inplace=True)

In [29]:
df4

Unnamed: 0_level_0,total_sales,average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
2015-09-23,156.50,19.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,95.80,0.0,0.0
2015-09-24,141.80,19.9,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,156.50,0.0,0.0
2015-09-25,1095.15,21.4,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,141.80,0.0,0.0
2015-09-26,2588.05,20.8,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1095.15,0.0,0.0
2015-09-27,1316.90,21.2,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,2588.05,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-06,649.40,27.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,751.70,0.0,0.0
2021-05-07,2154.00,28.0,0.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,649.40,0.0,0.0
2021-05-08,6241.70,29.0,0.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2154.00,0.0,0.0
2021-05-09,6611.10,22.0,1.5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,6241.70,0.0,0.0


In [30]:
df4.corr()

Unnamed: 0,total_sales,average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday
total_sales,1.0,-0.265401,0.070753,0.107756,-0.260874,-0.247607,-0.158908,0.061382,-0.229944,0.552109,0.243729,-0.198493,-0.243472,-0.185655,0.008122,-0.116339,0.156834,0.077383,0.110025,-0.118126,-0.073275,0.035087,-0.068697,0.040252,-0.028058,-0.052284,-0.037045,-0.00309,0.006472,0.06197,0.088307,-0.18205,0.077631,0.241963,0.165263,-0.65488,0.545216,0.513002,0.087132,0.052128
average_temp,-0.265401,1.0,-0.106484,-0.205288,0.040832,0.049427,0.000128,0.004268,0.001329,-0.001832,-0.003456,0.006336,-0.007747,0.00109,-0.077256,0.409125,-0.320523,-0.268473,-0.379292,0.440144,0.302942,-0.190076,0.122291,-0.195127,0.023458,0.237393,-0.085706,0.028521,0.05533,0.005609,0.036277,0.034119,-0.175347,-0.00171,-0.075289,0.033945,-0.001045,-0.262695,-0.070514,-0.076861
total_precip_mm,0.070753,-0.106484,1.0,0.456924,0.053975,0.049137,0.034658,-0.001027,0.002885,0.02851,-0.003156,-0.017682,-0.010256,0.000711,0.117671,-0.042243,-0.033296,-0.007403,-0.026415,-0.036932,-0.05559,0.030353,0.009356,0.035375,0.028145,-0.030228,-0.01345,0.028376,-0.049706,0.036935,-0.021778,0.002766,0.017185,-0.001887,0.006219,-0.020022,0.025342,0.089086,0.003858,0.001483
did_rain,0.107756,-0.205288,0.456924,1.0,0.093982,0.095663,0.061895,0.011449,0.014452,-0.000562,0.005444,-0.012573,-0.002637,-0.015576,0.14941,-0.135674,0.001137,-0.01041,0.015795,-0.095857,-0.046582,0.041448,0.014794,0.101058,0.034119,-0.094744,-0.044061,0.031551,-0.105111,0.090208,-0.036336,0.020559,0.037488,0.008273,0.010337,-0.008265,-0.003082,0.079444,-0.005594,0.005026
is_closed,-0.260874,0.040832,0.053975,0.093982,1.0,0.949144,0.430734,0.001665,-0.004332,0.001665,-0.004332,0.001665,0.002009,0.001665,0.146791,-0.070106,-0.07743,-0.073711,-0.070112,-0.070106,0.141015,0.061629,0.167029,-0.076049,-0.004241,-0.070837,0.042086,-0.114251,-0.114061,-0.114061,-0.114061,0.467496,-0.055159,-0.003323,0.002356,0.002938,-0.001964,-0.259431,0.002356,0.002356
is_lockdown,-0.247607,0.049427,0.049137,0.095663,0.949144,1.0,0.458121,0.005339,-0.000944,-0.000944,-0.000944,-0.000944,-0.000621,-0.000944,0.161274,-0.06654,-0.073493,-0.069962,-0.073493,-0.06654,0.154517,0.072192,0.18213,-0.072182,-0.073493,-0.067234,-0.05269,-0.108441,-0.108261,-0.108261,-0.108261,0.501021,-0.060542,2.8e-05,0.007459,0.000535,-0.00501,-0.247444,0.007459,0.007459
is_curfew,-0.158908,0.000128,0.034658,0.061895,0.430734,0.458121,1.0,0.001863,-0.001574,0.001863,0.001863,-0.001574,-0.000868,-0.001574,0.097839,-0.003983,-0.030403,-0.030399,-0.030403,-0.003983,-0.003913,0.049286,0.026878,-0.029861,-0.030403,-0.011252,-0.115013,-0.236708,-0.236315,-0.236315,-0.236315,0.687933,0.505546,0.004,0.010492,-0.004806,-0.003536,-0.162378,0.010492,0.010492
day_of_week_Friday,0.061382,0.004268,-0.001027,0.011449,0.001665,0.005339,0.001863,1.0,-0.166761,-0.166761,-0.166761,-0.166761,-0.16643,-0.166761,0.001343,-0.000808,-0.00283,-0.001501,0.002013,-0.000808,0.002997,0.002013,-0.002981,0.001343,-0.00283,0.002178,-0.00189,0.002502,-0.000612,-0.000612,-0.000612,-0.00113,0.002395,-0.165767,0.049087,0.224823,-0.164438,-0.198385,0.006977,-0.035132
day_of_week_Monday,-0.229944,0.001329,0.002885,0.014452,-0.004332,-0.000944,-0.001574,-0.166761,1.0,-0.166761,-0.166761,-0.166761,-0.16643,-0.166761,0.001343,-0.000808,-0.00283,0.003544,-0.00283,0.004454,-0.002345,-0.00283,0.002133,0.001343,0.002013,-0.003038,-0.00189,-0.00113,-0.000612,0.003024,-0.000612,-0.00113,0.002395,-0.165767,0.056105,0.22185,-0.164438,0.243768,-0.063206,-0.014078
day_of_week_Saturday,0.552109,-0.001832,0.02851,-0.000562,0.001665,-0.000944,0.001863,-0.166761,-0.166761,1.0,-0.166761,-0.166761,-0.16643,-0.166761,0.001343,-0.000808,0.002013,0.003544,-0.00283,-0.000808,0.002997,-0.00283,-0.002981,-0.003573,0.002013,0.002178,-0.00189,0.002502,-0.000612,-0.000612,-0.000612,-0.00113,0.002395,-0.165767,-0.035132,-0.592796,0.986068,0.06145,0.049087,-0.063206


In [31]:
df4.to_excel("data/test.xlsx", index=True)

In [181]:


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis

from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV, ElasticNet,SGDRegressor
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor 
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.preprocessing import  Normalizer , scale
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, KFold , cross_val_score,RandomizedSearchCV


from sklearn.preprocessing import MinMaxScaler , StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error 

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

#### 

In [182]:
X = df4.drop(['total_sales'], axis=1)
y = df4['total_sales']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)

In [183]:
models = { "ridge": Ridge(),
          "lasso": Lasso(),
          "sgd": SGDRegressor(),
          "knn": KNeighborsRegressor(),
          "gradient": GradientBoostingRegressor()
}

In [184]:
for name, model in models.items():
    print(f"Entrenando modelo ---> {name}")
    model.fit(X_train,y_train)
    print(f"He acabado :)")

Entrenando modelo ---> ridge
He acabado :)
Entrenando modelo ---> lasso
He acabado :)
Entrenando modelo ---> sgd
He acabado :)
Entrenando modelo ---> knn
He acabado :)
Entrenando modelo ---> gradient
He acabado :)


In [185]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"--------{name}--------")
    print("MAE: ", mean_absolute_error(y_test, y_pred))
    print("MSE: ", mean_squared_error(y_test,y_pred))
    print("RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))
    print("R2: ", r2_score(y_test,y_pred))
    print("\n")

--------ridge--------
MAE:  875.9570617419739
MSE:  1501667.4290011108
RMSE:  1225.425407359057
R2:  0.6994623099574828


--------lasso--------
MAE:  876.5911619449099
MSE:  1512529.1351417925
RMSE:  1229.8492326874025
R2:  0.6972884917002589


--------sgd--------
MAE:  3591728060115478.5
MSE:  2.4412952134399505e+31
RMSE:  4940946481636844.0
R2:  -4.885910222126512e+24


--------knn--------
MAE:  1289.7386067961165
MSE:  3298764.7291740873
RMSE:  1816.2501835303658
R2:  0.33979847165015886


--------gradient--------
MAE:  616.7830689515188
MSE:  959398.898252148
RMSE:  979.4891006295823
R2:  0.8079897564923324




In [192]:
model = RandomForestRegressor()

params = {'n_estimators': [10,30,40,50,100],
          'max_features': ["sqrt"],
          'max_depth': [15,20,25],
          'min_samples_leaf': [1,2,4,6,8,10]}

grid_search = GridSearchCV(model, param_grid=params, verbose=1, n_jobs=-1,cv=5)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [15, 20, 25], 'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 2, 4, 6, 8, 10],
                         'n_estimators': [10, 30, 40, 50, 100]},
             verbose=1)

In [193]:
bestscore = grid_search.best_score_
print("Best GridSearch Score: ", bestscore)
best_rf = grid_search.best_estimator_
print("Best Estimator: ", best_rf)
print("Best RF SCORE: ", best_rf.score(X, y))


Best GridSearch Score:  0.8299862860879792
Best Estimator:  RandomForestRegressor(max_depth=20, max_features='sqrt')
Best RF SCORE:  0.9436785571679428


In [194]:
X["predicted_sales"] = best_rf.predict(X)


In [195]:
X["actual_total_sales"] = y

In [197]:
X

Unnamed: 0_level_0,average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday,predicted_sales,actual_total_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
2015-09-23,19.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,95.80,0.0,0.0,309.949916,156.50
2015-09-24,19.9,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,156.50,0.0,0.0,326.518685,141.80
2015-09-25,21.4,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,141.80,0.0,0.0,1698.295528,1095.15
2015-09-26,20.8,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1095.15,0.0,0.0,3042.714300,2588.05
2015-09-27,21.2,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,2588.05,0.0,0.0,1868.112000,1316.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-06,27.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,751.70,0.0,0.0,771.206853,649.40
2021-05-07,28.0,0.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,649.40,0.0,0.0,2026.929072,2154.00
2021-05-08,29.0,0.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2154.00,0.0,0.0,4458.060000,6241.70
2021-05-09,22.0,1.5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,6241.70,0.0,0.0,5827.272300,6611.10


In [198]:
import pickle

In [199]:
# save the model to disk
pickle.dump(best_rf, open("models/best_rf.pkl", 'wb'))

'''
# load the model from disk
loaded_model = pickle.load(open("mi_mejor_modelo", 'rb'))
loaded_model.predict(X_test)
'''

'\n# load the model from disk\nloaded_model = pickle.load(open("mi_mejor_modelo", \'rb\'))\nloaded_model.predict(X_test)\n'

In [200]:
import joblib

#autoMachineLearning
import h2o
from h2o.automl import H2OAutoML

In [201]:
h2o.init() #To start h2o


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 36 mins
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.2
H2O_cluster_version_age:,18 days
H2O_cluster_name:,H2O_from_python_fran_jlddwv
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.961 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [202]:
h2train = h2o.H2OFrame(df4)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [203]:
x = list(df4.columns)
x.remove('total_sales')

y = "total_sales"

print("X:", x)
print("y:", y)

X: ['average_temp', 'total_precip_mm', 'did_rain', 'is_closed', 'is_lockdown', 'is_curfew', 'day_of_week_Friday', 'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday', 'day_of_week_Wednesday', 'month_name_April', 'month_name_August', 'month_name_December', 'month_name_February', 'month_name_January', 'month_name_July', 'month_name_June', 'month_name_March', 'month_name_May', 'month_name_November', 'month_name_October', 'month_name_September', 'year_2015.0', 'year_2016.0', 'year_2017.0', 'year_2018.0', 'year_2019.0', 'year_2020.0', 'year_2021.0', 'day_type_domingo', 'day_type_festivo', 'day_type_laborable', 'day_type_sábado', 'prev_sales', 'is_post_holiday', 'is_pre_holiday']
y: total_sales


In [204]:
#TRAINING all the h20 models

automl = H2OAutoML(max_models=40, max_runtime_secs=10000, sort_metric='RMSE')
automl.train(x=x, y=y, training_frame=h2train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [205]:
#Showing the best performers

leader_board = automl.leaderboard
leader_board.head()

model_id,rmse,mean_residual_deviance,mse,mae,rmsle
StackedEnsemble_AllModels_AutoML_20210517_174810,849.769,722107,722107,535.825,
StackedEnsemble_BestOfFamily_AutoML_20210517_174810,865.707,749449,749449,548.032,
GBM_grid__1_AutoML_20210517_174810_model_5,889.834,791804,791804,562.688,
GBM_2_AutoML_20210517_174810,897.192,804954,804954,570.036,
GBM_grid__1_AutoML_20210517_174810_model_1,897.644,805765,805765,571.272,
DRF_1_AutoML_20210517_174810,898.268,806885,806885,568.687,0.783744
GBM_3_AutoML_20210517_174810,899.32,808776,808776,571.594,
GBM_4_AutoML_20210517_174810,902.584,814658,814658,569.032,
GBM_1_AutoML_20210517_174810,906.704,822112,822112,575.627,
GBM_grid__1_AutoML_20210517_174810_model_7,908.27,824954,824954,583.767,




In [206]:
# save the model to disk
model_path = h2o.save_model(model=automl.leader, path="models/autostacked", force=True)
print (model_path)

/mnt/c/Users/lesto/Desktop/Ironhack/CityPlayForecast/models/autostacked/StackedEnsemble_AllModels_AutoML_20210517_174810


In [207]:
#Loading the TEST dataset

stacked_test = X
h2test_stacked = h2o.H2OFrame(stacked_test) #Conversion into a H20 frame to train
h2test_stacked.head() #preview

Parse progress: |█████████████████████████████████████████████████████████| 100%


average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday,predicted_sales,actual_total_sales
19.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,95.8,0,0,309.95,156.5
19.9,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,156.5,0,0,326.519,141.8
21.4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,141.8,0,0,1698.3,1095.15
20.8,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1095.15,0,0,3042.71,2588.05
21.2,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,2588.05,0,0,1868.11,1316.9
18.4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1316.9,0,0,1443.63,1929.0
18.1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1929.0,0,0,723.578,578.0
18.4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,578.0,0,0,597.219,552.2
18.4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,552.2,0,0,494.339,429.3
19.6,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,429.3,0,0,2015.19,1955.4




In [208]:
predicted_price_h2_stacked = automl.leader.predict(h2test_stacked).as_data_frame() #PREDICTING the Sales on the TEST dataset
predicted_price_h2_stacked #Result

stackedensemble prediction progress: |████████████████████████████████████| 100%


Unnamed: 0,predict
0,646.195920
1,560.715196
2,1238.750973
3,2888.874782
4,1892.792346
...,...
2052,943.759990
2053,2049.660548
2054,5586.464017
2055,5814.359039


In [409]:
pred = X
pred


Unnamed: 0_level_0,average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
2015-09-23,19.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,95.80,0.0,0.0
2015-09-24,19.9,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,156.50,0.0,0.0
2015-09-25,21.4,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,141.80,0.0,0.0
2015-09-26,20.8,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1095.15,0.0,0.0
2015-09-27,21.2,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,2588.05,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-06,27.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,751.70,0.0,0.0
2021-05-07,28.0,0.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,649.40,0.0,0.0
2021-05-08,29.0,0.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2154.00,0.0,0.0
2021-05-09,22.0,1.5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,6241.70,0.0,0.0


In [410]:
pred["total_sales"] = y

In [411]:
pred

Unnamed: 0_level_0,average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday,total_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
2015-09-23,19.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,95.80,0.0,0.0,156.50
2015-09-24,19.9,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,156.50,0.0,0.0,141.80
2015-09-25,21.4,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,141.80,0.0,0.0,1095.15
2015-09-26,20.8,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1095.15,0.0,0.0,2588.05
2015-09-27,21.2,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,2588.05,0.0,0.0,1316.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-06,27.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,751.70,0.0,0.0,649.40
2021-05-07,28.0,0.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,649.40,0.0,0.0,2154.00
2021-05-08,29.0,0.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2154.00,0.0,0.0,6241.70
2021-05-09,22.0,1.5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,6241.70,0.0,0.0,6611.10


In [412]:
pred = pred.set_index(predicted_price_h2_stacked.index)

In [413]:
pred['predict'] = predicted_price_h2_stacked['predict']


In [415]:
pred.to_csv("data/h20_stacked_pred.csv", index=True)

In [209]:
single_model = h2o.get_model(automl.leaderboard.as_data_frame()['model_id'][2]) #Saving the best NON-STACKED model


In [210]:
#Another way to save it:
model_path = h2o.save_model(model=single_model, path="models/deeplearning", force=True)
saved_model = h2o.load_model(model_path)
print (saved_model)

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_grid__1_AutoML_20210517_174810_model_5


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,37101.0,9.0,9.0,9.0,22.0,81.0,53.9




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 415580.8150768924
RMSE: 644.655578644048
MAE: 418.53620537218677
RMSLE: NaN
Mean Residual Deviance: 415580.8150768924

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 791804.0381193821
RMSE: 889.8337137462157
MAE: 562.6879311816449
RMSLE: NaN
Mean Residual Deviance: 791804.0381193821

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,562.7028,25.393003,573.5618,521.2765,567.4025,589.57404,561.69904
1,mean_residual_deviance,791856.9,157603.0,916993.8,558003.0,742078.94,954680.0,787528.75
2,mse,791856.9,157603.0,916993.8,558003.0,742078.94,954680.0,787528.75
3,r2,0.8348656,0.030011341,0.80312043,0.877945,0.84295034,0.8095802,0.8407321
4,residual_deviance,791856.9,157603.0,916993.8,558003.0,742078.94,954680.0,787528.75
5,rmse,886.10785,91.30802,957.5979,746.996,861.44,977.0773,887.42816
6,rmsle,,0.0,,,,,



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2021-05-17 17:48:43,3.757 sec,0.0,2186.984391,1752.607145,4782901.0
1,,2021-05-17 17:48:43,3.791 sec,5.0,1483.340086,1157.615382,2200298.0
2,,2021-05-17 17:48:43,3.819 sec,10.0,1104.611294,828.43988,1220166.0
3,,2021-05-17 17:48:43,3.848 sec,15.0,910.441849,651.94728,828904.4
4,,2021-05-17 17:48:43,3.881 sec,20.0,808.264926,555.485957,653292.2
5,,2021-05-17 17:48:44,3.909 sec,25.0,751.856177,501.978985,565287.7
6,,2021-05-17 17:48:44,3.937 sec,30.0,717.578685,471.346501,514919.2
7,,2021-05-17 17:48:44,3.964 sec,35.0,692.344698,449.878358,479341.2
8,,2021-05-17 17:48:44,3.989 sec,40.0,672.30159,436.403824,451989.4
9,,2021-05-17 17:48:44,4.016 sec,45.0,659.668378,427.362011,435162.4



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,day_type_laborable,12531640000.0,1.0,0.265033
1,prev_sales,9786065000.0,0.780909,0.206967
2,day_of_week_Saturday,7496922000.0,0.59824,0.158553
3,average_temp,4476040000.0,0.357179,0.094664
4,day_of_week_Friday,2299688000.0,0.183511,0.048636
5,day_type_sábado,1928685000.0,0.153905,0.04079
6,is_closed,1504496000.0,0.120056,0.031819
7,day_of_week_Sunday,957622100.0,0.076416,0.020253
8,did_rain,715605000.0,0.057104,0.015134
9,month_name_December,612839800.0,0.048903,0.012961



See the whole table with table.as_data_frame()



In [413]:
df2.to_csv("data/final.csv", index=True)

In [94]:
X.to_csv("data/model_outputs/rndForrest.csv", index=True)