In [10]:
import pandas as pd
import numpy as np
from functools import reduce

import datetime
import seaborn as sns
from matplotlib import rcParams
import matplotlib.pyplot as plt
# figure size in inches
rcParams['figure.figsize'] = 30,15
pd.set_option('display.max_columns', None)


In [11]:
df_sales = pd.read_csv ('data/sales_clean.csv')
df_climate = pd.read_csv('data/climate_clean.csv')
df_holidays = pd.read_csv('data/holidays_clean.csv')

In [12]:
df_climate

Unnamed: 0,date,average_temp,total_precip_mm,did_rain,did_snow
0,2015-09-21,20.9,0.0,0,0
1,2015-09-22,20.4,0.0,0,0
2,2015-09-23,19.0,0.0,0,0
3,2015-09-24,19.9,0.0,0,0
4,2015-09-25,21.4,0.0,0,0
...,...,...,...,...,...
2055,2021-05-07,28.0,0.0,0,0
2056,2021-05-08,29.0,0.0,0,0
2057,2021-05-09,22.0,1.5,1,0
2058,2021-05-10,18.0,0.0,0,0


In [13]:
df_holidays

Unnamed: 0,date,day_type,holiday_type,holiday_name
0,2015-09-21,laborable,,
1,2015-09-22,laborable,,
2,2015-09-23,laborable,,
3,2015-09-24,laborable,,
4,2015-09-25,laborable,,
...,...,...,...,...
2055,2021-05-07,laborable,,
2056,2021-05-08,sábado,,
2057,2021-05-09,domingo,,
2058,2021-05-10,laborable,,


In [14]:
df1= pd.merge(df_sales, df_climate, how="right", on="date")

In [15]:
df1

Unnamed: 0,date,total_sales,day_of_week,month_name,day,year,average_temp,total_precip_mm,did_rain,did_snow
0,2015-09-21,233.00,Monday,September,21,2015,20.9,0.0,0,0
1,2015-09-22,95.80,Tuesday,September,22,2015,20.4,0.0,0,0
2,2015-09-23,156.50,Wednesday,September,23,2015,19.0,0.0,0,0
3,2015-09-24,141.80,Thursday,September,24,2015,19.9,0.0,0,0
4,2015-09-25,1095.15,Friday,September,25,2015,21.4,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...
2055,2021-05-07,2154.00,Friday,May,7,2021,28.0,0.0,0,0
2056,2021-05-08,6241.70,Saturday,May,8,2021,29.0,0.0,0,0
2057,2021-05-09,6611.10,Sunday,May,9,2021,22.0,1.5,1,0
2058,2021-05-10,946.40,Monday,May,10,2021,18.0,0.0,0,0


In [16]:
df2 = pd.merge(df1, df_holidays, how="right", on="date")

In [17]:
df2

Unnamed: 0,date,total_sales,day_of_week,month_name,day,year,average_temp,total_precip_mm,did_rain,did_snow,day_type,holiday_type,holiday_name
0,2015-09-21,233.00,Monday,September,21,2015,20.9,0.0,0,0,laborable,,
1,2015-09-22,95.80,Tuesday,September,22,2015,20.4,0.0,0,0,laborable,,
2,2015-09-23,156.50,Wednesday,September,23,2015,19.0,0.0,0,0,laborable,,
3,2015-09-24,141.80,Thursday,September,24,2015,19.9,0.0,0,0,laborable,,
4,2015-09-25,1095.15,Friday,September,25,2015,21.4,0.0,0,0,laborable,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2055,2021-05-07,2154.00,Friday,May,7,2021,28.0,0.0,0,0,laborable,,
2056,2021-05-08,6241.70,Saturday,May,8,2021,29.0,0.0,0,0,sábado,,
2057,2021-05-09,6611.10,Sunday,May,9,2021,22.0,1.5,1,0,domingo,,
2058,2021-05-10,946.40,Monday,May,10,2021,18.0,0.0,0,0,laborable,,


In [18]:
df2['holiday_type'] = df2['holiday_type'].str.upper()
df2['holiday_name'] = df2['holiday_name'].str.upper()

In [24]:
df2.sample(40)

Unnamed: 0,date,total_sales,day_of_week,month_name,day,year,average_temp,total_precip_mm,did_rain,did_snow,day_type,holiday_type,holiday_name
1621,2020-02-28,4323.1,Friday,February,28,2020,10.5,0.0,0,0,laborable,,
337,2016-08-23,756.2,Tuesday,August,23,2016,28.8,0.0,0,0,laborable,,
646,2017-06-28,1277.5,Wednesday,June,28,2017,21.8,0.0,0,0,laborable,,
1365,2019-06-17,921.65,Monday,June,17,2019,24.6,0.0,0,0,laborable,,
151,2016-02-19,2887.25,Friday,February,19,2016,5.2,0.0,0,0,laborable,,
759,2017-10-19,1085.1,Thursday,October,19,2017,12.8,5.84,1,0,laborable,,
1811,2020-09-05,2968.3,Saturday,September,5,2020,24.8,0.0,0,0,sábado,,
369,2016-09-24,4847.35,Saturday,September,24,2016,20.1,0.0,0,0,sábado,,
2039,2021-04-21,1359.9,Wednesday,April,21,2021,13.2,0.25,1,0,laborable,,
257,2016-06-04,4267.45,Saturday,June,4,2016,21.3,0.0,0,0,sábado,,


In [25]:
df2['is_closed'] = df2['total_sales'].apply(lambda x: 1 if x == 0 else 0)


In [26]:
df2['is_lockdown'] = df2['date'].apply(lambda x: 1 if x >= '2020-03-13' and x <= '2020-06-26' else 0)


In [27]:
df2['is_curfew'] = df2['date'].apply(lambda x: 1 if x >= '2020-03-13' and x <= '2021-05-09' else 0)


In [28]:
df2.head()

Unnamed: 0,date,total_sales,day_of_week,month_name,day,year,average_temp,total_precip_mm,did_rain,did_snow,day_type,holiday_type,holiday_name,is_closed,is_lockdown,is_curfew
0,2015-09-21,233.0,Monday,September,21,2015,20.9,0.0,0,0,laborable,,,0,0,0
1,2015-09-22,95.8,Tuesday,September,22,2015,20.4,0.0,0,0,laborable,,,0,0,0
2,2015-09-23,156.5,Wednesday,September,23,2015,19.0,0.0,0,0,laborable,,,0,0,0
3,2015-09-24,141.8,Thursday,September,24,2015,19.9,0.0,0,0,laborable,,,0,0,0
4,2015-09-25,1095.15,Friday,September,25,2015,21.4,0.0,0,0,laborable,,,0,0,0


In [29]:
df2.corr()

Unnamed: 0,total_sales,day,year,average_temp,total_precip_mm,did_rain,did_snow,is_closed,is_lockdown,is_curfew
total_sales,1.0,-0.029586,-0.013776,-0.264064,0.070638,0.107799,0.002931,-0.258799,-0.245639,-0.157188
day,-0.029586,1.0,-0.023629,0.012277,-0.021252,-0.007165,0.021833,0.034367,0.0158,0.002079
year,-0.013776,-0.023629,1.0,-0.046058,0.007636,0.044646,0.006112,0.226014,0.272899,0.68838
average_temp,-0.264064,0.012277,-0.046058,1.0,-0.106723,-0.205768,-0.095801,0.040597,0.0492,-0.000332
total_precip_mm,0.070638,-0.021252,0.007636,-0.106723,1.0,0.457043,0.093648,0.054079,0.049235,0.034876
did_rain,0.107799,-0.007165,0.044646,-0.205768,0.457043,1.0,0.141058,0.094187,0.095855,0.062359
did_snow,0.002931,0.021833,0.006112,-0.095801,0.093648,0.141058,1.0,0.058615,0.041986,0.020193
is_closed,-0.258799,0.034367,0.226014,0.040597,0.054079,0.094187,0.058615,1.0,0.949148,0.430816
is_lockdown,-0.245639,0.0158,0.272899,0.0492,0.049235,0.095855,0.041986,0.949148,1.0,0.458189
is_curfew,-0.157188,0.002079,0.68838,-0.000332,0.034876,0.062359,0.020193,0.430816,0.458189,1.0


In [30]:
df2.to_csv("data/db_load_files/clean_data.csv", index=False)

In [19]:
df3 = df2.set_index("date")


In [20]:
df3

Unnamed: 0_level_0,total_sales,day_of_week,month_name,day,year,average_temp,total_precip_mm,did_rain,did_snow,day_type,holiday_type,holiday_name,is_closed,is_lockdown,is_curfew
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-09-21,233.00,Monday,September,21,2015,20.9,0.0,0,0,laborable,,,0,0,0
2015-09-22,95.80,Tuesday,September,22,2015,20.4,0.0,0,0,laborable,,,0,0,0
2015-09-23,156.50,Wednesday,September,23,2015,19.0,0.0,0,0,laborable,,,0,0,0
2015-09-24,141.80,Thursday,September,24,2015,19.9,0.0,0,0,laborable,,,0,0,0
2015-09-25,1095.15,Friday,September,25,2015,21.4,0.0,0,0,laborable,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-07,2154.00,Friday,May,7,2021,28.0,0.0,0,0,laborable,,,0,0,1
2021-05-08,6241.70,Saturday,May,8,2021,29.0,0.0,0,0,sábado,,,0,0,1
2021-05-09,6611.10,Sunday,May,9,2021,22.0,1.5,1,0,domingo,,,0,0,1
2021-05-10,946.40,Monday,May,10,2021,18.0,0.0,0,0,laborable,,,0,0,0


In [377]:
df3['year'] = df3.year.astype('category')

In [378]:
del df3['day']
del df3['holiday_type']
del df3['holiday_name' ]

In [379]:
del df3['did_snow']


In [380]:
df4 = pd.get_dummies(df3 ,dummy_na=True)

In [381]:
df4

Unnamed: 0_level_0,total_sales,average_temp,total_precip_mm,did_rain,bolera_trends,bowling_trends,minigolf_trends,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,day_of_week_nan,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,month_name_nan,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,year_nan,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,day_type_nan
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1
2015-09-21,233.00,20.9,0.0,0,50,50,34,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2015-09-22,95.80,20.4,0.0,0,50,50,34,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2015-09-23,156.50,19.0,0.0,0,50,50,34,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2015-09-24,141.80,19.9,0.0,0,50,50,34,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2015-09-25,1095.15,21.4,0.0,0,50,50,34,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-07,2154.00,28.0,0.0,0,38,79,44,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
2021-05-08,6241.70,29.0,0.0,0,38,79,44,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2021-05-09,6611.10,22.0,1.5,1,38,79,44,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2021-05-10,946.40,18.0,0.0,0,38,79,44,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


In [382]:
del df4['day_type_nan']
del df4['month_name_nan']
del df4['day_of_week_nan']
del df4['year_nan']

In [383]:
df4

Unnamed: 0_level_0,total_sales,average_temp,total_precip_mm,did_rain,bolera_trends,bowling_trends,minigolf_trends,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
2015-09-21,233.00,20.9,0.0,0,50,50,34,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
2015-09-22,95.80,20.4,0.0,0,50,50,34,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
2015-09-23,156.50,19.0,0.0,0,50,50,34,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
2015-09-24,141.80,19.9,0.0,0,50,50,34,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
2015-09-25,1095.15,21.4,0.0,0,50,50,34,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-07,2154.00,28.0,0.0,0,38,79,44,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0
2021-05-08,6241.70,29.0,0.0,0,38,79,44,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2021-05-09,6611.10,22.0,1.5,1,38,79,44,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0
2021-05-10,946.40,18.0,0.0,0,38,79,44,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0


In [384]:
#add total sales lag 
number_lags=1
for lag in range(1, number_lags + 1):
    df4['prev_sales'] = df4.total_sales.shift(lag)

df4.dropna(subset = ["prev_sales"], inplace=True)

In [385]:
number_lags=1
for lag in range(1, number_lags + 1):
    df4['is_post_holiday'] = df4.day_type_festivo.shift(lag)

df4.dropna(subset = ["is_post_holiday"], inplace=True)

In [386]:
df4['is_pre_holiday'] = df4.day_type_festivo.shift(-1)
df4.dropna(subset = ["is_pre_holiday"], inplace=True)

In [387]:
df4

Unnamed: 0_level_0,total_sales,average_temp,total_precip_mm,did_rain,bolera_trends,bowling_trends,minigolf_trends,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
2015-09-23,156.50,19.0,0.0,0,50,50,34,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,95.80,0.0,0.0
2015-09-24,141.80,19.9,0.0,0,50,50,34,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,156.50,0.0,0.0
2015-09-25,1095.15,21.4,0.0,0,50,50,34,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,141.80,0.0,0.0
2015-09-26,2588.05,20.8,0.0,0,50,50,34,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1095.15,0.0,0.0
2015-09-27,1316.90,21.2,0.0,1,50,50,34,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,2588.05,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-06,649.40,27.0,0.0,0,38,79,44,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,751.70,0.0,0.0
2021-05-07,2154.00,28.0,0.0,0,38,79,44,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,649.40,0.0,0.0
2021-05-08,6241.70,29.0,0.0,0,38,79,44,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2154.00,0.0,0.0
2021-05-09,6611.10,22.0,1.5,1,38,79,44,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,6241.70,0.0,0.0


In [388]:
#df4 = df4[df4.total_sales != 0]
del df4['bolera_trends']
del df4['bowling_trends']
del df4['minigolf_trends']

In [389]:
df4

Unnamed: 0_level_0,total_sales,average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
2015-09-23,156.50,19.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,95.80,0.0,0.0
2015-09-24,141.80,19.9,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,156.50,0.0,0.0
2015-09-25,1095.15,21.4,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,141.80,0.0,0.0
2015-09-26,2588.05,20.8,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1095.15,0.0,0.0
2015-09-27,1316.90,21.2,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,2588.05,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-06,649.40,27.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,751.70,0.0,0.0
2021-05-07,2154.00,28.0,0.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,649.40,0.0,0.0
2021-05-08,6241.70,29.0,0.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2154.00,0.0,0.0
2021-05-09,6611.10,22.0,1.5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,6241.70,0.0,0.0


In [390]:


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis , QuadraticDiscriminantAnalysis

from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV, ElasticNet,SGDRegressor
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor 
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.preprocessing import  Normalizer , scale
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score

from sklearn.preprocessing import MinMaxScaler , StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error 

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

#### 

In [408]:
X = df4.drop(['total_sales'], axis=1)
y = df4['total_sales']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [392]:
models = { "ridge": Ridge(),
          "lasso": Lasso(),
          "sgd": SGDRegressor(),
          "knn": KNeighborsRegressor(),
          "gradient": GradientBoostingRegressor()
}

In [393]:
for name, model in models.items():
    print(f"Entrenando modelo ---> {name}")
    model.fit(X_train,y_train)
    print(f"He acabado :)")

Entrenando modelo ---> ridge
He acabado :)
Entrenando modelo ---> lasso
He acabado :)
Entrenando modelo ---> sgd
He acabado :)
Entrenando modelo ---> knn
He acabado :)
Entrenando modelo ---> gradient
He acabado :)


In [394]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"--------{name}--------")
    print("MAE: ", mean_absolute_error(y_test, y_pred))
    print("MSE: ", mean_squared_error(y_test,y_pred))
    print("RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))
    print("R2: ", r2_score(y_test,y_pred))
    print("\n")

--------ridge--------
MAE:  794.0367139211663
MSE:  1160202.8631309173
RMSE:  1077.1271341540503
R2:  0.7220704745496984


--------lasso--------
MAE:  790.4457593724027
MSE:  1153392.6847701278
RMSE:  1073.9612119486103
R2:  0.7237018699721665


--------sgd--------
MAE:  3623572168304555.0
MSE:  2.4947244925451394e+31
RMSE:  4994721706506920.0
R2:  -5.976175515299316e+24


--------knn--------
MAE:  1252.2628349514562
MSE:  3101137.816091845
RMSE:  1761.0047745795139
R2:  0.2571146055815803


--------gradient--------
MAE:  547.9655821151299
MSE:  684907.8558047389
RMSE:  827.5915996460687
R2:  0.8359285937053278




In [395]:
model = RandomForestRegressor()

params = {'n_estimators': [10,30,40,50,100],
          'max_features': ["sqrt", 0.5],
          'max_depth': [15,20,25],
          'min_samples_leaf': [1,2,4,6,8,10]}

grid_search = GridSearchCV(model, param_grid=params, verbose=1, n_jobs=-1,cv=5)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [15, 20, 25],
                         'max_features': ['sqrt', 0.5],
                         'min_samples_leaf': [1, 2, 4, 6, 8, 10],
                         'n_estimators': [10, 30, 40, 50, 100]},
             verbose=1)

In [396]:
bestscore = grid_search.best_score_
print("Best GridSearch Score: ", bestscore)
best_rf = grid_search.best_estimator_
print("Best Estimator: ", best_rf)
print("Best RF SCORE: ", best_rf.score(X, y))


Best GridSearch Score:  0.8065805947520704
Best Estimator:  RandomForestRegressor(max_depth=15, max_features=0.5)
Best RF SCORE:  0.9470792929008124


In [397]:
X["predicted_sales"] = best_rf.predict(X)


In [398]:
X["actual_total_sales"] = y

In [399]:
X

Unnamed: 0_level_0,average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday,predicted_sales,actual_total_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
2015-09-23,19.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,95.80,0.0,0.0,373.636333,156.50
2015-09-24,19.9,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,156.50,0.0,0.0,368.407578,141.80
2015-09-25,21.4,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,141.80,0.0,0.0,1270.084111,1095.15
2015-09-26,20.8,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1095.15,0.0,0.0,2879.928900,2588.05
2015-09-27,21.2,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,2588.05,0.0,0.0,2028.182982,1316.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-06,27.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,751.70,0.0,0.0,785.018468,649.40
2021-05-07,28.0,0.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,649.40,0.0,0.0,1983.912768,2154.00
2021-05-08,29.0,0.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2154.00,0.0,0.0,3913.015389,6241.70
2021-05-09,22.0,1.5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,6241.70,0.0,0.0,6004.893562,6611.10


In [273]:
import joblib

#autoMachineLearning
import h2o
from h2o.automl import H2OAutoML

In [336]:
h2o.init() #To start h2o


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 11 mins
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.2
H2O_cluster_version_age:,"14 days, 15 hours and 24 minutes"
H2O_cluster_name:,H2O_from_python_fran_k55b44
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.896 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [401]:
h2train = h2o.H2OFrame(df4)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [402]:
x = list(df4.columns)
x.remove('total_sales')

y = "total_sales"

print("X:", x)
print("y:", y)

X: ['average_temp', 'total_precip_mm', 'did_rain', 'is_closed', 'is_lockdown', 'is_curfew', 'day_of_week_Friday', 'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday', 'day_of_week_Wednesday', 'month_name_April', 'month_name_August', 'month_name_December', 'month_name_February', 'month_name_January', 'month_name_July', 'month_name_June', 'month_name_March', 'month_name_May', 'month_name_November', 'month_name_October', 'month_name_September', 'year_2015.0', 'year_2016.0', 'year_2017.0', 'year_2018.0', 'year_2019.0', 'year_2020.0', 'year_2021.0', 'day_type_domingo', 'day_type_festivo', 'day_type_laborable', 'day_type_sábado', 'prev_sales', 'is_post_holiday', 'is_pre_holiday']
y: total_sales


In [404]:
#TRAINING all the h20 models

automl = H2OAutoML(max_models=40, max_runtime_secs=3600, sort_metric='RMSE')
automl.train(x=x, y=y, training_frame=h2train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [405]:
#Showing the best performers

leader_board = automl.leaderboard
leader_board.head()

model_id,rmse,mean_residual_deviance,mse,mae,rmsle
StackedEnsemble_AllModels_AutoML_20210514_085030,891.485,794745,794745,544.785,
StackedEnsemble_BestOfFamily_AutoML_20210514_085030,902.258,814070,814070,547.353,
DeepLearning_grid__1_AutoML_20210514_085030_model_3,922.788,851538,851538,566.367,
DRF_1_AutoML_20210514_085030,935.071,874358,874358,573.63,0.776425
GBM_1_AutoML_20210514_085030,938.386,880568,880568,576.276,
GBM_2_AutoML_20210514_085030,940.211,883996,883996,579.173,
GBM_grid__1_AutoML_20210514_085030_model_1,943.466,890129,890129,585.612,
GBM_grid__1_AutoML_20210514_085030_model_3,943.482,890157,890157,579.35,
GBM_4_AutoML_20210514_085030,943.806,890770,890770,577.804,
GBM_3_AutoML_20210514_085030,944.913,892861,892861,573.935,




In [406]:
#Loading the TEST dataset

stacked_test = X
h2test_stacked = h2o.H2OFrame(stacked_test) #Conversion into a H20 frame to train
h2test_stacked.head() #preview

Parse progress: |█████████████████████████████████████████████████████████| 100%


average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday,predicted_sales,actual_total_sales
19.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,95.8,0,0,373.636,156.5
19.9,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,156.5,0,0,368.408,141.8
21.4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,141.8,0,0,1270.08,1095.15
20.8,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1095.15,0,0,2879.93,2588.05
21.2,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,2588.05,0,0,2028.18,1316.9
18.4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1316.9,0,0,1384.4,1929.0
18.1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1929.0,0,0,845.52,578.0
18.4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,578.0,0,0,770.641,552.2
18.4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,552.2,0,0,583.481,429.3
19.6,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,429.3,0,0,1834.36,1955.4




In [407]:
predicted_price_h2_stacked = automl.leader.predict(h2test_stacked).as_data_frame() #PREDICTING the Sales on the TEST dataset
predicted_price_h2_stacked #Result

stackedensemble prediction progress: |████████████████████████████████████| 100%


Unnamed: 0,predict
0,526.422368
1,443.888060
2,1403.973130
3,3138.984834
4,2116.813308
...,...
2052,880.873009
2053,1996.472619
2054,5402.138060
2055,5719.005857


In [409]:
pred = X
pred


Unnamed: 0_level_0,average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
2015-09-23,19.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,95.80,0.0,0.0
2015-09-24,19.9,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,156.50,0.0,0.0
2015-09-25,21.4,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,141.80,0.0,0.0
2015-09-26,20.8,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1095.15,0.0,0.0
2015-09-27,21.2,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,2588.05,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-06,27.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,751.70,0.0,0.0
2021-05-07,28.0,0.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,649.40,0.0,0.0
2021-05-08,29.0,0.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2154.00,0.0,0.0
2021-05-09,22.0,1.5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,6241.70,0.0,0.0


In [410]:
pred["total_sales"] = y

In [411]:
pred

Unnamed: 0_level_0,average_temp,total_precip_mm,did_rain,is_closed,is_lockdown,is_curfew,day_of_week_Friday,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,month_name_April,month_name_August,month_name_December,month_name_February,month_name_January,month_name_July,month_name_June,month_name_March,month_name_May,month_name_November,month_name_October,month_name_September,year_2015.0,year_2016.0,year_2017.0,year_2018.0,year_2019.0,year_2020.0,year_2021.0,day_type_domingo,day_type_festivo,day_type_laborable,day_type_sábado,prev_sales,is_post_holiday,is_pre_holiday,total_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
2015-09-23,19.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,95.80,0.0,0.0,156.50
2015-09-24,19.9,0.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,156.50,0.0,0.0,141.80
2015-09-25,21.4,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,141.80,0.0,0.0,1095.15
2015-09-26,20.8,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1095.15,0.0,0.0,2588.05
2015-09-27,21.2,0.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,2588.05,0.0,0.0,1316.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-06,27.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,751.70,0.0,0.0,649.40
2021-05-07,28.0,0.0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,649.40,0.0,0.0,2154.00
2021-05-08,29.0,0.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2154.00,0.0,0.0,6241.70
2021-05-09,22.0,1.5,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,6241.70,0.0,0.0,6611.10


In [412]:
pred = pred.set_index(predicted_price_h2_stacked.index)

In [413]:
pred['predict'] = predicted_price_h2_stacked['predict']


In [414]:
pred.dtypes

average_temp             float64
total_precip_mm          float64
did_rain                   int64
is_closed                  int64
is_lockdown                int64
is_curfew                  int64
day_of_week_Friday         uint8
day_of_week_Monday         uint8
day_of_week_Saturday       uint8
day_of_week_Sunday         uint8
day_of_week_Thursday       uint8
day_of_week_Tuesday        uint8
day_of_week_Wednesday      uint8
month_name_April           uint8
month_name_August          uint8
month_name_December        uint8
month_name_February        uint8
month_name_January         uint8
month_name_July            uint8
month_name_June            uint8
month_name_March           uint8
month_name_May             uint8
month_name_November        uint8
month_name_October         uint8
month_name_September       uint8
year_2015.0                uint8
year_2016.0                uint8
year_2017.0                uint8
year_2018.0                uint8
year_2019.0                uint8
year_2020.

In [415]:
pred.to_csv("data/h20_stacked_pred.csv", index=True)

In [413]:
df2.to_csv("data/final.csv", index=True)

In [416]:
X.to_csv("data/rndForrest.csv", index=True)