# Libraries

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_squared_log_error

from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import BayesianRidge

# Read Data

In [28]:
data = pd.read_csv('../01-Data/DataGas.csv', parse_dates=['Analysis_Date', 'Last_Day_of_Analyses_of_Week'])

In [29]:
data.head()

Unnamed: 0,Unnamed:_0,Analysis_Date,Last_Day_of_Analyses_of_Week,Macroregion,State,Product,No_of_Gas_Stations_Analyzed,Measurement_Unit,Mean_Price,Std_Dev,Min_Price,Max_Price,Mean_Price_Margin,Coefficient_of_variation,Mean_Dist_Price,Distribution_Std_Dev,Distribution_Min_Price,Distribution_Max_Price,Distribution_Coefficient_of_Variation,Month,Year
0,12064,2004-05-09,2004-05-15,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,128,R$/l,2.029,0.007,1.99,2.07,0.318,0.003,1.711,0.02,1.651,1.7427,0.012,5,2004
1,12065,2004-05-09,2004-05-15,CENTRO OESTE,GOIAS,GASOLINA COMUM,395,R$/l,2.025,0.062,1.85,2.22,0.296,0.031,1.729,0.036,1.6643,1.915,0.021,5,2004
2,12066,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO,GASOLINA COMUM,194,R$/l,2.358,0.066,2.0,2.54,0.472,0.028,1.886,0.068,1.75,2.0713,0.036,5,2004
3,12067,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO DO SUL,GASOLINA COMUM,166,R$/l,2.12,0.075,1.97,2.44,0.325,0.035,1.795,0.033,1.70701,1.9703,0.018,5,2004
4,12068,2004-05-09,2004-05-15,NORDESTE,ALAGOAS,GASOLINA COMUM,106,R$/l,2.09,0.034,2.0,2.159,0.35,0.016,1.74,0.042,1.6789,1.918,0.024,5,2004


In [30]:
data.columns

Index(['Unnamed:_0', 'Analysis_Date', 'Last_Day_of_Analyses_of_Week',
       'Macroregion', 'State', 'Product', 'No_of_Gas_Stations_Analyzed',
       'Measurement_Unit', 'Mean_Price', 'Std_Dev', 'Min_Price', 'Max_Price',
       'Mean_Price_Margin', 'Coefficient_of_variation', 'Mean_Dist_Price',
       'Distribution_Std_Dev', 'Distribution_Min_Price',
       'Distribution_Max_Price', 'Distribution_Coefficient_of_Variation',
       'Month', 'Year'],
      dtype='object')

# Train and Validation Split (Simple Holdout)

In [31]:
data_train = data[data['Last_Day_of_Analyses_of_Week'] < '2011-01-01']
data_valid = data[data['Last_Day_of_Analyses_of_Week'] >= '2011-01-01']

data_train.shape, data_valid.shape

((9233, 21), (11961, 21))

# New DataFrame for Train and Validation (Index: original Data)

In [32]:
df_train = pd.DataFrame(index=data_train.index)
df_valid  = pd.DataFrame(index=data_valid.index)

# Target

## First Difference of Average Resale Price

In [33]:
df_train['diff_Mean_Price'] = data_train.groupby(['State'])['Mean_Price'].apply(lambda row: row.diff().shift(-1))
df_valid['diff_Mean_Price'] = data_valid.groupby(['State'])['Mean_Price'].apply(lambda row: row.diff().shift(-1))

# Features

## Current Mean Price

In [34]:
df_train['Current_Mean_Price'] = data_train['Mean_Price']
df_valid['Current_Mean_Price'] = data_valid['Mean_Price']

## Seasonality

In [35]:
df_train['month'] = data_train['Last_Day_of_Analyses_of_Week'].dt.month
df_train['day'] = data_train['Last_Day_of_Analyses_of_Week'].dt.day
df_train['weekday'] = data_train['Last_Day_of_Analyses_of_Week'].dt.weekday
df_train['dayofyear'] = data_train['Last_Day_of_Analyses_of_Week'].dt.dayofyear

df_valid['month'] = data_valid['Last_Day_of_Analyses_of_Week'].dt.month
df_valid['day'] = data_valid['Last_Day_of_Analyses_of_Week'].dt.day
df_valid['weekday'] = data_valid['Last_Day_of_Analyses_of_Week'].dt.weekday
df_valid['dayofyear'] = data_valid['Last_Day_of_Analyses_of_Week'].dt.dayofyear

## Movel Average

In [36]:
roll = data_train.groupby(['State'])['Mean_Price'].rolling(4).mean()
roll.reset_index(level=[0,1], drop=True).head()

0        NaN
1        NaN
2        NaN
3    2.42475
4    2.43700
Name: Mean_Price, dtype: float64

In [37]:
df_train['Movel_Average_Mean_Price_4_weeks'] = data_train.groupby(['State'])['Mean_Price'].rolling(4).mean().reset_index(level=0, drop=True)
df_valid['Movel_Average_Mean_Price_4_weeks'] = data_valid.groupby(['State'])['Mean_Price'].rolling(4).mean().reset_index(level=0, drop=True)

In [38]:
df_train.isnull().sum()

diff_Mean_Price                     27
Current_Mean_Price                   0
month                                0
day                                  0
weekday                              0
dayofyear                            0
Movel_Average_Mean_Price_4_weeks    81
dtype: int64

In [39]:
df_train = df_train.dropna()
df_valid = df_valid.dropna()

## X, y Train and Validation Split

In [40]:
Xtr, ytr = df_train.drop(['diff_Mean_Price'], axis=1), df_train['diff_Mean_Price']
Xval, yval = df_valid.drop(['diff_Mean_Price'], axis=1), df_valid['diff_Mean_Price']

# Model

## RandomForestRegressor

In [41]:
mdl = RandomForestRegressor(n_jobs=-1, random_state=0, n_estimators=500)
mdl.fit(Xtr, ytr)
p = mdl.predict(Xval)

#### Metric

In [42]:
p_final = Xval['Current_Mean_Price'] + p
yval_final = Xval['Current_Mean_Price'] + yval

np.sqrt(mean_squared_log_error(yval_final, p_final)) * 100

1.0911770508576566

## LGBMRegressor

In [43]:
mdl = LGBMRegressor(num_leaves=2, min_data_in_leaf=250, n_jobs=-1, random_state=0, n_estimators=500)
mdl.fit(Xtr, ytr)
p = mdl.predict(Xval)



#### Metric

In [44]:
p_final = Xval['Current_Mean_Price'] + p
yval_final = Xval['Current_Mean_Price'] + yval

np.sqrt(mean_squared_log_error(yval_final, p_final)) * 100

0.992484189867061

## LinearRegression

In [45]:
mdl = LinearRegression()
mdl.fit(Xtr, ytr)
p = mdl.predict(Xval)

#### Metric

In [46]:
p_final = Xval['Current_Mean_Price'] + p
yval_final = Xval['Current_Mean_Price'] + yval

np.sqrt(mean_squared_log_error(yval_final, p_final)) * 100

1.1032292901762215

## Lasso

In [47]:
mdl = Lasso()
mdl.fit(Xtr, ytr)
p = mdl.predict(Xval)

#### Metric

In [48]:
p_final = Xval['Current_Mean_Price'] + p
yval_final = Xval['Current_Mean_Price'] + yval

np.sqrt(mean_squared_log_error(yval_final, p_final)) * 100

0.9857612576590822

## Ridge

In [49]:
mdl = Ridge()
mdl.fit(Xtr, ytr)
p = mdl.predict(Xval)

#### Metric

In [50]:
p_final = Xval['Current_Mean_Price'] + p
yval_final = Xval['Current_Mean_Price'] + yval

np.sqrt(mean_squared_log_error(yval_final, p_final)) * 100

1.0995291008925445

## BayesianRidge

In [51]:
mdl = BayesianRidge()
mdl.fit(Xtr, ytr)
p = mdl.predict(Xval)

#### Metric

In [52]:
p_final = Xval['Current_Mean_Price'] + p
yval_final = Xval['Current_Mean_Price'] + yval

np.sqrt(mean_squared_log_error(yval_final, p_final)) * 100

1.1023114612438818