# Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_squared_log_error

from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

# Read Data

### Oil data

In [2]:
data = pd.read_csv('../01-Data/DataGas.csv', parse_dates=['Analysis_Date', 'Last_Day_of_Analyses_of_Week'])

In [3]:
data.head()

Unnamed: 0,Unnamed:_0,Analysis_Date,Last_Day_of_Analyses_of_Week,Macroregion,State,Product,No_of_Gas_Stations_Analyzed,Measurement_Unit,Mean_Price,Std_Dev,Min_Price,Max_Price,Mean_Price_Margin,Coefficient_of_variation,Mean_Dist_Price,Distribution_Std_Dev,Distribution_Min_Price,Distribution_Max_Price,Distribution_Coefficient_of_Variation,Month,Year
0,12064,2004-05-09,2004-05-15,CENTRO OESTE,DISTRITO FEDERAL,GASOLINA COMUM,128,R$/l,2.029,0.007,1.99,2.07,0.318,0.003,1.711,0.02,1.651,1.7427,0.012,5,2004
1,12065,2004-05-09,2004-05-15,CENTRO OESTE,GOIAS,GASOLINA COMUM,395,R$/l,2.025,0.062,1.85,2.22,0.296,0.031,1.729,0.036,1.6643,1.915,0.021,5,2004
2,12066,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO,GASOLINA COMUM,194,R$/l,2.358,0.066,2.0,2.54,0.472,0.028,1.886,0.068,1.75,2.0713,0.036,5,2004
3,12067,2004-05-09,2004-05-15,CENTRO OESTE,MATO GROSSO DO SUL,GASOLINA COMUM,166,R$/l,2.12,0.075,1.97,2.44,0.325,0.035,1.795,0.033,1.70701,1.9703,0.018,5,2004
4,12068,2004-05-09,2004-05-15,NORDESTE,ALAGOAS,GASOLINA COMUM,106,R$/l,2.09,0.034,2.0,2.159,0.35,0.016,1.74,0.042,1.6789,1.918,0.024,5,2004


In [4]:
data.columns

Index(['Unnamed:_0', 'Analysis_Date', 'Last_Day_of_Analyses_of_Week',
       'Macroregion', 'State', 'Product', 'No_of_Gas_Stations_Analyzed',
       'Measurement_Unit', 'Mean_Price', 'Std_Dev', 'Min_Price', 'Max_Price',
       'Mean_Price_Margin', 'Coefficient_of_variation', 'Mean_Dist_Price',
       'Distribution_Std_Dev', 'Distribution_Min_Price',
       'Distribution_Max_Price', 'Distribution_Coefficient_of_Variation',
       'Month', 'Year'],
      dtype='object')

### Economic Data

In [5]:
economic_data = pd.read_csv('../01-Data/economic_data.csv',  parse_dates=['Date'])

In [6]:
economic_data.head()

Unnamed: 0,Date,Oil_mean,Selic_mean,Dollar_mean,Gold_mean,Oil_median,Selic_median,Dollar_median,Gold_median,Oil_std,Selic_std,Dollar_std,Gold_std
0,2004-05-09,39.297143,16.0,3.014014,37.064286,39.41,16.0,2.9891,36.9,0.578179,0.0,0.059211,0.414151
1,2004-05-16,40.687143,16.0,3.113114,37.492857,40.94,16.0,3.113267,37.45,0.943112,0.0,0.011204,0.332757
2,2004-05-23,40.867143,16.0,3.158643,38.29,40.92,16.0,3.1805,38.3,0.664057,0.0,0.042535,0.569077
3,2004-05-30,40.693214,16.0,3.1375,38.778571,40.6,16.0,3.1516,38.7,0.939225,0.0,0.031106,0.305007
4,2004-06-06,39.843214,16.0,3.133371,38.828571,39.29,16.0,3.1294,38.7,1.59166,0.0,0.012524,0.541322


In [7]:
economic_data.columns

Index(['Date', 'Oil_mean', 'Selic_mean', 'Dollar_mean', 'Gold_mean',
       'Oil_median', 'Selic_median', 'Dollar_median', 'Gold_median', 'Oil_std',
       'Selic_std', 'Dollar_std', 'Gold_std'],
      dtype='object')

# Join the Economic Data with the oil data

In [8]:
data = data.merge(economic_data, left_on='Analysis_Date', right_on='Date')

# Train and Validation Split (Simple Holdout)

In [9]:
data_train = data[data['Last_Day_of_Analyses_of_Week'] < '2011-01-01']
data_valid = data[data['Last_Day_of_Analyses_of_Week'] >= '2011-01-01']

# New DataFrame for Train and Validation (Index: original Data)

In [10]:
df_train = pd.DataFrame(index=data_train.index)
df_valid  = pd.DataFrame(index=data_valid.index)

# Feature Engineering

In [12]:
MM_scaler = MinMaxScaler()
SS_scaler = StandardScaler()
pow_trans = PowerTransformer()


dataset_list = [(df_train, data_train), (df_valid, data_valid)]
var_one_hot = ['Macroregion', 'State']
regex = ['_mean', '_median', '_std']

def one_hot_encoding(model_dataset, sample_dataset, var):
    for label in sample_dataset[var].unique():
        model_dataset[var + '_' + label] = np.where(sample_dataset[var] == label, 1, 0)


for dataset in dataset_list:
    
    model_dataset, sample_dataset = dataset
    
    #Target: First Difference of Average Resale Price
    model_dataset['diff_Mean_Price'] = sample_dataset.groupby(['State'])['Mean_Price'].apply(lambda row: row.diff().shift(-1))
    
    ## Features
    # Current Mean Price
    model_dataset['Current_Mean_Price'] = sample_dataset['Mean_Price']
    
    # Mean Price Margin
    sample_dataset['Mean_Price_Margin'] = sample_dataset['Mean_Price_Margin'].astype(str)
    sample_dataset['Mean_Price_Margin'] = sample_dataset['Mean_Price_Margin'].replace('-', 0)
    model_dataset['Mean_Price_Margin'] = sample_dataset['Mean_Price_Margin'].astype(float)
    
    #Seasonality
    model_dataset['month'] = sample_dataset['Last_Day_of_Analyses_of_Week'].dt.month
    model_dataset['day'] = sample_dataset['Last_Day_of_Analyses_of_Week'].dt.day
    model_dataset['dayofyear'] = sample_dataset['Last_Day_of_Analyses_of_Week'].dt.dayofyear
    model_dataset['year'] = sample_dataset['Last_Day_of_Analyses_of_Week'].dt.year
    
    #Movel Average
    model_dataset['Movel_Average_Mean_Price_4_weeks'] = sample_dataset.groupby(['State'])['Mean_Price'].rolling(4).mean().reset_index(level=0, drop=True)
    
    #Economic Columns
    for reg in regex:
        economic_columns = economic_data.filter(regex=reg, axis=1).columns
        for col in economic_columns:
            model_dataset[col] = sample_dataset[col]
            # MinMaxScaler
            MM_scaler.fit(df_train[[col]])
            model_dataset[col + '_MM'] = MM_scaler.transform(model_dataset[[col]])
            # StandardScaler
            SS_scaler.fit(df_train[[col]])
            model_dataset[col + '_SS'] = SS_scaler.transform(model_dataset[[col]])
            # PowerTransformer
            pow_trans.fit(df_train[[col]])
            model_dataset[col + '_PW'] = pow_trans.transform(model_dataset[[col]])
        
    #One Hot Encoding
    for var in var_one_hot:
        one_hot_encoding(model_dataset, sample_dataset, var)

# Drop Missing Values

In [13]:
df_train = df_train.dropna()
df_valid = df_valid.dropna()

# X, y Train and Validation Split

In [14]:
Xtr, ytr = df_train.drop(['diff_Mean_Price'], axis=1), df_train['diff_Mean_Price']
Xval, yval = df_valid.drop(['diff_Mean_Price'], axis=1), df_valid['diff_Mean_Price']

# Features Evaluation

### With One Hot Encoding

In [15]:
var_menor_erro = None
valor_menor_erro = 1000.

results = {'State_':{}, 'Macroregion_':{}}
one_hot_encode_params = ['State_', 'Macroregion_']

for param in one_hot_encode_params:
    param_exclude = list(filter(lambda x: x != param, one_hot_encode_params))[0]
    one_hot_exclude = list(Xtr.filter(regex=param_exclude).columns)
    encode_columns = list(Xtr.filter(regex=param).columns)
    to_drop = encode_columns + one_hot_exclude
    feature_columns = Xtr.drop(to_drop, axis=1).columns
    
    print(f'One Hot Encoding Param: {param}')
    
    for var in feature_columns:
        column_list = list(encode_columns)
        column_list.append(var)
        mdl = LGBMRegressor(num_leaves=2, min_data_in_leaf=250, n_jobs=-1, random_state=0, n_estimators=500)
        mdl.fit(Xtr[column_list], ytr)
        p = mdl.predict(Xval[column_list])

        p_final = Xval['Current_Mean_Price'] + p
        yval_final = Xval['Current_Mean_Price'] + yval

        erro = np.sqrt(mean_squared_log_error(yval_final, p_final)) * 100

        print("Variável: {} - Erro: {:.4f}\n".format(var, erro))

        if erro < valor_menor_erro:
            var_menor_erro = var
            valor_menor_erro = erro

        results[param][var] = erro
        
    print("Melhor Variável: {} - Erro: {:.4f} | One Hot Encoding {}\n\n".format(var_menor_erro, valor_menor_erro, param))
    
df_results = pd.DataFrame.from_dict(results, orient='index').T.rename(columns={param: param + 'result' for param in one_hot_encode_params})

One Hot Encoding Param: State_
Variável: Current_Mean_Price - Erro: 1.0055

Variável: Mean_Price_Margin - Erro: 0.9919

Variável: month - Erro: 0.9854

Variável: day - Erro: 0.9866

Variável: dayofyear - Erro: 0.9864

Variável: year - Erro: 0.9873

Variável: Movel_Average_Mean_Price_4_weeks - Erro: 1.0021

Variável: Oil_mean - Erro: 0.9866

Variável: Oil_mean_MM - Erro: 0.9864

Variável: Oil_mean_SS - Erro: 0.9866

Variável: Oil_mean_PW - Erro: 0.9865

Variável: Selic_mean - Erro: 0.9881

Variável: Selic_mean_MM - Erro: 0.9881

Variável: Selic_mean_SS - Erro: 0.9881

Variável: Selic_mean_PW - Erro: 0.9880

Variável: Dollar_mean - Erro: 0.9875

Variável: Dollar_mean_MM - Erro: 0.9875

Variável: Dollar_mean_SS - Erro: 0.9874

Variável: Dollar_mean_PW - Erro: 0.9874

Variável: Gold_mean - Erro: 0.9847

Variável: Gold_mean_MM - Erro: 0.9848

Variável: Gold_mean_SS - Erro: 0.9848

Variável: Gold_mean_PW - Erro: 0.9847

Variável: Oil_median - Erro: 0.9865

Variável: Oil_median_MM - Erro: 0.9

### State Encode

In [16]:
baseline = 0.98587
df_results[df_results.iloc[:,0] < baseline].iloc[:,0].sort_values()

Gold_mean         0.984656
Gold_mean_PW      0.984688
Gold_mean_MM      0.984820
Gold_mean_SS      0.984843
Gold_median       0.985058
Gold_median_MM    0.985058
Gold_median_SS    0.985058
Gold_median_PW    0.985058
month             0.985378
Selic_std         0.985865
Name: State_result, dtype: float64

### Macroregion Encode

In [17]:
baseline = 0.98587
df_results[df_results.iloc[:,1] < baseline].iloc[:,1].sort_values()

Gold_mean         0.984656
Gold_mean_PW      0.984688
Gold_mean_MM      0.984820
Gold_mean_SS      0.984843
Gold_median       0.985058
Gold_median_MM    0.985058
Gold_median_SS    0.985058
Gold_median_PW    0.985058
month             0.985389
Selic_std         0.985830
Name: Macroregion_result, dtype: float64

### Without One Hot Encode

In [18]:
var_menor_erro = None
valor_menor_erro = 1000.

results = {}

for var in feature_columns:
    mdl = LGBMRegressor(num_leaves=2, min_data_in_leaf=250, n_jobs=-1, random_state=0, n_estimators=500)
    mdl.fit(Xtr[[var]], ytr)
    p = mdl.predict(Xval[[var]])

    p_final = Xval['Current_Mean_Price'] + p
    yval_final = Xval['Current_Mean_Price'] + yval

    erro = np.sqrt(mean_squared_log_error(yval_final, p_final)) * 100
    
    print("Variável: {} - Erro: {:.4f}\n".format(var, erro))
    
    if erro < valor_menor_erro:
        var_menor_erro = var
        valor_menor_erro = erro
        
    results[var] = erro
        
print("Melhor Variável: {} - Erro: {:.4f}\n".format(var_menor_erro, valor_menor_erro))

Variável: Current_Mean_Price - Erro: 0.9918

Variável: Mean_Price_Margin - Erro: 0.9872

Variável: month - Erro: 0.9854

Variável: day - Erro: 0.9866

Variável: dayofyear - Erro: 0.9864

Variável: year - Erro: 0.9872

Variável: Movel_Average_Mean_Price_4_weeks - Erro: 0.9913

Variável: Oil_mean - Erro: 0.9866

Variável: Oil_mean_MM - Erro: 0.9864

Variável: Oil_mean_SS - Erro: 0.9866

Variável: Oil_mean_PW - Erro: 0.9865

Variável: Selic_mean - Erro: 0.9881

Variável: Selic_mean_MM - Erro: 0.9881

Variável: Selic_mean_SS - Erro: 0.9881

Variável: Selic_mean_PW - Erro: 0.9880

Variável: Dollar_mean - Erro: 0.9875

Variável: Dollar_mean_MM - Erro: 0.9875

Variável: Dollar_mean_SS - Erro: 0.9874

Variável: Dollar_mean_PW - Erro: 0.9874

Variável: Gold_mean - Erro: 0.9847

Variável: Gold_mean_MM - Erro: 0.9848

Variável: Gold_mean_SS - Erro: 0.9848

Variável: Gold_mean_PW - Erro: 0.9847

Variável: Oil_median - Erro: 0.9865

Variável: Oil_median_MM - Erro: 0.9864

Variável: Oil_median_SS - 

In [19]:
df_results = pd.DataFrame.from_dict(results, orient='index', columns=['result'])
baseline = 0.98587
df_results[df_results['result'] < baseline].sort_values('result')

Unnamed: 0,result
Gold_mean,0.984656
Gold_mean_PW,0.984688
Gold_mean_MM,0.98482
Gold_mean_SS,0.984843
Gold_median,0.985058
Gold_median_MM,0.985058
Gold_median_SS,0.985058
Gold_median_PW,0.985058
month,0.985392
Selic_std,0.985834


### Model With Some Features and the State Encode

In [21]:
encode_columns = Xtr.filter(regex='State_').columns
enc_columns = list(encode_columns)
feat_columns = ['Gold_mean', 'Selic_std', 'year', 'Mean_Price_Margin']
feat_columns = feat_columns + enc_columns

mdl = LGBMRegressor(num_leaves=2, min_data_in_leaf=250, n_jobs=-1, random_state=0, n_estimators=500)
mdl.fit(Xtr[feat_columns], ytr)
p = mdl.predict(Xval[feat_columns])

p_final = Xval['Current_Mean_Price'] + p
yval_final = Xval['Current_Mean_Price'] + yval

np.sqrt(mean_squared_log_error(yval_final, p_final)) * 100



0.9863951184591437

### Model With Some Features and the Macroregion Encode

In [22]:
encode_columns = Xtr.filter(regex='Macroregion_').columns
enc_columns = list(encode_columns)
feat_columns = ['Gold_mean', 'Selic_std', 'year', 'Mean_Price_Margin']
feat_columns = feat_columns + enc_columns

mdl = LGBMRegressor(num_leaves=2, min_data_in_leaf=250, n_jobs=-1, random_state=0, n_estimators=500)
mdl.fit(Xtr[feat_columns], ytr)
p = mdl.predict(Xval[feat_columns])

p_final = Xval['Current_Mean_Price'] + p
yval_final = Xval['Current_Mean_Price'] + yval

np.sqrt(mean_squared_log_error(yval_final, p_final)) * 100



0.9844301222960156

### Model With Some Features and without Encode

In [23]:
feat_columns = ['Gold_mean', 'Selic_std', 'year', 'Mean_Price_Margin']


mdl = LGBMRegressor(num_leaves=2, min_data_in_leaf=250, n_jobs=-1, random_state=0, n_estimators=500)
mdl.fit(Xtr[feat_columns], ytr)
p = mdl.predict(Xval[feat_columns])

p_final = Xval['Current_Mean_Price'] + p
yval_final = Xval['Current_Mean_Price'] + yval

np.sqrt(mean_squared_log_error(yval_final, p_final)) * 100



0.9841642364455628