# Business Case - 4Intelligence

## Model to Predict ....

In [1]:
# installing libs and packages

!pip install sklearn -q

You should consider upgrading via the 'c:\users\gabriel\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [26]:
# importing libs

import pandas as pd
import numpy as np
from numpy import mean, median
from datetime import datetime , date
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression ,ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import *
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load dataset

data_path = 'E:\\Projects\\4intelligence_case\\data\\Bases_Final_ADS_Jun2021.xlsx'
df = pd.read_excel(data_path, sheet_name='dados')

## Feature Engineering

In [4]:
# change datetime to date
df['data_tidy'] = df['data_tidy'].apply(lambda x: x.date())

In [5]:
# replace nan values to zero
df = df.fillna(0)

In [6]:
# creating year, month features

df['ano'] = df['data_tidy'].apply(lambda x: x.year)
df['mes'] = df['data_tidy'].apply(lambda x: x.month)

In [7]:
# creating new features

# total energy consumed by business category
df['com_total'] = df['com_co'] + df['com_n'] + df['com_ne'] + df['com_s'] + df['com_se']

# total energy consumed by industrial category
df['ind_total'] = df['ind_co'] + df['ind_n'] + df['ind_ne'] + df['ind_s'] + df['ind_se']

# total energy consumed by residential category
df['res_total'] = df['res_co'] + df['res_n'] + df['res_ne'] + df['res_s'] + df['res_se']

# total energy consumed
df['total'] = df['res_total'] + df['ind_total'] + df['com_total']

In [8]:
# creating new features about each region
# consumption total by region

# SE
df['se_total'] = df['com_se'] + df['ind_se'] + df['com_se']
# S
df['s_total'] = df['com_s'] + df['ind_s'] + df['com_s']
# NE
df['ne_total'] = df['com_ne'] + df['ind_ne'] + df['com_ne']
# N
df['n_total'] = df['com_n'] + df['ind_n'] + df['com_n']
# CO
df['co_total'] = df['com_co'] + df['ind_co'] + df['com_co']

In [9]:
# new features about temperature

# max tempearature in Brazil
df['max_temp'] = df[['temp_max_n','temp_max_ne','temp_max_co','temp_max_se','temp_max_s']].max(axis =1)
# min tempearature in Brazil
df['min_temp'] = df[['temp_max_n','temp_max_ne','temp_max_co','temp_max_se','temp_max_s']].min(axis =1)
# delta - the difference
df['delta'] = df['max_temp'] - df['min_temp']
# the mean -min-max
df['media_temp'] = (df['max_temp'] + df['min_temp'])/2

In [10]:
# split train dataset

df_train = df.loc[df['data_tidy'] <= date(2021,2,1)]

In [11]:
# creating correlation matrix
mc = df_train.corr(method = 'pearson')

In [12]:
# strongests correlations between industrial energy consumption SE
mc['ind_se'].sort_values(ascending=False)

ind_se         1.000000
ind_total      0.885493
pim_se         0.793250
ind_ne         0.754299
pim_s          0.683477
pim_ne         0.450059
pim_n          0.316463
se_total       0.313763
pmc_a_co       0.301078
ne_total       0.287529
mes            0.281097
pmc_a_ne       0.229778
pmc_a_se       0.218574
pmc_r_co       0.216244
ind_s          0.201316
delta          0.192139
max_temp       0.181328
temp_max_n     0.180960
pmc_a_n        0.156524
pmc_r_ne       0.130908
temp_max_co    0.116144
ind_n          0.104751
pmc_a_s        0.102949
pmc_r_n        0.067781
total          0.060419
pim_co         0.047890
pmc_r_se       0.046026
pop_ocup_br    0.044074
du             0.036097
temp_max_ne   -0.011721
s_total       -0.019891
n_total       -0.036934
pmc_r_s       -0.059158
temp_max_se   -0.076352
com_se        -0.088309
ind_co        -0.091900
co_total      -0.108182
media_temp    -0.110655
com_co        -0.115085
com_total     -0.123896
res_se        -0.127810
temp_min_ne   -0

In [13]:
# selecting features and target

features = df_train[['pim_se']]

label = df_train['ind_se'].values

In [14]:
# scalling features

scaler = StandardScaler()
features = scaler.fit_transform(features)

In [19]:
# splitting the features

train_features, test_features, train_labels, test_labels = train_test_split(features , label, 
                                                                            test_size = 0.25, 
                                                                            random_state = 0)

In [20]:
# model 1: Linear Regression

model1 = LinearRegression()
model1.fit(train_features, train_labels)

LinearRegression()

In [23]:
# predictions

predictions = model1.predict(test_features)

In [25]:
# evaluate the model 1

mae = mean_absolute_error(test_labels , predictions)
mse = mean_squared_error(test_labels , predictions)
print('Linear Regression MAE: ', mae)
print('Linear Regression MSE: ', mse)


Linear Regression MAE:  240.4548413577102
Linear Regression MSE:  84187.4931919511


In [None]:
# model 2: ElasticNet

# test all hyperparams below
param_grid = [{'l1_ratio':[0.25, 0.5, 0.75, 1],
               'max_iter':[500, 750, 1000, 2000, 5000]}]

GS = GridSearchCV(reg , param_grid , scoring= 'neg_mean_absolute_error' , error_score= 'raise', cv=5)
GS.fit(train_features, train_labels)
print(GS.best_params_)