In [79]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from sklearn.model_selection import train_test_split
from darts.models import AutoARIMA
import darts
from sklearn.metrics import mean_absolute_error as mae, mean_absolute_percentage_error as mape
warnings.filterwarnings('ignore')

In [2]:
def load_data_comp_train():
    train = pd.read_csv('../data/train.csv')
    client = pd.read_csv('../data/client.csv')
    electricity_prices = pd.read_csv('../data/electricity_prices.csv')
    forecast_weather = pd.read_csv('../data/forecast_weather.csv')
    gas_prices = pd.read_csv('../data/gas_prices.csv')
    historical_weather = pd.read_csv('../data/historical_weather.csv')

    return (train, client, electricity_prices, forecast_weather,
            gas_prices, historical_weather)


def load_data_comp_test():
    test = pd.read_csv('../data/example_test_files/test.csv')
    client_test = pd.read_csv('../data/example_test_files/client.csv')
    electricity_prices_test = pd.read_csv('../data/example_test_files/electricity_prices.csv')
    forecast_weather_test = pd.read_csv('../data/example_test_files/forecast_weather.csv')
    gas_prices_test = pd.read_csv('../data/example_test_files/gas_prices.csv')
    historical_weather_test = pd.read_csv('../data/historical_weather.csv')

    return (test, client_test, electricity_prices_test, forecast_weather_test,
            gas_prices_test, historical_weather_test)

In [3]:
train, client, electricity_prices, forecast_weather, gas_prices, historical_weather = load_data_comp_train()
test, client_test, electricity_prices_test, forecast_weather_test, gas_prices_test, historical_weather_test = load_data_comp_test()

# Autorregresive Models

In [4]:
train.head()


Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
0,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0
1,0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0
2,0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1
3,0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1
4,0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2


In [5]:
# Realizamos el análisis para los segmentos siempre activos
activos = ['31', '51', '45', '8', '15', '35', '56', '20', '60', '50', '32', '6', '18', '55', '57', '4', '54', '42', '28', '33', '14', '1', '34', '39', '48', '3', '53', '29', '37', '16', '43', '5', '46', '58', '23', '24', '10', '7', '2', '36', '49', '27', '12', '30', '59', '19', '22', '52', '0', '11', '25', '9', '13', '38', '17', '40']

In [6]:
# Para cada county, is_business, product_type, is_consumption, datetime se hará un modelo de ts pata la variable target
cols_to_model = ['county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'target']
cols_to_group = ['county', 'is_business', 'product_type', 'is_consumption']
data_model = train.copy().loc[train.prediction_unit_id.astype(str).isin(activos)][cols_to_model]

In [115]:
unique_combinations = data_model[cols_to_group].drop_duplicates()
unique_combinations

Unnamed: 0,county,is_business,product_type,is_consumption
0,0,0,1,0
1,0,0,1,1
2,0,0,2,0
3,0,0,2,1
4,0,0,3,0
...,...,...,...,...
117,15,0,3,1
118,15,1,1,0
119,15,1,1,1
120,15,1,3,0


# Primer modelo

In [117]:
df_exp = data_model.copy().loc[data_model[cols_to_group].eq(unique_combinations.iloc[0]).all(axis=1)]

In [116]:
# df_exp.reset_index(inplace=True)
# df_exp.set_index('datetime', inplace=True)

In [104]:
results_ = pd.DataFrame(columns=['county', 'is_business', 'product_type', 'is_consumption', 'Model' , 'MAPE', 'MAE'])

In [38]:
df_exp.head()

Unnamed: 0,county,is_business,product_type,is_consumption,datetime,target
0,0,0,1,0,2021-09-01 00:00:00,0.713
122,0,0,1,0,2021-09-01 01:00:00,1.132
244,0,0,1,0,2021-09-01 02:00:00,0.49
366,0,0,1,0,2021-09-01 03:00:00,0.496
488,0,0,1,0,2021-09-01 04:00:00,0.149


In [55]:
df_exp['target'].interpolate(method='linear', inplace=True)

In [56]:
train_size = int(0.75 * df_exp.shape[0])
train_data_ext, test_data_ext = df_exp[:train_size], df_exp[train_size:]

In [57]:
ts_exp=darts.TimeSeries.from_dataframe(df=train_data_ext, value_cols='target' , time_col='datetime')

In [59]:
model = AutoARIMA()

In [60]:
model.fit(ts_exp)

AutoARIMA(add_encoders=None)

In [118]:
model.model.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,11484.0
Model:,"SARIMAX(5, 1, 0)",Log Likelihood,-65925.679
Date:,"Sun, 28 Apr 2024",AIC,131863.359
Time:,22:36:58,BIC,131907.45
Sample:,0,HQIC,131878.181
,- 11484,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.8527,0.004,236.286,0.000,0.846,0.860
ar.L2,-0.1495,0.004,-34.583,0.000,-0.158,-0.141
ar.L3,-0.0064,0.005,-1.306,0.192,-0.016,0.003
ar.L4,-0.0233,0.006,-3.821,0.000,-0.035,-0.011
ar.L5,-0.1203,0.005,-22.652,0.000,-0.131,-0.110
sigma2,5680.3936,27.607,205.762,0.000,5626.286,5734.501

0,1,2,3
Ljung-Box (L1) (Q):,1.13,Jarque-Bera (JB):,130966.44
Prob(Q):,0.29,Prob(JB):,0.0
Heteroskedasticity (H):,26.32,Skew:,-0.13
Prob(H) (two-sided):,0.0,Kurtosis:,19.54


In [61]:
forecast = model.predict(len(test_data_ext))

In [105]:
# metricas, mape, mae
mape_value = mape(forecast.pd_dataframe()['target'].values, test_data_ext['target'].values)
mae_value = mae(forecast.pd_dataframe()['target'].values, test_data_ext['target'].values)
print(str(model.model.summary().tables[0][1][1]))
print(f'MAPE: {mape_value}, MAE: {mae_value}')

SARIMAX(5, 1, 0)
MAPE: 30.497431306091933, MAE: 365.99685838409374


In [None]:
str(model.model.summary().tables[0][1][1])

In [112]:
pd_tm = pd.DataFrame({'county': unique_combinations.iloc[0]['county'],
                      'is_business': unique_combinations.iloc[0]['is_business'], 
                      'product_type': unique_combinations.iloc[0]['product_type'], 
                      'is_consumption': unique_combinations.iloc[0]['is_consumption'], 
                      'Model': str(model.model.summary().tables[0][1][1]), 
                      'MAPE': mape_value, 'MAE': mae_value}, index=[0])

In [113]:
results_ = pd.concat([results_, pd_tm], ignore_index=False)

In [114]:
results_

Unnamed: 0,county,is_business,product_type,is_consumption,Model,MAPE,MAE
0,0,0,1,0,"SARIMAX(5, 1, 0)",30.497431,365.996858


# Iteración para todas las combinaciones

In [119]:
results_ = pd.DataFrame(columns=['county', 'is_business', 'product_type', 'is_consumption', 'Model' , 'MAPE', 'MAE'])

In [120]:
for row in range(unique_combinations.shape[0]):
    df_exp = data_model.copy().loc[data_model[cols_to_group].eq(unique_combinations.iloc[row]).all(axis=1)]
    df_exp['target'].interpolate(method='linear', inplace=True)
    train_size = int(0.75 * df_exp.shape[0])
    train_data_ext, test_data_ext = df_exp[:train_size], df_exp[train_size:]
    ts_exp=darts.TimeSeries.from_dataframe(df=train_data_ext, value_cols='target' , time_col='datetime')
    model = AutoARIMA()
    model.fit(ts_exp)
    forecast = model.predict(len(test_data_ext))
    mape_value = mape(forecast.pd_dataframe()['target'].values, test_data_ext['target'].values)
    mae_value = mae(forecast.pd_dataframe()['target'].values, test_data_ext['target'].values)
    pd_tm = pd.DataFrame({'county': unique_combinations.iloc[row]['county'],
                          'is_business': unique_combinations.iloc[row]['is_business'], 
                          'product_type': unique_combinations.iloc[row]['product_type'], 
                          'is_consumption': unique_combinations.iloc[row]['is_consumption'], 
                          'Model': str(model.model.summary().tables[0][1][1]), 
                          'MAPE': mape_value, 'MAE': mae_value}, index=[row])
    results_ = pd.concat([results_, pd_tm], ignore_index=False)

In [132]:
results_

Unnamed: 0,county,is_business,product_type,is_consumption,Model,MAPE,MAE
0,0,0,1,0,"SARIMAX(5, 1, 0)",30.4974,365.9969
1,0,0,1,1,"SARIMAX(3, 1, 2)",0.3299,263.7835
2,0,0,2,0,"SARIMAX(5, 1, 0)",7.4103,4.6738
3,0,0,2,1,"SARIMAX(5, 1, 3)",0.4758,16.0097
4,0,0,3,0,"SARIMAX(5, 1, 0)",17.9910,1169.6682
...,...,...,...,...,...,...,...
107,15,0,3,1,"SARIMAX(5, 1, 5)",0.3254,51.4981
108,15,1,1,0,"SARIMAX(1, 1, 0)",9.2357,50.2766
109,15,1,1,1,"SARIMAX(4, 1, 1)",0.9012,84.8166
110,15,1,3,0,"SARIMAX(5, 1, 0)",21.5064,113.8193


In [122]:
results_.to_csv('results_arima.csv', index=False)

#### MAPE & MAE By County

In [130]:
# suppress scientific notation by setting float_format
pd.options.display.float_format = '{:.4f}'.format

In [147]:
mp_cn = results_.groupby('county').agg({'MAPE': 'median', 'MAE': 'median'}).sort_values(by='MAPE', ascending=False)
mp_cn

Unnamed: 0_level_0,MAPE,MAE
county,Unnamed: 1_level_1,Unnamed: 2_level_1
12,3.31014784379739e+16,115.2858
6,46.975,67.7329
9,17.5159,66.9053
7,9.6499,149.8007
1,8.8309,29.128
10,6.2832,78.9838
13,5.4847,65.9131
15,5.0685,68.1574
2,4.4575,59.5641
0,3.943,441.3207


#### MAPE & MAE By is_business

In [143]:
results_.groupby('is_business').agg({'MAPE': 'mean', 'MAE': 'mean'}).sort_values(by='MAPE', ascending=False)

Unnamed: 0_level_0,MAPE,MAE
is_business,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1182195658499123.2,286.8944
0,10.1848,125.8681


#### MAPE & MAE By product_type

In [144]:
results_.groupby('product_type').agg({'MAPE': 'mean', 'MAE': 'mean'}).sort_values(by='MAPE', ascending=False)

Unnamed: 0_level_0,MAPE,MAE
product_type,Unnamed: 1_level_1,Unnamed: 2_level_1
3,1103382614599167.6,305.9824
0,86.8778,201.7814
1,19.8005,80.0995
2,3.943,10.3418


#### MAPE & MAE By is_consumption

In [145]:
results_.groupby('is_consumption').agg({'MAPE': 'mean', 'MAE': 'mean'}).sort_values(by='MAPE', ascending=False)

Unnamed: 0_level_0,MAPE,MAE
is_consumption,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1182195658499132.8,149.425
1,0.6268,263.3375
