In [3]:
import os, pandas as pd
import numpy as np
from pmdarima import auto_arima
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [4]:
parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
main_directory = os.path.dirname(parent_directory)
phase_1 =  pd.read_csv(main_directory + '/data/phase_1_data.csv')
phase_1.head()

Unnamed: 0,ID,From,Date,Hour,Participation_Phase,Demand_kWh,Temperature,Temperature24,Temperature48,Temperature72
0,Exp_553,2020-02-01 00:00:00+00:00,2020-02-01,1,Phase_1,1.174,7.6,6.1,5.0,4.4
1,Exp_553,2020-02-01 01:00:00+00:00,2020-02-01,2,Phase_1,1.172,8.2,6.2,5.1,4.4
2,Exp_553,2020-02-01 02:00:00+00:00,2020-02-01,3,Phase_1,1.207,8.4,6.3,5.3,4.5
3,Exp_553,2020-02-01 03:00:00+00:00,2020-02-01,4,Phase_1,1.613,8.4,6.4,5.4,4.5
4,Exp_553,2020-02-01 04:00:00+00:00,2020-02-01,5,Phase_1,1.101,8.2,6.5,5.5,4.6


75 / 25

In [5]:
y=phase_1['Demand_kWh']
X= phase_1[["From","Temperature"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature
0,2020-02-01 00:00:00+00:00,7.6
1,2020-02-01 01:00:00+00:00,8.2
2,2020-02-01 02:00:00+00:00,8.4
3,2020-02-01 03:00:00+00:00,8.4
4,2020-02-01 04:00:00+00:00,8.2


In [6]:

model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                72900
Model:               SARIMAX(2, 1, 5)   Log Likelihood              -74928.868
Date:                Mon, 08 Jul 2024   AIC                         149873.736
Time:                        00:11:52   BIC                         149947.311
Sample:                             0   HQIC                        149896.386
                              - 72900                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.6520      0.002    680.611      0.000       1.647       1.657
ar.L2         -0.9238      0.002   -414.000      0.000      -0.928      -0.919
ma.L1         -2.0395      0.003   -614.922      0.0

In [7]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_1.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 72900    0.397018
72901    0.483619
72902    0.480745
72903    0.510343
72904    0.579505
           ...   
97195    0.664205
97196    0.664205
97197    0.664205
97198    0.664205
97199    0.664205
Name: forecast_data, Length: 24300, dtype: float64
Mean Squared Error: 6.425413042085854
Mean Absolute Error: 1.8916285050061323
Root Mean Squared Error: 2.5348398454509615


In [16]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Temperature24","Temperature48","Temperature72","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Arima/predictions/Phase_1/arima_phase_1_method_2_75_25.csv",index=True)

80 / 20 Ratio

In [3]:
# phase_1.set_index('From',inplace=True)

y=phase_1['Demand_kWh']
X= phase_1[["From","Temperature"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature
0,2020-02-01 00:00:00+00:00,7.6
1,2020-02-01 01:00:00+00:00,8.2
2,2020-02-01 02:00:00+00:00,8.4
3,2020-02-01 03:00:00+00:00,8.4
4,2020-02-01 04:00:00+00:00,8.2


In [4]:

model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                77760
Model:               SARIMAX(4, 1, 5)   Log Likelihood              -81841.470
Date:                Mon, 17 Jun 2024   AIC                         163702.940
Time:                        13:51:20   BIC                         163795.554
Sample:                             0   HQIC                        163731.368
                              - 77760                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.3394      0.007    187.423      0.000       1.325       1.353
ar.L2          0.2672      0.013     20.767      0.000       0.242       0.292
ar.L3         -1.4056      0.011   -129.204      0.0

In [5]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_1.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 77760    2.216165
77761    2.511278
77762    2.799564
77763    3.080638
77764    3.345477
           ...   
97195    3.314866
97196    3.314866
97197    3.314866
97198    3.314866
97199    3.314866
Name: forecast_data, Length: 19440, dtype: float64
Mean Squared Error: 4.856287261524148
Mean Absolute Error: 1.7729937633484438
Root Mean Squared Error: 2.203698541435318


In [6]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Temperature24","Temperature48","Temperature72","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Arima/predictions/Phase_1/arima_phase_1_method_2_80_20.csv",index=True)

60 / 40 Ratio

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature
0,2020-02-01 00:00:00+00:00,7.6
1,2020-02-01 01:00:00+00:00,8.2
2,2020-02-01 02:00:00+00:00,8.4
3,2020-02-01 03:00:00+00:00,8.4
4,2020-02-01 04:00:00+00:00,8.2


In [8]:

model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                58320
Model:               SARIMAX(4, 1, 5)   Log Likelihood              -62499.353
Date:                Mon, 17 Jun 2024   AIC                         125018.705
Time:                        14:06:09   BIC                         125108.442
Sample:                             0   HQIC                        125046.615
                              - 58320                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.3079      0.051     25.748      0.000       1.208       1.407
ar.L2          0.2925      0.117      2.507      0.012       0.064       0.521
ar.L3         -1.3901      0.101    -13.765      0.0

In [9]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_1.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 58320    0.224423
58321    0.196018
58322    0.181762
58323    0.177129
58324    0.178932
           ...   
97195    0.177535
97196    0.177535
97197    0.177535
97198    0.177535
97199    0.177535
Name: forecast_data, Length: 38880, dtype: float64
Mean Squared Error: 6.1265473704289
Mean Absolute Error: 1.7807436310424951
Root Mean Squared Error: 2.475186330446437


In [10]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Temperature24","Temperature48","Temperature72","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Arima/predictions/Phase_1/arima_phase_1_method_2_60_40.csv",index=True)

50 / 50 Ratio

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature
0,2020-02-01 00:00:00+00:00,7.6
1,2020-02-01 01:00:00+00:00,8.2
2,2020-02-01 02:00:00+00:00,8.4
3,2020-02-01 03:00:00+00:00,8.4
4,2020-02-01 04:00:00+00:00,8.2


In [12]:

model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                48600
Model:               SARIMAX(2, 1, 5)   Log Likelihood              -56438.795
Date:                Mon, 17 Jun 2024   AIC                         112893.590
Time:                        14:13:24   BIC                         112963.921
Sample:                             0   HQIC                        112915.650
                              - 48600                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.6087      0.004    360.327      0.000       1.600       1.617
ar.L2         -0.8912      0.004   -223.611      0.000      -0.899      -0.883
ma.L1         -2.0211      0.005   -380.778      0.0

In [13]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_1.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 48600    1.296063
48601    1.242808
48602    1.158470
48603    1.111588
48604    1.102231
           ...   
97195    1.226365
97196    1.226365
97197    1.226365
97198    1.226365
97199    1.226365
Name: forecast_data, Length: 48600, dtype: float64
Mean Squared Error: 2.9743993294978144
Mean Absolute Error: 1.2755285037520012
Root Mean Squared Error: 1.7246446965963205


In [14]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Temperature24","Temperature48","Temperature72","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Arima/predictions/Phase_1/arima_phase_1_method_2_50_50.csv",index=True)