In [1]:
import os, pandas as pd
import numpy as np
from pmdarima import auto_arima
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [2]:
parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
main_directory = os.path.dirname(parent_directory)
phase_2 =  pd.read_csv(main_directory + '/data/phase_2_data.csv')
phase_2.head()

Unnamed: 0,ID,From,Date,Hour,Participation_Phase,Demand_kWh,Temperature,Temperature24,Temperature48,Temperature72
0,Exp_3928,2020-12-01 00:00:00+00:00,2020-12-01,1,Phase_2,4.43,4.2,0.7,-0.8,-1.3
1,Exp_3928,2020-12-01 01:00:00+00:00,2020-12-01,2,Phase_2,4.5,4.2,1.0,-0.7,-1.2
2,Exp_3928,2020-12-01 02:00:00+00:00,2020-12-01,3,Phase_2,5.44,3.9,1.3,-0.6,-1.0
3,Exp_3928,2020-12-01 03:00:00+00:00,2020-12-01,4,Phase_2,4.5,3.1,1.6,-0.5,-0.9
4,Exp_3928,2020-12-01 04:00:00+00:00,2020-12-01,5,Phase_2,4.48,2.4,1.8,-0.4,-0.8


Training for 100 households from 3000

In [3]:
unique_ids = phase_2["ID"].unique()[:100]
phase_2 = phase_2[phase_2["ID"].isin(unique_ids)]

75/25

In [4]:
y=phase_2['Demand_kWh']
X= phase_2[["From","Temperature"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature
0,2020-12-01 00:00:00+00:00,4.2
1,2020-12-01 01:00:00+00:00,4.2
2,2020-12-01 02:00:00+00:00,3.9
3,2020-12-01 03:00:00+00:00,3.1
4,2020-12-01 04:00:00+00:00,2.4


In [5]:

model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:               208800
Model:               SARIMAX(4, 1, 2)   Log Likelihood             -268609.654
Date:                Thu, 04 Jul 2024   AIC                         537233.308
Time:                        23:07:02   BIC                         537305.051
Sample:                             0   HQIC                        537254.384
                             - 208800                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.3895      0.004    328.412      0.000       1.381       1.398
ar.L2         -0.3949      0.003   -132.852      0.000      -0.401      -0.389
ar.L3         -0.0263      0.002    -11.499      0.0

In [6]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_2.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 208800    0.680754
208801    0.695628
208802    0.708027
208803    0.726779
208804    0.747507
            ...   
278395    0.845464
278396    0.845464
278397    0.845464
278398    0.845464
278399    0.845464
Name: forecast_data, Length: 69600, dtype: float64
Mean Squared Error: 2.1219139813589734
Mean Absolute Error: 0.9699728267024537
Root Mean Squared Error: 1.4566790934721943


In [8]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Temperature24","Temperature48","Temperature72","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Arima/predictions/phase_2/arima_phase_2_method_2_75_25.csv",index=True)


80 / 20 Ratio

In [4]:
# phase_2.set_index('From',inplace=True)

y=phase_2['Demand_kWh']
X= phase_2[["From","Temperature"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature
0,2020-12-01 00:00:00+00:00,4.2
1,2020-12-01 01:00:00+00:00,4.2
2,2020-12-01 02:00:00+00:00,3.9
3,2020-12-01 03:00:00+00:00,3.1
4,2020-12-01 04:00:00+00:00,2.4


In [5]:

model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:               222720
Model:               SARIMAX(3, 1, 4)   Log Likelihood             -284699.333
Date:                Mon, 17 Jun 2024   AIC                         569414.667
Time:                        21:41:27   BIC                         569497.176
Sample:                             0   HQIC                        569438.838
                             - 222720                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.6617      0.017     39.503      0.000       0.629       0.695
ar.L2          0.8282      0.026     31.483      0.000       0.777       0.880
ar.L3         -0.6160      0.011    -56.545      0.0

In [6]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_2.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 222720    0.555255
222721    0.651862
222722    0.725227
222723    0.798299
222724    0.847893
            ...   
278395    0.955983
278396    0.955983
278397    0.955983
278398    0.955983
278399    0.955983
Name: forecast_data, Length: 55680, dtype: float64
Mean Squared Error: 2.114964409531244
Mean Absolute Error: 1.0086965239750714
Root Mean Squared Error: 1.4542917209182085


In [7]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Temperature24","Temperature48","Temperature72","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Arima/predictions/Phase 2/arima_phase_2_method_2_80_20.csv",index=True)


50 / 50 Ratio

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature
0,2020-12-01 00:00:00+00:00,4.2
1,2020-12-01 01:00:00+00:00,4.2
2,2020-12-01 02:00:00+00:00,3.9
3,2020-12-01 03:00:00+00:00,3.1
4,2020-12-01 04:00:00+00:00,2.4


In [9]:
model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:               139200
Model:               SARIMAX(5, 1, 5)   Log Likelihood             -182489.968
Date:                Mon, 17 Jun 2024   AIC                         365001.937
Time:                        22:23:34   BIC                         365110.217
Sample:                             0   HQIC                        365034.316
                             - 139200                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.7680      0.016     48.907      0.000       0.737       0.799
ar.L2         -0.3694      0.012    -30.219      0.000      -0.393      -0.345
ar.L3          0.8496      0.005    181.185      0.0

In [10]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_2.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 139200    6.117855
139201    6.195192
139202    6.316122
139203    6.477774
139204    6.735295
            ...   
278395    6.761223
278396    6.761223
278397    6.761223
278398    6.761223
278399    6.761223
Name: forecast_data, Length: 139200, dtype: float64
Mean Squared Error: 23.828075304198723
Mean Absolute Error: 4.641642955230534
Root Mean Squared Error: 4.8814009571227315


In [11]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Temperature24","Temperature48","Temperature72","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Arima/predictions/Phase 2/arima_phase_2_method_2_50_50.csv",index=True)


60 / 40 Ratio

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature
0,2020-12-01 00:00:00+00:00,4.2
1,2020-12-01 01:00:00+00:00,4.2
2,2020-12-01 02:00:00+00:00,3.9
3,2020-12-01 03:00:00+00:00,3.1
4,2020-12-01 04:00:00+00:00,2.4


In [13]:
model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:               167040
Model:               SARIMAX(5, 1, 3)   Log Likelihood             -213302.691
Date:                Mon, 17 Jun 2024   AIC                         426623.382
Time:                        23:24:33   BIC                         426713.616
Sample:                             0   HQIC                        426650.149
                             - 167040                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.3879      0.006     61.007      0.000       0.375       0.400
ar.L2          0.9722      0.005    203.425      0.000       0.963       0.982
ar.L3         -0.3983      0.004    -95.107      0.0

In [14]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_2.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 167040    1.210371
167041    1.265970
167042    1.337352
167043    1.377518
167044    1.448159
            ...   
278395    1.570846
278396    1.570846
278397    1.570846
278398    1.570846
278399    1.570846
Name: forecast_data, Length: 111360, dtype: float64
Mean Squared Error: 4.501408903529178
Mean Absolute Error: 1.3549857637082092
Root Mean Squared Error: 2.121652399317376


In [15]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Temperature24","Temperature48","Temperature72","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Arima/predictions/Phase 2/arima_phase_2_method_2_60_40.csv",index=True)
