In [1]:
import os, pandas as pd
import numpy as np
from pmdarima import auto_arima
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [2]:
parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
main_directory = os.path.dirname(parent_directory)
common_records =  pd.read_csv(main_directory + '/data/common_records.csv')
common_records.head()

Unnamed: 0,ID,From,Date,Hour,Participation_Phase,Demand_kWh,Temperature,Temperature24,Temperature48,Temperature72
0,Exp_1,2020-02-01 00:00:00+00:00,2020-02-01,1,Phase_1,2.393,7.6,6.1,5.0,4.4
1,Exp_1,2020-02-01 01:00:00+00:00,2020-02-01,2,Phase_1,2.056,8.2,6.2,5.1,4.4
2,Exp_1,2020-02-01 02:00:00+00:00,2020-02-01,3,Phase_1,2.258,8.4,6.3,5.3,4.5
3,Exp_1,2020-02-01 03:00:00+00:00,2020-02-01,4,Phase_1,2.535,8.4,6.4,5.4,4.5
4,Exp_1,2020-02-01 04:00:00+00:00,2020-02-01,5,Phase_1,2.042,8.2,6.5,5.5,4.6


In [3]:
unique_ids = common_records["ID"].unique()[:80]
common_records = common_records[common_records["ID"].isin(unique_ids)]

75/25

In [4]:
y=common_records['Demand_kWh']
X= common_records[["From","Temperature"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)
X_train.head()
y.isna()

0         False
1         False
2         False
3         False
4         False
          ...  
366715    False
366716    False
366717    False
366718    False
366719    False
Name: Demand_kWh, Length: 366720, dtype: bool

In [5]:
model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:               275040
Model:               SARIMAX(4, 1, 2)   Log Likelihood             -327539.723
Date:                Thu, 04 Jul 2024   AIC                         655093.445
Time:                        11:15:41   BIC                         655167.118
Sample:                             0   HQIC                        655114.833
                             - 275040                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.4417      0.004    387.911      0.000       1.434       1.449
ar.L2         -0.5067      0.003   -171.245      0.000      -0.512      -0.501
ar.L3          0.0579      0.002     25.982      0.0

In [6]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = common_records.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 275040    6.690575
275041    6.493700
275042    6.314800
275043    6.159677
275044    6.089883
            ...   
366715    6.534441
366716    6.534441
366717    6.534441
366718    6.534441
366719    6.534441
Name: forecast_data, Length: 91680, dtype: float64
Mean Squared Error: 19.98446470131853
Mean Absolute Error: 4.1409866289410315
Root Mean Squared Error: 4.4703987183827945


In [7]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Temperature24","Temperature48","Temperature72","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Arima/predictions/common_records/arima_common_records_method_2_75_25.csv",index=True)

80 / 20 Ratio

In [4]:
# common_records.set_index('From',inplace=True)

y=common_records['Demand_kWh']
X= common_records[["From","Temperature"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train.head()
y.isna()

0         False
1         False
2         False
3         False
4         False
          ...  
366715    False
366716    False
366717    False
366718    False
366719    False
Name: Demand_kWh, Length: 366720, dtype: bool

In [5]:
model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:               293376
Model:               SARIMAX(4, 1, 2)   Log Likelihood             -346864.983
Date:                Sun, 16 Jun 2024   AIC                         693743.965
Time:                        10:50:59   BIC                         693818.090
Sample:                             0   HQIC                        693765.425
                             - 293376                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.4361      0.004    386.991      0.000       1.429       1.443
ar.L2         -0.4980      0.003   -171.393      0.000      -0.504      -0.492
ar.L3          0.0547      0.002     25.298      0.0

In [6]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = common_records.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 293376    1.971861
293377    2.534848
293378    2.858343
293379    3.123538
293380    3.296563
            ...   
366715    2.863245
366716    2.863245
366717    2.863245
366718    2.863245
366719    2.863245
Name: forecast_data, Length: 73344, dtype: float64
Mean Squared Error: 4.623496524018266
Mean Absolute Error: 1.720002282257764
Root Mean Squared Error: 2.1502317372828137


In [7]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Temperature24","Temperature48","Temperature72","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Arima/predictions/Common Records/arima_common_records_method_2_80_20.csv",index=True)

50 / 50 Ratio

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature
0,2020-02-01 00:00:00+00:00,7.6
1,2020-02-01 01:00:00+00:00,8.2
2,2020-02-01 02:00:00+00:00,8.4
3,2020-02-01 03:00:00+00:00,8.4
4,2020-02-01 04:00:00+00:00,8.2


In [13]:

model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:               183360
Model:               SARIMAX(1, 1, 1)   Log Likelihood             -198054.227
Date:                Sun, 16 Jun 2024   AIC                         396114.454
Time:                        11:34:10   BIC                         396144.812
Sample:                             0   HQIC                        396123.423
                             - 183360                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.6200      0.001    506.584      0.000       0.618       0.622
ma.L1         -0.9784      0.000  -2419.621      0.000      -0.979      -0.978
sigma2         0.5078      0.001    678.915      0.0

In [14]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = common_records.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 183360    1.331058
183361    1.628704
183362    1.813250
183363    1.927673
183364    1.998617
            ...   
366715    2.114379
366716    2.114379
366717    2.114379
366718    2.114379
366719    2.114379
Name: forecast_data, Length: 183360, dtype: float64
Mean Squared Error: 5.310102859644252
Mean Absolute Error: 1.714165343842552
Root Mean Squared Error: 2.3043660428942823


In [15]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Temperature24","Temperature48","Temperature72","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Arima/predictions/Common Records/arima_common_records_method_2_50_50.csv",index=True)