In [3]:
import os, pandas as pd
import numpy as np
from pmdarima import auto_arima
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [4]:
parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
main_directory = os.path.dirname(parent_directory)
phase_2 =  pd.read_csv(main_directory + '/data/phase_2_data_with_survey.csv')
phase_2.head()

Unnamed: 0,ID,From,Date,Hour,Participation_Phase,Demand_kWh,Temperature,home_size,electric_car,no_of_people,electrically_heated
0,Exp_3928,2020-12-01 00:00:00+00:00,2020-12-01,1,Phase_2,4.43,4.2,120-159 m2,No,4,True
1,Exp_3928,2020-12-01 01:00:00+00:00,2020-12-01,2,Phase_2,4.5,4.2,120-159 m2,No,4,True
2,Exp_3928,2020-12-01 02:00:00+00:00,2020-12-01,3,Phase_2,5.44,3.9,120-159 m2,No,4,True
3,Exp_3928,2020-12-01 03:00:00+00:00,2020-12-01,4,Phase_2,4.5,3.1,120-159 m2,No,4,True
4,Exp_3928,2020-12-01 04:00:00+00:00,2020-12-01,5,Phase_2,4.48,2.4,120-159 m2,No,4,True


Training for 100 households from 3000

In [5]:
unique_ids = phase_2["ID"].unique()[:100]
phase_2 = phase_2[phase_2["ID"].isin(unique_ids)]

75/25

In [6]:
y=phase_2['Demand_kWh']
X= phase_2[["From","Temperature","home_size","electric_car","electrically_heated","no_of_people"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature,home_size,electric_car,electrically_heated,no_of_people
0,2020-12-01 00:00:00+00:00,4.2,120-159 m2,No,True,4
1,2020-12-01 01:00:00+00:00,4.2,120-159 m2,No,True,4
2,2020-12-01 02:00:00+00:00,3.9,120-159 m2,No,True,4
3,2020-12-01 03:00:00+00:00,3.1,120-159 m2,No,True,4
4,2020-12-01 04:00:00+00:00,2.4,120-159 m2,No,True,4


In [7]:

model = auto_arima(y_train,
                   seasonal=True,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:               208800
Model:               SARIMAX(3, 1, 5)   Log Likelihood             -247633.313
Date:                Fri, 05 Jul 2024   AIC                         495284.625
Time:                        00:54:54   BIC                         495376.867
Sample:                             0   HQIC                        495311.723
                             - 208800                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.6998      0.035    -19.869      0.000      -0.769      -0.631
ar.L2          0.2406      0.014     16.633      0.000       0.212       0.269
ar.L3          0.3261      0.014     23.689      0.0

In [8]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_2.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 208800    2.108125
208801    2.198558
208802    2.233912
208803    2.311270
208804    2.345244
            ...   
278395    2.390125
278396    2.390125
278397    2.390125
278398    2.390125
278399    2.390125
Name: forecast_data, Length: 69600, dtype: float64
Mean Squared Error: 7.052879475606611
Mean Absolute Error: 1.8607240726918906
Root Mean Squared Error: 2.6557257907409437


In [9]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Sarima/predictions/phase_2_with_survey_data/sarima_phase_2_survey_method_2_75_25.csv",index=True)


80 / 20 Ratio

In [4]:
y=phase_2['Demand_kWh']
X= phase_2[["From","Temperature","home_size","electric_car","electrically_heated","no_of_people"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature,home_size,electric_car,electrically_heated,no_of_people
0,2020-12-01 00:00:00+00:00,4.2,120-159 m2,No,True,4
1,2020-12-01 01:00:00+00:00,4.2,120-159 m2,No,True,4
2,2020-12-01 02:00:00+00:00,3.9,120-159 m2,No,True,4
3,2020-12-01 03:00:00+00:00,3.1,120-159 m2,No,True,4
4,2020-12-01 04:00:00+00:00,2.4,120-159 m2,No,True,4


In [5]:

model = auto_arima(y_train,
                   seasonal=True,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:               222720
Model:               SARIMAX(3, 1, 5)   Log Likelihood             -261881.044
Date:                Tue, 25 Jun 2024   AIC                         523780.087
Time:                        23:23:19   BIC                         523872.910
Sample:                             0   HQIC                        523807.280
                             - 222720                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.6859      0.039    -17.537      0.000      -0.763      -0.609
ar.L2          0.2375      0.016     15.295      0.000       0.207       0.268
ar.L3          0.3272      0.015     22.364      0.0

In [6]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_2.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 222720    2.464516
222721    2.475891
222722    2.463556
222723    2.460278
222724    2.453913
            ...   
278395    2.449351
278396    2.449351
278397    2.449351
278398    2.449351
278399    2.449351
Name: forecast_data, Length: 55680, dtype: float64
Mean Squared Error: 8.20668477386816
Mean Absolute Error: 2.0344689645357534
Root Mean Squared Error: 2.8647311870170578


In [8]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Sarima/predictions/Phase 2 with survey data/sarima_phase_2_survey_method_2_80_20.csv",index=True)


50 / 50 Ratio

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature,home_size,electric_car,electrically_heated,no_of_people
0,2020-12-01 00:00:00+00:00,4.2,120-159 m2,No,True,4
1,2020-12-01 01:00:00+00:00,4.2,120-159 m2,No,True,4
2,2020-12-01 02:00:00+00:00,3.9,120-159 m2,No,True,4
3,2020-12-01 03:00:00+00:00,3.1,120-159 m2,No,True,4
4,2020-12-01 04:00:00+00:00,2.4,120-159 m2,No,True,4


In [10]:
model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:               139200
Model:               SARIMAX(4, 1, 2)   Log Likelihood             -175835.567
Date:                Tue, 25 Jun 2024   AIC                         351685.135
Time:                        23:50:29   BIC                         351754.041
Sample:                             0   HQIC                        351705.740
                             - 139200                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.3844      0.004    325.032      0.000       1.376       1.393
ar.L2         -0.3728      0.003   -116.713      0.000      -0.379      -0.367
ar.L3         -0.0324      0.003    -11.877      0.0

In [11]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_2.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 139200    1.104996
139201    1.168486
139202    1.232835
139203    1.294695
139204    1.341625
            ...   
278395    1.392323
278396    1.392323
278397    1.392323
278398    1.392323
278399    1.392323
Name: forecast_data, Length: 139200, dtype: float64
Mean Squared Error: 7.1379852026050035
Mean Absolute Error: 1.8324377122602546
Root Mean Squared Error: 2.671700807089934


In [12]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Sarima/predictions/Phase 2 with survey data/sarima_phase_2_survey_method_2_50_50.csv",index=True)


60 / 40 Ratio

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False)
X_train.head()

Unnamed: 0,From,Temperature,home_size,electric_car,electrically_heated,no_of_people
0,2020-12-01 00:00:00+00:00,4.2,120-159 m2,No,True,4
1,2020-12-01 01:00:00+00:00,4.2,120-159 m2,No,True,4
2,2020-12-01 02:00:00+00:00,3.9,120-159 m2,No,True,4
3,2020-12-01 03:00:00+00:00,3.1,120-159 m2,No,True,4
4,2020-12-01 04:00:00+00:00,2.4,120-159 m2,No,True,4


In [14]:
model = auto_arima(y_train,
                   exogenous=X_train, 
                   suppress_warnings=True, 
                   error_action="ignore")

print(model.summary())

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:               167040
Model:               SARIMAX(4, 1, 2)   Log Likelihood             -201207.819
Date:                Wed, 26 Jun 2024   AIC                         402429.638
Time:                        00:27:40   BIC                         402499.820
Sample:                             0   HQIC                        402450.457
                             - 167040                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          1.3920      0.004    389.216      0.000       1.385       1.399
ar.L2         -0.3780      0.003   -136.786      0.000      -0.383      -0.373
ar.L3         -0.0339      0.002    -14.077      0.0

In [15]:
n_periods = len(y_test)

forecast = model.predict(n_periods=n_periods, exogenous=X_test)
forecast.name = "forecast_data"
forecast_df = phase_2.join(forecast,how="left",rsuffix="_forecast")

print("Predicted values:", forecast)

mse = mean_squared_error(y_test, forecast)
print("Mean Squared Error:", mse)

mae = mean_absolute_error(y_test, forecast)
print("Mean Absolute Error:", mae)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

Predicted values: 167040    0.115313
167041    0.111761
167042    0.108451
167043    0.104201
167044    0.101156
            ...   
278395    0.097910
278396    0.097910
278397    0.097910
278398    0.097910
278399    0.097910
Name: forecast_data, Length: 111360, dtype: float64
Mean Squared Error: 14.912876278324369
Mean Absolute Error: 3.0748246694242396
Root Mean Squared Error: 3.8617193422521487


In [16]:
df = forecast_df[forecast_df["forecast_data"].notna()]
df = df.drop(columns=["Temperature","Hour","Participation_Phase"])

df.to_csv(parent_directory+"/Sarima/predictions/Phase 2 with survey data/sarima_phase_2_survey_method_2_60_40.csv",index=True)
