# Imports

In [2]:
!pip install darts --quiet

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytest-astropy 0.8.0 requires pytest-cov>=2.0, which is not installed.
pytest-astropy 0.8.0 requires pytest-filter-subpackage>=0.1, which is not installed.
sagemaker 2.145.0 requires importlib-metadata<5.0,>=1.4.0, but you have importlib-metadata 6.1.0 which is incompatible.
sagemaker 2.145.0 requires PyYAML==5.4.1, but you have pyyaml 6.0 which is incompatible.
docker-compose 1.29.2 requires PyYAML<6,>=3.10, but you have pyyaml 6.0 which is incompatible.[0m[31m
[0m

In [3]:
import os
import itertools
import boto3
import pickle

import pandas as pd
import numpy as np

from sagemaker import get_execution_role

import darts 
from darts import TimeSeries
from darts.metrics import mape, mae
from darts.models import LightGBMModel



# Global Variables

In [4]:
TRAIN_TEST_SPLIT_TIMESTAMP = pd.Timestamp("2022-01-01 00:00:00") # Test set split is Jan 1, 2022
SRC_FCST_SPLIT_TIMESTAMP = pd.Timestamp("2019-01-01 00:00:00")
LAGS = 24*14
STRIDE=24
RANDOM_STATE = 88
LAST_POINTS_ONLY = False
VERBOSE = True
LAGS_PAST_COVARIATES =  [-1, -2, -3, -24, -48, -LAGS]

# Functions

In [5]:
def make_fit_hist_fcst_lgbm_model(
    series,
    lag_params,
    optional_model_params,
    optional_covariates,
    optional_hist_fcst_params
):
    """
    Create a LightGBM time series model and fits it to the input series.
    Generates historical forecasts using the trained model and returns
    both the historical forecasts and the trained model.

    Args:
        series (Darts TimeSeries): data to train the model on.
        lag_params (dict): Parameters for the lagged features used by the model.
        optional_model_params (dict): Optional parameters for the LightGBM model.
        optional_covariates (dict): Optional additional covariates to be included in the model.
        optional_hist_fcst_params (dict): Optional parameters for generating historical forecasts.
    Returns:
        historical_fcsts (list of Darts TimeSeries): The historical forecasts generated by the model.
        model (Darts LightGBMModel): The trained LightGBM model.
    """
    # Create and fit model
    model = LightGBMModel(**lag_params, **optional_model_params)
    model.fit(train, **optional_covariates)
    
    # Create historical forecasts
    historical_fcsts = model.historical_forecasts(series, **optional_covariates, **optional_hist_fcst_params)
    
    return historical_fcsts, model

In [6]:
def calc_mape_from_hist_fcsts(val, historical_fcsts):
    """
    Calcaulates mean absolute percentage error (MAPE) from Darts historical
    forecasts, which are outputted as a list of Darts TimeSeries objects.
    
    Args:
        val (Darts TimeSeries): validation set, function will automatically calculate 
        errors between predictions and actual validation values only for timesteps in 
        which predictions exist.
        historical_fcsts (list of Darts TimeSeries): output from .historical_forecasts()
        
    Returns:
        Average MAPE (float): the average of the mean absolute percentage errors
        from each of the forecasts in historical_fcsts
    """
    return np.array([darts.metrics.metrics.mape(val, fcst) for fcst in historical_fcsts]).mean()

In [7]:
def convert_hist_fcsts_to_df(historical_fcsts, forecast_horizon):
    """
    Takes list of historical forecasts and converts them into a pandas dataframe.
    
    Args:
         historical_fcsts (list of Darts TimeSeries): output from .historical_forecasts()
         forecast_horizon: number of hours in used to make the historical forecasts
    
    Returns:
        preds_df (pandas DataFrame): a pd df of historical forecasts. Index of the df
        is a DatetimeIndex of dates like YYYY-MM-DD and each column represents
        an hour. The value at the combination of row and column is the prediction
        for that time. Example: index value of 2020-02-02 and column of 2 would
        be the prediction for 2020-02-02 02:00:00.
    """
    # Create an empty dataframe to place predictions with rows equal to val length
    # and columns equal to forecast horizon
    preds_df_index = pd.date_range(start=historical_fcsts[0].start_time().date(),
                                                              end=historical_fcsts[-1].start_time().date(),
                                                              freq='D')
    preds_df = pd.DataFrame(index=preds_df_index, columns=np.arange(0, forecast_horizon))
    preds_df.index.name = 'utc_date'
    
    for i, utc_date in enumerate(preds_df.index):
        preds_df.loc[historical_fcsts[i].time_index[0]] = historical_fcsts[i].pd_series().values.reshape(1, -1)[0]

    return preds_df

In [8]:
def calc_mape_from_df(val, preds_df, forecast_horizon):
    """
    Calcaulates mean absolute percentage error (MAPE) from pandas
    DataFrame returned by convert_hist_fcsts_to_df().
    
    Args:
        val (Darts TimeSeries): validation set, function will automatically calculate 
        errors between predictions and actual validation values only for timesteps in 
        which predictions exist.
       preds_df (pandas DataFrame): a pd df of historical forecasts.
       forecast_horizon: number of hours in used to make the historical forecasts
        
    Returns:
        Average MAPE (float): the average of the mean absolute percentage errors
        from each of the forecasts in preds_df
    """
    hours = range(forecast_horizon)
    mapes = np.zeros(preds_df.shape[0]) 

    for i in range(preds_df.shape[0]):
        date = preds_df.iloc[i].name
        times = pd.DatetimeIndex([date + pd.Timedelta(hours=hour) for hour in hours])
        preds_values = preds_df.loc[date].values
        preds_timeseries = TimeSeries.from_times_and_values(times=times, values=preds_values)
        mapes[i] = mape(val, preds_timeseries)
        
    return mapes.mean()

# Load EIA and NCAR combined dataset (preprocessed)

In [9]:
# S3 role
role = get_execution_role()
BUCKET='ucb-mids-capstone'

In [10]:
# ORIGINAL
# # Read in preprocessed dataset
# data_key = 'Data/Aggregated_Data/train_eia_radiation_clean.csv'
# data_location = f's3://{BUCKET}/{data_key}'
# df = pd.read_csv(data_location,
#                              parse_dates=True,
#                               index_col='utc_time')
# after_start_time = df.index >= pd.Timestamp('2018-07-01 08:00:00') # start of target variable
# before_end_time = df.index <= pd.Timestamp('2021-12-31 23:00:00')
# df = df[after_start_time & before_end_time].copy()
# print("Shape is: {}".format(df.shape))

In [11]:
# Read in preprocessed train and validation dataset
data_key = 'Data/Aggregated_Data/train_eia_radiation_clean.csv'
data_location = f's3://{BUCKET}/{data_key}'
df_train_val = pd.read_csv(data_location,
                             parse_dates=True,
                              index_col='utc_time')
df_train_val = df_train_val[df_train_val.index >= pd.Timestamp('2018-07-01 08:00:00')].copy()
print("Shape is: {}".format(df_train_val.shape))
df_train_val

Shape is: (30720, 65)


Unnamed: 0_level_0,utc_time_with_offset,local_time,local_year,local_month,local_week_of_year,local_day,local_day_of_week,local_day_of_year,local_is_weekend,utc_hour,...,co2_emissions_consumed,positive_generation,consumed_electricity,co2_emissions_intensity_for_generated_electricity,co2_emissions_intensity_for_consumed_electricity,total_weighted_radiation_now,total_weighted_radiation_1_day_ahead,total_weighted_radiation_2_days_ahead,total_weighted_radiation_3_days_ahead,total_weighted_radiation_4_days_ahead
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-07-01 08:00:00,2018-07-01 08:00:00+00:00,2018-07-01 01:00:00,2018,7,26,1,6,182,True,8,...,3724.863908,17070.0,21845.0,0.435656,0.375917,0.0,0.0,0.0,0.0,0.0
2018-07-01 09:00:00,2018-07-01 09:00:00+00:00,2018-07-01 02:00:00,2018,7,26,1,6,182,True,9,...,3021.831173,15447.0,20493.0,0.374319,0.325086,0.0,0.0,0.0,0.0,0.0
2018-07-01 10:00:00,2018-07-01 10:00:00+00:00,2018-07-01 03:00:00,2018,7,26,1,6,182,True,10,...,2929.503549,14727.0,19854.0,0.375942,0.325297,0.0,0.0,0.0,0.0,0.0
2018-07-01 11:00:00,2018-07-01 11:00:00+00:00,2018-07-01 04:00:00,2018,7,26,1,6,182,True,11,...,2842.007242,14399.0,19521.0,0.370308,0.320964,0.0,0.0,0.0,0.0,0.0
2018-07-01 12:00:00,2018-07-01 12:00:00+00:00,2018-07-01 05:00:00,2018,7,26,1,6,182,True,12,...,2944.499874,14281.0,19434.0,0.385631,0.334028,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-01 03:00:00,2022-01-01 03:00:00+00:00,2021-12-31 19:00:00,2021,12,52,31,4,365,False,3,...,7773.260494,22780.0,31870.0,0.508546,0.537718,0.0,0.0,0.0,0.0,0.0
2022-01-01 04:00:00,2022-01-01 04:00:00+00:00,2021-12-31 20:00:00,2021,12,52,31,4,365,False,4,...,7687.742276,22045.0,30984.0,0.520778,0.547010,0.0,0.0,0.0,0.0,0.0
2022-01-01 05:00:00,2022-01-01 05:00:00+00:00,2021-12-31 21:00:00,2021,12,52,31,4,365,False,5,...,7721.388254,21060.0,30381.0,0.534327,0.560308,0.0,0.0,0.0,0.0,0.0
2022-01-01 06:00:00,2022-01-01 06:00:00+00:00,2021-12-31 22:00:00,2021,12,52,31,4,365,False,6,...,7397.395181,19771.0,28915.0,0.543871,0.564013,0.0,0.0,0.0,0.0,0.0


In [12]:
# Read in test dataset
data_key = 'Data/Aggregated_Data/test_eia_radiation_clean.csv'
data_location = f's3://{BUCKET}/{data_key}'
df_test = pd.read_csv(data_location,
                             parse_dates=True,
                              index_col='utc_time')
df_test = df_test[df_test.index <= pd.Timestamp('2023-01-07 23:00:00')].copy() # Add extra days to ensure prediction through end of 2022
print("Shape is: {}".format(df_test.shape))
df_test

Shape is: (8920, 65)


Unnamed: 0_level_0,utc_time_with_offset,local_time,local_year,local_month,local_week_of_year,local_day,local_day_of_week,local_day_of_year,local_is_weekend,utc_hour,...,co2_emissions_consumed,positive_generation,consumed_electricity,co2_emissions_intensity_for_generated_electricity,co2_emissions_intensity_for_consumed_electricity,total_weighted_radiation_now,total_weighted_radiation_1_day_ahead,total_weighted_radiation_2_days_ahead,total_weighted_radiation_3_days_ahead,total_weighted_radiation_4_days_ahead
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01 08:00:00,2022-01-01 08:00:00+00:00,2022-01-01 00:00:00,2022,1,52,1,5,1,True,8,...,6627.800100,17730.0,27462.0,0.520779,0.532073,0.0,0.0,0.0,0.0,0.0
2022-01-01 09:00:00,2022-01-01 09:00:00+00:00,2022-01-01 01:00:00,2022,1,52,1,5,1,True,9,...,6305.323745,17176.0,26120.0,0.522752,0.532192,0.0,0.0,0.0,0.0,0.0
2022-01-01 10:00:00,2022-01-01 10:00:00+00:00,2022-01-01 02:00:00,2022,1,52,1,5,1,True,10,...,6318.595580,16496.0,25307.0,0.544756,0.550445,0.0,0.0,0.0,0.0,0.0
2022-01-01 11:00:00,2022-01-01 11:00:00+00:00,2022-01-01 03:00:00,2022,1,52,1,5,1,True,11,...,6426.838676,15859.0,24645.0,0.573055,0.574913,0.0,0.0,0.0,0.0,0.0
2022-01-01 12:00:00,2022-01-01 12:00:00+00:00,2022-01-01 04:00:00,2022,1,52,1,5,1,True,12,...,6406.014087,15464.0,24567.0,0.578170,0.574870,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-07 19:00:00,2023-01-07 19:00:00+00:00,2023-01-07 11:00:00,2023,1,1,7,5,7,True,19,...,6219.481696,24106.0,28555.0,0.450529,0.480182,0.0,0.0,0.0,0.0,0.0
2023-01-07 20:00:00,2023-01-07 20:00:00+00:00,2023-01-07 12:00:00,2023,1,1,7,5,7,True,20,...,5941.132045,23656.0,27510.0,0.445531,0.476116,0.0,0.0,0.0,0.0,0.0
2023-01-07 21:00:00,2023-01-07 21:00:00+00:00,2023-01-07 13:00:00,2023,1,1,7,5,7,True,21,...,5969.629308,24054.0,27591.0,0.447390,0.476995,0.0,0.0,0.0,0.0,0.0
2023-01-07 22:00:00,2023-01-07 22:00:00+00:00,2023-01-07 14:00:00,2023,1,1,7,5,7,True,22,...,5836.626171,23293.0,27028.0,0.446943,0.476082,0.0,0.0,0.0,0.0,0.0


In [13]:
df = pd.concat([df_train_val, df_test])
df

Unnamed: 0_level_0,utc_time_with_offset,local_time,local_year,local_month,local_week_of_year,local_day,local_day_of_week,local_day_of_year,local_is_weekend,utc_hour,...,co2_emissions_consumed,positive_generation,consumed_electricity,co2_emissions_intensity_for_generated_electricity,co2_emissions_intensity_for_consumed_electricity,total_weighted_radiation_now,total_weighted_radiation_1_day_ahead,total_weighted_radiation_2_days_ahead,total_weighted_radiation_3_days_ahead,total_weighted_radiation_4_days_ahead
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-07-01 08:00:00,2018-07-01 08:00:00+00:00,2018-07-01 01:00:00,2018,7,26,1,6,182,True,8,...,3724.863908,17070.0,21845.0,0.435656,0.375917,0.0,0.0,0.0,0.0,0.0
2018-07-01 09:00:00,2018-07-01 09:00:00+00:00,2018-07-01 02:00:00,2018,7,26,1,6,182,True,9,...,3021.831173,15447.0,20493.0,0.374319,0.325086,0.0,0.0,0.0,0.0,0.0
2018-07-01 10:00:00,2018-07-01 10:00:00+00:00,2018-07-01 03:00:00,2018,7,26,1,6,182,True,10,...,2929.503549,14727.0,19854.0,0.375942,0.325297,0.0,0.0,0.0,0.0,0.0
2018-07-01 11:00:00,2018-07-01 11:00:00+00:00,2018-07-01 04:00:00,2018,7,26,1,6,182,True,11,...,2842.007242,14399.0,19521.0,0.370308,0.320964,0.0,0.0,0.0,0.0,0.0
2018-07-01 12:00:00,2018-07-01 12:00:00+00:00,2018-07-01 05:00:00,2018,7,26,1,6,182,True,12,...,2944.499874,14281.0,19434.0,0.385631,0.334028,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-07 19:00:00,2023-01-07 19:00:00+00:00,2023-01-07 11:00:00,2023,1,1,7,5,7,True,19,...,6219.481696,24106.0,28555.0,0.450529,0.480182,0.0,0.0,0.0,0.0,0.0
2023-01-07 20:00:00,2023-01-07 20:00:00+00:00,2023-01-07 12:00:00,2023,1,1,7,5,7,True,20,...,5941.132045,23656.0,27510.0,0.445531,0.476116,0.0,0.0,0.0,0.0,0.0
2023-01-07 21:00:00,2023-01-07 21:00:00+00:00,2023-01-07 13:00:00,2023,1,1,7,5,7,True,21,...,5969.629308,24054.0,27591.0,0.447390,0.476995,0.0,0.0,0.0,0.0,0.0
2023-01-07 22:00:00,2023-01-07 22:00:00+00:00,2023-01-07 14:00:00,2023,1,1,7,5,7,True,22,...,5836.626171,23293.0,27028.0,0.446943,0.476082,0.0,0.0,0.0,0.0,0.0


# Part 1: Make 24 hr net generation forecasts

## Make 24 hr solar and natural gas forecasts

* `retrain=True`
* `'lags_future_covariates': [1, 2, 3, 4, 5, 6, 12, 18, 24]`

In [14]:
target_cols = ['ng_ng', 'ng_sun']
series = TimeSeries.from_dataframe(df=df, value_cols=target_cols)
train, _ = series.split_before(SRC_FCST_SPLIT_TIMESTAMP)

In [None]:
%%time
past_covariates_cols = [
    'utc_month', 'utc_hour', 'utc_day_of_week', 'utc_week_of_year', 'is_daytime'
]
future_covariates_cols = [
    'utc_month', 'utc_hour', 'utc_day_of_week', 'utc_week_of_year', 'is_daytime',
    'total_weighted_radiation_1_day_ahead', 'total_weighted_radiation_2_days_ahead',
    'total_weighted_radiation_3_days_ahead', 'total_weighted_radiation_4_days_ahead',
]

if past_covariates_cols:
    past_covariates = TimeSeries.from_dataframe(df=df,
                                                                                  value_cols=past_covariates_cols,
                                                                                  freq='H')
if future_covariates_cols:
    future_covariates = TimeSeries.from_dataframe(df=df,
                                                                                  value_cols=future_covariates_cols,
                                                                                  freq='H')
    
lag_params = {
    'lags': LAGS,
    'lags_past_covariates': LAGS_PAST_COVARIATES,
    'lags_future_covariates': [1, 2, 3, 4, 5, 6, 12, 18, 24]
}

optional_covariates = {
    'past_covariates': past_covariates,
    'future_covariates': future_covariates,
}

optional_model_params = {
    'random_state': RANDOM_STATE
}

optional_samples = {
    'num_samples': None # Set this to 'None' for deterministic, 100 or other for probabalistic
}

optional_hist_fcst_params = {
    'start': SRC_FCST_SPLIT_TIMESTAMP,
    'forecast_horizon': 24, 
    'stride': STRIDE,
    'retrain': True,
    'last_points_only': LAST_POINTS_ONLY,
    'verbose': VERBOSE
}

# drop keys which are None
lag_params = {k: v for k, v in lag_params.items() if v is not None}
optional_covariates = {k: v for k, v in optional_covariates.items() if v is not None}
optional_model_params = {k: v for k, v in optional_model_params.items() if v is not None}
optional_samples = {k: v for k, v in optional_samples.items() if v is not None}
optional_hist_fcst_params = {k: v for k, v in optional_hist_fcst_params.items() if v is not None}

ng_ng_sun_fcsts, ng_ng_sun_model = make_fit_hist_fcst_lgbm_model(
                                                                                    series=series,
                                                                                    lag_params=lag_params,
                                                                                    optional_model_params=optional_model_params,
                                                                                    optional_covariates=optional_covariates,
                                                                                    optional_hist_fcst_params=optional_hist_fcst_params
                                                                            )

  0%|          | 0/1467 [00:00<?, ?it/s]

CPU times: user 15h 24min 14s, sys: 10min 35s, total: 15h 34min 49s
Wall time: 8h 6min 9s


In [None]:
# Convert list of TimeSeries objects to pandas df
ng_ng_sun_fcsts_df = darts.timeseries.concatenate(ng_ng_sun_fcsts).pd_dataframe()

# Rename columns
columns_to_rename = {
    'ng_ng': 'ng_ng_fcst',
    'ng_sun': 'ng_sun_fcst',
}
ng_ng_sun_fcsts_df.rename(columns=columns_to_rename, inplace=True)

# Add back column 'is_daytime' and set all ng_sun_fcsts to zero when there is
# no daylight. Then drop 'is_daytime'.
ng_ng_sun_fcsts_df = pd.merge(left=ng_ng_sun_fcsts_df,
                                            right=df['is_daytime'],
                                            how='left',
                                            left_index=True,
                                            right_index=True).copy()
ng_ng_sun_fcsts_df.loc[~ng_ng_sun_fcsts_df['is_daytime'], 'ng_sun_fcst'] = 0
ng_ng_sun_fcsts_df.drop('is_daytime', axis=1, inplace=True)
ng_ng_sun_fcsts_df

Unnamed: 0_level_0,ng_ng_fcst,ng_sun_fcst
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01 00:00:00,7805.148451,1600.952689
2019-01-01 01:00:00,9267.077871,1392.607499
2019-01-01 02:00:00,9678.001598,0.000000
2019-01-01 03:00:00,10025.806901,0.000000
2019-01-01 04:00:00,10045.981002,0.000000
...,...,...
2023-01-06 19:00:00,15109.376034,0.000000
2023-01-06 20:00:00,15586.853770,0.000000
2023-01-06 21:00:00,16027.585341,0.000000
2023-01-06 22:00:00,16118.546034,0.000000


In [None]:
mean_abs_error = mae(actual_series=TimeSeries.from_series(df['ng_ng']),
                                        pred_series=TimeSeries.from_series(ng_ng_sun_fcsts_df['ng_ng_fcst']))

print(f'MAE for natural gas from ng and sun only forecast: {round(mean_abs_error, 2)}')

MAE for natural gas from ng and sun only forecast: 1088.41


In [None]:
mean_abs_error = mae(actual_series=TimeSeries.from_series(df['ng_sun']),
                                        pred_series=TimeSeries.from_series(ng_ng_sun_fcsts_df['ng_sun_fcst']))

print(f'MAE for solar from ng and sun only forecast: {round(mean_abs_error, 2)}')

MAE for solar from ng and sun only forecast: 435.84


In [None]:
# Write net generation natural gas and solar source forecasts to file
file_name = "net_generation_ng_sun_source_forecasts_final.csv"
ng_ng_sun_fcsts_df.to_csv(file_name)

s3 = boto3.resource('s3')
s3.meta.client.upload_file(file_name, BUCKET,f'Data/EIA/{file_name}')

## Make 24 hr wind, hydro, coal forecasts

* `retrain=730`
* `'lags_future_covariates': [1, 2, 3, 24]`

In [None]:
# Include natural gas and solar because the generation from those sources
# will impact hydro and coal
target_cols = ['ng_ng', 'ng_sun', 'ng_wnd', 'ng_wat', 'ng_col']
series = TimeSeries.from_dataframe(df=df, value_cols=target_cols)
train, _ = series.split_before(SRC_FCST_SPLIT_TIMESTAMP)

In [None]:
%%time

past_covariates_cols = [
    'utc_month', 'utc_hour', 'utc_day_of_week', 'utc_week_of_year'
]
future_covariates_cols = [
    'utc_month', 'utc_hour', 'utc_day_of_week', 'utc_week_of_year',
    'total_weighted_radiation_1_day_ahead', 'total_weighted_radiation_2_days_ahead',
    'total_weighted_radiation_3_days_ahead', 'total_weighted_radiation_4_days_ahead',
]

if past_covariates_cols:
    past_covariates = TimeSeries.from_dataframe(df=df,
                                                                                      value_cols=past_covariates_cols,
                                                                                      freq='H')
if future_covariates_cols:
    future_covariates = TimeSeries.from_dataframe(df=df,
                                                                                          value_cols=future_covariates_cols,
                                                                                          freq='H')
    
lag_params = {
    'lags': LAGS,
    'lags_past_covariates': LAGS_PAST_COVARIATES,
    'lags_future_covariates': [1, 2, 3, 24]
}

optional_covariates = {
    'past_covariates': past_covariates,
    'future_covariates': future_covariates,
}

optional_model_params = {
    'random_state': RANDOM_STATE
}

optional_samples = {
    'num_samples': None # Set this to 'None' for deterministic, 100 or other for probabalistic
}

optional_hist_fcst_params = {
    'start': SRC_FCST_SPLIT_TIMESTAMP,
    'forecast_horizon': 24, 
    'stride': STRIDE,
    'retrain': 730,
    'last_points_only': LAST_POINTS_ONLY,
    'verbose': VERBOSE
}

# drop keys which are None
lag_params = {k: v for k, v in lag_params.items() if v is not None}
optional_covariates = {k: v for k, v in optional_covariates.items() if v is not None}
optional_model_params = {k: v for k, v in optional_model_params.items() if v is not None}
optional_samples = {k: v for k, v in optional_samples.items() if v is not None}
optional_hist_fcst_params = {k: v for k, v in optional_hist_fcst_params.items() if v is not None}

ng_all_fcsts, ng_all_model = make_fit_hist_fcst_lgbm_model(
                                                                                    series=series,
                                                                                    lag_params=lag_params,
                                                                                    optional_model_params=optional_model_params,
                                                                                    optional_covariates=optional_covariates,
                                                                                    optional_hist_fcst_params=optional_hist_fcst_params
                                                                            )

  0%|          | 0/1467 [00:00<?, ?it/s]

CPU times: user 16min 25s, sys: 10.6 s, total: 16min 36s
Wall time: 8min 28s


In [None]:
# Convert list of TimeSeries objects to pandas df
ng_all_fcsts_df = darts.timeseries.concatenate(ng_all_fcsts).pd_dataframe()

# Drop natural gas and solar forecasts since we previously made more accurate
# forecasts
ng_wnd_wat_col_fcsts_df = ng_all_fcsts_df.drop(['ng_ng', 'ng_sun'], axis=1).copy()

columns_to_rename = {
    'ng_wnd': 'ng_wnd_fcst',    
    'ng_wat': 'ng_wat_fcst',
    'ng_col': 'ng_col_fcst'
}
ng_wnd_wat_col_fcsts_df.rename(columns=columns_to_rename, inplace=True)
ng_wnd_wat_col_fcsts_df

component,ng_wnd_fcst,ng_wat_fcst,ng_col_fcst
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01 00:00:00,2797.653043,2613.926821,1071.357559
2019-01-01 01:00:00,2927.866663,3630.647469,1351.668993
2019-01-01 02:00:00,2819.551921,4274.134951,1592.408450
2019-01-01 03:00:00,2740.122328,4428.710828,1606.862852
2019-01-01 04:00:00,2642.982183,4095.323760,1567.627902
...,...,...,...
2023-01-06 19:00:00,2206.872662,2380.906786,794.948483
2023-01-06 20:00:00,2388.413643,2282.858456,756.450547
2023-01-06 21:00:00,2532.651530,2269.071368,759.110024
2023-01-06 22:00:00,2684.738015,2327.390682,815.476841


In [None]:
mean_abs_error = mae(actual_series=TimeSeries.from_series(df['ng_wnd']),
                                         pred_series=TimeSeries.from_series(ng_wnd_wat_col_fcsts_df['ng_wnd_fcst']))
print(f'MAE for wind: {round(mean_abs_error, 2)}')

MAE for wind: 826.11


In [None]:
mean_abs_error = mae(actual_series=TimeSeries.from_series(df['ng_wat']),
                                         pred_series=TimeSeries.from_series(ng_wnd_wat_col_fcsts_df['ng_wat_fcst']))
print(f'MAE for hydro: {round(mean_abs_error, 2)}')

MAE for hydro: 417.42


In [None]:
mean_abs_error = mae(actual_series=TimeSeries.from_series(df['ng_col']),
                                         pred_series=TimeSeries.from_series(ng_wnd_wat_col_fcsts_df['ng_col_fcst']))
print(f'MAE for coal: {round(mean_abs_error, 2)}')

MAE for coal: 317.1


In [None]:
file_name = "net_generation_wnd_wat_col_source_forecasts_final.csv"
ng_wnd_wat_col_fcsts_df.to_csv(file_name)

s3 = boto3.resource('s3')
s3.meta.client.upload_file(file_name, BUCKET, f'Data/EIA/{file_name}')

# Part 2: Make 24 hour CO2 intensity forecasts

In [None]:
# Load net generation natural gas and solar forecasts
data_key = 'Data/EIA/net_generation_ng_sun_source_forecasts_final.csv'
data_location = f's3://{BUCKET}/{data_key}'
ng_sun_fcsts_df = pd.read_csv(data_location,
                             parse_dates=True,
                              index_col='utc_time')

# Load net generation wind, hydro, and coal forecasts
data_key = 'Data/EIA/net_generation_wnd_wat_col_source_forecasts_final.csv'
data_location = f's3://{BUCKET}/{data_key}'
wnd_wat_coal_fcsts_df = pd.read_csv(data_location,
                             parse_dates=True,
                              index_col='utc_time')

# Join original dataset with net generation forecasts
df2 = pd.merge(left=df, right=ng_sun_fcsts_df, on='utc_time', how='inner')
df_merged = pd.merge(left=df2, right=wnd_wat_coal_fcsts_df, on='utc_time', how='inner')
df_merged

Unnamed: 0_level_0,utc_time_with_offset,local_time,local_year,local_month,local_week_of_year,local_day,local_day_of_week,local_day_of_year,local_is_weekend,utc_hour,...,total_weighted_radiation_now,total_weighted_radiation_1_day_ahead,total_weighted_radiation_2_days_ahead,total_weighted_radiation_3_days_ahead,total_weighted_radiation_4_days_ahead,ng_ng_fcst,ng_sun_fcst,ng_wnd_fcst,ng_wat_fcst,ng_col_fcst
utc_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 00:00:00,2019-01-01 00:00:00+00:00,2018-12-31 16:00:00,2018,12,1,31,0,365,False,0,...,268.174038,72.126662,117.757422,120.941605,116.544292,7805.148451,1600.952689,2797.653043,2613.926821,1071.357559
2019-01-01 01:00:00,2019-01-01 01:00:00+00:00,2018-12-31 17:00:00,2018,12,1,31,0,365,False,1,...,0.283649,0.181061,0.256713,0.325910,0.383436,9267.077871,1392.607499,2927.866663,3630.647469,1351.668993
2019-01-01 02:00:00,2019-01-01 02:00:00+00:00,2018-12-31 18:00:00,2018,12,1,31,0,365,False,2,...,0.000000,0.000000,0.000000,0.000000,0.000000,9678.001598,0.000000,2819.551921,4274.134951,1592.408450
2019-01-01 03:00:00,2019-01-01 03:00:00+00:00,2018-12-31 19:00:00,2018,12,1,31,0,365,False,3,...,0.000000,0.000000,0.000000,0.000000,0.000000,10025.806901,0.000000,2740.122328,4428.710828,1606.862852
2019-01-01 04:00:00,2019-01-01 04:00:00+00:00,2018-12-31 20:00:00,2018,12,1,31,0,365,False,4,...,0.000000,0.000000,0.000000,0.000000,0.000000,10045.981002,0.000000,2642.982183,4095.323760,1567.627902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-06 19:00:00,2023-01-06 19:00:00+00:00,2023-01-06 11:00:00,2023,1,1,6,4,6,False,19,...,0.000000,0.000000,0.000000,0.000000,0.000000,15109.376034,0.000000,2206.872662,2380.906786,794.948483
2023-01-06 20:00:00,2023-01-06 20:00:00+00:00,2023-01-06 12:00:00,2023,1,1,6,4,6,False,20,...,0.000000,0.000000,0.000000,0.000000,0.000000,15586.853770,0.000000,2388.413643,2282.858456,756.450547
2023-01-06 21:00:00,2023-01-06 21:00:00+00:00,2023-01-06 13:00:00,2023,1,1,6,4,6,False,21,...,0.000000,0.000000,0.000000,0.000000,0.000000,16027.585341,0.000000,2532.651530,2269.071368,759.110024
2023-01-06 22:00:00,2023-01-06 22:00:00+00:00,2023-01-06 14:00:00,2023,1,1,6,4,6,False,22,...,0.000000,0.000000,0.000000,0.000000,0.000000,16118.546034,0.000000,2684.738015,2327.390682,815.476841


In [None]:
# Make timeseries of entire dataset
series = TimeSeries.from_dataframe(df=df_merged,
                                                               value_cols='co2_emissions_intensity_for_consumed_electricity')

# Make train and val
train, test = series.split_before(TRAIN_TEST_SPLIT_TIMESTAMP)

In [None]:
%%time
past_covariates_cols = ['co2_emissions_consumed', 'ng_ng',  'ng_wnd', 'ng_sun', 'ng_wat', 'ng_col', 'ng_nuc', 'd',
                   'utc_month', 'utc_week_of_year', 'utc_day_of_week', 'utc_hour']

future_covariates_cols = ['ng_ng_fcst', 'ng_col_fcst', 'ng_sun_fcst', 'ng_wat_fcst', 'ng_wnd_fcst',
                     'utc_month', 'utc_week_of_year', 'utc_day_of_week', 'utc_hour']

if past_covariates_cols:
    past_covariates = TimeSeries.from_dataframe(df=df_merged,
                      value_cols=past_covariates_cols,
                      freq='H')
if future_covariates_cols:
    future_covariates = TimeSeries.from_dataframe(df=df_merged,
                      value_cols=future_covariates_cols,
                      freq='H')
    
lag_params = {
    'lags': LAGS,
    'lags_past_covariates': LAGS_PAST_COVARIATES,
    'lags_future_covariates': [1, 2, 3, 4, 5, 6, 12, 18, 24]
}

optional_covariates = {
    'past_covariates': past_covariates,
    'future_covariates': future_covariates,
}

optional_model_params = {
    'random_state': RANDOM_STATE#,
    #'likelihood': 'quantile',    # Uncomment these for probabilistic model
    #'quantiles': [0.05, 0.5, 0.95] # Middle value must be 0.5
}

optional_samples = {
    'num_samples': None # Set this to 'None' for deterministic, 100 or other for probabalistic
}

optional_hist_fcst_params = {
    'start': TRAIN_TEST_SPLIT_TIMESTAMP,
    'forecast_horizon': 24, # only a 24 hour forecast horizon
    'stride': STRIDE,
    'retrain': True,
    'last_points_only': LAST_POINTS_ONLY,
    'verbose': VERBOSE
}

# drop keys which are None
lag_params = {k: v for k, v in lag_params.items() if v is not None}
optional_covariates = {k: v for k, v in optional_covariates.items() if v is not None}
optional_model_params = {k: v for k, v in optional_model_params.items() if v is not None}
optional_samples = {k: v for k, v in optional_samples.items() if v is not None}
optional_hist_fcst_params = {k: v for k, v in optional_hist_fcst_params.items() if v is not None}

co2_24_historical_fcsts, co2_24_hr_model = make_fit_hist_fcst_lgbm_model(
                                                                                    series=series,
                                                                                    lag_params=lag_params,
                                                                                    optional_model_params=optional_model_params,
                                                                                    optional_covariates=optional_covariates,
                                                                                    optional_hist_fcst_params=optional_hist_fcst_params
                                                                            )

  0%|          | 0/370 [00:00<?, ?it/s]

CPU times: user 1h 35min 50s, sys: 10.9 s, total: 1h 36min 1s
Wall time: 50min 44s


In [None]:
co2_24_hist_fcst_mape = calc_mape_from_hist_fcsts(val=test, historical_fcsts=co2_24_historical_fcsts)
print(f'CO2 intensity 24 hr forecast MAPE: {round(co2_24_hist_fcst_mape, 2)}%')

CO2 intensity 24 hr forecast MAPE: 5.74%


In [None]:
co2_24_preds_df = convert_hist_fcsts_to_df(historical_fcsts=co2_24_historical_fcsts,
                                                                             forecast_horizon=24)

mape_from_df = calc_mape_from_df(val=test,
                                                                 preds_df=co2_24_preds_df,
                                                                 forecast_horizon=24)
print(f'MAPE from reshaped df: {round(mape_from_df, 2)}%')

MAPE from reshaped df: 5.74%


In [None]:
co2_24_preds_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
utc_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01,0.470492,0.491854,0.522422,0.527823,0.530451,0.537607,0.54225,0.546962,0.550348,0.550521,...,0.546268,0.534321,0.475186,0.391036,0.35679,0.345945,0.342705,0.343362,0.357498,0.39811
2022-01-02,0.477705,0.53787,0.563954,0.572643,0.577694,0.584193,0.586832,0.589021,0.591147,0.589424,...,0.594979,0.575699,0.481161,0.405224,0.378039,0.364194,0.362021,0.360214,0.374256,0.40711
2022-01-03,0.477013,0.542351,0.576605,0.572354,0.574267,0.580528,0.586977,0.589649,0.591309,0.590583,...,0.58955,0.575116,0.497875,0.432526,0.416213,0.406443,0.404854,0.404842,0.411918,0.433963
2022-01-04,0.500176,0.543726,0.573262,0.569135,0.568175,0.566668,0.565902,0.56139,0.55664,0.558539,...,0.555384,0.544826,0.491297,0.422671,0.39634,0.389892,0.382567,0.380784,0.385243,0.414455
2022-01-05,0.463086,0.522199,0.565619,0.566704,0.568747,0.573697,0.573939,0.574462,0.574837,0.574203,...,0.570252,0.562317,0.504621,0.434352,0.418303,0.407515,0.40145,0.398262,0.405711,0.431936


In [None]:
co2_24_preds_df.tail(9)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
utc_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-12-28,0.565477,0.565661,0.56658,0.559041,0.560466,0.560109,0.559592,0.556818,0.557277,0.556927,...,0.55742,0.552522,0.539655,0.498541,0.466565,0.454251,0.447427,0.44335,0.45246,0.468549
2022-12-29,0.431034,0.517039,0.540298,0.559652,0.572439,0.585651,0.600079,0.608849,0.614148,0.622391,...,0.632592,0.623269,0.610557,0.59261,0.531425,0.520906,0.522144,0.52635,0.536685,0.556243
2022-12-30,0.598505,0.616626,0.613444,0.611373,0.616589,0.62558,0.635785,0.638209,0.638335,0.633402,...,0.625702,0.613193,0.600562,0.595378,0.585275,0.578848,0.596326,0.608608,0.617873,0.633201
2022-12-31,0.502901,0.564009,0.570601,0.568537,0.572453,0.577383,0.588672,0.591207,0.586877,0.583128,...,0.573314,0.563232,0.546527,0.511356,0.504923,0.505447,0.507869,0.508511,0.522342,0.543023
2023-01-01,0.534506,0.545318,0.560763,0.561396,0.559934,0.56066,0.562707,0.561904,0.555143,0.550578,...,0.520847,0.498769,0.48211,0.455079,0.456155,0.459633,0.460839,0.477167,0.495347,0.520681
2023-01-02,0.444036,0.515242,0.541933,0.552292,0.561306,0.566924,0.577277,0.587284,0.592936,0.598819,...,0.614636,0.613637,0.598883,0.57946,0.563736,0.565398,0.573609,0.571563,0.585734,0.597645
2023-01-03,0.600135,0.612579,0.617915,0.619023,0.629755,0.636835,0.644808,0.654905,0.661151,0.662487,...,0.655523,0.645871,0.629344,0.617954,0.605496,0.609572,0.61313,0.617783,0.630841,0.65727
2023-01-04,0.58035,0.616199,0.618423,0.623512,0.629224,0.643439,0.657288,0.664213,0.674185,0.677786,...,0.669179,0.654581,0.627954,0.606023,0.603464,0.60716,0.610919,0.609188,0.611283,0.625435
2023-01-05,0.586561,0.609664,0.604197,0.610484,0.62107,0.629628,0.641767,0.647941,0.656439,0.655503,...,0.646842,0.634489,0.619385,0.595574,0.592418,0.592733,0.588958,0.593903,0.605093,0.625637


In [None]:
# Write 96 hour forecasts to file
file_name = "co2_intensity_24_hr_fcsts_final.csv"
co2_24_preds_df.to_csv(file_name)

s3 = boto3.resource('s3')
s3.meta.client.upload_file(file_name, BUCKET,f'Data/EIA/{file_name}')

# Part 3: Make 96 hour CO2 intensity forecasts

(without natural gas, solar, etc. net generation forecasts as future covariates)

In [None]:
# Make timeseries of entire dataset
series = TimeSeries.from_dataframe(df=df,
                                   value_cols='co2_emissions_intensity_for_consumed_electricity')


# Make train and val
train, test = series.split_before(TRAIN_TEST_SPLIT_TIMESTAMP)

In [None]:
%%time

past_covariates_cols = ['co2_emissions_consumed', 'ng_ng',  'ng_wnd', 'ng_sun', 'ng_wat', 'ng_col', 'ng_nuc', 'd',
                        'utc_month', 'utc_week_of_year', 'utc_day_of_week', 'utc_hour']

future_covariates_cols = ['utc_month', 'utc_week_of_year', 'utc_day_of_week', 'utc_hour']

if past_covariates_cols:
    past_covariates = TimeSeries.from_dataframe(df=df,
                                                                                  value_cols=past_covariates_cols,
                                                                                  freq='H')
if future_covariates_cols:
    future_covariates = TimeSeries.from_dataframe(df=df,
                                                                                      value_cols=future_covariates_cols,
                                                                                      freq='H')

lag_params = {
    'lags': LAGS,
    'lags_past_covariates': LAGS_PAST_COVARIATES,
    'lags_future_covariates': (0, 24)
}

optional_covariates = {
    'past_covariates': past_covariates,
    'future_covariates': future_covariates,
}

optional_model_params = {
    'random_state': RANDOM_STATE#,
    #'likelihood': 'quantile',    # Uncomment these for probabilistic model
    #'quantiles': [0.05, 0.5, 0.95] # Middle value must be 0.5
}

optional_samples = {
    'num_samples': None # Set this to 'None' for deterministic, 100 or other for probabalistic
}

optional_hist_fcst_params = {
    'start': TRAIN_TEST_SPLIT_TIMESTAMP,
    'forecast_horizon': 96,
    'stride': STRIDE,
    'retrain': True,
    'last_points_only': LAST_POINTS_ONLY,
    'verbose': VERBOSE
}

# drop keys which are None
lag_params = {k: v for k, v in lag_params.items() if v is not None}
optional_covariates = {k: v for k, v in optional_covariates.items() if v is not None}
optional_model_params = {k: v for k, v in optional_model_params.items() if v is not None}
optional_samples = {k: v for k, v in optional_samples.items() if v is not None}
optional_hist_fcst_params = {k: v for k, v in optional_hist_fcst_params.items() if v is not None}

co2_96_historical_fcsts, co2_96_hr_model = make_fit_hist_fcst_lgbm_model(
                                                                                    series=series,
                                                                                    lag_params=lag_params,
                                                                                    optional_model_params=optional_model_params,
                                                                                    optional_covariates=optional_covariates,
                                                                                    optional_hist_fcst_params=optional_hist_fcst_params
                                                                            )

  0%|          | 0/368 [00:00<?, ?it/s]

CPU times: user 1h 45min 6s, sys: 14 s, total: 1h 45min 20s
Wall time: 55min 52s


In [None]:
co2_96_hist_fcst_mape = calc_mape_from_hist_fcsts(val=test, historical_fcsts=co2_96_historical_fcsts)
print(f'CO2 intensity 24 hr forecast MAPE: {round(co2_96_hist_fcst_mape, 2)}%')

CO2 intensity 24 hr forecast MAPE: 8.14%


In [None]:
co2_96_preds_df = convert_hist_fcsts_to_df(historical_fcsts=co2_96_historical_fcsts,
                                                                             forecast_horizon=96)

mape_from_df = calc_mape_from_df(val=test,
                                                                 preds_df=co2_96_preds_df,
                                                                 forecast_horizon=96)

print(f'MAPE from reshaped df: {round(mape_from_df, 2)}%')

MAPE from reshaped df: 8.14%


In [None]:
co2_96_preds_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
utc_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01,0.473932,0.501588,0.526609,0.532363,0.53665,0.539908,0.546714,0.545907,0.544502,0.542087,...,0.644654,0.641758,0.582585,0.519683,0.483774,0.470625,0.466023,0.468389,0.477112,0.502266
2022-01-02,0.49562,0.550609,0.578425,0.577622,0.582951,0.592235,0.597869,0.603242,0.605145,0.607262,...,0.648259,0.64665,0.586942,0.517407,0.477475,0.467447,0.466135,0.467828,0.475757,0.507616
2022-01-03,0.477254,0.552509,0.58182,0.582863,0.589781,0.595147,0.603109,0.607514,0.608739,0.610452,...,0.645253,0.637235,0.575053,0.510993,0.477432,0.461118,0.455679,0.460788,0.467711,0.508481
2022-01-04,0.498237,0.544692,0.571445,0.565459,0.568161,0.574844,0.580269,0.575306,0.57304,0.567358,...,0.60718,0.593095,0.533867,0.454161,0.420935,0.402075,0.394283,0.392358,0.393509,0.407838
2022-01-05,0.460962,0.526408,0.559084,0.561983,0.566522,0.566808,0.565961,0.564898,0.563462,0.565039,...,0.525286,0.50961,0.428085,0.345803,0.327854,0.320065,0.324047,0.328244,0.347742,0.385012


In [None]:
co2_96_preds_df.tail()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
utc_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-12-30,0.611199,0.625873,0.628747,0.628277,0.629494,0.635719,0.645875,0.654404,0.656845,0.655527,...,0.64517,0.639268,0.6047,0.539749,0.500746,0.491201,0.483235,0.489373,0.504306,0.526884
2022-12-31,0.499564,0.551748,0.567859,0.567254,0.571098,0.576333,0.585715,0.589334,0.589209,0.586438,...,0.645046,0.634787,0.616655,0.549358,0.514713,0.506005,0.502318,0.503818,0.505252,0.518133
2023-01-01,0.537987,0.551503,0.568364,0.569654,0.577435,0.587124,0.597345,0.600583,0.602314,0.602936,...,0.681799,0.659599,0.634785,0.590385,0.560466,0.555265,0.550635,0.551032,0.551484,0.563568
2023-01-02,0.437805,0.520531,0.539984,0.552553,0.557593,0.567658,0.580394,0.583375,0.58295,0.586806,...,0.64537,0.641721,0.627435,0.606302,0.577339,0.573298,0.570443,0.571546,0.571034,0.577301
2023-01-03,0.595851,0.609428,0.614811,0.618805,0.630473,0.643622,0.654198,0.658613,0.665864,0.671015,...,0.67203,0.648809,0.63197,0.575232,0.550686,0.535461,0.527984,0.528561,0.533535,0.552915


In [None]:
# Write 96 hour forecasts to file
file_name = "co2_intensity_96_hr_fcsts_final.csv"
co2_96_preds_df.to_csv(file_name)

s3 = boto3.resource('s3')
s3.meta.client.upload_file(file_name, BUCKET,f'Data/EIA/{file_name}')

# Part 4: Combine 24 and 96 hour CO2 intensity forecasts

Overwrite the first 24 predictions in 96 hour forecast with 24 hour CO2 intensity forecasts

In [None]:
# Read 24 hour CO2 intensity forecasts
data_key = 'Data/EIA/co2_intensity_24_hr_fcsts_final.csv'
data_location = f's3://{BUCKET}/{data_key}'
co2_intensity_24_hr_df = pd.read_csv(data_location,
                                                                  parse_dates=True,
                                                                  index_col='utc_date')

In [None]:
# Read 96 hour CO2 intensity forecasts
data_key = 'Data/EIA/co2_intensity_96_hr_fcsts_final.csv'
data_location = f's3://{BUCKET}/{data_key}'
co2_intensity_96_hr_df = pd.read_csv(data_location,
                                                                  parse_dates=True,
                                                                  index_col='utc_date')

In [None]:
# Overwrite first 24 hours of every 96 hour forecast with more accurate 24 hour
# forecast
first = co2_intensity_24_hr_df.iloc[0].name
last = co2_intensity_96_hr_df.iloc[-1].name

co2_intensity_96_hr_df.loc[first:last, '0':'23']  = co2_intensity_24_hr_df.loc[first:last, '0':'23'] 
co2_intensity_combined_96_hr_df = co2_intensity_96_hr_df.copy()
co2_intensity_combined_96_hr_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
utc_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01,0.470492,0.491854,0.522422,0.527823,0.530451,0.537607,0.542250,0.546962,0.550348,0.550521,...,0.644654,0.641758,0.582585,0.519683,0.483774,0.470625,0.466023,0.468389,0.477112,0.502266
2022-01-02,0.477705,0.537870,0.563954,0.572643,0.577694,0.584193,0.586832,0.589021,0.591147,0.589424,...,0.648259,0.646650,0.586942,0.517407,0.477475,0.467447,0.466135,0.467828,0.475757,0.507616
2022-01-03,0.477013,0.542351,0.576605,0.572354,0.574267,0.580528,0.586977,0.589649,0.591309,0.590583,...,0.645253,0.637235,0.575053,0.510993,0.477432,0.461118,0.455679,0.460788,0.467711,0.508481
2022-01-04,0.500176,0.543726,0.573262,0.569135,0.568175,0.566668,0.565902,0.561390,0.556640,0.558539,...,0.607180,0.593095,0.533867,0.454161,0.420935,0.402075,0.394283,0.392358,0.393509,0.407838
2022-01-05,0.463086,0.522199,0.565619,0.566704,0.568747,0.573697,0.573939,0.574462,0.574837,0.574203,...,0.525286,0.509610,0.428085,0.345803,0.327854,0.320065,0.324047,0.328244,0.347742,0.385012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-30,0.598505,0.616626,0.613444,0.611373,0.616589,0.625580,0.635785,0.638209,0.638335,0.633402,...,0.645170,0.639268,0.604700,0.539749,0.500746,0.491201,0.483235,0.489373,0.504306,0.526884
2022-12-31,0.502901,0.564009,0.570601,0.568537,0.572453,0.577383,0.588672,0.591207,0.586877,0.583128,...,0.645046,0.634787,0.616655,0.549358,0.514713,0.506005,0.502318,0.503818,0.505252,0.518133
2023-01-01,0.534506,0.545318,0.560763,0.561396,0.559934,0.560660,0.562707,0.561904,0.555143,0.550578,...,0.681799,0.659599,0.634785,0.590385,0.560466,0.555265,0.550635,0.551032,0.551484,0.563568
2023-01-02,0.444036,0.515242,0.541933,0.552292,0.561306,0.566924,0.577277,0.587284,0.592936,0.598819,...,0.645370,0.641721,0.627435,0.606302,0.577339,0.573298,0.570443,0.571546,0.571034,0.577301


In [None]:
df_final = co2_intensity_combined_96_hr_df[co2_intensity_combined_96_hr_df.index <= pd.Timestamp('2022-12-31 23:00:00')]
df_final

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
utc_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-01,0.470492,0.491854,0.522422,0.527823,0.530451,0.537607,0.542250,0.546962,0.550348,0.550521,...,0.644654,0.641758,0.582585,0.519683,0.483774,0.470625,0.466023,0.468389,0.477112,0.502266
2022-01-02,0.477705,0.537870,0.563954,0.572643,0.577694,0.584193,0.586832,0.589021,0.591147,0.589424,...,0.648259,0.646650,0.586942,0.517407,0.477475,0.467447,0.466135,0.467828,0.475757,0.507616
2022-01-03,0.477013,0.542351,0.576605,0.572354,0.574267,0.580528,0.586977,0.589649,0.591309,0.590583,...,0.645253,0.637235,0.575053,0.510993,0.477432,0.461118,0.455679,0.460788,0.467711,0.508481
2022-01-04,0.500176,0.543726,0.573262,0.569135,0.568175,0.566668,0.565902,0.561390,0.556640,0.558539,...,0.607180,0.593095,0.533867,0.454161,0.420935,0.402075,0.394283,0.392358,0.393509,0.407838
2022-01-05,0.463086,0.522199,0.565619,0.566704,0.568747,0.573697,0.573939,0.574462,0.574837,0.574203,...,0.525286,0.509610,0.428085,0.345803,0.327854,0.320065,0.324047,0.328244,0.347742,0.385012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-27,0.628688,0.653858,0.653038,0.650209,0.654006,0.659589,0.662665,0.667334,0.663145,0.657335,...,0.657670,0.641683,0.613931,0.564681,0.527258,0.512631,0.506401,0.501162,0.502057,0.535347
2022-12-28,0.565477,0.565661,0.566580,0.559041,0.560466,0.560109,0.559592,0.556818,0.557277,0.556927,...,0.655575,0.645370,0.625490,0.545905,0.501468,0.477549,0.467542,0.462824,0.464558,0.483404
2022-12-29,0.431034,0.517039,0.540298,0.559652,0.572439,0.585651,0.600079,0.608849,0.614148,0.622391,...,0.616109,0.598318,0.570703,0.492857,0.440844,0.417223,0.399269,0.396837,0.402298,0.428609
2022-12-30,0.598505,0.616626,0.613444,0.611373,0.616589,0.625580,0.635785,0.638209,0.638335,0.633402,...,0.645170,0.639268,0.604700,0.539749,0.500746,0.491201,0.483235,0.489373,0.504306,0.526884


In [None]:
co2_combined_96_mape = calc_mape_from_df(val=test, preds_df=df_final, forecast_horizon=96)
print(f'CO2 intensity combined 96 hour forecast MAPE: {round(co2_combined_96_mape, 2)}%')

CO2 intensity combined 96 hour forecast MAPE: 7.97%


In [None]:
# Write combined 96 hour forecasts to file
file_name = "co2_intensity_combined_fcsts_final.csv"
df_final.to_csv(file_name)

s3 = boto3.resource('s3')
s3.meta.client.upload_file(file_name, BUCKET,f'Data/EIA/{file_name}')