In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import itertools
import holidays

# Loading data into dataframe -- energy prices for houston, north, south, and west load zones, predicted and actual energy load for all four load zones, and oil prices

In [20]:
df = pd.read_csv(r"C:\Users\achra\OneDrive\Documents\GitHub\ERCOT_Price_Prediction_Stat_413_Final_Project\Data\CombinedData2023.csv")

In [21]:
df['Oil Price'] = df['Oil Price'].ffill()

# Feature Preprocessing before training LSTM Model:

    1) Need to engineer lags for all energy price columns, and all actual load columns. Lags will be 1 hour, 24 hours, and 168 hours for price columns, and  24 hours for load columns.
    2) Need to create exponential moving average (EMA) mean price and EMA price standard deviation variables for north price, west price, south price, and houston price. 
    3) Need to generate pairwise differences between north, west, south, and houston prices, but using the lagged version (168 hours ago) as the actual variable
    4) Remove first week of data due to lag features being null.
    5) Delete predicted load columns, save the last week of predicted load values to replace actual load with for the test set.
    6) Encode month cyclically as sin and cos values.
    7) Encode day of the week cyclically as sin and cos values.
    8) Encode hour cyclically as sin and cos values.
    8) Add binary holiday column to indicate whether a price observation was during a holiday or not.
    9) Use standard scaler to normalize all numerical features.

In [22]:
# Engineering Lagged Features

def create_lagged_features(df, price_cols_and_lags, load_cols_and_lags):
    """
    Create lagged features for price and load columns.
    price_cols_and_lags: dictionary with keys as column names and values as a list of lags
    load_cols_and_lags: dictionary with keys as column names and values as a list oflags
    """
    new_df = df.copy()
    for col, lags in price_cols_and_lags.items():
        for lag in lags:
            new_df[f'{col}_lag{lag}'] = new_df[col].shift(lag)
    
    for col, lags in load_cols_and_lags.items():
        for lag in lags:
            new_df[f'{col}_lag{lag}'] = new_df[col].shift(lag)
    
    return new_df

price_cols_and_lags = {col: [1,24,168] for col in ['North Price', 'Houston Price', 'South Price', 'West Price']}
load_cols_and_lags = {col: [24] for col in ['North Load', 'Houston Load', 'South Load', 'West Load']}


In [23]:
def create_ema_mean_and_sd(df, ema_cols, span = 168):
    """
        Create Exponential Moving Average (EMA) Mean and Exponential Moving Average Standard Deviation (SD) for specified columns.
        Parameters:
        df (pandas.DataFrame): The input DataFrame containing the data.
        ema_cols (list of str): List of column names for which to calculate the EMA mean and SD.
        span (int, optional): The span for the EMA calculation. Default is 168.
        Returns:
        pandas.DataFrame: A new DataFrame with the original columns and additional columns for EMA mean and SD for each specified column.
    """
    new_df = df.copy()
    for col in ema_cols:
        new_df[f'{col}_ema_mean'] = new_df[col].ewm(span= span).mean()
        new_df[f'{col}_ema_std'] = new_df[col].ewm(span= span).std()
    
    return new_df

ema_cols = ['North Price', 'Houston Price', 'South Price', 'West Price']

In [24]:
def create_pairwise_differences_and_lags(df, price_cols, lag = 168):
    new_df = df.copy()
    pairwise_price_combinations = list(itertools.combinations(price_cols, 2))
    for price1, price2 in pairwise_price_combinations:
        new_df["{}_{}_diff".format(price1, price2)] = new_df[price1] - new_df[price2]
        new_df["{}_{}_diff_lag{}".format(price1, price2, lag)] = new_df["{}_{}_diff".format(price1, price2)].shift(lag)
        new_df.drop(labels = ["{}_{}_diff".format(price1, price2)], axis = 1, inplace = True)
    return new_df


In [25]:
def delete_rows_with_missing_values(df, rows = 168):
    """
    Delete rows with missing data due to adding lagged features.
    Parameters:
    df (pandas.DataFrame): The input DataFrame containing the data.
    rows (int, optional): The number of rows to delete. Default is 168.
    Returns:
    pandas.DataFrame: A new DataFrame with the specified number of rows deleted.
    """
    new_df = df.copy()
    new_df = new_df.iloc[rows:]
    return new_df

In [26]:
def delete_predicted_load(df, load_cols, predicted_load_cols, test_data_size):
    """
    Delete the predicted load columns, but replace the last test_data_size rows from the actual load values with the corresponding predicted load values.
    """
    new_df = df.copy()
    for load_col, predicted_load_col in zip(load_cols, predicted_load_cols):
        new_df.iloc[-test_data_size:, new_df.columns.get_loc(load_col)] = new_df.iloc[-test_data_size:,new_df.columns.get_loc(predicted_load_col)]
    for predicted_load_col in predicted_load_cols:
        new_df.drop(predicted_load_col, axis = 1, inplace = True)
    return new_df



In [27]:
def encode_month(df):
    """
    Encode the month as a cyclic feature
    """
    new_df = df.copy()
    new_df['month'] = pd.to_datetime(new_df['Date']).dt.month
    new_df['month_sin'] = np.sin(2 * np.pi * new_df['month'] / 12)
    new_df['month_cos'] = np.cos(2 * np.pi * new_df['month'] / 12)
    new_df.drop(labels = ['month'], axis = 1, inplace = True)
    return new_df

In [28]:
def encode_day_of_week(df):
    """
    Encode the day of the week as a cyclic feature
    """
    new_df = df.copy()
    new_df['day_of_week'] = pd.to_datetime(new_df['Date']).dt.dayofweek
    new_df["day_of_week_sin"] = np.sin(2 * np.pi * new_df["day_of_week"] / 7)
    new_df["day_of_week_cos"] = np.cos(2 * np.pi * new_df["day_of_week"] / 7)
    new_df.drop(labels = ["day_of_week"], axis = 1, inplace = True)
    return new_df


In [29]:
def encode_holidays(df):
    """
    Encode holidays as a binary feature
    """
    new_df = df.copy()
    tx_holidays = holidays.US(state = 'TX')
    new_df["Is_Holiday"] = new_df["Date"].apply(lambda x : 1 if x in tx_holidays else 0)
    return new_df


In [30]:
def encode_hour(df):
    """
    Encode the hour as a cyclic feature
    """
    
    new_df = df.copy()
    hour = new_df['Time'].str.split(':').str[0].astype(int)
    new_df['hour_sin'] = np.sin(2 * np.pi * hour / 24)
    new_df['hour_cos'] = np.cos(2 * np.pi * hour / 24)
    return new_df


In [31]:
load_cols = ['North Load', 'Houston Load', 'South Load', 'West Load']
predicted_load_cols = ['North Predicted Load', 'Houston Predicted Load', 'South Predicted Load', 'West Predicted Load']
df_1 = create_lagged_features(df, price_cols_and_lags, load_cols_and_lags)
df_2 = create_ema_mean_and_sd(df_1, ema_cols)
df_3 = create_pairwise_differences_and_lags(df_2, ['North Price', 'Houston Price', 'South Price', 'West Price'])
df_4 = delete_rows_with_missing_values(df_3)
df_5 = encode_month(df_4)
df_6 = encode_day_of_week(df_5)
df_7 = encode_holidays(df_6)
df_8 = encode_hour(df_7)
df_final = delete_predicted_load(df_8, load_cols, predicted_load_cols, 168)

In [32]:
columns_to_scale = [col for col in df_final if col not in ['Date', 'Time','month_sin', 'month_cos', 'day_of_week_sin', 'day_of_week_cos', 'Is_Holiday', 'hour_sin', 'hour_cos']]
def scale_data(df, columns_to_scale):
    """
    Scale the data using StandardScaler
    """
    new_df = df.copy()
    scaler = StandardScaler()
    new_df[columns_to_scale] = scaler.fit_transform(new_df[columns_to_scale])
    return (new_df, scaler)

In [33]:
def split_and_scale_data(df, columns_to_scale, test_size=168):
    """
    Split the data into training and test sets, and scale the numerical features.
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame containing the data.
    columns_to_scale (list): List of column names to be scaled.
    test_size (int, optional): The number of rows to be used as the test set. Default is 168.
    
    Returns:
    tuple: A tuple containing the scaled training DataFrame, scaled test DataFrame, and the scaler object.
    """
    df_train = df.iloc[:-test_size]
    df_test = df.iloc[-test_size:]
    
    df_train_scaled, scaler = scale_data(df_train, columns_to_scale)
    df_test_scaled = df_test.copy()
    df_test_scaled[columns_to_scale] = scaler.transform(df_test[columns_to_scale])
    
    return df_train_scaled, df_test_scaled, scaler

df_train_scaled, df_test_scaled, scaler = split_and_scale_data(df_final, columns_to_scale)

In [34]:
def separate_datetime_features(df_train_scaled, df_test_scaled):
   """
   Separates date and time features from scaled dataframes and returns them along with modified dataframes
   
   Parameters:
   df_train_scaled (pd.DataFrame): Scaled training dataframe with Date and Time columns
   df_test_scaled (pd.DataFrame): Scaled test dataframe with Date and Time columns
   
   Returns:
   tuple: (date_train, date_test, time_train, time_test, df_train_scaled, df_test_scaled)
   """
   # Store date and time columns
   date_train = df_train_scaled['Date']
   date_test = df_test_scaled['Date']
   time_train = df_train_scaled['Time'] 
   time_test = df_test_scaled['Time']
   
   # Drop date and time columns
   df_train_scaled.drop(['Date', 'Time'], axis=1, inplace=True)
   df_test_scaled.drop(['Date', 'Time'], axis=1, inplace=True)
   df_train_scaled.ffill(inplace=True)
   df_test_scaled.ffill(inplace=True)
   
   return date_train, date_test, time_train, time_test, df_train_scaled, df_test_scaled

date_train, date_test, time_train, time_test, df_train_scaled, df_test_scaled = separate_datetime_features(df_train_scaled, df_test_scaled)

In [35]:
import pickle

# Save the variables to a file
with open('train_test_data.pkl', 'wb') as f:
    pickle.dump({
        'date_train': date_train,
        'date_test': date_test,
        'time_train': time_train,
        'time_test': time_test,
        'df_train_scaled': df_train_scaled,
        'df_test_scaled': df_test_scaled,
        'scaler': scaler
    }, f)