In [1]:
# step 1. CD

In [2]:
import pandas as pd
from PyEMD import CEEMDAN

# Load the dataset from an Excel file
data = pd.read_excel('./Datasets/Original Data.xlsx')

In [None]:
# Filter the dataset to include only the training data based on the 'time_idx'
train_data = data[data['time_idx'] <= 712]
wti_train = train_data['WTI']
brent_train = train_data['Brent']

# Initialize the CEEMDAN object with a specified number of trials for robust decomposition
ceemdan = CEEMDAN()
ceemdan.trials = 100

# Function to decompose a time series using the CEEMDAN method
def decompose_data(series):
    imfs = ceemdan(series)
     # Calculate residuals by subtracting the sum of the first two IMFs from the original series if more than two IMFs are present
    residual = series - sum(imfs[:2]) if len(imfs) > 2 else series - sum(imfs)
    return imfs[:2], residual

# Decompose the WTI and Brent training datasets
wti_imfs, wti_residual = decompose_data(wti_train.values)
brent_imfs, brent_residual = decompose_data(brent_train.values)

# Append decomposed components to the original dataset and adjust by adding 10 for normalization
# The addition of 10 to the first two IMFs (Intrinsic Mode Functions) is a preprocessing step to normalize their scale.
# Since IMFs often oscillate around zero, this shift facilitates better learning and training stability for subsequent modeling.
# This normalization is reversed in the final analysis to maintain the original data scale.
for i, imf in enumerate(wti_imfs):
    data[f'WTI_IMF{i+1}'] = pd.Series(imf + 10, index=train_data.index) # Adjust WTI IMFs
data['WTI_Res'] = pd.Series(wti_residual, index=train_data.index) # Append WTI residual

for i, imf in enumerate(brent_imfs):
    data[f'Brent_IMF{i+1}'] = pd.Series(imf + 10, index=train_data.index)
data['Brent_Res'] = pd.Series(brent_residual, index=train_data.index)

# Save the enriched dataset back to the Excel file
data.to_excel('./Datasets/Original Data.xlsx', index=False)

In [None]:
# step 2 Extended Window Application

In [3]:
data = pd.read_excel('./Datasets/CEEMDAN Decomposed Data.xlsx')

# Filter data for training based on the 'time_idx'
train_data = data[data['time_idx'] <= 712]
N1 = 30  # Main window size for feature extraction
N2 = 5   # Extension window size

mapping_pairs = []
for t in range(1, len(train_data) - N1 - N2 + 1):
    W_t = train_data.iloc[t-1:t+N1-1]  # Extract the main window data for past N1 days
    D_k_t_plus_N2 = train_data.iloc[t+N2-1:t+N1+N2-1]  # Extract the extended window data for target IMF components at N2 days ahead

    W_t_data = W_t[['Brent', 'time_idx']]
    D_k_data = D_k_t_plus_N2[['Brent_IMF1', 'Brent_IMF2', 'Brent_Res']]
    
    # Collect the pair of feature matrix and target matrix for training
    mapping_pairs.append((W_t_data, D_k_data))

In [None]:
# step 3 Training the Enhancing Mapping Neural Network (EMNN)

In [4]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

# Define a function to build a multi-layer neural network model for regression
def build_mnn_model(input_dim):
    # Construct a fully connected neural network with four hidden layers and one output layer
    model = Sequential([
        Dense(1024, input_dim=input_dim, activation='relu'),
        Dense(1024, activation='relu'),
        Dense(1024, activation='relu'),
        Dense(1024, activation='relu'),
        Dense(1)   
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse') # Compile model with MSE loss and Adam optimizer
    return model

# Extract features and targets for training from mapping pairs
train_features = np.array([pair[0].values.flatten() for pair in mapping_pairs])
train_targets_imf1 = np.array([pair[1]['Brent_IMF1'].values[-1] for pair in mapping_pairs]).reshape(-1, 1)
train_targets_imf2 = np.array([pair[1]['Brent_IMF2'].values[-1] for pair in mapping_pairs]).reshape(-1, 1)
train_targets_residual = np.array([pair[1]['Brent_Res'].values[-1] for pair in mapping_pairs]).reshape(-1, 1)

# Build models for each IMF and the residual
input_dim = W_t_data.shape[1] * W_t_data.shape[0]  
imf1_model = build_mnn_model(input_dim)
imf2_model = build_mnn_model(input_dim)
residual_model = build_mnn_model(input_dim)

# Configure callbacks to save the best model based on the lowest loss
checkpoint_imf1 = ModelCheckpoint('./EEMD_saved_models/Brent_imf1_best_model.h5', monitor='loss', save_best_only=True, verbose=1)
checkpoint_imf2 = ModelCheckpoint('./EEMD_saved_models/Brent_imf2_best_model.h5', monitor='loss', save_best_only=True, verbose=1)
checkpoint_residual = ModelCheckpoint('./EEMD_saved_models/Brent_residual_best_model.h5', monitor='loss', save_best_only=True, verbose=1)

# Conditionally train models if all data arrays match in length
if (len(train_features) == len(train_targets_imf1) == len(train_targets_imf2) == len(train_targets_residual)):
    imf1_model.fit(train_features, train_targets_imf1, epochs=50, batch_size=32, verbose=1, callbacks=[checkpoint_imf1])
    imf2_model.fit(train_features, train_targets_imf2, epochs=50, batch_size=32, verbose=1, callbacks=[checkpoint_imf2])
    residual_model.fit(train_features, train_targets_residual, epochs=50, batch_size=32, verbose=1, callbacks=[checkpoint_residual])
else:
    print("Error: Data cardinality mismatch!")

In [None]:
# step 4 Prediction/Decomposition

In [7]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model

# Load trained models
imf1_model = load_model('./EEMD_saved_models/Brent_imf1_best_model.h5')
imf2_model = load_model('./EEMD_saved_models/Brent_imf2_best_model.h5')

# Load data
data = pd.read_excel('./Datasets/CEEMDAN Decomposed Data.xlsx')
data.sort_values('time_idx', inplace=True)

# Define prediction range
start_time_idx = 713 
end_time_idx = data['time_idx'].max() 

# Predict and update DataFrame
for time_idx in range(start_time_idx, end_time_idx + 1):
    target_idx = time_idx - 1

    if target_idx >= 0:
        window_data = data.iloc[time_idx - 30:time_idx][['Brent', 'time_idx']].values.flatten()
        window_data = np.expand_dims(window_data, axis=0)

        # Predict
        predicted_imf1 = imf1_model.predict(window_data)  
        predicted_imf2 = imf2_model.predict(window_data) 

        # Update DataFrame
        data.at[target_idx, 'Brent_IMF1'] = predicted_imf1.flatten()[0]
        data.at[target_idx, 'Brent_IMF2'] = predicted_imf2.flatten()[0]

        # Calculate and adjust residual
        if 'Brent' in data.columns:
            predicted_residual = data.at[target_idx, 'Brent'] - (predicted_imf1.flatten()[0] + predicted_imf2.flatten()[0])
            data.at[target_idx, 'Brent_Res'] = predicted_residual + 20  # Include +20 adjustment here

# Save the updated DataFrame back to Excel
data.to_excel('./Datasets/CEEMDAN Decomposed Data.xlsx', index=False)

print("Predictions integrated and Excel file updated.")


Predictions integrated and Excel file updated.
