# Define the level 1 models

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import data_prep

# Hyperparameters
num_features = 9  # Number of features
use_time_horizon = False

# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self):
        super(LSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(num_features, 50, batch_first=True)
        self.dropout1 = nn.Dropout(0.4)
        self.lstm2 = nn.LSTM(50, 50, batch_first=True)
        self.dropout2 = nn.Dropout(0.4)
        self.lstm3 = nn.LSTM(50, 50, batch_first=True)
        self.dropout3 = nn.Dropout(0.4)
        self.lstm4 = nn.LSTM(50, 50, batch_first=True)  
        self.dropout4 = nn.Dropout(0.4)
        self.fc = nn.Linear(50, 1) 
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        if use_time_horizon:
            outputs = []
            # For each sequence in the batch
            for i in range(x.shape[0]):
                x_ele = x[i]
                # remove padding
                x_ele = x_ele[x_ele[:, 0] != -1]
                # print("before", x_ele.shape)
                x_ele = x_ele.unsqueeze(0) # Add a batch dimension
                # print("after", x_ele.shape)

                out, _ = self.lstm1(x_ele)  # process single sequence
                out = self.dropout1(out)
                out, _ = self.lstm2(out)
                out = self.dropout2(out)
                out, _ = self.lstm3(out)
                out = self.dropout3(out)
                out, _ = self.lstm4(out)
                out = out[:, -1, :]  # take the last output from the last LSTM layer
                out = self.dropout4(out)
                out = self.fc(out)
                out = self.sigmoid(out)
                outputs.append(out)

            outputs = torch.cat(outputs, dim=0)  # recombine into a single batch tensor
            return outputs
        else:
            # print("x shape", x.shape)
            x, _ = self.lstm1(x)
            x = self.dropout1(x)
            x, _ = self.lstm2(x)
            x = self.dropout2(x)
            x, _ = self.lstm3(x)
            x = self.dropout3(x)
            x, _ = self.lstm4(x)
            x = x[:, -1, :]  # Take the last output from the last LSTM layer
            x = self.dropout4(x)
            x = self.fc(x)   # Linear layer to map to 1 output
            x = self.sigmoid(x)
            return x
    # def forward(self, x):
    #     x, _ = self.lstm1(x)
    #     x = self.dropout1(x)
    #     x, _ = self.lstm2(x)
    #     x = self.dropout2(x)
    #     x, _ = self.lstm3(x)
    #     x = self.dropout3(x)
    #     x, _ = self.lstm4(x)
    #     x = x[:, -1, :]  # Take the last output from the last LSTM layer
    #     x = self.dropout4(x)
    #     x = self.fc(x)   # Linear layer to map to 1 output
    #     x = self.sigmoid(x)
    #     return x

# GRU Model
class GRUModel(nn.Module):
    def __init__(self):
        super(GRUModel, self).__init__()
        self.gru1 = nn.GRU(num_features, 50, batch_first=True)
        self.dropout1 = nn.Dropout(0.4)
        self.gru2 = nn.GRU(50, 50, batch_first=True)
        self.dropout2 = nn.Dropout(0.4)
        self.gru3 = nn.GRU(50, 50, batch_first=True)
        self.dropout3 = nn.Dropout(0.4)
        self.gru4 = nn.GRU(50, 50, batch_first=True)
        self.dropout4 = nn.Dropout(0.4)
        self.fc = nn.Linear(50, 1)  # Ensures the output is of size [batch_size, 1]
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        if use_time_horizon:

            outputs = []
            # For each sequence in the batch
            for i in range(x.shape[0]):
                x_ele = x[i]
                # remove padding
                x_ele = x_ele[x_ele[:, 0] != -1]
                x_ele = x_ele.unsqueeze(0)
                # print(x_ele.shape)

                # Pass through the GRU layers
                x_ele, _ = self.gru1(x_ele)
                x_ele = self.dropout1(x_ele)
                x_ele, _ = self.gru2(x_ele)
                x_ele = self.dropout2(x_ele)
                x_ele, _ = self.gru3(x_ele)
                x_ele = self.dropout3(x_ele)
                x_ele, _ = self.gru4(x_ele)
                x_ele = x_ele[:, -1, :]  # Take the last output
                x_ele = self.dropout4(x_ele)
                x_ele = self.fc(x_ele)   # Linear layer to map to 1 output
                x_ele = self.sigmoid(x_ele)
                outputs.append(x_ele)

            outputs = torch.cat(outputs, dim=0)  # recombine into a single batch tensor
            return outputs
        else:
            x, _ = self.gru1(x)
            x = self.dropout1(x)
            x, _ = self.gru2(x)
            x = self.dropout2(x)
            x, _ = self.gru3(x)
            x = self.dropout3(x)
            x, _ = self.gru4(x)
            x = x[:, -1, :]  # Take the last output
            x = self.dropout4(x)
            x = self.fc(x)   # Linear layer to map to 1 output
            x = self.sigmoid(x)
            return x
    # def forward(self, x):
    #     x, _ = self.gru1(x)
    #     x = self.dropout1(x)
    #     x, _ = self.gru2(x)
    #     x = self.dropout2(x)
    #     x, _ = self.gru3(x)
    #     x = self.dropout3(x)
    #     x, _ = self.gru4(x)
    #     x = x[:, -1, :]  # Take the last output
    #     x = self.dropout4(x)
    #     x = self.fc(x)   # Linear layer to map to 1 output
    #     x = self.sigmoid(x)
    #     return x

# Define the meta learner

In [30]:
# it's a fully-connect neuralnetwork with three layers; the activation function for this model is the Rectified Linear Unit (ReLu).
# NOTE: The paper doesn't specify the number of neurons in the hidden layers, so I'm basing on the stanford paper
class MetaLearner(nn.Module):
    def __init__(self):
        super(MetaLearner, self).__init__()
        self.fc1 = nn.Linear(2, 2)
        self.fc12 = nn.Linear(2, 1, bias=False)
        # self.fc2 = nn.Linear(30, 25)
        # self.fc3 = nn.Linear(25, 20)
        # self.fc4 = nn.Linear(20, 1)
        self.sigmoid = nn.Sigmoid() 
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc12(x)
        # x = self.relu(self.fc2(x))
        # x = self.relu(self.fc3(x))
        # x = self.fc4(x)
        x = self.sigmoid(x) 
        return x

# Load in data

In [31]:
import pandas as pd
import numpy as np
from os.path import join
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, TensorDataset

def get_data_stanford():
    DATA_PATH = join('data', 'original_dataset', 'Finalised_datasets', 'amzn_all_sources_WITH_TH_2017-2020.csv') #'Finalised_datasets',

    HORIZON = 10
    DAYS_FORWARD = 1
    END_SPLIT = 30

    # To drop columns, change the data_prep.py file

    # split_y: tuple
    #     (train_y, validate_y, test_y)
    # split_X: tuple
    #     (train_X, validate_X, test_X)
    split_y, split_X = data_prep.data_prep(DATA_PATH, HORIZON, DAYS_FORWARD, END_SPLIT, use_time_horizon)


    return split_X, split_y



(x_train, x_val, x_test), (y_train, y_val, y_test) = get_data_stanford()

print(x_train[0])

print('x_train.shape', x_train.shape)
print('y_train.shape', y_train.shape)
print('x_val.shape', x_val.shape)
print('y_val.shape', y_val.shape)
print('x_test.shape', x_test.shape)
print('y_test.shape', y_test.shape)




504
[[0.42741352 0.46966103 0.47956216 0.40526962 0.16566661 0.
  0.00377905 0.00222886 0.00677276]
 [0.42741352 0.46966103 0.47956216 0.6621269  0.27674776 0.01752055
  0.00928211 0.01029968 0.01237452]
 [0.42741352 0.31750959 0.47956216 0.38826329 0.14672822 0.0270586
  0.02301681 0.02388418 0.0255065 ]
 [0.42741352 0.46966103 0.47956216 0.38826329 0.12117877 0.02909422
  0.02095032 0.02180779 0.0218333 ]
 [0.42741352 0.46966103 0.47956216 0.08273885 0.10579798 0.03063822
  0.0187012  0.01812422 0.01062965]
 [0.42741352 0.46966103 0.47956216 0.44420338 0.08818037 0.01394093
  0.00519478 0.00831723 0.00734663]
 [0.42741352 0.46966103 0.47956216 0.28764027 0.04997692 0.00957227
  0.         0.00854003 0.        ]
 [0.42741352 0.46966103 0.47956216 0.38826329 0.07980827 0.00531793
  0.0042243  0.         0.00964248]
 [0.42741352 0.46966103 0.47956216 0.38826329 0.07000888 0.01852691
  0.01446545 0.0176667  0.01595592]
 [0.42741352 0.46966103 0.47956216 0.44420338 0.06819812 0.02892268
 

# Instatiate the models

In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
# Instantiate models
lstm_model = LSTMModel()
gru_model = GRUModel()
meta_model = MetaLearner()

# Define loss and optimizer
criterion = nn.BCELoss()
lstm_optimizer = optim.RMSprop(lstm_model.parameters(), lr=0.0016, weight_decay=1e-5) # 16 batch size, 150 epochs
gru_optimizer = optim.RMSprop(gru_model.parameters(), lr=0.0008, weight_decay=1e-5) # 16 batch size, 200 epochs
base_models_batch_size = 16
# meta_optimizer = optim.Adam(meta_model.parameters(), lr=0.001) # 100 epochs, 8 batch size
meta_learner_batch_size = 8

# Train the base models

In [33]:
import copy

# Convert data to PyTorch tensors and create DataLoader
X_train_tensor = torch.tensor(x_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

X_val_tensor = torch.tensor(x_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

print(X_train_tensor.shape)
print(y_train_tensor.shape)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=base_models_batch_size, shuffle=True) #Stanford had shuffle true

val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=base_models_batch_size) 

import torch

def train_model(model, optimizer, criterion, train_loader, n_epochs=150, val_loader=None, return_lowest_val_loss=False):
    model.train()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_val_loss = float('inf')
    best_val_accuracy = 0

    for epoch in range(n_epochs):
        epoch_loss = 0
        correct_train = 0
        total_train = 0
        
        # Training phase
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch.view(-1, 1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

            # Calculate training accuracy
            predicted_train = output.round()
            total_train += y_batch.size(0)
            correct_train += (predicted_train == y_batch.view(-1, 1)).sum().item()

        # Output training loss and accuracy
        train_loss_avg = epoch_loss / len(train_loader)
        train_accuracy = correct_train / total_train
        print(f'Epoch {epoch+1}/{n_epochs}, Training Loss: {train_loss_avg:.4f}, Training Accuracy: {train_accuracy:.2f}', end='')

        # Validation phase (if val_loader is provided)
        if val_loader:
            model.eval()
            val_loss = 0
            correct_val = 0
            total_val = 0
            with torch.no_grad():
                for X_val, y_val in val_loader:
                    output_val = model(X_val)
                    val_loss += criterion(output_val, y_val.view(-1, 1)).item()

                    predicted_val = output_val.round()
                    total_val += y_val.size(0)
                    correct_val += (predicted_val == y_val.view(-1, 1)).sum().item()

            val_loss_avg = val_loss / len(val_loader)
            val_accuracy = correct_val / total_val
            print(f', Validation Loss: {val_loss_avg:.4f}, Validation Accuracy: {val_accuracy:.2f}')
            
            if val_accuracy > best_val_accuracy - 1e-2:
                best_val_loss = val_loss_avg
                best_val_accuracy = val_accuracy
                best_model_wts = copy.deepcopy(model.state_dict())
            # # Check if this is the best model so far
            # if val_loss_avg < best_val_loss + 1e-2 : # Add 1e-2 as a "buffer" to favor the latest model
            #     best_val_loss = val_loss_avg
            #     best_val_accuracy = val_accuracy
            #     best_model_wts = copy.deepcopy(model.state_dict())

            model.train()

        else:
            print()  # Just move to the next line

    # Load the best model weights
    if return_lowest_val_loss and val_loader != None:
        print(f'Lowest validation loss: {best_val_loss:.4f}')
        print(f'Best validation accuracy: {best_val_accuracy:.2f}')
        model.load_state_dict(best_model_wts)
    return model



# Train the LSTM model
print("Training LSTM Model")
train_model(lstm_model, lstm_optimizer, criterion, train_loader, 150, val_loader, False)

# Train the GRU model
print("Training GRU Model")
train_model(gru_model, gru_optimizer, criterion, train_loader, 100, val_loader, False) # 200

torch.Size([444, 10, 9])
torch.Size([444])
Training LSTM Model
Epoch 1/150, Training Loss: 0.6903, Training Accuracy: 0.57, Validation Loss: 0.7044, Validation Accuracy: 0.47
Epoch 2/150, Training Loss: 0.6869, Training Accuracy: 0.57, Validation Loss: 0.7059, Validation Accuracy: 0.47
Epoch 3/150, Training Loss: 0.6872, Training Accuracy: 0.57, Validation Loss: 0.7074, Validation Accuracy: 0.47
Epoch 4/150, Training Loss: 0.6865, Training Accuracy: 0.57, Validation Loss: 0.7059, Validation Accuracy: 0.47
Epoch 5/150, Training Loss: 0.6874, Training Accuracy: 0.57, Validation Loss: 0.7016, Validation Accuracy: 0.47
Epoch 6/150, Training Loss: 0.6840, Training Accuracy: 0.57, Validation Loss: 0.7023, Validation Accuracy: 0.47
Epoch 7/150, Training Loss: 0.6835, Training Accuracy: 0.57, Validation Loss: 0.7074, Validation Accuracy: 0.47
Epoch 8/150, Training Loss: 0.6850, Training Accuracy: 0.57, Validation Loss: 0.7062, Validation Accuracy: 0.47
Epoch 9/150, Training Loss: 0.6831, Train

GRUModel(
  (gru1): GRU(9, 50, batch_first=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (gru2): GRU(50, 50, batch_first=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (gru3): GRU(50, 50, batch_first=True)
  (dropout3): Dropout(p=0.2, inplace=False)
  (gru4): GRU(50, 50, batch_first=True)
  (dropout4): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=50, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

# Use base models to predict the validation data, this will be used as input to the Meta Learner

In [34]:
lstm_val_predictions = lstm_model(torch.tensor(x_val, dtype=torch.float32)).detach().numpy().reshape(-1,1)
gru_val_predictions = gru_model(torch.tensor(x_val, dtype=torch.float32)).detach().numpy().reshape(-1,1)

# lstm_pred = lstm_model.predict(X).reshape(-1, 1)
# gru_pred = gru_model.predict(X).reshape(-1, 1)

# Form and return new data set
# new_X = np.hstack((lstm_pred, gru_pred))


# Combine predictions to form new training data for the meta-learner
meta_X_train = np.concatenate((lstm_val_predictions, gru_val_predictions), axis=1)#meta_X_train = np.hstack((lstm_val_predictions, gru_val_predictions))#

print(meta_X_train.shape)



(30, 2)


# Train meta learner

In [77]:
meta_model = MetaLearner()
meta_criterion = nn.BCELoss()
meta_optimizer = optim.Adam(meta_model.parameters(), eps=1e-7) #default keras LR

meta_X_train_tensor = torch.tensor(meta_X_train, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)

meta_train_dataset = TensorDataset(meta_X_train_tensor, y_val_tensor)
meta_train_loader = DataLoader(meta_train_dataset, batch_size=8, shuffle=True)



train_model(meta_model, meta_optimizer, meta_criterion, meta_train_loader, 100, return_lowest_val_loss=False)

Epoch 1/100, Training Loss: 0.7002, Training Accuracy: 0.47
Epoch 2/100, Training Loss: 0.7000, Training Accuracy: 0.47
Epoch 3/100, Training Loss: 0.7026, Training Accuracy: 0.47
Epoch 4/100, Training Loss: 0.7006, Training Accuracy: 0.47
Epoch 5/100, Training Loss: 0.7012, Training Accuracy: 0.47
Epoch 6/100, Training Loss: 0.7027, Training Accuracy: 0.47
Epoch 7/100, Training Loss: 0.7013, Training Accuracy: 0.47
Epoch 8/100, Training Loss: 0.6981, Training Accuracy: 0.47
Epoch 9/100, Training Loss: 0.7023, Training Accuracy: 0.47
Epoch 10/100, Training Loss: 0.6990, Training Accuracy: 0.47
Epoch 11/100, Training Loss: 0.7025, Training Accuracy: 0.47
Epoch 12/100, Training Loss: 0.7013, Training Accuracy: 0.47
Epoch 13/100, Training Loss: 0.7023, Training Accuracy: 0.47
Epoch 14/100, Training Loss: 0.7008, Training Accuracy: 0.47
Epoch 15/100, Training Loss: 0.7002, Training Accuracy: 0.47
Epoch 16/100, Training Loss: 0.6987, Training Accuracy: 0.47
Epoch 17/100, Training Loss: 0.70

MetaLearner(
  (fc1): Linear(in_features=2, out_features=2, bias=True)
  (fc12): Linear(in_features=2, out_features=1, bias=False)
  (sigmoid): Sigmoid()
  (relu): ReLU()
)

In [78]:
from sklearn.metrics import precision_recall_fscore_support
#  the test dataset will be input into the sub-models again to produce intermediate test data for the meta-learner. Afterward, the meta-learner will use the intermediate test predictions from the sub-models to make the final predictions.
lstm_test_predictions = lstm_model(torch.tensor(x_test, dtype=torch.float32)).detach().numpy()
gru_test_predictions = gru_model(torch.tensor(x_test, dtype=torch.float32)).detach().numpy()

meta_X_test = np.concatenate((lstm_test_predictions, gru_test_predictions), axis=1)
meta_X_test_tensor = torch.tensor(meta_X_test, dtype=torch.float32)

meta_test_predictions = meta_model(meta_X_test_tensor).detach().numpy()
print("meta predictions", meta_test_predictions)
# Evaluation metrics
meta_test_predictions = np.round(meta_test_predictions+0.00001)
accuracy = np.mean(meta_test_predictions == y_test)
print(f'Accuracy: {accuracy}')

precision, recall, f1, _ = precision_recall_fscore_support(y_test, meta_test_predictions, average='binary')
print(f'Precision: {precision}, Recall: {recall}, F1 Score: {f1}')



print("lstm predictions", lstm_test_predictions)
lstm_test_predictions = np.round(lstm_test_predictions)
accuracy = np.mean(lstm_test_predictions == y_test)
print(f'Accuracy for lstm: {accuracy}')

precision, recall, f1, _ = precision_recall_fscore_support(y_test, lstm_test_predictions, average='binary')
print(f'Precision for lstm: {precision}, Recall for lstm: {recall}, F1 Score for lstm: {f1}')

print("gru predictions", gru_test_predictions)
gru_test_predictions = np.round(gru_test_predictions)
accuracy = np.mean(gru_test_predictions == y_test)
print(f'Accuracy for gru: {accuracy}')

precision, recall, f1, _ = precision_recall_fscore_support(y_test, gru_test_predictions, average='binary')
print(f'Precision for gru: {precision}, Recall for gru: {recall}, F1 Score for gru: {f1}')


meta predictions [[0.5       ]
 [0.50222516]
 [0.5066623 ]
 [0.51174223]
 [0.5126165 ]
 [0.51459974]
 [0.51609904]
 [0.515377  ]
 [0.51449925]
 [0.512618  ]
 [0.5080938 ]
 [0.50604945]
 [0.50675666]
 [0.5068851 ]
 [0.5100317 ]
 [0.5137889 ]
 [0.5161456 ]
 [0.51576746]
 [0.51385343]
 [0.5143592 ]
 [0.5140845 ]
 [0.5062542 ]
 [0.5075975 ]
 [0.50048846]
 [0.5       ]
 [0.5065622 ]
 [0.5126778 ]
 [0.51524544]
 [0.5152753 ]
 [0.5114242 ]]
Accuracy: 0.5666666666666667
Precision: 0.5666666666666667, Recall: 1.0, F1 Score: 0.723404255319149
lstm predictions [[0.10591886]
 [0.2633614 ]
 [0.41097564]
 [0.5718318 ]
 [0.604662  ]
 [0.6839272 ]
 [0.73475   ]
 [0.70819247]
 [0.6700898 ]
 [0.6075641 ]
 [0.4571608 ]
 [0.40764925]
 [0.44668877]
 [0.4488821 ]
 [0.5353973 ]
 [0.6561181 ]
 [0.7224857 ]
 [0.7057829 ]
 [0.6355047 ]
 [0.68173873]
 [0.68072987]
 [0.41635218]
 [0.47411394]
 [0.23846626]
 [0.15427297]
 [0.41769272]
 [0.61523044]
 [0.7032321 ]
 [0.70514786]
 [0.5975514 ]]
Accuracy for lstm: 0.51

In [38]:
#Save the models
save_path = join('models/')
# torch.save(lstm_model.state_dict(), save_path + 'lstm_model_amzn2.pth')
# torch.save(gru_model.state_dict(), save_path + 'gru_model_amzn2.pth')
# torch.save(meta_model.state_dict(), save_path + 'meta_model_amzn2.pth')