In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import Tuple, Any
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score as accuracy, f1_score
from sklearn.preprocessing import scale
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from pathlib2 import Path
from os.path import join
import os
import re
import pickle
import yaml

**Pre-Processing Functions**

In [2]:
from sklearn.preprocessing import scale
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler

#IMPORT FROM dataset.py CNNpred-pytorch
class WholeDataset(Dataset):

    def __init__(self, data, target):
        self.data = data
        self.target = target


    def __len__(self):
        return self.target.shape[0]

    def __getitem__(self, idx):
        #print("Accessing data at index", idx, "with shape:", self.data[idx].shape)
        return self.data[idx], self.target[idx]

#IMPORT FROM dataset.py CNNpred-pytorch
def generate_batches(dataset, #ONLY GENERATES TWO BATCHES
                     batch_size,
                     shuffle=True,
                     drop_last=False,
                     device="cpu",
                     n_workers=0):
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            num_workers=n_workers,
                            pin_memory=False)

    for data, labels in dataloader:
        #data = torch.unsqueeze(data, 1).float()
        data = data.float()
        labels = labels.float()
        yield data.to(device, non_blocking=True), labels.to(device, non_blocking=True)


#Personally I think this should be applied to the data before it is split into
#train, test, and val, but following CNNpred code, the test set losses seq_len
#points with this method
def create_windows(X, y, seq_len):
  series = []
  target = []
  for i in range(len(X) - seq_len + 1):
    series.append(X[i: i + seq_len])
    target.append(y[i + seq_len - 1])

  return np.array(series), np.array(target)


def scale_X(X_train, X_val, X_test):
  # Shapes of the arrays
  num_samples_train, seq_length, num_stocks, num_features = X_train.shape
  num_samples_val = X_val.shape[0]
  num_samples_test = X_test.shape[0]

  # Initialize a scaler for each feature
  scalers = [StandardScaler() for _ in range(num_features)]

  # Scale training data and prepare to scale validation and test data
  X_train_scaled = np.empty_like(X_train)
  X_val_scaled = np.empty_like(X_val)
  X_test_scaled = np.empty_like(X_test)

  # Scale each feature
  for i in range(num_features):
      # Extract the feature across all training data
      feature_data_train = X_train[:, :, :, i].reshape(num_samples_train, seq_length * num_stocks)
      
      # Fit and transform training data
      scalers[i].fit(feature_data_train)  # Fit only on training data
      scaled_feature_data_train = scalers[i].transform(feature_data_train)
      
      # Transform validation data
      feature_data_val = X_val[:, :, :, i].reshape(num_samples_val, seq_length * num_stocks)
      scaled_feature_data_val = scalers[i].transform(feature_data_val)
      
      # Transform test data
      feature_data_test = X_test[:, :, :, i].reshape(num_samples_test, seq_length * num_stocks)
      scaled_feature_data_test = scalers[i].transform(feature_data_test)
      
      # Reshape scaled data back and store it in the correct position
      X_train_scaled[:, :, :, i] = scaled_feature_data_train.reshape(num_samples_train, seq_length, num_stocks)
      X_val_scaled[:, :, :, i] = scaled_feature_data_val.reshape(num_samples_val, seq_length, num_stocks)
      X_test_scaled[:, :, :, i] = scaled_feature_data_test.reshape(num_samples_test, seq_length, num_stocks)

  return X_train_scaled, X_val_scaled, X_test_scaled
  # Now X_train_scaled, X_val_scaled, and X_test_scaled contain the appropriately scaled data


def preprocess(data: dict,
               seq_len: int,
               Val_first: bool,
               Trim_end: str,
               Split_Date1: str,
               Split_Date2: str,
               n_day_predict: int = 1,) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:


  X_all = []
  y_all = {}

  for index, df in data.items():

    #Ensure it is sorted
    df = df.sort_index()

    #Get y
    y_i = (df['close'][n_day_predict:] / df['close'][:-n_day_predict].values).astype(int)
    y_i = y_i[:-n_day_predict]


    #touch up data
    X_i = df.fillna(0)
    X_i = X_i[:-n_day_predict]
    X_i = X_i[n_day_predict:]

    X_all.append(X_i) 
    y_all[index] = y_i

    #Get Trim index
    Trim_index   = df.index.get_loc(Trim_end)
    Split_index1 = df.index.get_loc(Split_Date1)
    Split_index2 = df.index.get_loc(Split_Date2)
    



    
  y = {}
    
  for index, df in data.items():
    X, y[index] = create_windows(np.transpose(np.array(X_all), (1, 0, 2)), np.array(y_all[index]).flatten(), seq_len)
  
  y_train = {}

  y_test = {}

  y_val = {}

  TESTAMOUNT = 365 #THIS IS HARD CODED AS ****, the 30 is HARD CODED
  if Val_first: #Puts the validation first
     X_val = X[Trim_index:Split_index1]
     X_train = X[Split_index1:Split_index2]
     X_test = X[Split_index2:Split_index2+TESTAMOUNT]
     for index, _ in data.items():
        y_val[index]   = y[index][Trim_index:Split_index1]
        y_train[index] = y[index][Split_index1:Split_index2]
        y_test[index]  = y[index][Split_index2:Split_index2+TESTAMOUNT] 
  else:
    X_train = X[Trim_index:Split_index1]
    X_val = X[Split_index1:Split_index2]
    X_test = X[Split_index2:Split_index2+TESTAMOUNT]
    for index, _ in data.items():
      y_train[index]   = y[index][Trim_index:Split_index1]
      y_val[index] = y[index][Split_index1:Split_index2]
      y_test[index]  = y[index][Split_index2:Split_index2+TESTAMOUNT]
  

  #Scaling
  X_train, X_val, X_test = scale_X(X_train, X_val, X_test)
  

  return X_train, y_train, X_test, y_test, X_val, y_val


**Utility Functions** Might combine with preprocessing

In [3]:
#Use for deriving next model filename
def next_file(file, path):
  #Ensure dicectory exists
  os.makedirs(path, exist_ok=True)

  #Get root and extension
  filename, ext = os.path.splitext(file)

  #Get list of files
  files = os.listdir(path)
  num = []
  for file in files:
    if file.startswith(filename):
      num.append(int(re.findall(r'\d+', file.split('-')[-1])[0]))

  #Find next iteration number
  file_iteration_number = max(num) + 1 if len(num) else 1
  return join(path, filename + "-" + str(file_iteration_number) + ext)

**Engine Code**

In [4]:
#Main.py script
#Define Model and preprocessing params
config_path = "./Configs/year5_CNNpred.yaml" #TODO: Define config file, this will be done in command line
with open(config_path, "r") as file:
  config = yaml.safe_load(file)

with open(config['preprocess']['dataset_path'], 'rb') as file:
  data = pickle.load(file)


preparams = {
    "data": data,
    "seq_len": config['preprocess']['seq_len'],
    "Val_first": bool(config['preprocess']['Val_first']),
    "Trim_end": str(config['preprocess']['Trim_end']),
    "Split_Date1": str(config['preprocess']['Split_Date1']),
    "Split_Date2": str(config['preprocess']['Split_Date2']),
    "n_day_predict": config['preprocess']['n_day_predict'],
}


In [5]:
data["^DJI"].index.get_loc("2022-01-03")

5536

In [6]:
#Main.py script
#preprocess data

X_train, y_train, X_test, y_test, X_val, y_val = preprocess(**preparams)


#Dict of Datasets
train_data = {}
val_data = {}
test_data = {}

#Dict of batch generaters
train_dataloader = {}

#Create wholedatasets in dict for each stock
#Create batch generator for training


X_val = X_val.transpose(0, 3, 1, 2) #TEMP
X_test = X_test.transpose(0, 3, 1, 2) #TEMP

X_train = X_train.transpose(0, 3, 1, 2) #TEMP
for index, y_train_i in y_train.items():
  train_data[index] = WholeDataset(X_train, y_train_i)
  # train_dataloader[index] = generate_batches(train_data[index], config['train']['batch_size'], config['train']['num_workers']) #TODO: Replace 128 and 0 with .yaml settings
  #Train_dataloader moved to training loop, as it needs to be re init every time

for index, y_val_i in y_val.items():

  val_data[index] = WholeDataset(X_val, y_val_i)

for index, y_test_i in y_test.items():
  test_data[index] = WholeDataset(X_test, y_test_i)

In [7]:
class CNNModelOne(nn.Module):
    def __init__(self, number_filter, number_of_stocks, seq_len, number_feature, drop, calculated_fc_layer_size):
        super(CNNModelOne, self).__init__()

        # Layer 1
        #self.conv1 = nn.Conv2d(in_channels=number_feature, out_channels=number_filter[0], kernel_size=(1, 1))
        self.conv1 = nn.Conv2d(in_channels=82, out_channels=8, kernel_size=(1, 1))
        # Layer 2
        #self.conv2 = nn.Conv2d(in_channels=number_filter[0], out_channels=number_filter[1], kernel_size=(1, 3))

        self.conv2 = nn.Conv2d(in_channels=8, out_channels=8, kernel_size=(3, 5))
        self.pool1 = nn.MaxPool2d(kernel_size=(2, 1))

        # Layer 3
        self.conv3 = nn.Conv2d(in_channels=8, out_channels=8, kernel_size=(3, 1))
        self.pool2 = nn.MaxPool2d(kernel_size=(2, 1))

        # Flatten and Dropout
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(drop)

        # Output layer
        self.fc = nn.Linear(calculated_fc_layer_size, 1)  # Calculate based on output size after last pooling

    def forward(self, x):
        """
        torch.Size([128, 82, 60, 5])
        torch.Size([128, 8, 60, 5])
        torch.Size([128, 8, 58, 1])
        torch.Size([128, 8, 29, 1])
        torch.Size([128, 8, 27, 1])
        torch.Size([128, 8, 13, 1])
        torch.Size([128, 104])
        torch.Size([128, 104])
        """
        #print("-----------------x-x-xxx--------------")
        #print(x.shape) # torch.Size([128, 82, 60, 5])
        x = F.relu(self.conv1(x))
        #print(x.shape) # torch.Size([128, 8, 60, 5])
        x = F.relu(self.conv2(x))
        #print(x.shape) # torch.Size([128, 8, 58, 1])
        x = self.pool1(x)
        #print(x.shape) # torch.Size([128, 8, 29, 1])
        x = F.relu(self.conv3(x))
        #print(x.shape) # torch.Size([128, 8, 27, 1])
        x = self.pool2(x)
        #print(x.shape) # torch.Size([128, 8, 13, 1])
        x = self.flatten(x)
        #print(x.shape) # torch.Size([128, 104])
        x = self.dropout(x)
        #print(x.shape) # torch.Size([128, 104])
        x = torch.sigmoid(self.fc(x))
        return x

In [8]:
class CNNModelTwo(nn.Module):
    def __init__(self, number_filter, number_of_stocks, seq_len, number_feature, drop, calculated_fc_layer_size):
        super(CNNModelTwo, self).__init__()

        # Layer 1
        #self.conv1 = nn.Conv2d(in_channels=number_feature, out_channels=number_filter[0], kernel_size=(1, 1))
        self.conv1 = nn.Conv2d(in_channels=82, out_channels=8, kernel_size=(1, 1))
        # Layer 2
        #self.conv2 = nn.Conv2d(in_channels=number_filter[0], out_channels=number_filter[1], kernel_size=(1, 3))

        self.conv2 = nn.Conv2d(in_channels=8, out_channels=8, kernel_size=(3, 5))
        self.pool1 = nn.MaxPool2d(kernel_size=(2, 1))

        # Layer 3
        self.conv3 = nn.Conv2d(in_channels=8, out_channels=8, kernel_size=(3, 1))
        self.pool2 = nn.MaxPool2d(kernel_size=(2, 1))

        # Flatten and Dropout
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(drop)

         # Additional Fully Connected Layer
        self.fc1 = nn.Linear(calculated_fc_layer_size, 52)  # New intermediate layer with 52 neurons

        # Output layer
        self.fc2 = nn.Linear(52, 1)  # Output layer after the new FC layer

    def forward(self, x):
        """
        torch.Size([128, 82, 60, 5])
        torch.Size([128, 8, 60, 5])
        torch.Size([128, 8, 58, 1])
        torch.Size([128, 8, 29, 1])
        torch.Size([128, 8, 27, 1])
        torch.Size([128, 8, 13, 1])
        torch.Size([128, 104])
        torch.Size([128, 104])
        """
        #print("-----------------x-x-xxx--------------")
        #print(x.shape) # torch.Size([128, 82, 60, 5])
        x = F.relu(self.conv1(x))
        #print(x.shape) # torch.Size([128, 8, 60, 5])
        x = F.relu(self.conv2(x))
        #print(x.shape) # torch.Size([128, 8, 58, 1])
        x = self.pool1(x)
        #print(x.shape) # torch.Size([128, 8, 29, 1])
        x = F.relu(self.conv3(x))
        #print(x.shape) # torch.Size([128, 8, 27, 1])
        x = self.pool2(x)
        #print(x.shape) # torch.Size([128, 8, 13, 1])
        x = self.flatten(x)
        #print(x.shape) # torch.Size([128, 104])
        x = self.dropout(x)
        #print(x.shape) # torch.Size([128, 104])
        x = F.relu(self.fc1(x))  # Activation for intermediate layer
        x = torch.sigmoid(self.fc2(x))  # Final output with sigmoid activation
        return x

In [9]:
from torch.optim.lr_scheduler import ReduceLROnPlateau




#Train.py script
def validate(config, model, dataset):
    model.eval()
    loss_fcn = torch.nn.BCELoss().to(device)

    data_dataloader = generate_batches(dataset, config['train']['batch_size'], config['train']['num_workers'])

    loss_list = []
    pred_list = []
    label_list = []
    with torch.no_grad():
        for batch_data, batch_label in data_dataloader:

            batch_data = batch_data.to(device)
            batch_label = batch_label.to(device)

            
            batch_logit = model(batch_data).view(-1)

            loss = loss_fcn(batch_logit, batch_label)

            pred = (batch_logit > 0.5).int()

            pred_list.extend(pred.cpu().numpy())
            label_list.extend(batch_label.cpu().numpy())

            loss_list.append(loss.item())

        loss_data = np.array(loss_list).mean()
        acc = accuracy(pred_list, label_list)
        f1 = f1_score(pred_list, label_list, average='macro')

    return loss_data, acc, f1,
def train(config, train_data, val_dataset, test_dataset, filepath): #Test_dataset SHOULD NOT BE A PARAMETER!!!!!
  #Init Model
  #model_class = getattr(models, config['model']['type'])
  #model = model_class(**config['model']['params'])

  ####DEBUG DEBUG DEBUG
  model = CNNModelOne(number_filter=[8,8,8], number_of_stocks=5, seq_len=60, number_feature=82, drop=0.1, calculated_fc_layer_size =104)
  model = model.to(device)
  # print("-------Model----------")
  # print(model)
  # print(model.conv3.kernel_size)
  # print(model.conv3.padding)
  # print("-------Model----------")

  #Init loss function
  #loss_fcn = torch.nn.BCELoss()
  #loss_class = getattr(nn, config['loss_function']['type'])
  #loss_fcn = loss_class(**config['loss_function']['params']).to(device)
  loss_fcn = nn.BCELoss().to(device)


  #Init Optimizer
  #optimizer_class = getattr(optim, config['optimizer']['type'])
  #optimizer = optimizer_class(model.parameters(), **{k: v for k, v in config['optimizer'].items() if k != 'type'})


  ###DEBUG DEBUG DEBUG
  optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)




  #Init Scheduler
  #scheduler_class = getattr(optim.lr_scheduler, config['scheduler']['type'])
  #scheduler = scheduler_class(optimizer, **{k: v for k, v in config['scheduler'].items() if k != 'type'})

  ###DEBUG DEBUG DEBUG
  #scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20, verbose=True, eps=1e-8)
  for epoch in range(config['train']['epoch']):
    model.train()
    loss_list = []
    pred_list = []
    label_list = []
    #train_dataloader
    train_dataloader = generate_batches(train_data, config['train']['batch_size'], config['train']['num_workers'])

    for batch_data, batch_label in train_dataloader:
      # print("-------bd----------")
      # print(batch_data.shape)
      # print("-------------------")
      batch_data = batch_data.to(device) #
      batch_label = batch_label.to(device) #

      optimizer.zero_grad()
      batch_logit = model(batch_data).view(-1)
      loss = loss_fcn(batch_logit, batch_label)
      loss.backward()
      optimizer.step()
      # print("Batch_logit")
      # print(batch_logit)

      #Convert to CPU for processing
      batch_data = batch_data.cpu()
      batch_label = batch_label.cpu()


      pred = (batch_logit > 0.5).int()
      # print("Pred")
      # print(pred)

      pred_list.extend(pred.cpu().numpy())
      label_list.extend(batch_label.cpu().numpy())

      
      parameters = list(model.parameters())
      loss_list.append(loss.item())

    loss_data = np.array(loss_list).mean()

    train_acc = accuracy(pred_list, label_list)
    # print("-------acc----------")
    # print(train_acc)
    # print(pred_list)
    # print(label_list)
    # print("-------acc----------")

    train_f1 = f1_score(pred_list, label_list, average='macro')

    print("Epoch {:05d}\n"
          "Train: loss: {:.4f} | accuracy: {:.4f} | f-acore: {:.4f}"
          .format(epoch + 1, loss_data, train_acc, train_f1))
    #loss_data = float(loss_data.item())

    #loss_data = torch.tensor(loss_data)
    
    #scheduler.step(loss_data) SCHEDULER REMOVE FOR NOW

    ###TEMP TEMP
    test_loss, test_acc, test_f1 = validate(config, model, test_dataset)
    print("Test:  loss: {:.4f} | accuracy: {:.4f} | f1: {:.4f}"
          .format(test_loss, test_acc, test_f1))

    ###TEMP TEMP
    
    val_loss, val_acc, val_f1 = validate(config, model, val_dataset)
    print("Validation:  loss: {:.4f} | accuracy: {:.4f} | f1: {:.4f}"
          .format(val_loss, val_acc, val_f1))
      # choosing best model according to best validation accuracy
    best_f1 = 0
    if best_f1 < val_f1:
        best_f1 = val_f1
        torch.save(model, filepath)

      # else:
      #     cur_step += 1
      #     if cur_step == config['train']['patience']:
      #         break

In [10]:
#Main.py script
#Training loop
import models #TODO: Put this at top of imports

#Set up GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU instead.")

gen_num = 1 #TODO: Define this in the Config file
for iteration in range(gen_num):


  for index, _ in y_train.items():
    #Get Model output path
    print("-----------------------------------------------------------------------------------------")
    print(index)
    print("-----------------------------------------------------------------------------------------")
    model_path = next_file(config['model']['type'] + index[1:], join("./Results/", os.path.basename(config_path)))
    # for b, a, in train_dataloader[index]:
    #   print("opppppppppppppppppppppppppppppppppppppppppppppppppppppppp")
    train(config, train_data[index], val_data[index], test_data[index], model_path)


Using GPU: NVIDIA GeForce RTX 3070
-----------------------------------------------------------------------------------------
^DJI
-----------------------------------------------------------------------------------------
Epoch 00001
Train: loss: 0.6919 | accuracy: 0.5369 | f-acore: 0.3649
Test:  loss: 0.6913 | accuracy: 0.5315 | f1: 0.3470
Validation:  loss: 0.6951 | accuracy: 0.4881 | f1: 0.3280
Epoch 00002
Train: loss: 0.6902 | accuracy: 0.5392 | f-acore: 0.3503
Test:  loss: 0.6908 | accuracy: 0.5315 | f1: 0.3470
Validation:  loss: 0.6952 | accuracy: 0.4881 | f1: 0.3280
Epoch 00003
Train: loss: 0.6926 | accuracy: 0.5392 | f-acore: 0.3503
Test:  loss: 0.6906 | accuracy: 0.5315 | f1: 0.3470
Validation:  loss: 0.6951 | accuracy: 0.4881 | f1: 0.3280
Epoch 00004
Train: loss: 0.6872 | accuracy: 0.5374 | f-acore: 0.3504
Test:  loss: 0.6911 | accuracy: 0.5315 | f1: 0.3470
Validation:  loss: 0.6957 | accuracy: 0.4881 | f1: 0.3280
Epoch 00005
Train: loss: 0.6880 | accuracy: 0.5387 | f-acore: 0.