In [2]:
# ! pip install pandas joblib tomli torch torchvision torchaudio tqdm

In [46]:
import torch
import tomli
import joblib
import logging
import pandas as pd

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [38]:
# Load Config
CONFIG_FILE_PATH = "../config.tomli"

with open(CONFIG_FILE_PATH, 'rb') as config_file:
    config = tomli.load(config_file)

ETF_DATA_DRIVE_PATH = f"../{config['data']['etfs']}"
STOCK_DATA_DRIVE_PATH = f"../{config['data']['stocks']}"
PROCESSED_DATA_DRIVE_PATH = f"../{config['data']['processed']}"
ENGINEERED_DATA_DRIVE_PATH = f"../{config['data']['engineered']}"
SYMBOLS_FILE_PATH = f"../{config['data']['symbols']}"

DATASET_PATH = f"{PROCESSED_DATA_DRIVE_PATH}/dataset.parquet"
ENG_DATASET_PATH = f"{ENGINEERED_DATA_DRIVE_PATH}/dataset.parquet"

MODEL_DIR = f"../{config['model']['model_dir']}"

data_dtypes = config['etf_stock_data_type']
symbols_dtype = config['symbols_data_types']

date_format = config['format']['date_format']

rf_model_path = f"{MODEL_DIR}/{config['model']['rf_model']}"
dl_model_path = f"{MODEL_DIR}/{config['model']['dl_model']}"
dl_dataset_path = f"{MODEL_DIR}/{config['model']['dl_dataset_stats']}"

N_JOBS = config['random_forest']['n_jobs']
TEST_SIZE = config['random_forest']['test_size']
MAX_DEPTH = config['random_forest']['max_depth']
N_ESTIMATORS = config['random_forest']['n_estimators']
RANDOM_STATE = config['random_forest']['random_state']

EPOCHS = config['deep_learning']['epochs']
BATCH_SIZE = config['deep_learning']['batch_size']
TEST_SPLIT = config['deep_learning']['test_split']
RANDOM_STATE = config['deep_learning']['random_state']
LEARNING_RATE = config['deep_learning']['learning_rate']
NEGATIVE_SLOPE = config['deep_learning']['negative_slope']
HIDDEN_LAYER_SIZE = config['deep_learning']['hidden_layer_size']

In [12]:
# Create a logger object
logger = logging.getLogger('deep_learning')
logger.setLevel(logging.DEBUG)

# Create a file handler and set the logging level
file_handler = logging.FileHandler('../logs/deep_learning.log')
file_handler.setLevel(logging.DEBUG)

# Create a formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

# Add the file handler to the logger
logger.addHandler(file_handler)

In [39]:
# load dataset
data = pd.read_parquet(ENG_DATASET_PATH)
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

In [40]:
# Select features and target
features = ['vol_moving_avg', 'adj_close_rolling_med']
target = 'Volume'

# build x, y
X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_STATE)

del data

In [41]:
# Creating a datasets
class DLDataset(Dataset):
    def __init__(self, X:pd.DataFrame, y:pd.DataFrame)->None:
        # normalize the data  
        X = (X - X.mean())/X.std()
        y = (y - y.mean())/y.std()

        # saving stats
        self.X_mean = X.mean()
        self.y_mean = y.mean()
        self.X_std = X.std()
        self.y_std = y.std()

        # convert to torch
        self.X = torch.tensor(X.to_numpy(), dtype=torch.float32)
        self.y = torch.tensor(y.to_numpy(), dtype=torch.float32)
        self.length = len(self.y)
    
    def __len__(self)->int:
        return self.length
    
    def __getitem__(self, index) -> tuple:
        return self.X[index], self.y[index]
    
train_dataset = DLDataset(X=X_train, y=y_train)
dl_dataset_stats = {
    'X_mean': train_dataset.X_mean,
    'y_mean': train_dataset.y_mean,
    'X_std': train_dataset.X_std,
    'y_std': train_dataset.y_std,
}
joblib.dump(dl_dataset_stats, dl_dataset_path)
test_dataset = DLDataset(X=X_test, y=y_test)

# Creating DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [42]:
# Create a Deep Learning Model
class DLModel(torch.nn.Module):
    
    def __init__(self, stats:dict=dl_dataset_stats):
        super(DLModel, self).__init__()
        self.hidden_layer_size = HIDDEN_LAYER_SIZE
        self.negative_slope = NEGATIVE_SLOPE
        self.stats = stats

        self.layers = torch.nn.ModuleList()

        self.layers.append(torch.nn.Linear(in_features=2, out_features=self.hidden_layer_size))
        self.layers.append(torch.nn.LeakyReLU(negative_slope=self.negative_slope))
        # self.layers.append(torch.nn.Linear(in_features=(self.hidden_layer_size * 1), out_features=(self.hidden_layer_size * 2)))
        # self.layers.append(torch.nn.LeakyReLU(negative_slope=self.negative_slope))
        # self.layers.append(torch.nn.Linear(in_features=(self.hidden_layer_size * 2), out_features=(self.hidden_layer_size * 1)))
        # self.layers.append(torch.nn.LeakyReLU(negative_slope=self.negative_slope))
        self.layers.append(torch.nn.Linear(in_features=self.hidden_layer_size, out_features=1))
        self.layers.append(torch.nn.LeakyReLU(negative_slope=self.negative_slope))
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def predict(self, vol_moving_avg:float, adj_close_rolling_med:float):
        self.eval()
        # standardizing the data
        vol_moving_avg = (vol_moving_avg - self.stats['X_mean']['vol_moving_avg'])/self.stats['X_std']['vol_moving_avg']
        adj_close_rolling_med = (adj_close_rolling_med - self.stats['X_mean']['adj_close_rolling_med'])/self.stats['X_std']['adj_close_rolling_med']
        # converting to tensor
        x = [vol_moving_avg, adj_close_rolling_med]
        x = torch.tensor(x, dtype=torch.float32)
        # make predictions
        y_pred = self.forward(x)
        # reverse the y standardization
        y_pred = (y_pred * self.stats['y_std']) + self.stats['y_mean']
        return y_pred

model = DLModel()
print(model)

# define your loss function
loss_fn = torch.nn.MSELoss()

# define your optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

DLModel(
  (layers): ModuleList(
    (0): Linear(in_features=2, out_features=128, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=128, out_features=1, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
  )
)


In [43]:
# Train the model

logger.info(f"********************* Started training DL Model *********************************")
logger.info(model)
for epoch in range(EPOCHS):
    total_loss = 0.0
    for X, y in tqdm(train_loader, unit="batch"):
        optimizer.zero_grad()
        y_hat = model(X)
        loss = loss_fn(y_hat, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    epoch_loss = total_loss/len(train_loader)
    logger.info(f"Epoch {epoch} loss: {epoch_loss}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 375/375 [36:57<00:00,  5.91s/batch]
100%|██████████| 375/375 [30:56<00:00,  4.95s/batch]
100%|██████████| 375/375 [34:59<00:00,  5.60s/batch]
100%|██████████| 375/375 [47:07<00:00,  7.54s/batch]
100%|██████████| 375/375 [44:37<00:00,  7.14s/batch] 


In [50]:
# Make predictions on test data
for X, y in test_loader:
    y_pred = model.forward(X).detach().numpy()
    logger.debug(f"Deep Learning Predictions: {y_pred}")
    break

In [51]:
# Calculate the Mean Absolute Error and Mean Squared Error
r2 = r2_score(y, y_pred)
evs = explained_variance_score(y, y_pred)
mae = mean_absolute_error(y, y_pred)
mse = mean_squared_error(y, y_pred)

logger.info(f"Deep Learning Config: {config['deep_learning']}")
logger.info(f"Deep Learning MAE: {mae}")
logger.info(f"Deep Learning MSE: {mse}")
logger.info(f"Deep Learning EVS: {evs}")
logger.info(f"Deep Learning R^2: {r2}")
logger.info("-------------------------------------------------------------")

In [52]:
# Save the model to disk
# a better approach would be MLflow
joblib.dump(model, dl_model_path)

['.././models/model_deep_learning.joblib']

In [9]:
# ! pip freeze > ../requirements.txt