In [None]:
# Imports

import gym
import gymnasium as gym
import torch as T
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gym_trading_env
import gym_trading_env.environments
import pandas as pd
from datetime import datetime, timedelta
import dill
import matplotlib.pyplot as plt
import shutil
import os
import time
import glob
from tqdm.auto import tqdm

ENV_WINDOWS = 10
EVAL_PERIOD = 1
RETURN_DIFFERENCES_TRAIN = []
RETURN_DIFFERENCES_EVAL = []

DEVICE = T.device("cuda:0" if T.cuda.is_available() else "cpu")
print(DEVICE)
first = True

import nvidia_smi
nvidia_smi.nvmlInit()
DEVICE_HANDLE = nvidia_smi.nvmlDeviceGetHandleByIndex(0)

# Run only if loading
dill.load_module("./ddqn_vars/ddqn_best.dill")
first = False

In [None]:
class ReplayBuffer:
    def __init__(self, max_size, state_dim):
        self.max_size = max_size
        self.mem_size = max_size

        self.mem_ctr = 0
        self.n = 0
        self.calibrated = False

        self.states = T.zeros(
            (self.mem_size, state_dim), dtype=T.float32, device=DEVICE
        )
        self.actions = T.zeros(self.mem_size, dtype=T.int64, device=DEVICE)
        self.next_states = T.zeros(
            (self.mem_size, state_dim), dtype=T.float32, device=DEVICE
        )
        self.rewards = T.zeros(self.mem_size, dtype=T.float32, device=DEVICE)
        self.dones = T.zeros(self.mem_size, dtype=T.bool, device=DEVICE)

    def append(
        self,
        states: np.ndarray,
        action: int,
        next_states: np.ndarray,
        reward: float,
        done: bool,
    ):
        idx = self.mem_ctr % self.mem_size

        self.states[idx] = T.Tensor(
            np.reshape(states, (np.multiply(*states.shape)))
        ).to(DEVICE)
        self.actions[idx] = action
        self.next_states[idx] = T.Tensor(
            np.reshape(next_states, (np.multiply(*next_states.shape)))
        ).to(DEVICE)
        self.rewards[idx] = reward
        self.dones[idx] = done

        self.mem_ctr += 1

        if self.mem_ctr <= self.mem_size:
            self.n = self.mem_ctr

    def resize(self):
        if not self.calibrated:
            self.calibrated = True
            self.mem_size = self.n

            self.states = self.states[: self.mem_size]
            self.actions = self.actions[: self.mem_size]
            self.next_states = self.next_states[: self.mem_size]
            self.rewards = self.rewards[: self.mem_size]
            self.dones = self.dones[: self.mem_size]
            print(f"Resized buffer to {self.mem_size}")

    def sample(self, batch_size: int):
        weights = T.ones(self.n, device=DEVICE).expand(batch_size, -1)
        batch = T.multinomial(weights, 1, replacement=False).reshape(batch_size)

        return (
            self.states[batch],
            self.actions[batch],
            self.next_states[batch],
            self.rewards[batch],
            self.dones[batch],
        )

In [None]:
class DeepQNetwork(nn.Module):
    def __init__(
        self, state_dims: int, fc1_dims: int, fc2_dims: int, action_dims: int, name: str
    ):
        super(DeepQNetwork, self).__init__()
        self.name = name

        self._fc1 = nn.Linear(state_dims, fc1_dims)
        self._fc2 = nn.Linear(fc1_dims, fc2_dims)
        self._fc3 = nn.Linear(fc2_dims, action_dims)

        self.to(DEVICE)

    def forward(self, states: np.ndarray):
        x = F.leaky_relu(self._fc1(states))
        x = F.leaky_relu(self._fc2(x))
        x = self._fc3(x)

        return x

In [None]:
class DDQNAgent:
    class ThompsonSampling:
        pass

    class EpsilonGreedy:
        pass

    class Greedy:
        pass

    def __init__(
        self,
        state_dim: int,
        action_dim: int,
        fc1_dim=64,
        fc2_dim=64,
        batch_size=64,
        max_mem_size=100_000,
        learning_rate=1e-3,
        discount_factor=0.99,
        train_every=1,
        sync_every=5_000,
        tau=0.005,
        epsilon=1,
        epsilon_end=0.01,
        epsilon_decay=0.99,
        decay_episodes: int = None,
        Q1_name="QNetwork1",
        Q2_name="QNetwork2",
    ):
        self._action_dim = action_dim
        self._batch_size = batch_size
        self._max_memory_size = max_mem_size
        self._gamma = discount_factor
        self._train_every = train_every
        self._sync_every = sync_every
        self._tau = tau
        self._epsilon = epsilon
        self._epsilon_end = epsilon_end
        if decay_episodes and type(decay_episodes == int):
            self._epsilon_decay = np.power(epsilon_end, 1 / decay_episodes)
        else:
            self._epsilon_decay = epsilon_decay

        self._memory_size = 0
        self._train_count = 0

        self._QNetwork1 = DeepQNetwork(
            state_dim, fc1_dim, fc2_dim, action_dim, name=Q1_name
        )
        self._QNetwork2 = DeepQNetwork(
            state_dim, fc1_dim, fc2_dim, action_dim, name=Q2_name
        )

        self.update_networks(tau=1)
        for param in self._QNetwork2.parameters():
            param.requires_grad = False

        self._optim = optim.Adam(self._QNetwork1.parameters(), lr=learning_rate)

        self._memory = ReplayBuffer(max_mem_size, state_dim)

    def choose_action(self, states: np.ndarray, algorithm=EpsilonGreedy):
        states = T.Tensor(np.reshape(states, (np.multiply(*states.shape),))).to(DEVICE)
        with T.no_grad():
            values = self._QNetwork2(states)

        match algorithm:
            case DDQNAgent.Greedy:
                action = np.argmax(values.cpu().numpy())
            case DDQNAgent.EpsilonGreedy:
                if np.random.random() < self._epsilon:  # epsilon greedy
                    action = np.random.randint(0, self._action_dim)
                else:
                    action = np.argmax(values.cpu().numpy())
            case DDQNAgent.ThompsonSampling:
                action_probs = T.softmax(values, dim=0)
                action = int(T.multinomial(action_probs, 1).cpu().detach())
            case _:
                raise NotImplementedError(
                    f"{algorithm} is not a valid action selection algorithm"
                )
        return action

    def remember(
        self,
        states: np.ndarray,
        action: int,
        next_states: np.ndarray,
        reward: float,
        done: bool,
    ):
        self._memory.append(states, action, next_states, reward, done)

    def resize_buffer(self):
        self._memory.resize()

    def train(self):
        self._train_count += 1
        if (self._memory.n <= self._batch_size) or (
            self._train_count % self._train_every != 0
        ):
            return

        states, actions, next_states, rewards, dones = self._memory.sample(
            self._batch_size
        )

        q_current_values = self._QNetwork1(states)

        q_next_values = self._QNetwork1(next_states)
        q_target_values = self._QNetwork2(next_states)

        q_current = q_current_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        q_target = q_target_values.gather(
            1, T.max(q_next_values, 1)[1].unsqueeze(1)
        ).squeeze(1)

        q_expected = rewards + self._gamma * q_target * (T.ones_like(dones) ^ dones)

        loss = (q_current - q_expected.detach()).pow(2).mean()

        self._optim.zero_grad()
        loss.backward()
        self._optim.step()

        self.update_networks()
        # if self._train_count % self._sync_every == 0:
        #     self.sync_networks()

    def update_epsilon(self):
        self._epsilon = max(self._epsilon_end, self._epsilon * self._epsilon_decay)

    def update_networks(self, tau: float = None):
        # self._QNetwork2.load_state_dict(self._QNetwork1.state_dict())

        if tau == None:
            tau = self._tau

        target_value_params = self._QNetwork2.named_parameters()
        value_params = self._QNetwork1.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        for name in value_state_dict:
            value_state_dict[name] = (
                tau * value_state_dict[name].clone()
                + (1 - tau) * target_value_state_dict[name].clone()
            )

        self._QNetwork2.load_state_dict(value_state_dict)

    def save_models(self):
        print("... Saving Models ...")
        T.save(self._QNetwork1.state_dict(), f"DDQN_{self._QNetwork1.name}.model")
        T.save(self._QNetwork2.state_dict(), f"DDQN_{self._QNetwork2.name}.model")

    def load_models(self):
        print("... Loading Models ...")
        self._QNetwork1.load_state_dict(T.load(f"DDQN_{self._QNetwork1.name}.model"))
        self._QNetwork2.load_state_dict(T.load(f"DDQN_{self._QNetwork2.name}.model"))

In [None]:
# Utilities
def preprocess(df: pd.DataFrame):
    df["feature_close"] = df["close"].pct_change()
    df["feature_open"] = df["open"] / df["close"]
    df["feature_high"] = df["high"] / df["close"]
    df["feature_low"] = df["low"] / df["close"]
    df["feature_volume"] = df["volume"] / df["volume"].rolling(7 * 24).max()
    df.dropna(inplace=True)
    return df


def reward_function(history):
    log_portfolio = np.log(
        history["portfolio_valuation", -1] / history["portfolio_valuation", -2]
    )
    log_market = np.log(history["data_close", -1] / history["data_close", -2])
    reward = log_portfolio - log_market
    return 1000 * reward


def make_env(data_dir, positions, verbose=True):
    env = gym.make(
        "MultiDatasetTradingEnv",
        dataset_dir=data_dir,
        positions=positions,
        preprocess=preprocess,
        reward_function=reward_function,
        windows=ENV_WINDOWS,
        initial_position=0.0,
        trading_fees=0.18 / 100,  # 0.18% per stock buy / sell (BTCTurk fees)
        portfolio_initial_value=1_000,  # in USD
        verbose=verbose,
    )
    env.unwrapped.add_metric(
        "market_return_raw",
        lambda history: round(
            100 * (history["data_close", -1] / history["data_close", 0] - 1), 2
        ),
    )
    env.unwrapped.add_metric(
        "portfolio_return_raw",
        lambda history: round(
            100
            * (
                history["portfolio_valuation", -1] / history["portfolio_valuation", 0]
                - 1
            ),
            2,
        ),
    )
    env.unwrapped.add_metric(
        "position_changes",
        lambda history: 100
        * np.sum(np.diff(history["position"]) != 0)
        / len(history["position"]),
    )
    # env.unwrapped.add_metric("Episode Length", lambda history: len(history))

    return env


def plot_scores(
    values: list,
    y_axis_label: str,
    label: str,
    title: str,
    file: str,
    values2: list = None,
    label2: str = None,
    hlines: list = [0],
    markers: list = [],
):
    x = np.arange(len(values)) + 1
    if values2 != None:
        y_max = max(max(values), max(values2), 0)
        y_min = min(min(values), min(values2), 0)
        x_eval = (np.arange(len(values2)) + 1) * EVAL_PERIOD
    else:
        y_max = max(max(values), 0)
        y_min = min(min(values), 0)
    y_ticks = np.arange(y_min, y_max, (y_max - y_min) / 11)

    plt.figure(figsize=(15, 9))
    plt.plot(x, values, color="C0", label=label)

    if values2 != None:
        plt.plot(x_eval, values2, color="C1", label=label2)

    for y in hlines:
        plt.axhline(y=y, color="white", lw=0.3)
    for x, y, text in markers:
        plt.text(x, y, text)

    plt.yticks(y_ticks)
    plt.xlabel("Episodes")
    plt.ylabel(y_axis_label)
    plt.legend()
    plt.title(title)
    plt.tight_layout()
    plt.savefig(file)
    plt.close()

In [None]:
EPISODES = 50000

In [None]:
# Environment and Agent Setup

for file in glob.glob("./ddqn_vars/*.dill"):
    os.remove(file)
for file in glob.glob("./ddqn_render/*.pkl"):
    os.remove(file)


positions = list(np.arange(0, 1, 0.01))

episode_ = 1

training_env = make_env("./data/training/*.pkl", positions, False)
testing_env = make_env("./data/testing/*.pkl", positions, False)

agent = DDQNAgent(
    np.multiply(*training_env.observation_space.shape),
    training_env.action_space.n,
    decay_episodes=int(EPISODES * 0.75),
    max_mem_size=400_000,
    tau=0.00001,
    batch_size=256,
    fc1_dim=256,
    fc2_dim=256,
    learning_rate=0.00005,
)

best_avg_score = -np.inf
best_avg_score_eval = -np.inf
scores = []
scores_avg = []
scores_eval = []
scores_eval_avg = []
epsilons = []
times_taken = []
best_episode_markers = []  # [(episode, score, text)]

SAVE_MIN_LENGTH = 100
BUFFER_EPISODES = np.inf
SAVE_VARIABLES = True

In [None]:
# Main loop

if first:
    print(f"Starting... Time: {datetime.now()}\n")
else:
    print(f"Continuing... Time: {datetime.now()}\n")
    n = len(scores)
    for i in range(n):
        print(
            f"Episode: {i+1:4d}, Score: {scores[i]:.3f}, \
              Average Score: {scores_avg[i]:.3f}, Epsilon: {epsilons[i]:.3f}"
        )

calibrated = False
for episode in tqdm(range(episode_, episode_ + EPISODES)):
    if not calibrated:
        gpu_memory_free = nvidia_smi.nvmlDeviceGetMemoryInfo(DEVICE_HANDLE).free / (1024**3)
        if gpu_memory_free < 4:
            print(f"GPU memory limit reached at episode: {episode}.")
            agent.resize_buffer()
            calibrated = True
        elif episode > BUFFER_EPISODES:
            print(f"Requested buffer size reached.")
            agent.resize_buffer()
            calibrated = True

    time_start = time.perf_counter()

    state, info = training_env.reset()

    score, length, avg_length = 0, 0, 0
    done, truncated = False, False
    while not (done or truncated):
        action = agent.choose_action(state, algorithm=DDQNAgent.ThompsonSampling)
        next_state, reward, done, truncated, info = training_env.step(action)

        score += reward

        agent.remember(state, action, next_state, reward, done)
        agent.train()

        state = next_state

        length += 1
    scores.append(score)
    epsilons.append(agent._epsilon)
    scores_avg.append(np.mean(scores[-SAVE_MIN_LENGTH:]))

    metrics = training_env.unwrapped.results_metrics
    market_return = metrics["market_return_raw"]
    portfolio_return = metrics["portfolio_return_raw"]
    position_changes = metrics["position_changes"]
    RETURN_DIFFERENCES_TRAIN.append(portfolio_return - market_return)

    time_taken = time.perf_counter() - time_start
    times_taken.append(time_taken)
    print(
        f"Episode: {episode:>4}, Score: {score:>9.3f}, Market Return: {market_return:>7.2f}%, Portfolio Return: {portfolio_return:>8.2f}%, Position Changes: {position_changes:>6.2f}%, Time Taken: {time_taken:>5.2f}s, Length: {length:>4d}"
    )

    if episode % EVAL_PERIOD == 0:
        time_eval_start = time.perf_counter()
        print("Evaluating...  ", end="")

        state_eval, info_eval = testing_env.reset()
        score_eval = 0
        done_eval, truncated_eval = False, False
        while not (done_eval or truncated_eval):
            action_ = agent.choose_action(state_eval, algorithm=DDQNAgent.Greedy)
            (
                state_eval,
                reward_eval,
                done_eval,
                truncated_eval,
                info_eval,
            ) = testing_env.step(action_)
            score_eval += reward_eval
        scores_eval.append(score_eval)
        scores_eval_avg.append(np.mean(scores_eval[-SAVE_MIN_LENGTH:]))

        metrics_eval = testing_env.unwrapped.results_metrics
        market_return_eval = metrics_eval["market_return_raw"]
        portfolio_return_eval = metrics_eval["portfolio_return_raw"]
        position_changes_eval = metrics_eval["position_changes"]
        RETURN_DIFFERENCES_EVAL.append(portfolio_return_eval - market_return_eval)

        time_eval_taken = time.perf_counter() - time_eval_start
        print(
            f"Score: {score_eval:>9.3f}, Market Return: {market_return_eval:>7.2f}%, Portfolio Return: {portfolio_return_eval:>8.2f}%, Position Changes: {position_changes_eval:>6.2f}%, Time Taken: {time_eval_taken:>5.2f}s, Free GPU Memory: {gpu_memory_free:>5.2f}GB"
        )

        testing_env.unwrapped.save_for_render(dir="./ddqn_render")
        render_saves = glob.glob("./ddqn_render/*.pkl")
        latest_render_save = max(render_saves, key=os.path.getctime)
        os.rename(
            latest_render_save,
            f"./ddqn_render/ddqn_render_E{episode:04}_[{score_eval:+08.2f}].pkl",
        )

        if (
            len(scores_eval_avg) >= SAVE_MIN_LENGTH
            and scores_eval_avg[-1] > best_avg_score_eval
        ):
            best_avg_score_eval = scores_eval_avg[-1]
            best_episode_markers.append(
                (episode, score_eval, f"{best_avg_score_eval:.1f}")
            )
            print("Saving Variables...")
            shutil.copy(
                f"./ddqn_render/ddqn_render_E{episode:04}_[{score_eval:+08.2f}].pkl",
                f"./ddqn_render/ddqn_best_E{episode:04}_[{best_avg_score_eval:+08.2f}].pkl",
            )
            
            if SAVE_VARIABLES:
                try:
                    dill.dump_module(
                        f"./ddqn_vars/ddqn_E{episode:04}_[{best_avg_score_eval:+08.2f}].dill"
                    )
                    shutil.copy(
                        f"./ddqn_vars/ddqn_E{episode:04}_[{best_avg_score_eval:+08.2f}].dill",
                        f"./ddqn_vars/ddqn_best.dill",
                    )
                except:
                    print("Failed to save variables. Disabling dill saves.")
                    SAVE_VARIABLES = False
        
        plot_scores(
            values=scores,
            values2=scores_eval,
            y_axis_label="Score",
            label="Training Score",
            label2="Evaluation Score",
            title="Scores vs Episodes",
            file="DDQN_Scores.png",
        )
        plot_scores(
            values=RETURN_DIFFERENCES_TRAIN,
            values2=RETURN_DIFFERENCES_EVAL,
            y_axis_label="Return Difference (%)",
            label="Training Return Difference",
            label2="Evaluation Return Difference",
            title="Return Differences vs Episodes",
            file="DDQN_Returns.png",
        )
        plot_scores(
            values=times_taken,
            y_axis_label="Training Time (s)",
            label="Training Time",
            title="Training Time vs Episodes",
            file="DDQN_Times.png",
            hlines=[],
        )

    agent.update_epsilon()
    episode_ = episode + 1

    total_time_taken = time.perf_counter() - time_start
    print(f"Total Time Taken: {total_time_taken}s\n")

In [None]:
from gym_trading_env.renderer import Renderer

renderer = Renderer(render_logs_dir="render_logs")
renderer.run()