In [1]:
import sys
import json
import numpy as np

from src.env import StockTradingEnv
from src.agent import DDPG_Hedger
from src.network import MLP

In [7]:
with open("model/hypparams.json", "r") as file:
    hyp_params = json.load(file)


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

from copy import deepcopy
from src.buffer import ExpReplay
from collections import namedtuple
from torch.distributions import Normal


class DDPG_Hedger:
    def __init__(
        self,
        Actor: nn.Module,
        Critic_1: nn.Module,
        Critic_2: nn.Module,
        actor_lr: float,
        critic_lr: float,
        disc_rate: float = 1,
        batch_size: int = 32,
    ):

        # params
        self.gamma = disc_rate
        self.tau = 0.01
        self.batch_size = batch_size

        # experience replay related
        self.transition = namedtuple(
            "Transition",
            ("state", "action", "reward", "next_state", "done"),
        )
        self.buffer = ExpReplay(10000, self.transition)

        # define actor and critic ANN.
        self.actor = Actor
        self.critic_1 = Critic_1  # mean(cost)
        self.critic_2 = Critic_2  # std(cost)

        # loss function for critic
        self.critic_loss = nn.MSELoss()

        # define optimizer for Actor and Critic network
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=critic_lr)
        self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=critic_lr)

        # define target network needed for DDPG optimization
        self.actor_target = deepcopy(self.actor)
        self.critic_1_target = deepcopy(self.critic_1)
        self.critic_2_target = deepcopy(self.critic_2)

    def reset(self):
        self.buffer.clear()

    def store(self, *args):
        self.buffer.store(*args)

    def act(self, state: list, sigma: float = 0.2):
        """
        We use policy function to find the deterministic action instead of distributions
        which is parametrized by distribution parameters learned from the policy.

        Here, state input prompts policy network to output a single or multiple-dim
        actions.
        :param state:
        :return:
        """
        x = torch.tensor(state).to(torch.float64)
        action = self.actor.forward(x)
        noise = torch.normal(mean=torch.Tensor([0]),std=torch.Tensor([sigma]))
        print(action, noise)
        return (
            torch.clip((action - 0.5) * 2 + noise, -state[0], 1.0 - state[0])
            .detach()
            .numpy()
        )

    def update(self, output=False):
        # calculate return of all times in the episode
        if self.buffer.len() < self.batch_size:
            return

        transitions = self.buffer.sample(self.batch_size)
        batch = self.transition(*zip(*transitions))

        # extract variables from sampled batch.
        states = torch.tensor(batch.state)
        actions = torch.tensor(batch.action)
        rewards = torch.tensor(batch.reward)
        dones = torch.tensor(batch.done).float()
        next_states = torch.tensor(batch.next_state)

        # compute Q_1 loss
        Q_1 = self.critic_1(torch.hstack([states, actions]))
        y_1 = rewards + self.gamma * (1 - dones) * self.critic_1_target(
            torch.hstack([next_states, self.actor_target(next_states)]).detach()
        )

        critic_loss_1 = self.critic_loss(Q_1, y_1)
        
        # Optimize the critic Q_1
        self.critic_1_optimizer.zero_grad()
        critic_loss_1.backward()
        self.critic_1_optimizer.step()

        # compute Q_2 loss
        Q_2 = self.critic_2(torch.hstack([states, actions]))
        y_2 = (
            rewards**2
            + (self.gamma**2)
            * (1 - dones)
            * self.critic_2_target(
                torch.hstack([next_states, self.actor_target(next_states)]).detach()
            )
            + 2
            * self.gamma
            * rewards
            * self.critic_1_target(
                torch.hstack([next_states, self.actor_target(next_states)]).detach()
            )
        )

        critic_loss_2 = self.critic_loss(Q_2, y_2)
        
        # Optimize the critic Q_2
        self.critic_2_optimizer.zero_grad()
        critic_loss_2.backward()
        self.critic_2_optimizer.step()


        # Get actor loss
        state_action = torch.hstack([states, self.actor(states)])
        cost_variance = (
            self.critic_2(state_action)
            - self.critic_1(state_action) ** 2
        )
        #print(self.critic_1(state_action)[:3], self.critic_2(state_action)[:3])
        actor_loss = (
            self.critic_1(state_action)
            + 1.5 * torch.sqrt(torch.where(cost_variance < 0, 0, cost_variance))
        ).mean()

        # Optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        print(critic_loss_1, critic_loss_2, actor_loss)

        if output:
            return actor_loss.detach().item()

    def polyak_update(self):
        # Update the frozen target models
        for trg_param, src_param in zip(
            list(self.critic_1_target.parameters()), list(self.critic_1.parameters())
        ):
            trg_param = trg_param * (1.0 - self.tau) + src_param * self.tau

        # Update the frozen target models
        for trg_param, src_param in zip(
            list(self.critic_2_target.parameters()), list(self.critic_2.parameters())
        ):
            trg_param = trg_param * (1.0 - self.tau) + src_param * self.tau


        for trg_param, src_param in zip(
            list(self.actor_target.parameters()), list(self.actor.parameters())
        ):
            trg_param = trg_param * (1.0 - self.tau) + src_param * self.tau

    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic")
        torch.save(self.critic_optimizer.state_dict(), filename + "_critic_hyp_params")

        torch.save(self.actor.state_dict(), filename + "_actor")
        torch.save(self.actor_optimizer.state_dict(), filename + "_actor_hyp_params")

    def load(self, filename):
        self.critic.load_state_dict(torch.load(filename + "_critic"))
        self.critic_optimizer.load_state_dict(
            torch.load(filename + "_critic_hyp_params")
        )

        self.actor.load_state_dict(torch.load(filename + "_actor"))
        self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_hyp_params"))

        # define target network needed for DDPG optimization
        self.actor_target = deepcopy(self.actor)
        self.critic_target = deepcopy(self.critic)

In [17]:
BATCH_SIZE = 16
N_EPISODE = 20

env = StockTradingEnv(reset_path=True)

#nHidden = hyp_params["hidden_dim"]
nHidden = 32
actor_lr = 10 ** hyp_params["actor_lr"]
critic_lr = 10 ** hyp_params["critic_lr"]
#trg_update = hyp_params["polyak_update_freq"]
trg_update = 2
nState, nAction = env.observation_space.shape[0], env.action_space.shape[0]  # 3, 1

actor_lr

actor = MLP(nState, nHidden, nAction, "Sigmoid")
qnet_1 = MLP(nState + nAction, nHidden, nAction, "")
qnet_2 = MLP(nState + nAction, nHidden, nAction, "")
agent = DDPG_Hedger(actor, qnet_1, qnet_2, actor_lr, critic_lr, 1, BATCH_SIZE)

In [20]:
target_rewards = []
noise_std = 1

for episode in range(N_EPISODE):
    # reset state
    state = env.reset()  # s_0
    ep_tot_reward = 0

    if episode > N_EPISODE - 30:
        noise_std = 0.0001

    i = 0
    while True:
        # take action given state
        print(noise_std)
        print(f'------- step {i+1} action:')
        action = agent.act(state, noise_std)
        
        # take next step of the environment
        next_state, reward, done = env.step(action)

        # record interaction between environment and the agent
        agent.store(state, action, reward, next_state, done)

        ep_tot_reward -= reward
        state = next_state
        
        agent.update()

        i +=1 
        if done:
            break
        
    print("------------------")
    print(f"Episode {episode} Reward: {ep_tot_reward}")
    print("------------------")
    # store total rewards after some training is done
    # we only consider alst 10 total rewards as a quantity to minimize
    if episode > N_EPISODE - 30:
        target_rewards.append(ep_tot_reward)

    if episode % trg_update == 0:  # update target network
        agent.polyak_update()

#print(np.mean(target_rewards))


0.0001
------- step 1 action:
tensor([0.6219], grad_fn=<SigmoidBackward0>) tensor([-7.8656e-05])
tensor(30912.7578, grad_fn=<MseLossBackward0>) tensor(4.8087e+09, grad_fn=<MseLossBackward0>) tensor(180.2722, grad_fn=<MeanBackward0>)
0.0001
------- step 2 action:
tensor([0.6219], grad_fn=<SigmoidBackward0>) tensor([0.0001])
tensor(18730.8602, grad_fn=<MseLossBackward0>) tensor(5.8680e+09, grad_fn=<MseLossBackward0>) tensor(180.1828, grad_fn=<MeanBackward0>)
0.0001
------- step 3 action:
tensor([0.6219], grad_fn=<SigmoidBackward0>) tensor([9.6789e-05])
tensor(28858.6220, grad_fn=<MseLossBackward0>) tensor(7.4012e+09, grad_fn=<MseLossBackward0>) tensor(180.0780, grad_fn=<MeanBackward0>)
0.0001
------- step 4 action:
tensor([0.6219], grad_fn=<SigmoidBackward0>) tensor([-5.5443e-05])
tensor(29998.1354, grad_fn=<MseLossBackward0>) tensor(1.1396e+09, grad_fn=<MseLossBackward0>) tensor(179.8687, grad_fn=<MeanBackward0>)
0.0001
------- step 5 action:
tensor([0.6219], grad_fn=<SigmoidBackward0>)

In [26]:
torch.normal(mean=torch.Tensor([0]),std=torch.Tensor([1]))

tensor([-0.6170])

In [45]:
sigma = 1
torch.normal(mean=torch.Tensor([0]),std=torch.Tensor([sigma]))

tensor([0.5899])

In [19]:

# check the init params of actor and two critics
print('Actor')
for param in agent.actor.parameters():
    print(param)
    
print('Critic_1')
for param in agent.critic_1.parameters():
    print(param)
    
print('Critic_2')
for param in agent.critic_2.parameters():
    print(param)

Actor
Parameter containing:
tensor([1.0000, 1.0000, 1.0000], requires_grad=True)
Parameter containing:
tensor([ 6.1786e-06, -1.4953e-05,  4.5926e-06], requires_grad=True)
Parameter containing:
tensor([[-0.2280,  0.5527, -0.1695],
        [ 0.4217, -0.0847, -0.3503]], requires_grad=True)
Parameter containing:
tensor([-0.4920,  0.1864], requires_grad=True)
Parameter containing:
tensor([0.9958, 0.9948], requires_grad=True)
Parameter containing:
tensor([-0.0042,  0.0052], requires_grad=True)
Parameter containing:
tensor([[-0.6245, -0.1175],
        [-0.1013,  0.2619],
        [ 0.0765, -0.2712],
        [ 0.3649,  0.2824]], requires_grad=True)
Parameter containing:
tensor([ 0.0930, -0.3979,  0.1604, -0.6195], requires_grad=True)
Parameter containing:
tensor([1.0059, 0.9941, 0.9941, 0.9941], requires_grad=True)
Parameter containing:
tensor([-0.0059,  0.0059, -0.0059,  0.0059], requires_grad=True)
Parameter containing:
tensor([[ 0.0919, -0.3225,  0.1068, -0.3532]], requires_grad=True)
Parame