In [68]:
import torch
import torch.nn as nn
import torch.optim as optim

from copy import deepcopy
from src.buffer import ExpReplay
from collections import namedtuple
import gym
from gym import spaces
import pandas as pd
import numpy as np

#from src.env import StockTradingEnv
#from src.agent import DDPG_Hedger
#from src.network import MLP

In [66]:
with open("model/hypparams.json", "r") as file:
    hyp_params = json.load(file)


In [88]:

class StockTradingEnv(gym.Env):
    """Environment for agent, consists of __init__, step, and reset functions"""

    def __init__(self, reset_path=False, data_type="mixed"):
        self.asset_price = pd.read_csv(
            f"data/Daily/asset_price_{data_type}_1_sim.csv"
        ).values
        self.option_price = pd.read_csv(
            f"data/Daily/option_price_{data_type}_1_sim.csv"
        ).values
        self.nPaths = self.option_price.shape[0]
        self.nSteps = self.option_price.shape[1]

        # user-defined options (path)
        self.reset_path = reset_path
        self.path_choice = int(random.uniform(0, self.nPaths))
        self.path_idx = self.path_choice

        # user-defined options (rewards)
        self.kappa = 0.0001

        # initializing underlying amount
        self.holdings = 0
        self.curr_step = 0

        # Actions of the format hold amount [0,1]
        self.action_space = spaces.Box(low=-1, high=100, dtype=np.float16)

        # agent is given previous action + current asset price and time to maturity (H_i-1, S_i, tau_i)
        self.observation_space = spaces.Box(
            low=np.array([-1, 0, 0]),
            high=np.array([1, np.inf, self.nSteps]),
            shape=(3,),
            dtype=np.float16,
        )

    def step(self, action: float):
        # Execute one time step within the environment
        self.curr_step += 1

        # next call price, call price now, next asset price, asset price now.
        c_next, c_now, s_next, s_now = (
            self.option_price[self.path_idx, self.curr_step],
            self.option_price[self.path_idx, self.curr_step - 1],
            self.asset_price[self.path_idx, self.curr_step],
            self.asset_price[self.path_idx, self.curr_step - 1],
        )

        # R_{t} is Acc PnL
        reward = self.holdings * (s_next - s_now) - self.kappa * np.abs(
            s_next * (action - self.holdings)
        )

        # A_{t}: update the holding info.
        self.holdings = action

        # S_{t+1}: previous action, current asset price and time to maturity (H_i-1, S_i, tau_i)
        next_state = [
            self.holdings.item(),
            s_next,
            self.nSteps - self.curr_step,
        ]
        # done: whether the episode is ended or not
        done = True if self.curr_step + 1 >= self.nSteps else False

        # if terminal subtract option price difference, assumed next option price is just a call payoff and cost for exiting delta hedge position
        if done:
            reward = (
                reward
                - (max(s_next - 100, 0) - c_now)
                - self.kappa * s_next * self.holdings
            )
        else:  # if not terminal, substract option price difference.
            reward = reward - (c_next - c_now)
        return next_state, reward, done

    def reset(self):
        if self.reset_path:  # if user chose True, sets to his choice, else, random
            self.path_idx = self.path_choice

        # when resetting the env, set current_step and previous holdings equal to 0.
        self.curr_step = 0
        self.holdings = 0
        return [
            self.holdings,
            self.asset_price[self.path_idx, self.curr_step],
            self.nSteps,
        ]  # state0 of new path


In [70]:
torch.set_default_dtype(torch.float64)


class MLP(nn.Module):
    def __init__(self, dim_in, dim_hidden, dim_out, activation_name="ReLU"):
        super(MLP, self).__init__()
        self.dim_in = dim_in
        self.dim_hidden = dim_hidden
        self.dim_out = dim_out

        self.model = nn.Sequential(
            nn.LayerNorm(self.dim_in, elementwise_affine=True),
            nn.Linear(self.dim_in, self.dim_hidden),
            nn.ReLU(),
            nn.LayerNorm(self.dim_hidden, elementwise_affine=True),
            nn.Linear(self.dim_hidden, self.dim_hidden * 2),
            nn.ReLU(),
            nn.LayerNorm(self.dim_hidden * 2, elementwise_affine=True),
            nn.Linear(self.dim_hidden * 2, self.dim_out),
        )

        self.activ_layers = {"ReLU": nn.ReLU, "Sigmoid": nn.Sigmoid, "Tanh": nn.Tanh}
        self.activation_name = activation_name

    def forward(self, x):
        output = self.model(x)
        if self.activation_name not in self.activ_layers.keys():
            pass
        else:
            final_activ_func = self.activ_layers[self.activation_name]()
            output = final_activ_func(output)
        return output

In [90]:

class DDPG_Hedger:
    def __init__(
        self,
        Actor: nn.Module,
        Critic_1: nn.Module,
        Critic_2: nn.Module,
        actor_lr: float,
        critic_lr: float,
        disc_rate: float = 1,
        batch_size: int = 32,
    ):

        # params
        self.gamma = disc_rate
        self.tau = 0.01
        self.batch_size = batch_size

        # experience replay related
        self.transition = namedtuple(
            "Transition",
            ("state", "action", "reward", "next_state", "done"),
        )
        self.buffer = ExpReplay(10000, self.transition)

        # define actor and critic ANN.
        self.actor = Actor
        self.critic_1 = Critic_1  # mean(cost)
        self.critic_2 = Critic_2  # std(cost)

        # loss function for critic
        self.critic_loss = nn.MSELoss()

        # define optimizer for Actor and Critic network
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=critic_lr)
        self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=critic_lr)

        # define target network needed for DDPG optimization
        self.actor_target = deepcopy(self.actor)
        self.critic_1_target = deepcopy(self.critic_1)
        self.critic_2_target = deepcopy(self.critic_2)

    def reset(self):
        self.buffer.clear()

    def store(self, *args):
        self.buffer.store(*args)

    def act(self, state: list, epsilon: float = 0.05):
        """
        We use policy function to find the deterministic action instead of distributions
        which is parametrized by distribution parameters learned from the policy.

        Here, state input prompts policy network to output a single or multiple-dim
        actions.
        :param state:
        :return:
        """
        x = torch.tensor(state).to(torch.float64)
        if np.random.rand() <= epsilon:
            action = np.random.uniform(0, 1) * 100
        else:
            action = (
                self.actor.forward(x).detach().item() * 100
            )  # output from sigmoid layer
        # noise = torch.normal(mean=torch.Tensor([0]), std=torch.Tensor([sigma]))
        return np.clip(action, -state[0], 100.0 - state[0])

    def update(self, output=False):
        # calculate return of all times in the episode
        if self.buffer.len() < self.batch_size:
            return

        transitions = self.buffer.sample(self.batch_size)
        batch = self.transition(*zip(*transitions))

        # extract variables from sampled batch.
        states = torch.tensor(batch.state)
        actions = torch.tensor(batch.action)
        rewards = torch.tensor(batch.reward)
        dones = torch.tensor(batch.done).float()

        # define stateactions
        next_states = torch.tensor(batch.next_state)
        next_stateaction = torch.hstack(
            [next_states, self.actor_target(next_states)]
        ).detach()

        stateaction = torch.hstack([states, actions])

        # compute Q_1 loss
        Q_1 = self.critic_1(stateaction)
        y_1 = rewards + self.gamma * (1 - dones) * self.critic_1_target(
            next_stateaction
        )

        critic_loss_1 = self.critic_loss(Q_1, y_1)

        # Optimize the critic Q_1
        self.critic_1_optimizer.zero_grad()
        critic_loss_1.backward()
        self.critic_1_optimizer.step()

        # compute Q_2 loss
        Q_2 = self.critic_2(stateaction)
        y_2 = (
            rewards**2
            + (self.gamma**2) * (1 - dones) * self.critic_2_target(next_stateaction)
            + 2 * rewards * self.gamma * self.critic_1_target(next_stateaction)
        )

        critic_loss_2 = self.critic_loss(Q_2, y_2)

        # Optimize the critic Q_2
        self.critic_2_optimizer.zero_grad()
        critic_loss_2.backward()
        self.critic_2_optimizer.step()

        # Get actor loss
        state_action = torch.hstack([states, self.actor(states)])
        cost_variance = self.critic_2(state_action) - self.critic_1(state_action) ** 2

        actor_loss = -(
            self.critic_1(state_action)
            + 1.5 * torch.sqrt(torch.where(cost_variance < 0, 0, cost_variance))
        ).mean()

        # Optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        if output:
            return critic_loss_1, critic_loss_2, actor_loss.detach().item()

    def polyak_update(self):
        # Update the frozen target models
        for trg_param, src_param in zip(
            list(self.critic_1_target.parameters()), list(self.critic_1.parameters())
        ):
            trg_param = trg_param * (1.0 - self.tau) + src_param * self.tau

        # Update the frozen target models
        for trg_param, src_param in zip(
            list(self.critic_2_target.parameters()), list(self.critic_2.parameters())
        ):
            trg_param = trg_param * (1.0 - self.tau) + src_param * self.tau

        for trg_param, src_param in zip(
            list(self.actor_target.parameters()), list(self.actor.parameters())
        ):
            trg_param = trg_param * (1.0 - self.tau) + src_param * self.tau

    def save(self, name):
        torch.save(self.critic_1.state_dict(), f"model/{name}/critic_1_weight.pt")
        torch.save(self.critic_2.state_dict(), f"model/{name}/critic_2_weight.pt")
        torch.save(self.actor.state_dict(), f"model/{name}/actor_weight.pt")

    def load(self, name):
        # load trained weights to Q_1, Q_2, Actor
        self.critic_1.load_state_dict(torch.load(f"model/{name}/critic_1_weight.pt"))
        self.critic_2.load_state_dict(torch.load(f"model/{name}/critic_2_weight.pt"))
        self.actor.load_state_dict(torch.load(f"model/{name}/actor_weight.pt"))

        # Copy above 3 to target networks.
        self.critic_1_target = deepcopy(self.critic_1)
        self.critic_2_target = deepcopy(self.critic_2)
        self.actor_target = deepcopy(self.actor)

In [91]:
BATCH_SIZE = 16
N_EPISODE = 20

env = StockTradingEnv(reset_path=True)

#nHidden = hyp_params["hidden_dim"]
nHidden = 32
actor_lr = 10 ** hyp_params["actor_lr"]
critic_lr = 10 ** hyp_params["critic_lr"]
#trg_update = hyp_params["polyak_update_freq"]
#trg_update = 2
nState, nAction = env.observation_space.shape[0], env.action_space.shape[0]  # 3, 1

actor_lr

actor = MLP(nState, nHidden, nAction, "Sigmoid")
qnet_1 = MLP(nState + nAction, nHidden, nAction, "")
qnet_2 = MLP(nState + nAction, nHidden, nAction, "")
agent = DDPG_Hedger(actor, qnet_1, qnet_2, actor_lr, critic_lr, 1, BATCH_SIZE)

NameError: name 'random' is not defined

In [89]:
target_rewards = []
noise_std = 1

for episode in range(N_EPISODE):
    # reset state
    state = env.reset()  # s_0
    ep_tot_reward = 0

    if episode > N_EPISODE - 30:
        noise_std = 0.0001

    i = 0
    while True:
        # take action given state
        print(f'------- step {i+1} action:')
        action = agent.act(state, noise_std)
        
        # take next step of the environment
        next_state, reward, done = env.step(action)
        # record interaction between environment and the agent
        agent.store(state, action,  reward, next_state, done)

        ep_tot_reward -= reward
        state = next_state
        
        agent.update()

        i +=1 
        if done:
            break
        
    print("------------------")
    print(f"Episode {episode} Reward: {ep_tot_reward}")
    print("------------------")
    # store total rewards after some training is done
    # we only consider alst 10 total rewards as a quantity to minimize
    if episode > N_EPISODE - 30:
        target_rewards.append(ep_tot_reward)

    if episode % trg_update == 0:  # update target network
        agent.polyak_update()

#print(np.mean(target_rewards))


------- step 1 action:
tensor(24425.6837, grad_fn=<MseLossBackward0>) tensor(9.3400e+09, grad_fn=<MseLossBackward0>) tensor(391.6843, grad_fn=<MeanBackward0>)
------- step 2 action:
tensor(25649.7714, grad_fn=<MseLossBackward0>) tensor(7.5602e+09, grad_fn=<MseLossBackward0>) tensor(350.1599, grad_fn=<MeanBackward0>)
------- step 3 action:
tensor(37517.7547, grad_fn=<MseLossBackward0>) tensor(5.0974e+09, grad_fn=<MseLossBackward0>) tensor(383.8546, grad_fn=<MeanBackward0>)
------- step 4 action:
tensor(18920.1108, grad_fn=<MseLossBackward0>) tensor(1.5343e+10, grad_fn=<MseLossBackward0>) tensor(362.7693, grad_fn=<MeanBackward0>)
------- step 5 action:
tensor(21967.4064, grad_fn=<MseLossBackward0>) tensor(3.4593e+09, grad_fn=<MseLossBackward0>) tensor(377.5827, grad_fn=<MeanBackward0>)
------- step 6 action:
tensor(14796.8949, grad_fn=<MseLossBackward0>) tensor(4.3672e+09, grad_fn=<MseLossBackward0>) tensor(410.2914, grad_fn=<MeanBackward0>)
------- step 7 action:
tensor(26006.0787, grad

In [19]:

# check the init params of actor and two critics
print('Actor')
for param in agent.actor.parameters():
    print(param)
    
print('Critic_1')
for param in agent.critic_1.parameters():
    print(param)
    
print('Critic_2')
for param in agent.critic_2.parameters():
    print(param)

Actor
Parameter containing:
tensor([1.0000, 1.0000, 1.0000], requires_grad=True)
Parameter containing:
tensor([ 6.1786e-06, -1.4953e-05,  4.5926e-06], requires_grad=True)
Parameter containing:
tensor([[-0.2280,  0.5527, -0.1695],
        [ 0.4217, -0.0847, -0.3503]], requires_grad=True)
Parameter containing:
tensor([-0.4920,  0.1864], requires_grad=True)
Parameter containing:
tensor([0.9958, 0.9948], requires_grad=True)
Parameter containing:
tensor([-0.0042,  0.0052], requires_grad=True)
Parameter containing:
tensor([[-0.6245, -0.1175],
        [-0.1013,  0.2619],
        [ 0.0765, -0.2712],
        [ 0.3649,  0.2824]], requires_grad=True)
Parameter containing:
tensor([ 0.0930, -0.3979,  0.1604, -0.6195], requires_grad=True)
Parameter containing:
tensor([1.0059, 0.9941, 0.9941, 0.9941], requires_grad=True)
Parameter containing:
tensor([-0.0059,  0.0059, -0.0059,  0.0059], requires_grad=True)
Parameter containing:
tensor([[ 0.0919, -0.3225,  0.1068, -0.3532]], requires_grad=True)
Parame