In [None]:
'''
@Author: Yitao Qiu
'''
from typing import cast, List
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import math
import json
import yaml
from api_types import GlobalConfig, AgentProps
from torch.autograd import Variable
from torch.distributions import Categorical
from utils.qf_data import normalize,load_observations
from environment.QF_env_1 import envs
from tools.ddpg.replay_buffer import ReplayBuffer
from tools.ddpg.ornstein_uhlenbeck import OrnsteinUhlenbeckActionNoise
from tensorboardX import SummaryWriter

In [None]:
with open('stable_config.yml', 'r', encoding='utf-8') as f:
		config = GlobalConfig(yaml.safe_load(f))
assert type(config.use_agents) == int, 'You must specify one agent for training!'
agent_index = cast(int, config.use_agents)
product_list = AgentProps(config.agent_list[agent_index]).product_list
product_num = len(product_list)
window_size = config.window_size
action_dim = [product_num+1]
actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))

seed = config.random_seed
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:
def obs_normalizer(observation):
    # Normalize the observation into close/open ratio
    if isinstance(observation, tuple):
        observation = observation[0]
    
    observation = observation[:, :, 3:4] / observation[:, :, 0:1]
    observation = normalize(observation)
    return observation

def hidden_init(layer):
    # Initialize the parameter of hidden layer
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)

## GPU Setting

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
C_CUDA = torch.cuda.is_available()

## Model Setting

In [None]:
# Define actor network--CNN
class Actor(nn.Module):
    def __init__(self,product_num, win_size):
        super(Actor, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels =  1,
            out_channels = 32,
            kernel_size = (1,3),
            #stride = (1,3)
        )
        self.conv2 = nn.Conv2d(
            in_channels = 32,
            out_channels = 32,
            kernel_size = (1, win_size-2),
            #stride = (1, win_size-2)
        )
        self.linear1 = nn.Linear((product_num + 1)*1*32, 64)
        self.linear2 = nn.Linear(64, 64)
        self.linear3 = nn.Linear(64,product_num + 1)
    
    def reset_parameters(self):
        self.linear1.weight.data.uniform_(*hidden_init(self.linear1))
        self.linear2.weight.data.uniform_(*hidden_init(self.linear2))
        self.linear3.weight.data.uniform_(-3e-3, 3e-3)
    
    def forward(self, state):
        conv1_out = self.conv1(state)
        conv1_out = F.relu(conv1_out)
        conv2_out = self.conv2(conv1_out)
        conv2_out = F.relu(conv2_out)
        # Flatten
        conv2_out = conv2_out.view(conv2_out.size(0), -1)
        fc1_out = self.linear1(conv2_out)
        fc1_out = F.relu(fc1_out)
        fc2_out = self.linear2(fc1_out)
        fc2_out = F.relu(fc2_out)
        fc3_out = self.linear3(fc2_out)
        fc3_out = F.softmax(fc3_out,dim=1)
        
        return fc3_out

In [None]:
# Define policy gradient actor network--LSTM
class Policy(nn.Module):
    def __init__(self,product_num, win_size,action_size):
        super(Policy, self).__init__()
        
        self.lstm = nn.LSTM(win_size,32,1)
        
        self.linear1 = nn.Linear((product_num+1)*1*32, 64)
        self.linear2 = nn.Linear(64, 64)
        self.linear3 = nn.Linear(64,action_size)
        
        # Define the  vars for recording log prob and reawrd
        self.saved_log_probs = []
        self.rewards = []
        
    
    def reset_parameters(self):
        self.linear1.weight.data.uniform_(*hidden_init(self.linear1))
        self.linear2.weight.data.uniform_(*hidden_init(self.linear2))
        self.linear3.weight.data.uniform_(-3e-3, 3e-3)
    
    def forward(self, state):
        state = torch.reshape(state, (-1, 1, 3))
        lstm_out, _ = self.lstm(state)
        batch_n,win_s,hidden_s = lstm_out.shape
        lstm_out = lstm_out.view(batch_n, win_s*hidden_s)
        lstm_out = torch.reshape(lstm_out, (-1, product_num+1, 32))
        lstm_out = lstm_out.view(lstm_out.size(0), -1)
        fc1_out = self.linear1(lstm_out)
        #fc1_out = F.relu(fc1_out)
        fc2_out = self.linear2(fc1_out)
        #fc2_out = F.relu(fc2_out)
        fc3_out = self.linear3(fc2_out)
        fc3_out = F.softmax(fc3_out,dim=1)
        
        return fc3_out

In [None]:
# Define Critic network--CNN
class Critic(nn.Module):
    def __init__(self, product_num, win_size):
        super(Critic, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels =  1,
            out_channels = 32,
            kernel_size = (1,3),
            #stride = (1,3)
        )
        self.conv2 = nn.Conv2d(
            in_channels = 32,
            out_channels = 32,
            kernel_size = (1, win_size-2),
            #stride = (1, win_size-2)
        )
        self.linear1 = nn.Linear((product_num + 1)*1*32, 64)
        self.linear2 = nn.Linear((product_num + 1), 64)
        self.linear3 = nn.Linear(64, 1)
    
    def reset_parameters(self):
        self.linear1.weight.data.uniform_(*hidden_init(self.linear1))
        self.linear2.weight.data.uniform_(*hidden_init(self.linear2))
        self.linear3.weight.data.uniform_(-3e-3, 3e-3)
    
    def forward(self, state, action):
        # Observation channel
        conv1_out = self.conv1(state)
        conv1_out = F.relu(conv1_out)
        conv2_out = self.conv2(conv1_out)
        conv2_out = F.relu(conv2_out)
        # Flatten
        conv2_out = conv2_out.view(conv2_out.size(0), -1)
        fc1_out = self.linear1(conv2_out)
        # Action channel
        fc2_out = self.linear2(action)
        obs_plus_ac = torch.add(fc1_out,fc2_out)
        obs_plus_ac = F.relu(obs_plus_ac)
        fc3_out = self.linear3(obs_plus_ac)
        
        return fc3_out

## Agent Setting

In [None]:
class QFPIS(object):
    def __init__(self, env, product_num, win_size, action_size, actor_noise, config_file = 'config/config.json'):
        
        with open(config_file) as f:
            self.config = json.load(f)
        assert self.config != None, "Can't load config file"
        
        self.env = env
        self.actor_noise = actor_noise
        self.summary_path ='results/ddpg/'
        if C_CUDA:
            self.actor = Actor(product_num,win_size).cuda()
            self.actor_target = Actor(product_num,win_size).cuda()
            self.critic = Critic(product_num,win_size).cuda()
            self.critic_target = Critic(product_num,win_size).cuda()
        else:
            self.actor = Actor(product_num,win_size)
            self.actor_target = Actor(product_num,win_size)
            self.critic = Critic(product_num,win_size)
            self.critic_target = Critic(product_num,win_size)
        
        self.actor.reset_parameters()
        self.actor_target.reset_parameters()
        self.critic_target.reset_parameters()
        self.actor.reset_parameters()
        
        self.actor_optim = optim.Adam(self.actor.parameters(), lr = self.config['actor learning rate'])
        self.critic_optim = optim.Adam(self.critic.parameters(), lr = self.config['critic learning rate'])
        
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())
        
        # Here is the code for the policy-gradeint
        if C_CUDA:
            self.policy = Policy(product_num, win_size, action_size).cuda()
        else:
            self.policy = Policy(product_num, win_size, action_size)
        self.policy_optim = optim.Adam(self.policy.parameters(), lr=1e-4)
        
        
    
    def act(self, state):
        if C_CUDA:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0).cuda()
        else:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
        
        action = self.actor(state).squeeze(0).cpu().detach().numpy()+ self.actor_noise()
        return action
    
    def critic_learn(self, state, action, predicted_q_value):
        actual_q = self.critic(state, action)
        if C_CUDA:
            target_Q = torch.tensor(predicted_q_value, dtype=torch.float).cuda()
        else:
            target_Q = torch.tensor(predicted_q_value, dtype=torch.float)
        target_Q=Variable(target_Q,requires_grad=True)
        
        td_error  = F.mse_loss(actual_q, target_Q)
        
        self.critic_optim.zero_grad()
        td_error.backward()
        self.critic_optim.step()
        return predicted_q_value,td_error
    
    def actor_learn(self, state):

        loss = -self.critic(state, self.actor(state)).mean()
        
        self.actor_optim.zero_grad()
        loss.backward()
        self.actor_optim.step()
        return loss
        

    
    def soft_update(self, net_target, net, tau):
        for target_param, param  in zip(net_target.parameters(), net.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
    
    # Here is the code for the policy gradient actor
    def select_action(self, state):
        if C_CUDA:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0).cuda()
        else:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
        # Get the probability distribution
        probs = self.policy(state)
        m = Categorical(probs)
        # Sample action from the distribution
        action = m.sample()
        self.policy.saved_log_probs.append(m.log_prob(action))
        return action.item()
    
    def policy_learn(self, eps):
        R = 0
        policy_loss = []
        returns = []
        
        # Reversed Traversal and calculate cumulative rewards for t to T
        for r in self.policy.rewards[::-1]:
            R = r + 0.95 * R # R: culumative rewards for t to T
            returns.insert(0, R) # Evaluate the R and keep original order
        if C_CUDA:
            returns = torch.tensor(returns).cuda()
        else:
            returns = torch.tensor(returns)
        # Normalized returns
        returns = (returns - returns.mean()) / (returns.std() + eps)
        
        # After one episode, update once
        for log_prob, R in zip(self.policy.saved_log_probs, returns):
            # Actual loss definition:
            policy_loss.append(-log_prob * R)
        self.policy_optim.zero_grad()
        policy_loss = torch.cat(policy_loss).sum()
        policy_loss.backward()
        self.policy_optim.step()
        
        del self.policy.rewards[:]
        del self.policy.saved_log_probs[:]
        
        return policy_loss
        
    
    def train(self):
        num_episode = self.config['episode']
        batch_size = self.config['batch size']
        gamma = self.config['gamma']
        tau = self.config['tau']
        eps = np.finfo(np.float32).eps.item()
        self.buffer = ReplayBuffer(self.config['buffer size'])
        total_step = 0
        moving_average_reward = 0
        writer = SummaryWriter(self.summary_path)
        # Main training loop
        for i in range(100):
            previous_observation = self.env.reset()
            # Normalization
            previous_observation = obs_normalizer(previous_observation)
            
            previous_observation = previous_observation.transpose(2, 0, 1)
            ep_reward = 0
            ep_ave_max_q = 0
            
            # Keep sampling until done
            for j in range (self.config['max step']):
                # ================================================
        		# 1. Given state st, take action at based on actor
        		# ================================================
                
                action = self.act(previous_observation)
                action_policy = self.select_action(previous_observation)
                #print(action_policy)
                # ================================================
        		# 2. Obtain reward rt and reach new state st+1
                # ================================================
                observation_origin, reward, policy_reward, done, _ = self.env.step(action,action_policy)
                
                # ================================================
                # For Policy Gradient, append reward
                # ================================================
                self.policy.rewards.append(policy_reward)
                # ================================================
                # For Policy Gradient, Update network parameter
                # ================================================
                
                
                observation = obs_normalizer(observation_origin)
                # Reshape
                observation = observation.transpose(2, 0, 1)
                # ================================================
        		# 3. Store (st, at, rt, st+1)
        		# ================================================
                self.buffer.add(previous_observation, action, reward, done, observation)
                if self.buffer.size() >= batch_size:
                    # ==========================================
        			# 4. Sample (si,ai,ri,si+1) from the buffer
        			# ==========================================
                    s_batch, a_batch, r_batch, t_batch, s2_batch = self.buffer.sample_batch(batch_size)
                    # Convert to torch tensor
                    if C_CUDA:
                        s_batch = torch.tensor(s_batch, dtype=torch.float).cuda()
                        a_batch = torch.tensor(a_batch, dtype=torch.float).cuda()
                        r_batch = torch.tensor(r_batch, dtype=torch.float).cuda()#.view(batch_size,-1)
                        t_batch = torch.tensor(t_batch, dtype=torch.float).cuda()
                        s2_batch = torch.tensor(s2_batch, dtype=torch.float).cuda()
                        target_q = self.critic_target(s2_batch,self.actor_target(s2_batch)).cpu().detach()
                    else:
                        s_batch = torch.tensor(s_batch, dtype=torch.float)
                        a_batch = torch.tensor(a_batch, dtype=torch.float)
                        r_batch = torch.tensor(r_batch, dtype=torch.float)
                        t_batch = torch.tensor(t_batch, dtype=torch.float)
                        s2_batch = torch.tensor(s2_batch, dtype=torch.float)
                        target_q = self.critic_target(s2_batch,self.actor_target(s2_batch)).detach()
                    y_i = []
                    for k in range(batch_size):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k].cpu().numpy() + gamma * target_q[k].numpy())
                    #y_i = r_batch + gamma * target_q
                    # =========================================================
        			# 6. Update the parameters of Q to make Q(si,ai) close to y
        			# =========================================================
                    predicted_q_value,td_error = self.critic_learn(s_batch, a_batch,np.reshape(y_i, (batch_size, 1)))
                    writer.add_scalar('TD error', td_error, global_step=total_step)
                    ep_ave_max_q += np.amax(predicted_q_value)
                    
                    # ================================================================
        			# 7. Update the parameters of of actor to maximize Q(si,actor(si))
        			# ================================================================
                    actor_loss = self.actor_learn(s_batch)
                    writer.add_scalar('Actor loss', actor_loss, global_step=total_step)
                    # ===============================================
        			# 8. Every C steps reset Q^ = Q, actor^ = actor
        			# ================================================
                    self.soft_update(self.critic_target, self.critic, tau)
                    self.soft_update(self.actor_target, self.actor, tau)
                
                ep_reward += reward
                
                previous_observation =  observation
                total_step = total_step+1
                if done or j == self.config['max step'] - 1:
                    writer.add_scalar('Q-max', ep_ave_max_q / float(j), global_step=i)
                    writer.add_scalar('Reward', ep_reward, global_step=i)
                    
                    print('Episode: {:d}, Reward: {:.2f}, Qmax: {:.4f}, Average reward: {:.8f}'.format(i, ep_reward, (ep_ave_max_q / float(j)),moving_average_reward))
                    break
            moving_average_reward = 0.05 * ep_reward + (1 - 0.05) * moving_average_reward
            writer.add_scalar('Moving average reward', moving_average_reward, global_step=i)
            policy_loss = self.policy_learn(eps)
            writer.add_scalar('Policy Loss', policy_loss, global_step=i)
        print('Finish.')
        torch.save(self.actor.state_dict(), model_add+model_name)
        torch.save(self.policy.state_dict(), model_add+pg_model_name)

## Train

In [None]:
if __name__ == '__main__':
    # Parameter Settings
    model_add = 'models/'
    model_name = 'QFPIS_DDPG_1'
    pg_model_name = 'QFPIS_PG_1'
    mode = "Train"
    steps = 1000
    product_num = 9
    window_length = 3
    action_dim = [10]
    train_ratio = 0.8
    window_size = 1
    feature_num = 6
    action_size = 2
    market_feature = ['Open','High','Low','Close','QPL1','QPL-1']
    product_list = ["AUDCAD","AUDUSD","EURAUD","EURCAD","EURUSD","GBPUSD","NZDCHF","NZDUSD","USDCHF"]
    
    actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))

    env = envs(product_list,market_feature,feature_num,steps,window_length,mode)
    qf_system = QFPIS(env,product_num ,window_length, action_size,actor_noise ,config_file='config/config.json')
    
    # Start Training
    qf_system.train()

## Test

In [None]:
def load_actor():
    actor = Actor(product_num =9,win_size = 3).cuda()
    actor.load_state_dict(torch.load(model_add+model_name))
    return actor

def load_policy():
    test = Policy(product_num = 9, win_size = 3, action_size = 2).cuda()
    test.load_state_dict(torch.load(model_add+pg_model_name))
    return test
    
def test_model(env, actor, policy):
    eps = 1e-8
    actions = []
    weights = []
    observation, info = env.reset()
    observation = obs_normalizer(observation)
    observation = observation.transpose(2, 0, 1)
    done = False
    ep_reward = 0
    while not done:
        observation = torch.tensor(observation, dtype=torch.float).unsqueeze(0).cuda()
        action = actor(observation).squeeze(0).cpu().detach().numpy()
        # Here is the code for the policy gradient
        actions_prob = policy(observation)
        m = Categorical(actions_prob)
        # Selection action by sampling the action prob
        action_policy = m.sample()
        actions.append(action_policy.cpu().numpy())
        w1 = np.clip(action, 0, 1)  # np.array([cash_bias] + list(action))  # [w0, w1...]
        w1 /= (w1.sum() + eps)
        weights.append(w1)
        observation, reward,policy_reward, done, _ = env.step(action,action_policy)
        ep_reward += reward
        observation = obs_normalizer(observation)
        observation = observation.transpose(2, 0, 1)
    #print(ep_reward)
    env.render()
    return actions, weights

In [None]:
# Testing

data_add ='Data/'
train_ratio = 0.8
window_size = 1
window_length = 3
market_feature = ['Open','High','Low','Close','QPL1','QPL-1']
feature_num = 6
product_list = ["AUDCAD","AUDUSD","EURAUD","EURCAD","EURUSD","GBPUSD","NZDCHF","NZDUSD","USDCHF"]

observations,ts_d_len = load_observations(window_size,market_feature,feature_num,product_list)
train_size = int(train_ratio*ts_d_len)

test_observations = observations[int(train_ratio * observations.shape[0]):]
test_observations = np.squeeze(test_observations)
test_observations = test_observations.transpose(2, 0, 1)

mode = "Test"
steps = 405
env = envs(product_list,market_feature,feature_num,steps,window_length,mode,start_index=train_size+282,start_date='2019-6-25')
actor = load_actor()
policy = load_policy()
test_actions, test_weight = test_model(env,actor,policy)