# Tasks
1. Collect Training Data
2. Compute Expected Returns
3. Actor-Critic Loss Function
4. Updating Parameters
5. Run Training in a Loop
6. Validate
7. Test

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras import layers
from trd_env import trading_env
import tqdm

In [43]:
df = pd.read_csv("btc_6H_(2016-2018).csv")
df.head()
env = trading_env(df)
env.reset()
env.render()
eps = np.finfo(np.float32).eps.item()

Current Porfolio Value:10000.000696952811; Available Capital: 10000; Current Stocks Held: 1e-06
No. Stocks Bought:1e-08; No. Stocks Sold:1e-07; Average Cost:0 
Return:100.00000696952812%; 0.0006969528112676926
Termination date: 2016-10-31 00:00:00


In [3]:
class SAC(tf.keras.Model):
    
    def __init__(self,n_actions,n_hl): #number of actions, number of hidden layers employed
        super().__init__()
        
        self.ac = layers.Dense(n_hl, activation = "relu")
        self.actor = layers.Dense(n_actions) # Number of outputs
        self.critic = layers.Dense(1) #Number of outputs
        
    def call(self,ins):
        x = self.ac(ins)
        return self.actor(x), self.critic(x)

In [4]:
n_actions = 3
n_hl = 50

model = SAC(n_actions,n_hl)

## 1. Data Collection
* During forward pass, use environment state as inputs to generate action probabilities
* Critic Value based on the current policy parametized by model weight
* Next action sampled using action probabilites generated by the model - applied to the environment generating the next state and reward

In [28]:
def walk_env(action):
    state,reward,done = env.step(action)
    return(np.array(state,np.float32), #might be an issue since our inputs is an array of 6
          np.array(reward,np.float32),
          np.array(done,np.int32))

def tf_env_walk(action):
    return tf.numpy_function(walk_env,[action],[tf.float32,tf.float32,tf.float32])

def collect(initial_state,model,max_steps):
    action_probabilities = tf.TensorArray(dtype = tf.float32, size=0, dynamic_size=True) #all action_probabilites
    c_values = tf.TensorArray(dtype = tf.float32, size=0, dynamic_size=True) # all critic_value
    rewards = tf.TensorArray(dtype = tf.float32, size=0, dynamic_size=True)
    
    init_state_shape = initial_state.shape
    state = init_state_shape
    
    for t in tf.range(max_steps):
        state = tf.expand_dims(state,0)
        action_proba, c_value = model(state)
        
        #sample next action from the probability
        action = tf.random.categorical(action_proba,1)[0,0] #draws sample from categorical distribution (chooses action)
        soft_action_proba = tf.nn.softmax(action_proba) # calculates softmax probabilities for each action (calculates the probability of said chosen action)
        #store log probability of chosen action
        action_probabilities = action_probabilities.write(t,soft_action_proba[0,action])
        
        #store critic value
        c_values = c_values.write(t,tf.squeeze(c_value))
        
        #Apply action to environment
        state,reward,done = tf_env_walk(action)
        state.set_shape(init_state_shape)
        
        #Store reward
        rewards = rewards.write(t, reward)
        
        if tf.cast(done,tf.bool):
            break
        
        action_probabilities = action_probabilities.stack()
        c_values = c_values.stack()
        rewards = rewards.stack()
        
        return action_probabilities, c_values, rewards
        

## 2. Calculate expcted return
* Convert reward from each episode into a sequence of expected returns
* Sum of rewards taken from t to T and each reward is multiplied by the factor gamma
* To stabalise training, returns are standardized using mean and std

In [41]:
def exp_return(rewards,gamma,standardize):
    n = tf.shape(rewards)[0]
    returns = tf.TensorArray(size = n, dtype=tf.float32)
    
    #Latest Reward -> First Reward
    rewards = tf.cast(rewards[::-1],dtype =tf.float32)
    discounted_sum = tf.constant(0.0)
    discounted_sum_shape = discounted_sum.shape
    for i in tf.range(n):
        reward = rewards[i]
        discounted_sum = reward * gamma * discounted_sum #Bellman Equation
        discounted_sum.set_shape(discounted_sum_shape)
        returns = returns.write(i,discounted_sum)
    returns = returns.stack()[::-1]
    
    if standardize:
        returns = ((returns-tf.math.reduce_mean(returns))/(tf.math.reduce_std(returns)+eps))
        
    return returns
    

## 3. Defining Loss Function
* Actor Loss = Bellman Equation*[G-V]
* Advantage indicates how much better an action is given a particular state over random actions selected by a policy
* Critic Loss: Can be treated as a regression problem using Hubers Loss Function(Less sensetive to outliers relative to squared error loss)

In [56]:
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

def compute_loss(action_probabilities, values,returns):
    advantage = returns-values
    action_lg_prob = tf.math.log(action_probabilities)
    actor_loss = -tf.math.reduce_sum(action_lg_prob * advantage)
    
    critic_loss = huber_loss(values,returns)
    return actor_loss + critic_loss

## 4. Training and Updating
* Runned every episode
* Use Adam Optimizer
* tf.GradienTape to enable autodiff
* Compute sum of undiscounted rewards to meet success criterion
* Use tf.function for the training step to improve training speed


In [61]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)

@tf.function
def training(init_state,model,optimizer,gamma,max_step_per_episode)-> tf.Tensor:
    
    with tf.GradientTape() as tape:
        # Run model for one episode to collect data
        action_probabilities,values,rewards = collect(init_state,model,max_step_per_episode) # Works
        #Calculate expected Returns
        returns = exp_return(rewards,gamma) # works
        # Convert training data to fit TF shapes
        action_probabilities,values,returns = [tf.expand_dims(x,1) for x in [action_probabilities,values,returns]] # Works
        #Calculate loss values for the network
        loss = compute_loss(action_probabilities, values,returns)#works
        
    #Compute Loss gradient
    grads = tape.gradient(loss,model.trainable_variables)
    
    #Apply gradient to model params
    optimizer.apply_gradients(zip(grads,model.trainable_variables))
    
    episode_reward = tf.math.reduce_sum(rewards)
    
    return episode_reward    

## 5. Training Execution

In [59]:
max_eps = 100
max_step_per_episode = len(df)

running_rewarsds = 0
reward_threshold = 120000 # Success Criterion > 1000% (return for holding from the start of the market)

gamma = 0.99 # Want to maximise future rewards as much as possible

with tqdm.trange(max_eps) as t:
    for i in t:
        initial_state = tf.constant(env.reset(),dtype=tf.float32)
        episode_reward = int(training(initial_state,model,optimizer,gamma,max_step_per_episode))
        
        running_reward = episode_reward * 0.01 + running_reward*0.99
        
        t.set_description(f"Epsiode{i}")
        t.set_postfix(episode_reward=episode_reward, running_reward=running_reward)
        
        #Show average reward every 10 epsiodes
        if i % 10 == 0:
            print(f" Episode {i}: Average Reward: {running_reward/i}")
        
        if running_reward > reward_threshold:
            break
    print(f"Completed: Episode {i}, average_reward:{running_reward/i}")

  0%|                                                                                          | 0/100 [00:00<?, ?it/s]


NotImplementedError: in user code:

    <ipython-input-8-326441991a1a>:8 training  *
        action_probabilites,values,rewards = collect(init_state,model,max_step_per_episode)
    <ipython-input-28-297fec778ebd>:18 collect  *
        for t in tf.range(max_steps):
    C:\Users\teybo\Anaconda3\lib\site-packages\tensorflow\python\autograph\operators\control_flow.py:414 for_stmt
        symbol_names, opts)
    C:\Users\teybo\Anaconda3\lib\site-packages\tensorflow\python\autograph\operators\control_flow.py:629 _tf_range_for_stmt
        opts)
    C:\Users\teybo\Anaconda3\lib\site-packages\tensorflow\python\autograph\operators\control_flow.py:1059 _tf_while_stmt
        body, get_state, set_state, init_vars, nulls, symbol_names)
    C:\Users\teybo\Anaconda3\lib\site-packages\tensorflow\python\autograph\operators\control_flow.py:1032 _try_handling_undefineds
        _verify_loop_init_vars(init_vars, symbol_names, first_iter_vars)
    C:\Users\teybo\Anaconda3\lib\site-packages\tensorflow\python\autograph\operators\control_flow.py:172 _verify_loop_init_vars
        'a return statement cannot be placed inside this TensorFlow loop;'

    NotImplementedError: a return statement cannot be placed inside this TensorFlow loop; this may happen if a return statement depends on a static Python condition such as a hyperparameter


In [9]:
initial_state = tf.constant(env.reset(),dtype=tf.float32)

In [47]:
action_probabilities,values,rewards = collect(initial_state,model,max_step_per_episode)


In [48]:
returns = exp_return(rewards,gamma,True)
returns

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>

In [49]:
action_probabilities,values,returns = [tf.expand_dims(x,1) for x in [action_probabilites,values,returns]]

In [57]:
loss = compute_loss(action_probabilities, values,returns)

In [None]:
#Current Issue is with the training function