In [1]:
import tensorflow as tf
from tf_agents.environments import TFPyEnvironment, validate_py_environment

from tf_agents.trajectories import StepType, Trajectory, PolicyInfo
from tf_agents.networks import  Network
from tf_agents.policies import random_tf_policy
from tf_agents.drivers import dynamic_step_driver
from tf_agents.utils import common
from ddqn_agent import DdqnAgent

from keras.optimizers.schedules import PolynomialDecay, PiecewiseConstantDecay
import keras
from game_environment import cache_directory_files, USStockEnv, StockState, USStockEnv
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
# from sumTreeBatched import PrioritizedExperienceReplayBuffer  # python buffer
from per_buffer import PrioritizedExperienceReplayBuffer    # c++ buffer
from pprint import pprint
from typing import Tuple, NamedTuple, List

In [2]:
# # Data Collection
tickers = ["AAPL", "TSLA", "MSFT", "META", "GOOG", "AMZN", "NVDA","AVGO","JPM","WMT", "LLY","NFLX","V","ORCL","MA","XOM","JNJ","BAC","PLTR","ABBV"]
check_ticker = "^GSPC"

# from USstockHelper import create_previous_data_file, printStaticGraph
# # date_str = "2025-09-02" # Sep 2 was labour day
# create_previous_data_file([check_ticker]+tickers, date_str=date_str)
# printStaticGraph(date_str, check_ticker)
    

In [3]:
# Training Loop & Evaluation
num_iterations           = 120000           # Total number of training iterations
log_interval             = 500              # Steps between scalar/log prints
num_eval_episodes        = 2                # Episodes to run per evaluation
eval_interval            = 1500              # Steps between evaluations

# Time-Step & Sequence Length
N_step                      = 5             # Number of steps used in n-step return
T                           = N_step + 1    # Total frames in each training sequence
L                           = 10            # total number of observations in each state  

# Environment & Episode Configuration
collect_steps_per_iteration = 128          # New env steps per iteration (T defined above)
regularization              = 0.0            # L2 regularization strength

# Return & Reward Scaling
gamma                    = 0.999           # Discount factor for future rewards
reward_scalar            = 600               # Factor by which raw rewards are scaled

# Target Network Updates
target_update_tau        = 0.05              # Soft-update interpolation factor
target_update_period     = 100              # Steps between hard-copy target updates

# Learning Rate Schedule
init_fc_lr                  = 1e-3                 # Starting learning rate
final_lr                    = 1e-5                            # Minimum learning rate for both subnetworks
num_decay_steps          = int(0.8 * num_iterations)  # Steps over which LR decays from init → final

# Exploration (ε-Greedy)
epsilon_start            = 0.5             # Starting ε (fully random)
epsilon_end              = 0.05              # Final ε after decay

#Experience Replay (PER)
per_collect_steps           = 2**15            # Initial warm-up steps before any training
per_replay_capacity         = per_collect_steps            # Maximum transitions stored in buffer
per_replay_alpha            = 0.5              # Prioritization exponent (0 = uniform, 1 = full)
per_replay_beta             = 0.6              # Initial importance-sampling correction factor
per_delta_beta              = (1 - per_replay_beta) /num_decay_steps # β increment per training iteration
num_per_samples             = 128              # Samples drawn from PER per training step
max_window_size             = 2 * 390          # Max trajectories per add_batch_experience call
num_prev_file               = 10                # number of previous to include in the calculation of histoircal previous average

# Cross Valiation Parameters
num_folds                   =   4       # K in K fold cross validation
num_fold_update_interval    = (eval_interval/num_folds)   # every num_fold_update steps, the next fold is used. 
                                                    # Use eval_interval_num_folds, so that every eval 
                                                    # interval all folds have been used for validation once.

In [4]:
# Creating the train, validation and test data_caches and environments
class Fold(NamedTuple):
    train: dict
    files_train:list
    validation: dict
    files_validation:list


train_dir = "../US_Market_Data"         # for training and validation data
eval_dir = "../US_Market_Data/^GSPC"    # for test data

train_cache = cache_directory_files(directory=train_dir, exclude_sub_dir=eval_dir, num_prev_file=num_prev_file) # eval_dir is excluded from training data
test_cache = cache_directory_files(directory=eval_dir, num_prev_file=num_prev_file)

""" Creating the validation and training folds"""
folds_list = [{} for _ in range(num_folds)]
fold_index = 0

# round-robin partioning
for k,v in train_cache.items():
    folds_list[fold_index][k] = v
    fold_index = (fold_index + 1) % num_folds

folds:List[Fold] = []
for index in range(num_folds):
    v = folds_list[index]
    t = {}
    for i, cache in enumerate(folds_list):
        if i == index: continue
        t.update(cache)
    folds.append(Fold(train=t, files_train= list(t.keys()), validation=v, files_validation= list(v.keys())))
    
del folds_list, train_dir, eval_dir

""" Creating the ennvironments """
# training environment, just for training, no evaluations
train_py_env = USStockEnv( folds[fold_index].train, reward_scalar, L)
train_tf_env = TFPyEnvironment(train_py_env)

# validation environments for both literal money and average return evaluations
validation_py_env = USStockEnv( folds[fold_index].validation, reward_scalar, L)
validation_tf_env = TFPyEnvironment(validation_py_env)

# test environments for both literal money and average return 
test_py_env = USStockEnv(test_cache, reward_scalar, L)
test_tf_env = TFPyEnvironment(test_py_env)

validate_py_environment(train_py_env, episodes=10)
validate_py_environment(validation_py_env, episodes=10)
validate_py_environment(test_py_env, episodes=10)

print( train_tf_env.action_spec())
print( train_tf_env.observation_spec())
print( train_tf_env.reward_spec())
print(f"{len(folds[fold_index].train)} training files, {len(folds[fold_index].validation)} validation files, {len(test_cache)} test files")

BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(2, dtype=int32))
{'independent': TensorSpec(shape=(10, 6), dtype=tf.float32, name='ind_obs'), 'dependent': TensorSpec(shape=(10, 4), dtype=tf.float32, name='dep_obs'), 'compound': TensorSpec(shape=(6,), dtype=tf.float32, name='comp_obs')}
{'training': TensorSpec(shape=(), dtype=tf.float32, name='reward'), 'evaluation': TensorSpec(shape=(), dtype=tf.float32, name='evaluation')}
616 training files, 205 validation files, 94 test files


In [5]:
class CustomNetwork(Network):
    def __init__(self, observation_spec, action_spec, regularization, name="CustomNetwork"):
        super(CustomNetwork, self).__init__(input_tensor_spec=observation_spec, state_spec=(), name=name)
        
        # Save parameters
        self._observation_spec = observation_spec
        self._action_spec = action_spec
        self._regularization = regularization
        self._regularizer = keras.regularizers.l2(self._regularization)
        self._num_actions = action_spec.maximum - action_spec.minimum + 1
        self._name = name
      
        self.independent_lstm = keras.layers.LSTM(units=85, name="INDEPENDENT_LSTM", kernel_regularizer=self._regularizer)
        self.dependent_lstm = keras.layers.LSTM(units=85, name="DEPENDENT_LSTM",kernel_regularizer=self._regularizer)
        
        self.compound_embedding_mlp = keras.Sequential([
            keras.layers.Dense(86, activation=None, kernel_regularizer=self._regularizer),
            keras.layers.LayerNormalization(),
            keras.layers.LeakyReLU(),
            keras.layers.Dense(86, activation=None, kernel_regularizer=self._regularizer),
            keras.layers.LayerNormalization(),
            keras.layers.LeakyReLU()
        ], name="COMPOUND_EMBEDING_MLP")
        
        # Create residual blocks (each with 2 layers)
        self.residual_blocks = []
        for i in range(7):
            block = keras.Sequential([
                keras.layers.Dense(256, activation=None, kernel_regularizer=self._regularizer),
                keras.layers.LayerNormalization(),
                keras.layers.LeakyReLU(),
                keras.layers.Dense(256, activation=None, kernel_regularizer=self._regularizer),
                keras.layers.LayerNormalization()
            ], name=f"RESIDUAL_BLOCK_{i}")
            self.residual_blocks.append(block)
        
        self.value_head = keras.layers.Dense(units=1, activation=None, name="VALUE_HEAD", kernel_regularizer=self._regularizer)
        self.advantage_head = keras.layers.Dense(units=self._num_actions, name="ADVANTAGE_HEAD", kernel_regularizer=self._regularizer)
    
    def call(self, inputs, network_state=(), training=False):
        ind = tf.cast(inputs['independent'], tf.float32)
        dep = tf.cast(inputs['dependent'], tf.float32)
        comp = tf.cast(inputs['compound'], tf.float32)
    
        # Run the network
        ind_emb = self.independent_lstm(ind, training=training)
        dep_emb = self.dependent_lstm(dep, training=training)
        comp_emb = self.compound_embedding_mlp(comp, training=training)
        
        x = tf.concat([ind_emb, dep_emb, comp_emb], axis=-1)
        
        # Apply residual blocks
        for block in self.residual_blocks:
            residual = x
            x = block(x, training=training)
            x = tf.nn.leaky_relu(x + residual)  # Skip connection with activation
        
        # Produce dueling outputs
        V = self.value_head(x, training=training) 
        A = self.advantage_head(x, training=training)    
        q = V + (A - tf.reduce_mean(A, axis=-1, keepdims=True))  
        return q, network_state

    def copy(self, **kwargs):
        return type(self)(
            self._observation_spec,
            self._action_spec,
            self._regularization,
            self._name,
            **kwargs
        )

class EpsilonDecay():
    def __init__(self, schedule, step_counter):
        self.schedule = schedule  # A Keras-style schedule (e.g., PolynomialDecay)
        self.step_counter = step_counter  # tf.Variable, shared with agent

    def __call__(self):
        # Read current training step and call the inner schedule
        step = tf.cast(self.step_counter, tf.int32)
        return self.schedule(step)

train_step_counter = tf.Variable(0, dtype=tf.int32, trainable=False)

epsilon_schedule_fn = EpsilonDecay(
    schedule = PolynomialDecay(
        initial_learning_rate=epsilon_start,
        decay_steps=num_decay_steps,
        end_learning_rate=epsilon_end,
        power=2.0,
        cycle=False,
    ),
    step_counter=train_step_counter
)  

network = CustomNetwork( train_tf_env.observation_spec(), train_tf_env.action_spec(), regularization)
fc_schedule = PolynomialDecay(init_fc_lr, num_decay_steps, final_lr, power=3, cycle=False)
optimizer = keras.optimizers.legacy.Adam(fc_schedule)

# # """ For Leslie Smith LR Test"""
# S = 3000              # steps per piece
# P = int(num_iterations/S)                # number of pieces (segments)
# boundaries = [i * S for i in range(1, P)]
# #    - values spaced geometrically from eta_start→eta_end
# eta_start = 1e-7
# eta_end   = 1e-0
# values = np.geomspace(eta_start, eta_end, num=P).tolist()

# # 3) PIECEWISE SCHEDULE & OPTIMIZER
# schedule = PiecewiseConstantDecay(boundaries, values)
# optimizer = keras.optimizers.legacy.Adam(schedule)
# """ Test Over"""

agent = DdqnAgent(
    train_tf_env.time_step_spec(),
    train_tf_env.action_spec(),
    q_network = network,
    optimizer = optimizer,
    epsilon_greedy = epsilon_schedule_fn,
    n_step_update = N_step,
    target_update_period = target_update_period,
    target_update_tau = target_update_tau,
    td_errors_loss_fn = common.element_wise_huber_loss,
    gamma = gamma,
    train_step_counter = train_step_counter,
    )

agent.initialize()
agent._q_network.summary()

#Defining Policies
random_policy = random_tf_policy.RandomTFPolicy(agent.time_step_spec, agent.action_spec, emit_log_probability = True)


Model: "CustomNetwork"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 INDEPENDENT_LSTM (LSTM)     multiple                  31280     
                                                                 
 DEPENDENT_LSTM (LSTM)       multiple                  30600     
                                                                 
 COMPOUND_EMBEDING_MLP (Seq  (1, 86)                   8428      
 uential)                                                        
                                                                 
 RESIDUAL_BLOCK_0 (Sequenti  (1, 256)                  132608    
 al)                                                             
                                                                 
 RESIDUAL_BLOCK_1 (Sequenti  (1, 256)                  132608    
 al)                                                             
                                                     

In [6]:
# eta_data = {}

# a = S/eval_interval

# if not a.is_integer(): ValueError()
# else: a = int(a)

# for ind,eta in enumerate(values):
#     eta_data[eta] = [loss_log[a*ind + a_i] for a_i in range(a)]    
    
# pprint(eta_data)


In [7]:
# Helper functions for setting up replay buffer and key positions buffer

OBSERVATION_SPEC = train_tf_env.observation_spec()

standard_padding_traj = Trajectory(
    step_type = tf.fill(dims=(1,), value=StepType.LAST, name="step_type"),
    observation = tf.nest.map_structure(
                    lambda s: tf.fill(dims = (1,*s.shape), value=0.0),
                    OBSERVATION_SPEC
                    ),
    action = tf.fill(dims=(1,), value=0, name="action"),
    policy_info = tf.fill(dims=(1,), value=0.0, name="policy_info"),
    next_step_type = tf.fill(dims=(1,), value=StepType.FIRST, name="next_step_type"),
    reward = tf.fill(dims=(1,), value=0.0, name="reward"),
    discount = tf.fill(dims=(1,), value=1.0, name="discount"),
)

combined_traj = Trajectory(
    *(
        tf.nest.map_structure(
            lambda x: tf.Variable(tf.zeros((T, *x.shape.as_list()[1:]), dtype=x.dtype), trainable=False),
            traj_arg
        )
        for traj_arg in standard_padding_traj
    )
)

combined_traj_indx = tf.Variable(0, dtype=tf.int32, trainable=False)

# combined traj functions
@tf.function
def reset_combined_traj():
    global combined_traj, standard_padding_traj, T
    
    def _reset_leaf(c_leaf, p_leaf):
        c_leaf.assign(tf.zeros(shape=(T, *p_leaf.shape[1:]), dtype=p_leaf.dtype))
        
    """Reset the global combined_traj to zeros."""
    tf.nest.map_structure(_reset_leaf, combined_traj, standard_padding_traj)

@tf.function
def combined_traj_write(write_traj: Trajectory, index: tf.Tensor):
    """
    Writes a single trajectory into combined_traj at given index,
    handling mismatched policy_info structure.
    """
    global combined_traj
    def _assign_field(c_field, w_field):
        c_field.assign(
            tf.tensor_scatter_nd_update(c_field, [[index]], w_field)
        )

    # Make a mutable copy of write_traj
    w_traj = write_traj

    # Special-case policy_info to match combined_traj
    if isinstance(w_traj.policy_info, tuple) and not hasattr(w_traj.policy_info, '_fields'):
        # Empty tuple case → use zeros of the same shape as combined_traj.policy_info
        w_traj = w_traj._replace(
            policy_info=tf.zeros(shape=(1,),dtype=tf.float32)
        )
    elif hasattr(w_traj.policy_info, '_fields'):  
        # Namedtuple case → pick the first/primary field (here log_probability)
        w_traj = w_traj._replace(
            policy_info=w_traj.policy_info.log_probability
        )

    tf.nest.map_structure(_assign_field, combined_traj, w_traj)

@tf.function
def fifo_shift_combined_traj():
    global combined_traj, standard_padding_traj

    def _shift_assign(c_leaf, p_leaf):
        # c_leaf: (T, ...) trajectory tensor (tf.Variable)
        # p_leaf: (1, ...) padding tensor
        new_val = tf.concat([c_leaf[1:], p_leaf], axis=0)
        c_leaf.assign(new_val)  # In-place update

    # Applies to all leaves in nested structure:
    tf.nest.map_structure(_shift_assign, combined_traj, standard_padding_traj)
        
batched_traj = Trajectory(
    *(
        tf.nest.map_structure(
            lambda x: tf.Variable(tf.zeros( shape = (max_window_size, *x.shape), dtype=x.dtype),
                                  name=x.name,
                                  trainable = False),
            traj_arg
        )
      
        for traj_arg in combined_traj
    )
)
batched_traj_indx = tf.Variable(0, dtype=tf.int32, trainable=False)

@tf.function
def batched_traj_write(write_traj: Trajectory, index: tf.Tensor):
    """
    Update batched_traj at position `index` from write_traj, for all fields.
    Works with nested structures inside the trajectory.
    """
    def _assign_field(batched_field, write_field):
        update = tf.expand_dims(write_field, axis=0)  # add batch dim
        batched_field.assign(
            tf.tensor_scatter_nd_update(batched_field, [[index]], update)
        )

    tf.nest.map_structure(_assign_field, batched_traj, write_traj)

def add_to_per_buffer(*flat_list: list):
    global per_replay_buffer
    # Number of trajectories in this batch
    td_err = flat_list[-1]
    batch_size = td_err.size  # same as td_err.shape[0]
    
    batched_traj = tf.nest.pack_sequence_as(standard_padding_traj, flat_list[:-1])
    tuple_list = []
    for i in range(batch_size):
        # extract i-th example from each leaf (works for nested obs)
        single_traj = tf.nest.map_structure(lambda x: x[i], batched_traj)
        tuple_list.append((single_traj, float(td_err[i])))
        
    per_replay_buffer.add_batch_experience(tuple_list)
    return None

# Tests for combined traj
# test_traj = Trajectory(*[
#     tf.ones_like(field) * (1 if field.dtype.is_integer else 5.0)
#     for field in standard_padding_traj
# ])
# combined_traj_indx.assign(tf.constant(1))
# print(f"Test Traj {test_traj}")
# print(f"\n\n\n\nOriginal:\n{combined_traj}")
# combined_traj_write(test_traj,combined_traj_indx)
# print(f"\n\n\n\nAfter write:\n{combined_traj}")
# fifo_shift_combined_traj()
# print(f"\n\n\n\nAfter fifo shift:\n{combined_traj}")
# reset_combined_traj()
# print(f"\n\n\n\nAfter reset:\n{combined_traj}")


In [8]:
"""
Create the PER buffer and allocate (trajectory,td_err) to it. 
"""

# Create and fill out PER buffers 
per_buffer_list = [PrioritizedExperienceReplayBuffer(per_replay_capacity, per_replay_alpha, per_replay_beta) for _ in range(num_folds)]

# IMPORTANT: the observer and consequently the drivers act on this buffe, so always assign it
per_replay_buffer: PrioritizedExperienceReplayBuffer

# Observer to add collected trajectories to the replay buffer
reset_combined_traj() # all boundary trajectories
combined_traj_indx.assign(0) # reset index to 0
batched_traj_indx.assign(0) # reset batch traj

@tf.function
def per_buffer_observer(traj: Trajectory):
    global agent, gamma, combined_traj, combined_traj_indx, T, batched_traj_indx, batched_traj, td_error_tnsr
    
    traj = traj.replace(reward = traj.reward["training"]) # for training purposes, use only the training reward
    combined_traj_write(traj, combined_traj_indx)
    # post increment means that the index is now equal to the size of the valid entries in combined trajectories
    combined_traj_indx.assign_add(1) 
    
    if traj.is_boundary():
        # Process ALL possible trajectories in the buffer
        for _ in tf.range( combined_traj_indx - 1):
            # 2. Add to batch BEFORE checking boundary
            
            batched_traj_write(combined_traj, batched_traj_indx)
            batched_traj_indx.assign_add(1)
            
            # 3. Check if FIRST element is LAST AFTER processing
            if tf.equal(combined_traj.step_type[0], StepType.LAST):
                tf.print("The first trajectory is of step type: ",combined_traj.next_step_type[0],"\ncombined_traj: ",combined_traj)

            fifo_shift_combined_traj()
                    
        # compute td_errs 
        # reward_scale_factor is at environment level
        # loss function does not matter, since only td_errors are needed
        
        sample_traj = tf.nest.map_structure(lambda x, ind = batched_traj_indx: x[0:ind], batched_traj) # from 0 to ind-1, becasue of slicing
        
        loss_info = agent._loss(sample_traj, gamma=gamma, td_errors_loss_fn=common.element_wise_huber_loss,
                                reward_scale_factor=1.0, weights=None, training=True)
        td_err = loss_info.extra.td_error
        
        # add batch to buffer and reset batch
        tf.numpy_function(
            add_to_per_buffer,
            inp=[ *tf.nest.flatten(sample_traj) , td_err],
            Tout=[]  # Match empty list if no returns needed
        )
        batched_traj_indx.assign(0)
        
        # reset combined traj and states
        reset_combined_traj()
        combined_traj_indx.assign(0)

    # Handle regular sliding window case
    if combined_traj_indx == T:
        
        batched_traj_write(combined_traj, batched_traj_indx)
        batched_traj_indx.assign_add(1)
        
        fifo_shift_combined_traj()
        combined_traj_indx.assign_sub(1)

# Create the dynamic step driver
initial_driver = dynamic_step_driver.DynamicStepDriver(
    env=train_tf_env,                         # The environment to collect data from
    policy=random_policy,                     # Policy used to collect data (e.g., random_policy)
    observers=[per_buffer_observer],# Observer that adds to the replay buffer
    num_steps=per_collect_steps           # Number of steps to collect
)

# Run the driver
initial_driver.run = common.function(initial_driver.run)

# Populate all buffers
for index in range(num_folds):
    fold_index = index
    train_py_env.update_data_cache(folds[fold_index].train, folds[fold_index].files_train)
    train_py_env.reset()
    per_replay_buffer = per_buffer_list[fold_index]
    
    initial_driver.run(train_tf_env.reset())

"""
Print the number of actual elements added to the buffers
"""
for index in range(num_folds):
    print(f"Fold {index}: PER Buffer Capacity:{per_buffer_list[index].capacity}, PER Actual Trajectories:{per_buffer_list[index].length}, Batch Size: {num_per_samples}")

  retval_ = ag__.and_(lambda : ag__.ld(state) is not None, lambda : ag__.and_(lambda : ag__.ld(state) is not (), lambda : ag__.ld(state) is not []))


Fold 0: PER Buffer Capacity:32768, PER Actual Trajectories:32680, Batch Size: 128
Fold 1: PER Buffer Capacity:32768, PER Actual Trajectories:32768, Batch Size: 128
Fold 2: PER Buffer Capacity:32768, PER Actual Trajectories:32768, Batch Size: 128
Fold 3: PER Buffer Capacity:32768, PER Actual Trajectories:32768, Batch Size: 128


In [9]:
# Helper functions for interacting with python native per buffer and evalution of key metrics
@tf.function
def evaluate_episode_metrics(env, num_episodes, policy):
    """
    Runs `num_episodes` episodes and returns:
      avg_return:        average training return per episode (divided by reward_scalar)
      avg_money_pct:     average evaluation money per episode as a percentage (money *100 / reward_scalar)
      avg_trades:        average trades per episode (buy+sell counted as one)
    """
    # capture reward scalar as a TF constant
    reward_sf = tf.constant(reward_scalar, dtype=tf.float32)

    # outer loop body
    def _episode_body(i, total_train, total_eval, total_trades):
        # reset env & policy state
        ts = env.reset()
        policy_state = policy.get_initial_state(batch_size=1)

        # per-episode accumulators
        ep_train = tf.constant(0.0, dtype=tf.float32)
        ep_eval = tf.constant(0.0, dtype=tf.float32)
        ep_trades = tf.constant(0, dtype=tf.int32)

        # inner loop condition / body
        def _step_cond(ts, policy_state, ep_train, ep_eval, ep_trades):
            return tf.logical_not(ts.is_last())

        def _step_body(ts, policy_state, ep_train, ep_eval, ep_trades):
            action_step = policy.action(ts, policy_state)
            ts_next = env.step(action_step.action)

            # extract training / evaluation components from the (structured) reward
            # reduce_sum collapses potential (1,) batch dimension to scalar
            train_r = tf.cast(tf.reduce_sum(ts_next.reward["training"]), tf.float32)
            eval_r = tf.cast(tf.reduce_sum(ts_next.reward["evaluation"]), tf.float32)

            ep_train = ep_train + train_r
            ep_eval = ep_eval + eval_r

            # increment trade count when evaluation (money) is non-zero
            delta = tf.cond(tf.not_equal(eval_r, 0.0),
                            lambda: tf.constant(1, dtype=tf.int32),
                            lambda: tf.constant(0, dtype=tf.int32))
            ep_trades = ep_trades + delta

            return ts_next, action_step.state, ep_train, ep_eval, ep_trades

        # run episode
        _, _, ep_train, ep_eval, ep_trades = tf.while_loop(
            _step_cond,
            _step_body,
            [ts, policy_state, ep_train, ep_eval, ep_trades],
            # optional shape_invariants could be added if needed
        )

        # accumulate totals and increment episode index
        return i + 1, total_train + ep_train, total_eval + ep_eval, total_trades + ep_trades

    # initialize outer loop vars
    i0 = tf.constant(0, dtype=tf.int32)
    total_train0 = tf.constant(0.0, dtype=tf.float32)
    total_eval0 = tf.constant(0.0, dtype=tf.float32)
    total_trades0 = tf.constant(0, dtype=tf.int32)

    # outer loop
    _, total_train, total_eval, total_trades = tf.while_loop(
        lambda i, *_: tf.less(i, num_episodes),
        _episode_body,
        [i0, total_train0, total_eval0, total_trades0],
    )

    # compute averages:
    n_eps_f = tf.cast(num_episodes, tf.float32)
    avg_return = total_train / (n_eps_f * reward_sf)                    # training return normalized by reward scalar
    avg_money_pct = (total_eval * 100.0) / (n_eps_f * reward_sf)         # evaluation money -> percent
    avg_trades = tf.cast(total_trades, tf.float32) / (2.0 * n_eps_f)    # buy+sell counted as one

    return avg_return, avg_money_pct, avg_trades

""" Helper functions dor interacting with native per buffer"""
def sample_helper(batch_size):
    trajs, weights, idxs = per_replay_buffer.sample(batch_size)
    
        # Create batched trajectory by stacking each component
    batched_traj = tf.nest.map_structure(lambda *x: np.stack(x, axis = 0), *trajs)
    flat_traj = tf.nest.flatten(batched_traj)
    return flat_traj + [np.array(weights, dtype=np.float32), np.array(idxs,dtype = np.int32)]
    
tout = []
tf.nest.map_structure(lambda x: tout.append(x.dtype), standard_padding_traj) # dtypes for trajectory
tout.append(tf.float32) # importance_weights dtype
tout.append(tf.int32)      # index dtype

@tf.function
def get_per_batch(batch_size) -> Tuple[Trajectory, tf.Tensor, tf.Tensor]:
    """
    Fetch a batch from the native PER helper and return fully-shaped tensors.

    Args:
        batch_size: scalar Python int or 0-D `tf.Tensor` (int32/int64) giving the requested batch size.

    Returns:
        all_trajs: `Trajectory`-shaped nested structure whose leaves are `tf.Tensor`s with runtime
                   shape `[batch_size, ...]` (tail dims match `combined_traj` template).
        weights:   `tf.Tensor` of shape `[batch_size]`, dtype `tf.float32` (importance weights).
        idxs:      `tf.Tensor` of shape `[batch_size]`, dtype `tf.int32` (buffer indices).

    Notes:
        - This function calls a Python-side sampling helper via `tf.numpy_function` and repacks
          the flattened results into the nested `combined_traj` structure, then reshapes leaves
          so the leading dimension equals `batch_size`.
    """
    global tout, combined_traj
    
    flat_list = tf.numpy_function( func=sample_helper, inp=[batch_size], Tout=tout)
    idxs = flat_list[-1]
    weights = flat_list[-2]
    
    all_trajs = tf.nest.pack_sequence_as(combined_traj, flat_list[:-2])
    all_trajs = tf.nest.map_structure(lambda x, c: tf.reshape(x , [batch_size, *c.shape]) ,all_trajs, combined_traj)
    
    weights = tf.reshape(weights, [batch_size])
    idxs = tf.reshape(idxs, [batch_size])
    
    return all_trajs, weights, idxs

def per_set_beta(beta: float |np.ndarray): 
    global per_replay_buffer
    per_replay_buffer.set_beta(float(beta))
    return None

def per_update_priorities(per_indexes, td_errors):
    global per_replay_buffer
    per_replay_buffer.update_leaf_priorities(per_indexes, td_errors)
    return None

In [10]:
# Preparation for training.
agent.train = common.function(agent._train)
agent.train_step_counter.assign(0)

# Collection happens with the collect_policy
collect_driver = dynamic_step_driver.DynamicStepDriver(
  env = train_tf_env,                 
  policy = agent.collect_policy,                
  observers =[per_buffer_observer],
  num_steps = collect_steps_per_iteration    
)
collect_driver.run = common.function(collect_driver.run)

# Evaluate the agent's policy for plotting purposes.
training_loss = [] # Store total loss values
training_q_values = [] # Storage for Q-values at evaluation intervals

validation_return = []
validation_money = []
validation_count = []

test_return = []
test_money = [] # stores the literal money 
test_count = [] # stores the number of trades an agent makes

per_replay_beta_tensor = tf.Variable(per_replay_beta, dtype=tf.float32)

@tf.function
def train_step(step, curr_step, policy_state):
    global per_replay_beta_tensor, num_per_samples
    
    # Collect experience.
    curr_step, policy_state = collect_driver.run(curr_step, policy_state)

    # Sample from PER buffer
    all_trajs, all_is_weights, per_indexes = get_per_batch(num_per_samples)
    
    # Train agent
    loss_info = agent.train(all_trajs, all_is_weights)
    
    total_loss = loss_info.loss
    td_errors = loss_info.extra.td_error
    
    # Update PER priorities and beta
    tf.numpy_function(func = per_update_priorities,inp=[per_indexes, td_errors],Tout=[])
    
    per_replay_beta_tensor.assign(tf.minimum(per_replay_beta_tensor + per_delta_beta, 1.0))
    
    tf.numpy_function(func = per_set_beta,inp=[per_replay_beta_tensor],Tout=[])
        
    if step % log_interval == 0:
        tf.print("step =", step, ": loss =", total_loss)
    
    return total_loss, curr_step, policy_state
    
def evaluation_step(step):
    """ Validation Metrics """
    avg_return_t, avg_money_pct_t, avg_count_t = evaluate_episode_metrics(validation_tf_env, num_eval_episodes, agent.policy)

    # convert to python floats and append (avg_money_pct_t is already in percent)
    avg_return = float(avg_return_t.numpy())
    validation_return.append(avg_return)

    avg_money_pct = float(avg_money_pct_t.numpy())   # already *100 inside evaluate_episode_metrics
    validation_money.append(avg_money_pct)

    avg_count = float(avg_count_t.numpy())
    validation_count.append(avg_count)

    print(f"step = {step}: Validation: Count = {avg_count:.5f}, Money = {avg_money_pct:.5f}, Reward = {avg_return:.5f}")

    """ Test Metrics """
    avg_return_t, avg_money_pct_t, avg_count_t = evaluate_episode_metrics(test_tf_env, num_eval_episodes, agent.policy)

    avg_return = float(avg_return_t.numpy())
    test_return.append(avg_return)

    avg_money_pct = float(avg_money_pct_t.numpy())   # already *100 inside evaluate_episode_metrics
    test_money.append(avg_money_pct)

    avg_count = float(avg_count_t.numpy())
    test_count.append(avg_count)

    print(f"step = {step}: Test: Count = {avg_count:.5f}, Money = {avg_money_pct:.5f}, Reward = {avg_return:.5f}")

    """ Training Metrics """
    # === Collect Q-values for randomly selected states ===
    num_q_samples = min(num_per_samples, 10)
    trajs, _, _= get_per_batch(num_q_samples)

    # slice out the first time‐step before calling the network
    q_values, _ = agent._q_network(
        tf.nest.map_structure(lambda x: x[:,0], trajs.observation) , #remove T (N step update part)
        tf.nest.map_structure(lambda x: x[:,0], trajs.step_type),   # input T = 0
        network_state=agent.policy.get_initial_state(num_q_samples),
        training=False
    ) # Out [B, A]
    # average over batch
    q_values = tf.reduce_mean(q_values, axis=0)  # → [A,]
    training_q_values.append(q_values.numpy())
    
def training_and_evalution():
    
    global agent, eval_interval, num_fold_update_interval, fold_index, folds, num_folds
    global combined_traj_indx, batched_traj_indx, per_replay_buffer 
    
    reset_combined_traj() # all boundary trajectories
    combined_traj_indx.assign(0) # reset index to 0
    batched_traj_indx.assign(0) # reset batch

    curr_step = train_tf_env.reset()# Reset the environment.
    policy_state = None
    
    for _ in range(num_iterations):
        step = agent.train_step_counter
        loss, curr_step, policy_state = train_step(step, curr_step, policy_state)
        step = int(step.numpy())
        if step % num_fold_update_interval == 0: 
            
            # update training and validation environments
            train_py_env.update_data_cache( folds[fold_index].train, folds[fold_index].files_train )
            validation_py_env.update_data_cache(folds[fold_index].validation, folds[fold_index].files_validation)
            
            # reset states
            train_py_env.reset()
            curr_step = train_tf_env.reset()
            policy_state = None
            
            # reset buffer parameters
            reset_combined_traj()
            combined_traj_indx.assign(0)
            batched_traj_indx.assign(0)
            
            # use another buffer
            per_replay_buffer = per_buffer_list[fold_index]
            fold_index = (fold_index + 1) % num_folds
            
        if step % eval_interval == 0:
            training_loss.append(loss.numpy())
            evaluation_step(step)

training_and_evalution()

step = 500 : loss = 1.40867877
step = 1000 : loss = 0.493961513
step = 1500 : loss = 0.48890686


  ag__.if_stmt(ag__.ld(policy_state) is (), if_body_2, else_body_2, get_state_2, set_state_2, ('do_return', 'retval_'), 2)


step = 1500: Validation: Count = 4.00000, Money = 0.09643, Reward = -0.00035
step = 1500: Test: Count = 2.00000, Money = 0.16132, Reward = 0.00021
step = 2000 : loss = 0.217109442
step = 2500 : loss = 0.596340358
step = 3000 : loss = 0.463815629
step = 3000: Validation: Count = 0.00000, Money = 0.00000, Reward = 0.00000
step = 3000: Test: Count = 0.00000, Money = 0.00000, Reward = 0.00000
step = 3500 : loss = 0.595168114
step = 4000 : loss = 0.456965148
step = 4500 : loss = 0.622768402
step = 4500: Validation: Count = 1.50000, Money = -0.13857, Reward = -0.00104
step = 4500: Test: Count = 1.50000, Money = -0.12454, Reward = -0.00015
step = 5000 : loss = 0.291768432
step = 5500 : loss = 0.280453622
step = 6000 : loss = 0.490948588
step = 6000: Validation: Count = 2.00000, Money = 3.54627, Reward = 0.03439
step = 6000: Test: Count = 2.00000, Money = -0.13064, Reward = -0.00037
step = 6500 : loss = 0.335695982
step = 7000 : loss = 0.503024101
step = 7500 : loss = 0.364633381
step = 7500: 

KeyboardInterrupt: 

In [None]:

# assemble arrays
q_vals = np.array(training_q_values)
q_mean = np.mean(q_vals, axis=1)
q_max  = np.max(q_vals, axis=1)

data = {
    'loss': np.array(training_loss),
    'q_mean': q_mean,
    'q_max': q_max,
    'val_return': np.array(validation_return),
    'val_money': np.array(validation_money),
    'val_count': np.array(validation_count),
    'test_return': np.array(test_return),
    'test_money': np.array(test_money),
    'test_count': np.array(test_count),
}

df = pd.DataFrame(data)
# df = pd.read_csv("../temp.csv", index_col=False, header=0)
# Pearson correlation
corr = df.corr(method='pearson').abs()

# # Heatmap
plt.figure(figsize=(9,7))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()


In [None]:
# --- plotting (reorganized middle & bottom plots) ---
# prepare arrays (unchanged)
q_values_array = np.array(training_q_values)            # [num_eval_points, num_actions]
val_return_arr   = np.array(validation_return)
val_money_arr    = np.array(validation_money)
val_count_arr    = np.array(validation_count)

test_return_arr  = np.array(test_return)
test_money_arr   = np.array(test_money)
test_count_arr   = np.array(test_count)

steps = list(range(0, num_iterations, eval_interval))

# Compute mean and max Q over actions
q_mean = np.mean(q_values_array, axis=1)
q_max  = np.max(q_values_array, axis=1)

# Create figure with three subplots stacked vertically
fig, (ax_top, ax_mid, ax_bottom) = plt.subplots(nrows=3, figsize=(14, 14), sharex=True)

# --- Top Plot: Q-values and Loss (unchanged) ---
ax_top.set_ylabel('Loss', color='black')
line_loss = ax_top.plot(steps, training_loss, label='Loss', color='black', linewidth=2, linestyle='dashed')[0]
ax_top.tick_params(axis='y', labelcolor='black')

ax2_top = ax_top.twinx()
ax2_top.set_ylabel('Q Values', color='tab:red')

q_line_short     = ax2_top.plot(steps, q_values_array[:, 0], color='tab:cyan', linestyle='dotted', linewidth=1.2, label='Short')[0]
q_line_no_trade  = ax2_top.plot(steps, q_values_array[:, 1], color='tab:red',  linestyle='dotted', linewidth=1.2, label='NoTrade')[0]
q_line_long      = ax2_top.plot(steps, q_values_array[:, 2], color='tab:green',linestyle='dotted', linewidth=1.2, label='Long')[0]

q_line_mean = ax2_top.plot(steps, q_mean, label='Mean Q', color='#6a3d9a', linewidth=2)[0]
q_line_max  = ax2_top.plot(steps, q_max,  label='Max Q',  color='#ff7f00', linewidth=2)[0]

ax2_top.tick_params(axis='y', labelcolor='tab:red')

handles_top = [line_loss, q_line_mean, q_line_max, q_line_short, q_line_long, q_line_no_trade]
labels_top = [h.get_label() for h in handles_top]
ax_top.legend(handles_top, labels_top, loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=4, fontsize=9)
ax_top.set_title('Q-Values and Loss Over Time')

# --- Middle Plot: Average Return (Validation + Test) ---
ax_mid.set_xlabel('Step')
ax_mid.set_ylabel('Average Return', color='tab:blue')
# Both lines blue but different structured lines
val_line_ret = ax_mid.plot(steps, val_return_arr, label='Val Avg Return', color='tab:blue', linewidth=2, linestyle='solid')[0]
test_line_ret = ax_mid.plot(steps, test_return_arr, label='Test Avg Return', color='tab:blue', linewidth=2, linestyle='dashed')[0]
ax_mid.tick_params(axis='y', labelcolor='tab:blue')
ax_mid.axhline(0, color='skyblue', linestyle='dashed', linewidth=1.2, label='Return=0')

handles_mid = [val_line_ret, test_line_ret]
labels_mid = [h.get_label() for h in handles_mid]
ax_mid.legend(handles_mid, labels_mid, loc='upper center', bbox_to_anchor=(0.5, -0.18), ncol=2, fontsize=9)
ax_mid.set_title('Average Return: Validation vs Test')

# --- Bottom Plot: Money Made (as percentage label) and Counts (Both Validation + Test) ---
ax_bottom.set_xlabel('Step')
ax_bottom.set_ylabel(f'Percentage Money Made for {check_ticker}', color='tab:green')
# Money lines share the same money axis (green) — legend labels mention Val/Test only
val_line_money = ax_bottom.plot(steps, val_money_arr, label='Val Money (%)', color='tab:green', linewidth=2, linestyle='solid')[0]
test_line_money = ax_bottom.plot(steps, test_money_arr, label='Test Money (%)', color='tab:green', linewidth=2, linestyle='dashed')[0]
ax_bottom.tick_params(axis='y', labelcolor='tab:green')
ax_bottom.axhline(0, color='lightgreen', linestyle='dashed', linewidth=1.2, label='Money=0')

# Counts on a separate right axis
ax2_bottom = ax_bottom.twinx()
ax2_bottom.spines['right'].set_position(('outward', 60))
ax2_bottom.set_ylabel('Counts', color='tab:orange')
val_line_counts = ax2_bottom.plot(steps, val_count_arr, label='Val Counts', color='tab:orange', linewidth=2, linestyle='solid')[0]
test_line_counts = ax2_bottom.plot(steps, test_count_arr, label='Test Counts', color='tab:orange', linewidth=2, linestyle='dotted')[0]
ax2_bottom.tick_params(axis='y', labelcolor='tab:orange')

# Combined legend (money & counts). Money legend entries do not include $check_ticker text.
handles_bottom = [val_line_money, test_line_money, val_line_counts, test_line_counts]
labels_bottom = [h.get_label() for h in handles_bottom]
ax_bottom.legend(handles_bottom, labels_bottom, loc='upper center', bbox_to_anchor=(0.5, -0.25), ncol=4, fontsize=9)
ax_bottom.set_title('Money Made (Val/Test) and Counts Over Time')

plt.tight_layout()
plt.show()


In [None]:
agent_dir = "../Agent"

# === Save Agent ===
checkpoint = tf.train.Checkpoint(agent=agent, optimizer=agent._optimizer)
checkpoint_manager = tf.train.CheckpointManager(checkpoint, agent_dir, max_to_keep=1)
checkpoint_manager.save()

# === Load Agent ===
# checkpoint = tf.train.Checkpoint(agent=agent, optimizer=agent._optimizer)
# checkpoint.restore(tf.train.latest_checkpoint(agent_dir))

# # save policy
# tf_policy_saver = policy_saver.PolicySaver(agent.policy)
# tf_policy_saver.save(agent_dir+" Policy")

In [None]:
""" Look at the Q values alongside the data to see agent decisions """

""" Data Collection """
test_py_env.logger = []
test_py_env.logging_step = test_py_env._L - 1
q_values_over_time = []
file_name = test_py_env._FILE_NAME

time_step = test_py_env.reset()
policy_state = agent.policy.get_initial_state(1)

while not bool(time_step.is_last()):
    test_py_env.logging_step += 1
    
    observation = tf.nest.map_structure(lambda x: np.expand_dims(x,axis= 0), time_step.observation)
    step_type = tf.nest.map_structure(lambda x: np.expand_dims(x, axis = 0), time_step.step_type)
    time_step = time_step._replace(observation = observation, step_type = step_type)
    
    # compute q_values pre action
    q_values,_ = agent._q_network(
        observation,  # add the batch dim
        step_type,   # input T = 0
        training=False
    ) # Out [B, A]
    q_values_over_time.append(tf.squeeze(q_values).numpy())
    
    # take the action
    action_step = agent.policy.action(time_step, policy_state)
    policy_state = action_step.state
    
    # update the time step
    time_step = test_py_env.step(action_step.action)
    
df = pd.DataFrame.from_records(test_py_env.logger, index="step")

# --- Prepare df columns for plotting --

# Prepopulate with holds
df['plot_category'] = 'hold'

# When action changes, show the *new* action
change_mask = df['action'] != df['prev_action']
df.loc[change_mask, 'plot_category'] = df.loc[change_mask, 'action']

# mapping -> color & size for plotting
plot_styles = {
    'short': {'color': 'orange', 'size': 50},
    'noTrade': {'color': 'purple', 'size': 50},
    'long': {'color': 'green', 'size': 50},
    'hold': {'color': 'cyan', 'size': 10},
}

# Total money made
total_reward = df['eval_reward'].sum()

# Convert q_values to numpy array and steps
q_values_array = np.array(q_values_over_time)  # shape: [T, num_actions]
steps = df.index.to_numpy()

# --- Plotting ---
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(14, 10), sharex=True)

# Plot price with colored markers by category (only at relevant steps)
for cat, style in plot_styles.items():
    mask = df['plot_category'] == cat
    ax1.scatter(df.index[mask], df['data'][mask],
                color=style['color'], s=style['size'], label=cat)

ax1.set_ylabel("Price")
ax1.set_title(f"Agent Decisions on {file_name} | Money Made: {total_reward:.5f}")
ax1.legend()

# --- Second plot: Q-values ---
n_actions = q_values_array.shape[1]
for i in range(n_actions):
    color = list(plot_styles.values())[i]['color']  # same color order
    ax2.plot(steps, q_values_array[:, i],
                label=f"Q({i})", alpha=0.7, color=color)

ax2.set_ylabel("Q-Values")
ax2.set_xlabel("Step")
ax2.set_title("Q-Values Over Time")
ax2.legend(loc="upper left")

plt.tight_layout()
plt.show()


In [None]:
# import itertools
# from collections import Counter

# # params
# K = 3
# S = 1
# dilations = (1,2,2,3)

# # build the flat list of dilations, two layers per dilation per stack
# layers = []
# for _ in range(S):
#     for d in dilations:
#         layers.append(d)   # first conv in residual block
#         layers.append(d)   # second conv in residual block

# # for each layer, the kernel can “pick” any of the K positions {0, 1, …, K-1},
# # which corresponds to an offset of (position * dilation)
# offsets_per_layer = [
#     [pos * d for pos in range(K)]
#     for d in layers
# ]

# # compute all path‐delays (sum of one offset per layer)
# all_delays = (
#     sum(path)
#     for path in itertools.product(*offsets_per_layer)
# )

# # tally them up
# hist = Counter(all_delays)

# # convert Counter to a plain dict
# delay_to_paths = dict(hist)

# pprint(delay_to_paths)

# total_count = 0
# max_path_len = max(list(delay_to_paths.keys()))
# heuristic = int(0.8*max_path_len)
# heuristic_count = 0


# for p_len, count in delay_to_paths.items():
#     total_count += count
#     if p_len < heuristic:
#         heuristic_count += count
    
# print(f"Total count: {total_count}")
# print(f"Heuristic count: {heuristic_count}")
# print(f"Percent of all paths close by heuristic look back period of {heuristic}:  {heuristic_count/total_count*100}")
# print(f"Max Path length {max_path_len}")
# print(f"K:{K}, nb_stacks:{S}, dilations:{dilations}")


In [None]:
# import tensorflow as tf
# import numpy as np
# from tensorflow.keras.optimizers.legacy import Adam
# from tensorflow.keras.optimizers.schedules import PolynomialDecay
# from tf_agents.networks.network import Network
# # Assume CustomNetwork is already defined/imported in your environment

# # --- 1. Generate fake data ---
# # State dimension: Look-back L = 28, feature dim F = 8
# # Action dimension A = 4
# X = np.random.randn(1000, 22, 8).astype(np.float32)
# y = np.random.randn(1000, 4).astype(np.float32)

# # --- 2. Build the network ---
# # Example hyperparameters
# fc_units = [256, 128]       # will be ignored since we freeze heads only
# regularization = 1e-4

# network = CustomNetwork(
#     observation_spec= train_py_env.observation_spec(),  # Replace with your actual spec if needed
#     action_spec=train_py_env.action_spec(),       # Replace with your actual spec if needed
#     fc_layer_params=fc_units,
#     regularization=regularization
# )

# # Unfreeze all layers so gradients flow through both TCN and heads
# for layer in network.layers:
#     layer.trainable = True
#     if layer.name in {"Value_Head", "Advantage_Head"}: layer.trainable =False

# # --- 3. Prepare dataset ---
# dataset = tf.data.Dataset.from_tensor_slices((X, y)).batch(32)

# # --- 4. Optimizer and loss ---
# # Learning rate schedule (example)
# init_lr = 1e-4
# num_decay_steps = 1000
# final_lr = 1e-5
# lr_schedule = PolynomialDecay(init_lr, num_decay_steps, final_lr, power=1, cycle=False)
# optimizer = Adam(learning_rate=lr_schedule)
# mse = tf.keras.losses.MeanSquaredError()

# # --- 5. Training step function ---
# @tf.function
# def train_step(x_batch, y_batch):
#     with tf.GradientTape() as tape:
#         q_pred, _ = network(x_batch, training=True)  # [B, A]
#         loss = mse(y_batch, q_pred)
#     grads = tape.gradient(loss, network.trainable_variables)
#     optimizer.apply_gradients(zip(grads, network.trainable_variables))
#     return loss

# # --- 6. Training loop ---
# for epoch in range(100):
#     epoch_loss = 0.0
#     for xb, yb in dataset:
#         batch_loss = train_step(xb, yb)
#         epoch_loss += batch_loss
#     print(f"Epoch {epoch+1}, Loss: {epoch_loss.numpy()/len(dataset):.4f}")


In [None]:
import per_buffer

print(per_buffer.test())  # Should print: "Hello from C++"
