In [None]:
import os
import sys
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
%matplotlib inline
sns.set_theme()

In [None]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [None]:
from src.strategy.model import Model
from src.strategy.environment import Environment
from src.strategy.agent import Agent
from src.strategy.buffer import Buffer
from src.utils import get_config, read_file
config = get_config.read_yaml()

In [None]:
print("Starting Training...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
MODEL_PATH = config['paths']['model_directory']
CAPITAL = config['strategy']['capital']
SYMBOLS = config['data']['symbols']

hp = config['hyperparameters']
NUM_ASSETS = hp['num_assets']
INPUT_DIM = hp['input_dim']
ACTION_DIM = hp['action_dim']
NUM_LSTM_LAYERS = hp['num_lstm_layers']
HIDDEN_STATE_DIM = hp['hidden_state_dim']
ACTOR_HIDDEN_DIM = hp['actor_hidden_dim']
CRITIC_HIDDEN_DIM = hp['critic_hidden_dim']
GAMMA = hp['gamma']
GAE_LAMBDA = hp['gae_lambda']
CLIP_EPSILON = hp['clip_epsilon']
VALUE_LOSS_COEF = hp['value_loss_coef']
ENTROPY_LOSS_COEF = hp['entropy_loss_coef']
LEARNING_RATE = hp['learning_rate']
NUM_EPOCHS = hp['num_epochs']
ROLLOUT_SIZE = hp['rollout_size']
MINI_BATCH_SIZE = hp['mini_batch_size']
SEQUENCE_LENGTH = hp['seq_len']
train_data_norm = read_file.read_merged_training_data()
train_data_unnorm = read_file.read_merged_training_data(False)
print(f'MODEL_PATH: {MODEL_PATH}')
print(f'CAPITAL: {CAPITAL}')
print(f'SYMBOLS: {SYMBOLS}')
print(f'NUM_ASSETS: {NUM_ASSETS}')
print(f'INPUT_DIM: {INPUT_DIM}')
print(f'ACTION_DIM: {ACTION_DIM}')
print(f'NUM_LSTM_LAYERS: {NUM_LSTM_LAYERS}')
print(f'HIDDEN_STATE_DIM: {HIDDEN_STATE_DIM}')
print(f'ACTOR_HIDDEN_DIM: {ACTOR_HIDDEN_DIM}')
print(f'CRITIC_HIDDEN_DIM: {CRITIC_HIDDEN_DIM}')
print(f'GAMMA: {GAMMA}')
print(f'GAE_LAMBDA: {GAE_LAMBDA}')
print(f'CLIP_EPSILON: {CLIP_EPSILON}')
print(f'VALUE_LOSS_COEF: {VALUE_LOSS_COEF}')
print(f'ENTROPY_LOSS_COEF: {ENTROPY_LOSS_COEF}')
print(f'LEARNING_RATE: {LEARNING_RATE}')
print(f'NUM_EPOCHS: {NUM_EPOCHS}')
print(f'ROLLOUT_SIZE: {ROLLOUT_SIZE}')
print(f'BATCH_SIZE: {MINI_BATCH_SIZE}')
print(f'SEQUENCE_LENGTH: {SEQUENCE_LENGTH}')
print(train_data_norm)
print(train_data_unnorm)

In [None]:
model = Model(INPUT_DIM,
              HIDDEN_STATE_DIM,
              NUM_ASSETS,
              NUM_LSTM_LAYERS,
              ACTOR_HIDDEN_DIM,
              CRITIC_HIDDEN_DIM,)
model

In [None]:
agent = Agent(model,
              GAMMA,
              GAE_LAMBDA,
              CLIP_EPSILON,
              VALUE_LOSS_COEF,
              ENTROPY_LOSS_COEF,
              LEARNING_RATE,
              MINI_BATCH_SIZE,
              device,
              MODEL_PATH)
agent

In [None]:
env = Environment(train_data_unnorm,
                  SEQUENCE_LENGTH,
                  NUM_ASSETS,
                  SYMBOLS,
                  CAPITAL)
env

In [None]:
env.reset()

In [None]:
buffer = Buffer(ROLLOUT_SIZE,
                SEQUENCE_LENGTH,
                INPUT_DIM,
                ACTION_DIM,
                device)
buffer.display()

In [None]:
train_data_norm.iloc[0:SEQUENCE_LENGTH]

In [None]:
buffer.store_state(train_data_norm.iloc[0:SEQUENCE_LENGTH].values)
buffer.states

In [None]:
for step in tqdm(range(5)):
    buffer = agent.get_action_and_value(buffer)
    buffer = env.step(buffer.actions[buffer.current_step_action - 1], buffer)
# print(buffer.rewards.shape)
# print(buffer.values.shape)
# print(buffer.dones.shape)
buffer = agent.compute_gae(buffer=buffer, next_value=torch.tensor([0.0]), next_done=torch.tensor([0.0]))
buffer.actions

In [None]:
buffer.advantages

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# ============================================================
# 1. Simple 1D Line World Environment with n "rows"
# ============================================================

class LineWorldEnv:
    """
    A 1D world with positions 0, 1, ..., n-1.
    Start near the middle. Actions: 0=left, 1=right.
    Reward: +1 at right end, -1 at left end, 0 otherwise.
    Episode ends when you hit an end or when max_steps is reached.
    """
    def __init__(self, n_rows=7, max_steps=20):
        self.n_rows = n_rows
        self.max_steps = max_steps
        self.pos = None
        self.steps = None

    def reset(self):
        # Start in the middle
        self.pos = self.n_rows // 2
        self.steps = 0
        return self._get_obs()

    def _get_obs(self):
        # Encode state as a 1D float in [-1, 1]
        # position / (n_rows-1) -> [0,1]; then scale to [-1,1]
        x = self.pos / (self.n_rows - 1)
        x = 2.0 * x - 1.0
        return np.array([x], dtype=np.float32)

    def step(self, action):
        # action: 0 = left, 1 = right
        if action == 0:
            self.pos -= 1
        elif action == 1:
            self.pos += 1
        else:
            raise ValueError("Invalid action")

        self.steps += 1

        done = False
        reward = 0.0

        # Bounds and terminal conditions
        if self.pos <= 0:
            self.pos = 0
            reward = -1.0
            done = True
        elif self.pos >= self.n_rows - 1:
            self.pos = self.n_rows - 1
            reward = 1.0
            done = True
        elif self.steps >= self.max_steps:
            done = True

        return self._get_obs(), reward, done, {}

    @property
    def obs_dim(self):
        return 1  # single scalar

    @property
    def act_dim(self):
        return 2  # left or right


# ============================================================
# 2. Actor-Critic Network (Policy πθ and Value Vφ)
# ============================================================

class ActorCritic(nn.Module):
    """
    Shared body -> separate policy head and value head.
    For discrete actions, policy head outputs logits for a Categorical dist.
    """
    def __init__(self, obs_dim, act_dim, hidden_sizes=(64, 64)):
        super().__init__()

        layers = []
        last_dim = obs_dim
        for h in hidden_sizes:
            layers.append(nn.Linear(last_dim, h))
            layers.append(nn.Tanh())
            last_dim = h
        self.shared = nn.Sequential(*layers)

        # Policy head
        self.policy_head = nn.Linear(last_dim, act_dim)

        # Value head
        self.value_head = nn.Linear(last_dim, 1)

    def forward(self, obs):
        # obs: [batch_size, obs_dim]
        x = self.shared(obs)
        logits = self.policy_head(x)          # for πθ(a|s)
        value = self.value_head(x).squeeze(-1)  # for Vφ(s), shape [batch_size]
        return logits, value

    def get_action_and_value(self, obs, action=None):
        """
        obs: tensor [batch_size, obs_dim]
        action (optional): tensor [batch_size] of actions.
        Returns:
            action: sampled if not provided
            logprob: log πθ(a|s)
            entropy: H[πθ(·|s)]
            value: Vφ(s)
        """
        logits, value = self.forward(obs)
        dist = torch.distributions.Categorical(logits=logits)

        if action is None:
            action = dist.sample()

        logprob = dist.log_prob(action)
        entropy = dist.entropy()
        return action, logprob, entropy, value


# ============================================================
# 3. GAE-Lambda Advantage and Return Computation
# ============================================================

def compute_gae(rewards, values, dones, gamma, lam):
    """
    rewards: [T] tensor
    values:  [T+1] tensor (note the extra last value for bootstrap)
    dones:   [T] tensor of 0/1
    Returns:
        advantages [T]
        returns    [T] = advantages + values[:-1]
    """
    T = len(rewards)
    advantages = torch.zeros(T, dtype=torch.float32)
    gae = 0.0

    # Work backwards from last step to first
    for t in reversed(range(T)):
        # δ_t = r_t + γ (1-d_t) V(s_{t+1}) - V(s_t)
        delta = rewards[t] + gamma * (1 - dones[t]) * values[t+1] - values[t]
        # A_t = δ_t + γ λ (1-d_t) A_{t+1}
        gae = delta + gamma * lam * (1 - dones[t]) * gae
        advantages[t] = gae

    returns = advantages + values[:-1]
    return advantages, returns


# ============================================================
# 4. Main PPO Training Loop
# ============================================================

def ppo_train(
    n_rows=7,
    total_iterations=500,
    steps_per_iter=256,
    gamma=0.99,
    lam=0.95,
    clip_eps=0.2,
    lr=3e-4,
    train_epochs=4,
    minibatch_size=64,
    value_coef=0.5,
    entropy_coef=0.01,
    device="cpu"
):
    env = LineWorldEnv(n_rows=n_rows)
    obs_dim = env.obs_dim
    act_dim = env.act_dim

    model = ActorCritic(obs_dim, act_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for it in range(total_iterations):
        # ------------------------------------------
        # 4.1 Collect a batch of experience
        # ------------------------------------------
        obs_buf = []
        act_buf = []
        rew_buf = []
        done_buf = []
        logprob_buf = []
        val_buf = []

        obs = env.reset()
        obs = torch.tensor(obs, dtype=torch.float32, device=device)

        for step in range(steps_per_iter):
            with torch.no_grad():
                # π_{θ_old}(·|s_t) and Vφ(s_t)
                action, logprob, _, value = model.get_action_and_value(obs.unsqueeze(0))
            action = action.item()
            logprob = logprob.item()
            value = value.item()

            next_obs, reward, done, _ = env.step(action)
            next_obs_t = torch.tensor(next_obs, dtype=torch.float32, device=device)

            # Store transition
            obs_buf.append(obs.cpu().numpy())
            act_buf.append(action)
            rew_buf.append(reward)
            done_buf.append(float(done))
            logprob_buf.append(logprob)
            val_buf.append(value)

            obs = next_obs_t
            if done:
                obs = torch.tensor(env.reset(), dtype=torch.float32, device=device)

        # At this point, we have steps_per_iter transitions.
        # We need one extra value for V(s_T) for bootstrapping.
        with torch.no_grad():
            _, _, _, last_value = model.get_action_and_value(obs.unsqueeze(0))
        last_value = last_value.item()

        # Convert buffers to tensors
        obs_tensor = torch.tensor(np.array(obs_buf), dtype=torch.float32, device=device)
        act_tensor = torch.tensor(act_buf, dtype=torch.long, device=device)
        rew_tensor = torch.tensor(rew_buf, dtype=torch.float32, device=device)
        done_tensor = torch.tensor(done_buf, dtype=torch.float32, device=device)
        val_tensor = torch.tensor(val_buf + [last_value], dtype=torch.float32, device=device)

        # ------------------------------------------
        # 4.2 Compute advantages and returns (GAE)
        # ------------------------------------------
        advantages, returns = compute_gae(
            rewards=rew_tensor,
            values=val_tensor,
            dones=done_tensor,
            gamma=gamma,
            lam=lam,
        )

        # Normalize advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        # Old logprobs as tensor
        old_logprob_tensor = torch.tensor(logprob_buf, dtype=torch.float32, device=device)

        # ------------------------------------------
        # 4.3 PPO Updates (multiple epochs over batch)
        # ------------------------------------------
        batch_size = steps_per_iter
        indices = np.arange(batch_size)

        for epoch in range(train_epochs):
            np.random.shuffle(indices)

            for start in range(0, batch_size, minibatch_size):
                end = start + minibatch_size
                mb_inds = indices[start:end]

                mb_obs = obs_tensor[mb_inds]
                mb_act = act_tensor[mb_inds]
                mb_adv = advantages[mb_inds]
                mb_ret = returns[mb_inds]
                mb_logprob_old = old_logprob_tensor[mb_inds]

                # Forward pass under current θ,ϕ
                _, mb_logprob, mb_entropy, mb_value = model.get_action_and_value(mb_obs, mb_act)

                # r_t(θ) = exp(logπθ - logπθ_old)
                ratio = torch.exp(mb_logprob - mb_logprob_old)

                # Clipped surrogate objective:
                # L^{CLIP}(θ) = E[min( r_t A_t , clip(r_t,1-ε,1+ε) A_t )]
                unclipped = ratio * mb_adv
                clipped = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * mb_adv
                policy_loss = -torch.mean(torch.min(unclipped, clipped))

                # Value function loss: (Vφ(s) - Ĝ_t)^2
                value_loss = torch.mean((mb_value - mb_ret) ** 2)

                # Entropy bonus (we subtract in loss => encourages exploration)
                entropy_loss = -torch.mean(mb_entropy)

                # Total loss
                loss = policy_loss + value_coef * value_loss + entropy_coef * entropy_loss

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # ------------------------------------------
        # 4.4 Logging (optional, simple print)
        # ------------------------------------------
        # Rough evaluation: run one episode with greedy policy
        if (it + 1) % 50 == 0:
            with torch.no_grad():
                obs_eval = torch.tensor(env.reset(), dtype=torch.float32, device=device)
                done_eval = False
                ep_return = 0.0
                while not done_eval:
                    logits, _ = model.forward(obs_eval.unsqueeze(0))
                    action = torch.argmax(logits, dim=-1).item()
                    next_obs_eval, r_eval, done_eval, _ = env.step(action)
                    ep_return += r_eval
                    obs_eval = torch.tensor(next_obs_eval, dtype=torch.float32, device=device)
            print(f"Iteration {it+1}: example episode return = {ep_return:.2f}")


if __name__ == "__main__":
    ppo_train()

Iteration 50: example episode return = 1.00
Iteration 100: example episode return = 1.00
Iteration 150: example episode return = 1.00
Iteration 200: example episode return = 1.00
Iteration 250: example episode return = 1.00
Iteration 300: example episode return = 1.00
Iteration 350: example episode return = 1.00
Iteration 400: example episode return = 1.00
Iteration 450: example episode return = 1.00
Iteration 500: example episode return = 1.00
