In [1]:
from anyio import sleep
%load_ext autoreload
%autoreload 2

# Lander

Initially I had multiple test runs and notebooks that was used to understand the domain of the RL problems. I figured out after the first run that the agent for PushT doesnt train. I decided to test my algorithms on the easier env from gym, particularly LunarLander with continuous action space

Luckily the training algorithm I have implemented DDPG doesnt requires any adjustments to be implemented on other envs.

In [32]:
from AIA.rl.lander.models import Agent
import numpy as np

import gymnasium as gym

# Initialise the environment
env = gym.make("LunarLander-v3", render_mode="rgb_array", continuous=True)

agent = Agent(alpha=1e-3, beta=1e-3, input_dims=8, tau=0.001,
              batch_size=64, n_actions=2, noise=0.15, expert_data=None)

#agent.load_models()
np.random.seed(0)

score_history = []

In [33]:
for i in range(1000):
    obs, _ = env.reset()
    done = False
    score = 0
    while not done:
        act = agent.choose_action(obs)
        new_state, reward, done, info, _ = env.step(act)
        agent.remember(obs, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        obs = new_state
        #env.render()
    score_history.append(score)


    print('episode ', i, 'score %.2f' % score,
          'trailing 100 games avg %.3f' % np.mean(score_history[-100:]))

episode  0 score -1646.10 trailing 100 games avg -1646.104
episode  1 score -391.73 trailing 100 games avg -1018.918
episode  2 score -511.62 trailing 100 games avg -849.819
episode  3 score -447.63 trailing 100 games avg -749.272
episode  4 score -296.15 trailing 100 games avg -658.648
episode  5 score -197.84 trailing 100 games avg -581.847
episode  6 score -140.84 trailing 100 games avg -518.846
episode  7 score -228.67 trailing 100 games avg -482.574
episode  8 score -279.17 trailing 100 games avg -459.973
episode  9 score -616.69 trailing 100 games avg -475.645
episode  10 score -402.03 trailing 100 games avg -468.952
episode  11 score -733.63 trailing 100 games avg -491.009
episode  12 score -642.54 trailing 100 games avg -502.665
episode  13 score -372.00 trailing 100 games avg -493.332
episode  14 score -564.37 trailing 100 games avg -498.067
episode  15 score -504.14 trailing 100 games avg -498.447
episode  16 score -474.05 trailing 100 games avg -497.012
episode  17 score -47

KeyboardInterrupt: 

In [None]:
from AIA.rl.lander.plot import plotLearning

plotLearning(score_history, window=100)

# Push T test 1

In [None]:
from AIA.rl.lander.plot import plotLearning
from AIA.rl.lander.models import Agent
import numpy as np

import gymnasium as gym
from AIA.rl.lander.envs.pusht import PushTEnv


# Initialise the environment
env = PushTEnv(render_mode="rgb_array")

agent = Agent(alpha=1e-3, beta=1e-3, noise=25, input_dims=5, tau=0.001,
              batch_size=64, n_actions=2)

In [None]:
#agent.load_models()
np.random.seed(0)

score_history = []

In [None]:
for i in range(10000):
    obs, _ = env.reset()
    done = False
    score = 0
    prev_reward = 0
    for t in range(400):
        act = agent.choose_action(obs)
        new_state, reward, done, info, _ = env.step(act)

        prev_t_pos = obs[2:3]
        new_t_pos = new_state[2:3]

        d_move = np.sqrt(np.sum((prev_t_pos - new_t_pos)**2))

        d_move_reward = d_move / 1000

        buffer_reward = reward

        reward -= prev_reward
        reward += d_move_reward

        prev_reward = buffer_reward

        agent.remember(obs, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        obs = new_state
        #env.render()
    score_history.append(score)

    #if i % 25 == 0:
    #    agent.save_models()

    print(f'episode , {i} score {(score):3f} trailing 100 games avg {np.mean(score_history[-100:]):3f}' )


In [None]:
env.close()

# Push T test 2

In [None]:
import torch
from torch import optim, nn
import pickle
from AIA.rl.lander.plot import plotLearning
from AIA.rl.lander.models import Agent
import numpy as np

import gymnasium as gym


In [None]:
env = PushTEnv(obs_type="state", render_mode="rgb_array")
input_dim = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

    # load expert demonstrations saved as list of (s,a,r,s2,d)
with open("demonstrations.pkl", "rb") as f:
    expert_transitions = pickle.load(f)

agent = Agent(
        alpha=2e-4, beta=2e-4,
        input_dims=input_dim, n_actions=n_actions,
        tau=0.001, gamma=0.99,
        max_size=1_000_000, batch_size=256, noise=0.2,
        expert_data=expert_transitions,
        expert_ratio=0.25
)

## Pre-Train from expert

In [None]:
def behaviour_clone(actor, demo, epochs=10, lr=1e-3, batch_size=256):
    """
    Supervised pre‑training of the actor on expert (s→a) pairs
    using mini‑batch SGD.
    demo: list of (state, action, reward, next_state, done)
    """
    device = actor.device
    opt    = optim.Adam(actor.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    N = len(demo)

    # pre‑stack everything once
    all_states = torch.tensor([t[0] for t in demo],
                              dtype=torch.float32, device=device)
    all_actions = torch.tensor([t[1] for t in demo],
                               dtype=torch.float32, device=device)

    actor.train()
    for ep in range(1, epochs+1):
        perm = torch.randperm(N, device=device)
        epoch_loss = 0.0

        for i in range(0, N, batch_size):
            idx = perm[i:i+batch_size]
            states = all_states[idx]
            acts   = all_actions[idx]

            pred = actor(states)
            loss = loss_fn(pred, acts)

            opt.zero_grad()
            loss.backward()
            opt.step()

            epoch_loss += loss.item() * idx.size(0)

        avg_loss = epoch_loss / N
        if ep % max(1, epochs//10) == 0:
            print(f"[BC] Epoch {ep}/{epochs}, avg loss={avg_loss:.6f}")

    actor.eval()


def pre_train_critic(critic, demo, gamma=0.99, epochs=10, batch_size=256, lr=1e-3):
    """
    Monte‑Carlo pre‑training of critic on expert episodes,
    using mini‑batch SGD.
    demo: list of (state, action, reward, next_state, done) in sequence order
    """
    device = critic.device
    opt    = optim.Adam(critic.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    # 1) compute returns G_t
    returns = []
    G = 0.0
    for (s,a,r,d,s2) in reversed(demo):
        if d:
            G = 0.0
        G = r + gamma * G
        returns.append(G)
    returns = returns[::-1]

    # 2) pre‑stack tensors
    N = len(demo)
    states  = torch.tensor([t[0] for t in demo],
                           dtype=torch.float32, device=device)
    actions = torch.tensor([t[1] for t in demo],
                           dtype=torch.float32, device=device)
    targets = torch.tensor(returns,
                           dtype=torch.float32, device=device).unsqueeze(1)

    critic.train()
    for ep in range(1, epochs+1):
        perm = torch.randperm(N, device=device)
        epoch_loss = 0.0

        for i in range(0, N, batch_size):
            idx = perm[i:i+batch_size]
            s_batch = states[idx]
            a_batch = actions[idx]
            y_batch = targets[idx]

            q_pred = critic(s_batch, a_batch)
            loss   = loss_fn(q_pred, y_batch)

            opt.zero_grad()
            loss.backward()
            opt.step()

            epoch_loss += loss.item() * idx.size(0)

        avg_loss = epoch_loss / N
        print(f"[Critic‑MC] Epoch {ep}/{epochs}, avg loss={avg_loss:.6f}")

    critic.eval()


In [None]:
# 1) Pre‑train critic
pre_train_critic(agent.critic,
                 demo=expert_transitions,
                 gamma=0.99,
                 epochs=200,
                 batch_size=256,
                 lr=1e-4)

In [None]:
# 2) Pre‑train actor
behaviour_clone(agent.actor,
                demo=expert_transitions,
                epochs=2000,
                lr=1e-3)

## Testy Pre-train results

In [None]:
import time

env =  PushTEnv(obs_type="state", render_mode="human")

obs, _ = env.reset()

for i in range(200):

    action = agent.choose_action(obs, eval=True)

    nxt, reward, terminated, truncated, _ = env.step(action)

    env.render()

    obs = nxt

    if terminated or truncated:
        break

env.close()

## Main Train

In [None]:
score_history = []

In [None]:
# env = PushTEnv(obs_type="state", render_mode="rgb_array")
env = PushTEnv(obs_type="state", render_mode="human")

In [None]:
for ep in range(1, 10001):
    obs, _ = env.reset()
    done = False
    score = 0.0
    prev_reward = 0.0
    for i in range(200):
        env.render()
        action = agent.choose_action(obs)
        new_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        prev_t_pos = obs[2:3]
        new_t_pos = new_state[2:3]

        d_move = np.sqrt(np.sum((prev_t_pos - new_t_pos)**2))

        d_move_reward = d_move / 1000

        buffer_reward = reward

        reward -= prev_reward
        reward += d_move_reward

        prev_reward = buffer_reward

        agent.memory.store_transition(obs, action, reward, new_state, done)
        agent.learn()

        obs = new_state
        score += reward

        if done:
            break

    score_history.append(score)
    if ep % 1 == 0:
        avg = np.mean(score_history[-100:])
        print(f"Episode {ep:5d}  Score: {score:.2f}  100‑ep avg: {avg:.2f}")

In [None]:
env.close()