In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from AIA.rl.lander.models import Agent
import numpy as np

import gymnasium as gym

# Initialise the environment
env = gym.make("LunarLander-v3", render_mode="rgb_array", continuous=True)

agent = Agent(alpha=0.000025, beta=0.00025, input_dims=8, tau=0.001,
              batch_size=64, n_actions=2)

#agent.load_models()
np.random.seed(0)

score_history = []

In [None]:
for i in range(1000):
    obs, _ = env.reset()
    done = False
    score = 0
    while not done:
        act = agent.choose_action(obs)
        new_state, reward, done, info, _ = env.step(act)
        agent.remember(obs, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        obs = new_state
        #env.render()
    score_history.append(score)

    #if i % 25 == 0:
    #    agent.save_models()

    print('episode ', i, 'score %.2f' % score,
          'trailing 100 games avg %.3f' % np.mean(score_history[-100:]))

In [None]:
from AIA.rl.lander.plot import plotLearning

plotLearning(score_history, window=100)

In [None]:
from AIA.rl.lander.plot import plotLearning
from AIA.rl.lander.models import Agent
import numpy as np

import gymnasium as gym
import gym_pusht


# Initialise the environment
env = gym.make("gym_pusht/PushT-v0", render_mode="rgb_array")

agent = Agent(alpha=1e-3, beta=1e-3, noise=25, input_dims=5, tau=0.001,
              batch_size=64, n_actions=2)

In [None]:
#agent.load_models()
np.random.seed(0)

score_history = []

In [None]:
for i in range(10000):
    obs, _ = env.reset()
    done = False
    score = 0
    prev_reward = 0
    for t in range(400):
        act = agent.choose_action(obs)
        new_state, reward, done, info, _ = env.step(act)

        prev_t_pos = obs[2:3]
        new_t_pos = new_state[2:3]

        d_move = np.sqrt(np.sum((prev_t_pos - new_t_pos)**2))

        d_move_reward = d_move / 1000

        buffer_reward = reward

        reward -= prev_reward
        reward += d_move_reward

        prev_reward = buffer_reward

        agent.remember(obs, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        obs = new_state
        #env.render()
    score_history.append(score)

    #if i % 25 == 0:
    #    agent.save_models()

    print(f'episode , {i} score {(score):3f} trailing 100 games avg {np.mean(score_history[-100:]):3f}' )


In [None]:
env.close()

In [2]:
import torch
from torch import optim, nn
import pickle
from AIA.rl.lander.plot import plotLearning
from AIA.rl.lander.models import Agent
import numpy as np

import gymnasium as gym
import gym_pusht


def behaviour_clone(actor, demo, epochs=10, lr=1e-3):
    opt = optim.Adam(actor.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    actor.train()
    for _ in range(epochs):
        s = torch.tensor([t[0] for t in demo], dtype=torch.float32, device=actor.device)
        a = torch.tensor([t[1] for t in demo], dtype=torch.float32, device=actor.device)
        pred = actor(s)
        loss = loss_fn(pred, a)
        opt.zero_grad()
        loss.backward()
        opt.step()
        print(f"Epoch {_}, loss {loss}")

env = gym.make("gym_pusht/PushT-v0", obs_type="state", render_mode="rgb_array")
input_dim = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]

    # load expert demonstrations saved as list of (s,a,r,s2,d)
with open("../gym-pusht/demonstrations.pkl", "rb") as f:
    expert_transitions = pickle.load(f)

agent = Agent(
        alpha=2e-4, beta=2e-3,
        input_dims=input_dim, n_actions=n_actions,
        tau=0.001, gamma=0.99,
        max_size=1_000_000, batch_size=256, noise=0.2,
        expert_data=expert_transitions,
        expert_ratio=0.25
)

# optional warm‑start actor via BC
behaviour_clone(agent.actor, expert_transitions, epochs=50, lr=1e-3)

  s = torch.tensor([t[0] for t in demo], dtype=torch.float32, device=actor.device)


Epoch 0, loss 4717.61962890625
Epoch 1, loss 37249.19921875
Epoch 2, loss 25676.693359375
Epoch 3, loss 5045.05419921875
Epoch 4, loss 9538.10546875
Epoch 5, loss 13679.365234375
Epoch 6, loss 11374.0322265625
Epoch 7, loss 5395.44677734375
Epoch 8, loss 1612.08154296875
Epoch 9, loss 2969.68798828125
Epoch 10, loss 5416.2666015625
Epoch 11, loss 5601.7490234375
Epoch 12, loss 3847.099609375
Epoch 13, loss 1959.3515625
Epoch 14, loss 1429.8736572265625
Epoch 15, loss 2159.477783203125
Epoch 16, loss 3001.964599609375
Epoch 17, loss 3190.636962890625
Epoch 18, loss 2713.78466796875
Epoch 19, loss 1979.6429443359375
Epoch 20, loss 1455.57470703125
Epoch 21, loss 1416.6175537109375
Epoch 22, loss 1750.7119140625
Epoch 23, loss 2079.15283203125
Epoch 24, loss 2118.32373046875
Epoch 25, loss 1865.2940673828125
Epoch 26, loss 1531.704345703125
Epoch 27, loss 1338.4381103515625
Epoch 28, loss 1385.2415771484375
Epoch 29, loss 1549.8477783203125
Epoch 30, loss 1643.4124755859375
Epoch 31, loss

In [None]:
score_history = []
for ep in range(1, 10001):
    obs, _ = env.reset()
    done = False
    score = 0.0
    while not done:
        action = agent.choose_action(obs)
        nxt, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # store live experience
        agent.memory.store_transition(obs, action, reward, nxt, done)
        # train with expert+live mixture
        agent.learn()

        obs = nxt
        score += reward

    score_history.append(score)
    if ep % 1 == 0:
        avg = np.mean(score_history[-100:])
        print(f"Episode {ep:5d}  Score: {score:.2f}  100‑ep avg: {avg:.2f}")

Episode     1  Score: 0.00  100‑ep avg: 0.00
Episode     2  Score: 0.00  100‑ep avg: 0.00
Episode     3  Score: 0.00  100‑ep avg: 0.00
Episode     4  Score: 0.00  100‑ep avg: 0.00
Episode     5  Score: 96.37  100‑ep avg: 19.27
Episode     6  Score: 0.00  100‑ep avg: 16.06
Episode     7  Score: 0.00  100‑ep avg: 13.77
Episode     8  Score: 0.57  100‑ep avg: 12.12
Episode     9  Score: 0.00  100‑ep avg: 10.77
Episode    10  Score: 2.86  100‑ep avg: 9.98
Episode    11  Score: 0.00  100‑ep avg: 9.07
Episode    12  Score: 0.00  100‑ep avg: 8.32
Episode    13  Score: 54.22  100‑ep avg: 11.85
Episode    14  Score: 0.00  100‑ep avg: 11.00
Episode    15  Score: 0.00  100‑ep avg: 10.27
Episode    16  Score: 0.00  100‑ep avg: 9.63
Episode    17  Score: 0.00  100‑ep avg: 9.06
Episode    18  Score: 0.00  100‑ep avg: 8.56
Episode    19  Score: 0.00  100‑ep avg: 8.11
Episode    20  Score: 0.00  100‑ep avg: 7.70
Episode    21  Score: 0.00  100‑ep avg: 7.33
Episode    22  Score: 0.00  100‑ep avg: 7.00
