In [1]:
from math import gamma
from time import sleep

import torch

compute_device = torch.device('cuda' if torch.cuda.is_available() else 'mps')
cpu_device = torch.device('cpu')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
torch.cuda.is_available()

True

# Env

In [4]:
def make_env(render_mode="rgb_array"):
    env = gym.make("gym_pusht/PushT-v0", render_mode=render_mode)
    return env

In [5]:
import gymnasium as gym
import gym_pusht
from time import sleep

env = make_env()

In [6]:
test_env = make_env(render_mode="human")

observation, info = test_env.reset()

for _ in range(50):
    sleep(0.2)
    action = test_env.action_space.sample()
    observation, reward, terminated, truncated, info = test_env.step(action)
    image = test_env.render()

    if terminated or truncated:
        observation, info = test_env.reset()

test_env.close()

Implement the deepq Learning to compete the task

# Part 1: Numerical PushT state

## Observation Space

If obs_type is set to state, the observation space is a 5-dimensional vector representing the state of the environment: [agent_x, agent_y, block_x, block_y, block_angle]. The values are in the range [0, 512] for the agent and block positions and [0, 2*pi] for the block angle.

If obs_type is set to environment_state_agent_pos the observation space is a dictionary with: - environment_state: 16-dimensional vector representing the keypoint locations of the T (in [x0, y0, x1, y1, ...] format). The values are in the range [0, 512]. - agent_pos: A 2-dimensional vector representing the position of the robot end-effector.

If obs_type is set to pixels, the observation space is a 96x96 RGB image of the environment.

In [7]:
env.observation_space

Box(0.0, [512.         512.         512.         512.           6.28318531], (5,), float64)

The Policy model is designed to give the action $a$ given the state $s$.

input:

$$
[agent_x, agent_y, t_x, t_y, t_{angle}]
$$

output

$$
[move_x, move_y]
$$


so the model tells

$$
a = P(s)
$$

In [8]:
# from torch import nn
# import torch.nn.functional as F
#
#
# class Policy(nn.Module):
#     def __init__(self):
#         super(Policy, self).__init__()
#         self.fc1 = nn.Linear(env.observation_space.shape[0], 64)
#         self.fc2 = nn.Linear(64, 64)
#         self.fc3 = nn.Linear(64, 2)
#
#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = self.fc3(x)
#
#         return x
# model = Policy()
# input = torch.rand((4, 5))
#
# input
# output = model(input)
#
# print(output.shape)
# output.detach().numpy()

In [9]:
# class Critic(nn.Module):
#     def __init__(self):
#         super(Critic, self).__init__()
#         self.fc1 = nn.Linear(env.observation_space.shape[0] + env.action_space.shape[0], 64)
#         self.fc2 = nn.Linear(64, 64)
#         self.fc3 = nn.Linear(64, 1)
#
#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = F.relu(self.fc3(x))
#
#         return x
# model = Critic()
# input = torch.rand((4, 7))
#
# input
# output = model(input)
#
# print(output.shape)
# output.detach().numpy()

Critic model is aimed to estimate

$$
Q(s,a)
$$

it takes concat input $[s,a]$, more detailed

input

$$
[agent_x, agent_y, t_x, t_y, t_{angle}, move_x, move_y]
$$

output

$$
score
$$


so it give single estimate of $Q$ value

$$
score = Q(s,a)
$$

The Policy model is trained using Critic model

Since we cant directly access the action value $Q(s,a)$ we use the critic model that gives estimate $Q'(s,a)$ to understand the value of this state action pair. Then we could use this estimate to compute the loss


$$
loss = -Q'(s,a)
$$

\- is used because optimization tasks aim to minimize function, hence minimizing -f is equivalent to maximizing f

In [10]:
# def train_policy(policy_model, critic_model, input, optimizer):
#     """
#
#     Args:
#         policy_model: Model to update
#         critic_model: Critic model to compute the value of the proposed action
#         input: the input of shape [batch_size, 5] = (batch_size, observation_space.shape)
#         optimizer: optimizer for the policy model
#
#     Returns:
#
#     """
#     optimizer.zero_grad()
#
#     input = input.to(compute_device)
#
#     output = policy_model(input)
#
#     critic_input = torch.cat((input, output), dim=1)
#
#     score = critic_model(critic_input)
#
#     loss = -score
#
#     loss.backward()
#     optimizer.step()
#
#     return loss.item()

For training critic, we use the actual reward we received from the env, bootstrap with critic model and train it with basic MSE loss

In [11]:
# def train_critic(critic_model, input, target, optimizer):
#     """
#     Train function that run one update on the critic network using batch of inputs.
#
#     Args:
#         critic_model: Model to update
#         input: the input of shape [batch_size, 7] = (batch_size, observation_space.shape + action_space.shape)
#         target: Is the target reward we received from env and critic_estimate model
#         optimizer: optimizer for the critic model
#
#     Returns:
#
#     """
#     optimizer.zero_grad()
#     criterion = nn.MSELoss()
#
#     critic_model.train()
#
#     input = input.to(compute_device)
#
#     output = critic_model(input)
#
#     loss = criterion(output, target)
#
#     loss.backward()
#     optimizer.step()
#
#     return loss.item()

so in my approach the policy actually changes, but I could ignore the fact that actions were recorded under another distribution because I am using Q, which gives some level of abstraction which is stable for the environment and any optimal policy will converge to identical function. Then the agent here is just the another function that is trained on this level of the representation of the environment. If I had the training process without Q where the policy is responsible to somehow incorporate the knowledge of the environment inside itself then I need to also think of sampling, because on the interpretation level the actions were done on that perception of env, which changed with policy.

this is idea of the model-based and model-free and particaul;larty off-policy and on-policy in the fact that env is not encoded in the model

Here comes the problem since the update rule is based on MSE of the value
$$
(Q(s,a), R(s,a))
$$

where target $R(s,a)$ is calculated

$$
R(s,a) = r + \gamma \cdot Q(s', \pi(s'))
$$

then the update rule:

$$
Q(s,a) \leftarrow \alpha \cdot (Q(s,a) - (r + \gamma \cdot Q(s', \pi(s'))))^2
$$

To solve this problem, I use the target models, this is the same duplicate of the Critic Model that is training with delay from the actual model, this will ensure more stable training process and prevent explode, when update depends on itself.

In [12]:
import random


class Memory:
    def __init__(self, batch_size=128, max_size=1024, min_sample=4096):
        self.batch_size = batch_size
        self.max_size = max_size
        self.min_sample = min_sample

        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []

    def add(self, state, action, reward, next_state):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.next_states.append(next_state)

        if len(self.states) > self.max_size:
            self._pop_oldest()

    def _pop_oldest(self):
        self.states.pop(0)
        self.actions.pop(0)
        self.rewards.pop(0)
        self.next_states.pop(0)

    def enough_sample(self):
        return len(self.states) >= self.min_sample

    def sample(self):
        idxs = random.sample(range(len(self.states)), self.batch_size)
        batch = dict(
            states=torch.stack([self.states[i] for i in idxs]),
            actions=torch.stack([self.actions[i] for i in idxs]),
            rewards=torch.stack([self.rewards[i] for i in idxs]),
            next_states=torch.stack([self.next_states[i] for i in idxs]),
        )
        return batch


In [28]:
import numpy as np
from AIA.rl.pushT.Nets import Policy, Critic
from AIA.rl.pushT.trainer import train_critic, train_policy, soft_update


def train(policy,
          critic,
          policy_target,
          critic_target,
          optimizer_policy,
          optimizer_critic,
          memory,
          episodes = 4000,
          max_steps = 1000,
          gamma = 0.99,
          tau = 0.005):
    env = make_env()

    for episode in range(episodes):

        state, _ = env.reset()
        prev_reward = 0

        losses = ([], [])

        for t in range(max_steps):
            #  Выбор действия по текущей policy

            cur_state_policy_input = torch.tensor(state, dtype=torch.float).to(compute_device)
            action = policy(cur_state_policy_input).detach().to(cpu_device)

            #  Выполнение действия в среде
            next_state, reward, terminated, truncated, _ = env.step(action)

            d_reward = reward - prev_reward

            prev_reward = reward

            reward = d_reward

            if reward > 0:
                print(33)

            #  Сохранение перехода в память
            memory.add(torch.tensor(state), action, torch.tensor(reward, dtype=torch.float), torch.tensor(next_state))

            #  Обновление состояния
            state = next_state

            #  Если накопилось достаточно данных → обучаемся
            if memory.enough_sample():
                ##  Выборка batch
                batch = memory.sample()

                states = batch['states'].to(torch.float).to(compute_device)
                actions = batch['actions'].to(torch.float).to(compute_device)
                rewards = batch['rewards'].to(torch.float).to(compute_device)
                next_states = batch['next_states'].to(torch.float).to(compute_device)

                # Вычисление target: r + γ * Q_target(s', π_target(s'))
                with torch.no_grad():
                    target_policy_action = policy_target(next_states)
                    critic_input_target = torch.cat((next_states, target_policy_action), dim=1)
                    q_target_value = critic_target(critic_input_target)
                    targets = (rewards + gamma * q_target_value)

                critic_input = torch.cat((states, actions), dim=1)

                #  Обновление критика по MSE
                critic_loss = train_critic(critic_model, critic_input, targets, optimizer_critic, compute_device)

                ## === ОБНОВЛЕНИЕ POLICY === ##
                # Генерация действий по policy
                policy_input = states
                policy_loss = train_policy(policy, critic, policy_input, optimizer_policy, compute_device)

                losses[0].append(critic_loss)
                losses[1].append(policy_loss)

                ## === ОБНОВЛЕНИЕ TARGET-СЕТЕЙ === ##
                soft_update(policy_target, policy, tau)
                soft_update(critic_target, critic, tau)

        print(f"Episode {episode}, Critic loss: {np.mean(losses[0])}, Policy loss: {np.mean(losses[1])}")




policy_model = Policy(env).to(compute_device)
critic_model = Critic(env).to(compute_device)
policy_target = Policy(env).to(compute_device)
critic_target = Critic(env).to(compute_device)
policy_target.load_state_dict(policy_model.state_dict())
critic_target.load_state_dict(critic_model.state_dict())
optimizer_policy = torch.optim.Adam(policy_model.parameters(), lr=1e-3)
optimizer_critic = torch.optim.Adam(critic_model.parameters(), lr=1e-3)
memory = Memory(batch_size=256, max_size=10000, min_sample = 1024)



train(
    policy_model,
    critic_model,
    policy_target,
    critic_target,
    optimizer_policy,
    optimizer_critic,
    memory,
    episodes= 4000,
    max_steps = 1000,
    gamma = 0.1,
    tau = 0.005)

Episode 0, Critic loss: nan, Policy loss: nan
33


KeyboardInterrupt: 