In [41]:
from math import gamma
from time import sleep

import torch

compute_device = torch.device('cuda' if torch.cuda.is_available() else 'mps')
cpu_device = torch.device('cpu')

In [5]:
torch.cuda.is_available()

False

# Env

In [23]:
def make_env(render_mode="rgb_array"):
    env = gym.make("gym_pusht/PushT-v0", render_mode=render_mode)
    return env

In [24]:
import gymnasium as gym
import gym_pusht
from time import sleep

env = make_env()

In [61]:
observation, info = env.reset()

for _ in range(1000):
    sleep(0.2)
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)
    image = env.render()

    if terminated or truncated:
        observation, info = env.reset()

env.close()

KeyboardInterrupt: 

Implement the deepq Learning to compete the task

# Part 1: Numerical PushT state

## Observation Space

If obs_type is set to state, the observation space is a 5-dimensional vector representing the state of the environment: [agent_x, agent_y, block_x, block_y, block_angle]. The values are in the range [0, 512] for the agent and block positions and [0, 2*pi] for the block angle.

If obs_type is set to environment_state_agent_pos the observation space is a dictionary with: - environment_state: 16-dimensional vector representing the keypoint locations of the T (in [x0, y0, x1, y1, ...] format). The values are in the range [0, 512]. - agent_pos: A 2-dimensional vector representing the position of the robot end-effector.

If obs_type is set to pixels, the observation space is a 96x96 RGB image of the environment.

In [11]:
env.observation_space

Box(0.0, [512.         512.         512.         512.           6.28318531], (5,), float64)

The Policy model is designed to give the action $a$ given the state $s$.

input:

$$
[agent_x, agent_y, t_x, t_y, t_{angle}]
$$

output

$$
[move_x, move_y]
$$


so the model tells

$$
a = P(s)
$$

In [12]:
from torch import nn
import torch.nn.functional as F


class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(env.observation_space.shape[0], 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)

        return x

In [13]:
model = Policy()

In [14]:
input = torch.rand((4, 5))

input

tensor([[7.0812e-01, 2.9347e-01, 6.3819e-01, 1.2315e-01, 8.5370e-01],
        [2.5635e-01, 7.0662e-01, 9.4525e-02, 5.9024e-01, 1.4743e-01],
        [9.6483e-01, 8.8398e-02, 6.5643e-04, 2.8631e-01, 3.9741e-01],
        [3.8779e-01, 1.1301e-01, 2.2382e-01, 6.4386e-01, 7.5785e-01]])

In [15]:
output = model(input)

print(output.shape)
output.detach().numpy()

torch.Size([4, 2])


array([[ 0.05687009, -0.01775344],
       [ 0.10334413, -0.02803724],
       [ 0.09294878, -0.03567781],
       [ 0.04785617, -0.03231449]], dtype=float32)

In [16]:
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(env.observation_space.shape[0] + env.action_space.shape[0], 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))

        return x

In [17]:
model = Critic()
input = torch.rand((4, 7))

input

tensor([[0.6828, 0.1745, 0.8521, 0.2143, 0.3034, 0.7537, 0.4590],
        [0.3627, 0.1563, 0.4809, 0.8539, 0.2025, 0.7409, 0.7964],
        [0.2526, 0.9531, 0.3883, 0.9095, 0.8693, 0.7960, 0.4940],
        [0.2837, 0.2805, 0.2066, 0.1380, 0.0584, 0.0595, 0.1713]])

In [18]:
output = model(input)

print(output.shape)
output.detach().numpy()

torch.Size([4, 1])


array([[0.],
       [0.],
       [0.],
       [0.]], dtype=float32)

Critic model is aimed to estimate

$$
Q(s,a)
$$

it takes concat input $[s,a]$, more detailed

input

$$
[agent_x, agent_y, t_x, t_y, t_{angle}, move_x, move_y]
$$

output

$$
score
$$


so it give single estimate of $Q$ value

$$
score = Q(s,a)
$$

The Policy model is trained using Critic model

Since we cant directly access the action value $Q(s,a)$ we use the critic model that gives estimate $Q'(s,a)$ to understand the value of this state action pair. Then we could use this estimate to compute the loss


$$
loss = -Q'(s,a)
$$

\- is used because optimization tasks aim to minimize function, hence minimizing -f is equivalent to maximizing f

In [19]:
def train_policy(policy_model, critic_model, input, optimizer):
    optimizer.zero_grad()

    input = input.to(compute_device)

    output = policy_model(input)

    critic_input = torch.cat((input, output), dim=1)

    score = critic_model(critic_input)

    loss = -score

    loss.backward()
    optimizer.step()

    return loss.item()

For training critic, we use the actual reward we received from the env, bootstrap with critic model and train it with basic MSE loss

In [20]:
def train_critic(model, input, target, optimizer):
    optimizer.zero_grad()
    criterion = nn.MSELoss()

    model.train()

    input = input.to(compute_device)

    output = model(input)

    loss = criterion(output, target)

    loss.backward()
    optimizer.step()

    return loss.item()

so in my approach the policy actually changes, but I could ignore the fact that actions were recorded under another distribution because I am using Q, which gives some level of abstraction which is stable for the environment and any optimal policy will converge to identical function. Then the agent here is just the another function that is trained on this level of the representation of the environment. If I had the training process without Q where the policy is responsible to somehow incorporate the knowledge of the environment inside itself then I need to also think of sampling, because on the interpretation level the actions were done on that perception of env, which changed with policy.

this is idea of the model-based and model-free and particaul;larty off-policy and on-policy in the fact that env is not encoded in the model

Here comes the problem since the update rule is based on MSE of the value
$$
(Q(s,a), R(s,a))
$$

where target $R(s,a)$ is calculated

$$
R(s,a) = r + \gamma \cdot Q(s', \pi(s'))
$$

then the update rule:

$$
Q(s,a) \leftarrow \alpha \cdot (Q(s,a) - (r + \gamma \cdot Q(s', \pi(s'))))^2
$$

To solve this problem, I use the target models, this is the same duplicate of the Critic Model that is training with delay from the actual model, this will ensure more stable training process and prevent explode, when update depends on itself.

In [43]:
import random


class Memory:
    def __init__(self, batch_size=128, max_size=1024):
        self.batch_size = batch_size
        self.max_size = max_size

        self.states = []
        self.actions = []
        self.targets = []

    def add(self, state, action, target):
        self.states.append(state)
        self.actions.append(action)
        self.targets.append(target)

        if len(self.states) > self.max_size:
            self._pop_oldest()

    def _pop_oldest(self):
        self.states.pop(0)
        self.actions.pop(0)
        self.targets.pop(0)

    def enough_sample(self):
        return len(self.states) >= self.batch_size

    def sample(self):
        idxs = random.sample(range(len(self.states)), self.batch_size)
        batch = dict(
            states=torch.stack([self.states[i] for i in idxs]),
            actions=torch.stack([self.actions[i] for i in idxs]),
            rewards=torch.stack([self.targets[i] for i in idxs]),
        )
        return batch


In [44]:
def train(policy,
          critic,
          policy_target,
          critic_target,
          optimizer_policy,
          optimizer_critic,
          memory, episodes,
          max_steps,
          gamma,
          tau):
    env = make_env()

    for episode in range(episodes):

        state, _ = env.reset()

        for t in range(max_steps):
            #  Выбор действия по текущей policy

            cur_state_policy_input = torch.tensor(state, dtype=torch.float).to(compute_device)
            action = policy(cur_state_policy_input).detach().to(cpu_device)

            #  Выполнение действия в среде
            next_state, reward, terminated, truncated, _ = env.step(action)

            # Вычисление target: r + γ * Q_target(s', π_target(s'))
            with torch.no_grad():
                next_state_tensor = torch.tensor(next_state, dtype=torch.float).to(compute_device)
                target_policy_action = policy_target(next_state_tensor)
                critic_input_target = torch.cat((next_state_tensor, target_policy_action))
                q_target_value = critic_target(critic_input_target)
                target = (reward + gamma * q_target_value).to(cpu_device)

            del cur_state_policy_input, next_state_tensor, target_policy_action, q_target_value

            #  Сохранение перехода в память
            memory.add((torch.tensor(state), action, target))

            #  Обновление состояния
            state = next_state



             #  Выборка batch
            batch = memory.sample()
            input = torch.tensor(batch).to(device)
            # [(s, a, target)]

            #  Вычисление Q(s,a) текущим критиком
            # critic_input = concat(s, a)
            # predicted_q = critic(critic_input)


            print(33)

        #  Если накопилось достаточно данных → обучаемся
        if memory.enough_sample():
            #  Выборка batch
            batch = memory.sample()
            input = torch.tensor(batch).to(device)
            # [(s, a, target)]

            #  Вычисление Q(s,a) текущим критиком
            # critic_input = concat(s, a)
            # predicted_q = critic(critic_input)

            # 9. Обновление критика по MSE
            # critic_loss = MSE(predicted_q, target)
            # optimizer_critic.zero_grad()
            # critic_loss.backward()
            # optimizer_critic.step()

            ## === ОБНОВЛЕНИЕ POLICY === ##
            # 10. Генерация действий по policy
            # policy_action = policy(s)
            # critic_input_for_policy = concat(s, policy_action)

            # 11. Подсчёт -Q(s, π(s)) (градиент идёт через π)
            # policy_loss = -critic(critic_input_for_policy).mean()
            # optimizer_policy.zero_grad()
            # policy_loss.backward()
            # optimizer_policy.step()

            ## === ОБНОВЛЕНИЕ TARGET-СЕТЕЙ === ##
            # soft_update(policy_target, policy, tau)
            # soft_update(critic_target, critic, tau)


policy_model = Policy().to(compute_device)
critic_model = Critic().to(compute_device)
policy_target = Policy().to(compute_device)
critic_target = Critic().to(compute_device)
optimizer_policy = torch.optim.Adam(policy_model.parameters(), lr=1e-3)
optimizer_critic = torch.optim.Adam(critic_model.parameters(), lr=1e-3)
memory = Memory()



train(
    policy_model,
    critic_model,
    policy_target,
    critic_target,
    optimizer_policy,
    optimizer_critic,
    memory,
    10000,
    100,
    0.9,
    100)

TypeError: Memory.add() missing 2 required positional arguments: 'action' and 'target'