<a href="https://colab.research.google.com/github/EugIva/ProzorovEI209M_RL/blob/main/Prozorov_HW2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Выполнил Прозоров Евгений 209М

Реализуйте алгоритм SAC для среды lunar lander

In [7]:
!pip install swig
!pip install "gymnasium[box2d]"

Collecting swig
  Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2379366 sha256=91f

In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
from torch.distributions import Normal

In [2]:
GAMMA = 0.99
TAU = 0.005
ALPHA = 0.2
ACTOR_LR = 3e-4
CRITIC_LR = 3e-4
REPLAY_SIZE = 100000
BATCH_SIZE = 256
START_STEPS = 10000
TOTAL_STEPS = 200000
UPDATE_AFTER = 1000
UPDATE_EVERY = 50

In [10]:
class Actor(nn.Module):
    def __init__(self, obs_dim, act_dim, act_limit):  # Добавлен act_limit в параметры
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 256), nn.ReLU(),
            nn.Linear(256, 256), nn.ReLU(),
        )
        self.mu_layer = nn.Linear(256, act_dim)
        self.log_std_layer = nn.Linear(256, act_dim)
        self.act_limit = act_limit  # Теперь корректно инициализируется

    def forward(self, obs):
        x = F.relu(self.net(obs))
        mean, std = self.mu_layer(x), torch.clamp(self.log_std_layer(x), -20, 2).exp()
        normal = torch.distributions.Normal(mean, std)

        x_t = normal.rsample()
        y_t = torch.tanh(x_t)
        action = y_t * self.act_limit  # Масштабируем в диапазон [-act_limit, act_limit]

        log_prob = normal.log_prob(x_t)
        log_prob -= torch.log(1 - y_t.pow(2) + 1e-6)
        log_prob = log_prob.sum(1, keepdim=True)

        return action, log_prob

#Если deterministic установлено в True, действие должно быть средним значением распределения, иначе мы используем семплирование
    def get_action(self, obs, deterministic=False):
        with torch.no_grad():
            x = F.relu(self.net(obs))
            mean = self.mu_layer(x)
            log_std = self.log_std_layer(x)
            std = log_std.exp()

            if deterministic:
                action = torch.tanh(mean)
            else:
                normal = Normal(mean, std)
                x_t = normal.rsample()
                action = torch.tanh(x_t)

            # Масштабируем действие в диапазон [-act_limit, act_limit]
            action = action * self.act_limit
            return action

In [11]:
class Critic(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.q1 = nn.Sequential(
            nn.Linear(obs_dim + act_dim, 256), nn.ReLU(),
            nn.Linear(256, 256), nn.ReLU(),
            nn.Linear(256, 1)
        )
        self.q2 = nn.Sequential(
            nn.Linear(obs_dim + act_dim, 256), nn.ReLU(),
            nn.Linear(256, 256), nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, obs, act):
        x = torch.cat([obs, act], dim=-1)
        return self.q1(x), self.q2(x)

In [12]:
class ReplayBuffer:
    def __init__(self, size):
        self.buffer = deque(maxlen=size)

    def add(self, *args):
        self.buffer.append(tuple(args))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*batch))
        return (
            torch.tensor(states, dtype=torch.float32),
            torch.tensor(actions, dtype=torch.float32),
            torch.tensor(rewards, dtype=torch.float32).unsqueeze(1),
            torch.tensor(next_states, dtype=torch.float32),
            torch.tensor(dones, dtype=torch.float32).unsqueeze(1)
        )

In [21]:
# В блоке:
env = gym.make("LunarLanderContinuous-v3")
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
action_low, action_high = float(env.action_space.low[0]), float(env.action_space.high[0])
act_limit = action_high  # Устанавливаем act_limit как максимальное значение действия (1.0 для LunarLander)

actor = Actor(obs_dim, act_dim, act_limit)  # Передаем act_limit в Actor
critic = Critic(obs_dim, act_dim)
critic_target = Critic(obs_dim, act_dim)
critic_target.load_state_dict(critic.state_dict())

actor_opt = optim.Adam(actor.parameters(), lr=ACTOR_LR)
critic_opt = optim.Adam(critic.parameters(), lr=CRITIC_LR)

replay = ReplayBuffer(REPLAY_SIZE)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
actor.to(device)
critic.to(device)
critic_target.to(device)

obs, _ = env.reset()
episode_return, episode_len = 0, 0

Здесь в базовой реализации была ошибка
```
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-961d9f7e8089> in <cell line: 0>()
      4 action_low, action_high = float(env.action_space.low[0]), float(env.action_space.high[0])
      5
----> 6 actor = Actor(obs_dim, act_dim, act_limit)
      7 critic = Critic(obs_dim, act_dim)
      8 critic_target = Critic(obs_dim, act_dim)

NameError: name 'act_limit' is not defined
```

пришлось править и в других местах (или тз некорректно было, не знаю, или я не понял)



In [22]:
def update():
    if len(replay.buffer) < BATCH_SIZE:
        return

    states, actions, rewards, next_states, dones = replay.sample(BATCH_SIZE)
    states = states.to(device)
    actions = actions.to(device)
    rewards = rewards.to(device)
    next_states = next_states.to(device)
    dones = dones.to(device)

    with torch.no_grad():
        next_actions, next_log_probs = actor(next_states)
        q1_next, q2_next = critic_target(next_states, next_actions)
        q_next = torch.min(q1_next, q2_next) - ALPHA * next_log_probs
        q_target = rewards + (1 - dones) * GAMMA * q_next

    q1, q2 = critic(states, actions)
    critic_loss = F.mse_loss(q1, q_target) + F.mse_loss(q2, q_target)
    critic_opt.zero_grad()
    critic_loss.backward()
    critic_opt.step()

    actions_pred, log_probs = actor(states)
    q1_pred, q2_pred = critic(states, actions_pred)
    q_pred = torch.min(q1_pred, q2_pred)
    actor_loss = (ALPHA * log_probs - q_pred).mean()
    actor_opt.zero_grad()
    actor_loss.backward()
    actor_opt.step()

    for param, target_param in zip(critic.parameters(), critic_target.parameters()):
        target_param.data.copy_(TAU * param.data + (1 - TAU) * target_param.data)


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Добавьте это в начало кода

for step in range(TOTAL_STEPS):
    if step < START_STEPS:
        act = env.action_space.sample()
    else:
        with torch.no_grad():
            obs_t = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
            act = actor.get_action(obs_t).cpu().numpy()[0]

    next_obs, rew, terminated, truncated, _ = env.step(act)
    done = terminated or truncated
    replay.add(obs, act, rew, next_obs, done)

    obs = next_obs
    episode_return += rew
    episode_len += 1

    if done:
        obs, _ = env.reset()
        print(f"Step: {step}, Return: {episode_return:.2f}, Len: {episode_len}")
        episode_return, episode_len = 0, 0

    if step >= UPDATE_AFTER and step % UPDATE_EVERY == 0:
        for _ in range(UPDATE_EVERY):
            update()

Step: 96, Return: -176.49, Len: 97
Step: 194, Return: -499.83, Len: 98
Step: 281, Return: -178.87, Len: 87
Step: 383, Return: -302.28, Len: 102
Step: 465, Return: 4.09, Len: 82
Step: 545, Return: -32.39, Len: 80
Step: 655, Return: -401.75, Len: 110
Step: 736, Return: -75.06, Len: 81
Step: 888, Return: -156.39, Len: 152
Step: 963, Return: -308.58, Len: 75
Step: 1052, Return: -391.04, Len: 89
Step: 1217, Return: -29.32, Len: 165
Step: 1359, Return: -413.78, Len: 142
Step: 1498, Return: -186.85, Len: 139
Step: 1650, Return: -86.49, Len: 152
Step: 1724, Return: -30.84, Len: 74
Step: 1809, Return: -58.19, Len: 85
Step: 2809, Return: 63.31, Len: 1000
Step: 2910, Return: -57.12, Len: 101
Step: 2987, Return: -70.56, Len: 77
Step: 3126, Return: -456.99, Len: 139
Step: 3246, Return: -129.30, Len: 120
Step: 3323, Return: -68.21, Len: 77
Step: 3506, Return: -208.90, Len: 183
Step: 3630, Return: -179.90, Len: 124
Step: 3748, Return: -467.92, Len: 118
Step: 3856, Return: -66.26, Len: 108
Step: 3933,