<a href="https://colab.research.google.com/github/EugIva/ProzorovEI209M_RL/blob/main/Prozorov_HW3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Выполнил Прозоров Евгений 209М

Реализуйте алгоритм GAIL на среде Mountain Car. Перед этим сгенерируйте экспертные данные (из детерминированной стратегии с первой практики). Хорошей идеей будет добавить в state (observation) синус и косинус от временной метки t для лучшего обучения.

In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
import torch.optim as optim
import torch.nn.functional as F

from collections import deque
import random

In [2]:
env_expert = gym.make("MountainCar-v0")
# Генерация экспертных данных:
states = []
actions = []

for _ in range(1000):
    obs, _ = env_expert.reset()
    done = False
    t = 0  # Инициализация t для каждого эпизода

    while not done:
        position, velocity = obs
        if velocity < 0 or position < -0.5:
            action = 0
        else:
            action = 2
        states.append(np.concatenate([obs, [np.sin(t), np.cos(t)]]))
        actions.append(action)

        next_obs, _, terminated, truncated, _ = env_expert.step(action)
        done = terminated or truncated
        obs = next_obs
        t += 1
        # Детерминированная стратегия: выбрать действие с наименьшим ускорением (0 - слева, 1 - нейтрально, 2 - справа)
        # Для MountainCar оптимально выбрать 2 (вправо) при положительной скорости и 0 (влево) при отрицательной

expert_obs = np.array(states)
expert_acts = np.array(actions)

obs_dim = 4  # 2 исходных признака + sin(t) + cos(t)
act_dim = 3  # Действий 3 (лево, нейтрально, право)

In [3]:
#expert_obs = np.copy(states)
#expert_acts = np.copy(actions)

In [4]:
class Policy(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 64), nn.ReLU(),
            nn.Linear(64, act_dim)
        )

    def forward(self, obs):
        logits = self.net(obs)
        return Categorical(logits=logits)

    def get_action(self, obs):
        dist = self.forward(obs)
        return dist.sample().item()

In [5]:
class Discriminator(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim + act_dim, 64), nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, obs, act):
        act_onehot = F.one_hot(act, num_classes=3).float()  # исправлено на 3
        x = torch.cat([obs, act_onehot], dim=1)
        return self.net(x)

In [6]:
class TrajectoryBuffer:
    def __init__(self):
        self.obs, self.acts, self.rews = [], [], []

    def store(self, o, a, r):
        self.obs.append(o)
        self.acts.append(a)
        self.rews.append(r)

    def get(self):
        return (
            torch.tensor(np.array(self.obs), dtype=torch.float32),
            torch.tensor(np.array(self.acts), dtype=torch.long),
            torch.tensor(np.array(self.rews), dtype=torch.float32)
        )

In [7]:
env = gym.make("MountainCar-v0")
policy = Policy(obs_dim, act_dim)
discrim = Discriminator(obs_dim, act_dim)

policy_opt = optim.Adam(policy.parameters(), lr=1e-3)
discrim_opt = optim.Adam(discrim.parameters(), lr=1e-3)

In [8]:
for epoch in range(3000):
    buf = TrajectoryBuffer()
    obs, _ = env.reset()
    done = False
    t = 0

    while not done:
        current_obs = obs
        extended_obs = np.concatenate([current_obs, [np.sin(t), np.cos(t)]])
        obs_tensor = torch.tensor(extended_obs, dtype=torch.float32).unsqueeze(0)
        action = policy.get_action(obs_tensor)
        next_obs, _, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        buf.store(extended_obs, action, 0)
        obs = next_obs
        t += 1

    agent_obs, agent_acts, _ = buf.get()

    idxs = np.random.choice(len(expert_obs), len(agent_obs), replace=False)
    exp_obs = torch.tensor(expert_obs[idxs], dtype=torch.float32)
    exp_acts = torch.tensor(expert_acts[idxs], dtype=torch.long)

    # Обучение дискриминатора
    for _ in range(2):
        discrim_opt.zero_grad()
        disc_agent = discrim(agent_obs, agent_acts)
        disc_expert = discrim(exp_obs, exp_acts)
        disc_loss = F.binary_cross_entropy(disc_agent, torch.ones_like(disc_agent)) + \
                    F.binary_cross_entropy(disc_expert, torch.zeros_like(disc_expert))
        disc_loss.backward()
        discrim_opt.step()

#В GAIL награда для политики должна быть log(1 - D(s,a)), а не -log(D(s,a))
    # Вычисление награды
    with torch.no_grad():
        disc_agent = discrim(agent_obs, agent_acts)
        rewards = (torch.log(1 - disc_agent + 1e-8)).cpu().numpy().flatten()

    # Обновление политики
    policy_opt.zero_grad()
    dist = policy(agent_obs)
    log_probs = dist.log_prob(agent_acts)
    policy_loss = - (log_probs * torch.from_numpy(rewards).float()).mean()
    policy_loss.backward()
    policy_opt.step()



    if epoch % 10 == 0:
        print(f"Epoch {epoch}: GAIL Loss {policy_loss.item():.3f}, Disc Loss {disc_loss.item():.3f}")



Epoch 0: GAIL Loss -0.745, Disc Loss 1.406
Epoch 10: GAIL Loss -0.763, Disc Loss 1.353
Epoch 20: GAIL Loss -0.830, Disc Loss 1.302
Epoch 30: GAIL Loss -0.862, Disc Loss 1.266
Epoch 40: GAIL Loss -0.816, Disc Loss 1.290
Epoch 50: GAIL Loss -0.944, Disc Loss 1.287
Epoch 60: GAIL Loss -0.847, Disc Loss 1.298
Epoch 70: GAIL Loss -0.703, Disc Loss 1.346
Epoch 80: GAIL Loss -0.694, Disc Loss 1.330
Epoch 90: GAIL Loss -0.549, Disc Loss 1.346
Epoch 100: GAIL Loss -0.419, Disc Loss 1.337
Epoch 110: GAIL Loss -0.378, Disc Loss 1.326
Epoch 120: GAIL Loss -0.319, Disc Loss 1.336
Epoch 130: GAIL Loss -0.314, Disc Loss 1.306
Epoch 140: GAIL Loss -0.205, Disc Loss 1.338
Epoch 150: GAIL Loss -0.231, Disc Loss 1.325
Epoch 160: GAIL Loss -0.346, Disc Loss 1.313
Epoch 170: GAIL Loss -0.345, Disc Loss 1.325
Epoch 180: GAIL Loss -0.283, Disc Loss 1.314
Epoch 190: GAIL Loss -0.384, Disc Loss 1.302
Epoch 200: GAIL Loss -0.389, Disc Loss 1.298
Epoch 210: GAIL Loss -0.402, Disc Loss 1.277
Epoch 220: GAIL Loss 

In [9]:
# Тестирование
for episode in range(10):
    obs, _ = env.reset()
    done = False
    total_reward = 0
    t = 0

    while not done:
        state = np.concatenate([obs, [np.sin(t), np.cos(t)]])
        obs_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        action = policy.get_action(obs_tensor)
        next_obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        obs = next_obs
        total_reward += reward
        t += 1
    print(f"Episode {episode}: Total reward {total_reward}")
env.close()

Episode 0: Total reward -200.0
Episode 1: Total reward -200.0
Episode 2: Total reward -200.0
Episode 3: Total reward -200.0
Episode 4: Total reward -200.0
Episode 5: Total reward -200.0
Episode 6: Total reward -200.0
Episode 7: Total reward -200.0
Episode 8: Total reward -200.0
Episode 9: Total reward -200.0


что-то не получилось :(