# Домашнее задание №1: Реализация DQN

В данном задании вам будет необходимо реализовать алгоритм DQN или одну из его модификаций. Реализованный алгоритм необходимо использовать для того, чтобы научиться решать задачу LunarLander.

**Можно запустить: ```python train.py``` в cmd или данный файл***

# Установка необходимых модулей:
* !pip install gym
* !pip install box2d
* !pip install torch
* !pip insyall pyvirtualdisplay *если запустить на линукс

In [1]:
import gym
import copy
import random
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from gym import make
from random import sample
from collections import deque
from torch import nn
from torch.nn import functional as F
from torch.optim import Adam
import matplotlib.pyplot as plt
%matplotlib inline

from agent import Agent

# если запустить на линукс pyvirtualdisplay отлично выводит процесс обученной модели
# from pyvirtualdisplay import Display
# display = Display(visible=0, size=(1400, 900))
# display.start()

# is_ipython = 'inline' in plt.get_backend()
# if is_ipython:
#     from IPython import display

# plt.ion()

In [2]:
# определим константные переменные
LEARNING_RATE = 5e-4
CONST_VAL = 0
GAMMA = 0.99
STEPS_PER_UPDATE = 4
BATCH_SIZE = 128
STATE_UNITS = 256
INITIAL_STEPS = 1024
TRANSITIONS = 500000
STEPS_PER_TARGET_UPDATE = STEPS_PER_UPDATE * 1000

In [3]:
# реализуем класс буфера памяти
class Buffer:
    def __init__(self):
        self.buffer = deque(maxlen=10000)

    def add(self, transition):
        self.buffer.append(transition)

    def sample(self, batch_size: int):
        return sample(self.buffer, batch_size)

In [4]:
# реализуем класс DQN
class DQN:
    def __init__(self, state_dim, action_dim):
        self.steps = CONST_VAL # Do not change
        self.gamma = GAMMA
        self.batch_size = BATCH_SIZE
        # defining the buffer
        self.buffer = Buffer()
        self.model = nn.Sequential(
            nn.Linear(state_dim, STATE_UNITS),
            nn.ReLU(),
            nn.Linear(STATE_UNITS, STATE_UNITS),
            nn.ReLU(),
            nn.Linear(STATE_UNITS, STATE_UNITS),
            nn.ReLU(),
            nn.Linear(STATE_UNITS, action_dim),
        ) # Torch model
        # device run
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # detected target
        self.target = copy.deepcopy(self.model).to(self.device)
        # optimizer
        self.optimizer = Adam(self.model.parameters(), lr=LEARNING_RATE)
        # loss mse
        self.loss = nn.MSELoss()

    def consume_transition(self, transition):
        # Add transition to a replay buffer.
        # Hint: use deque with specified maxlen. It will remove old experience automatically.
        self.buffer.add(transition)

    def sample_batch(self):
        # Sample batch from a replay buffer.
        # Hints:
        # 1. Use random.randint
        # 2. Turn your batch into a numpy.array before turning it to a Tensor. It will work faster
        batch = self.buffer.sample(self.batch_size)
        return list(zip(*batch))
        
    def train_step(self, batch):
        # Use batch to update DQN's network.
        state, action, next_state, reward, done = batch
        state = torch.tensor(np.array(state), dtype=torch.float32)
        next_state = torch.tensor(np.array(next_state), dtype=torch.float32)
        reward = torch.tensor(np.array(reward), dtype=torch.float32).view(-1)
        done = torch.tensor(np.array(done), dtype=torch.bool)
        action = torch.tensor(np.array(action), dtype=torch.int64).view(-1, 1)
        # target network            
        with torch.no_grad():
            q_target = self.target(next_state).max(dim=-1)[0]
            q_target[done] = 0
            q_target = reward + self.gamma * q_target
        q_func = self.model(state).gather(1, action.reshape(-1, 1))
        # calculate loss
        loss = self.loss(q_func, q_target.unsqueeze(1))
        # step
        self.optimizer.zero_grad()
        # calculate loss
        loss.backward()
        # step optimizer
        self.optimizer.step()
        
    def update_target_network(self):
        # Update weights of a target Q-network here. You may use copy.deepcopy to do this or 
        # assign a values of network parameters via PyTorch methods.
        self.target = copy.deepcopy(self.model)

    def act(self, state, target=False):
        # Compute an action. Do not forget to turn state to a Tensor and then turn an action to a numpy array.
        state = np.array(state)
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.float32)
            action = self.model(state).numpy()
        return np.argmax(action)

    def update(self, transition):
        # You don't need to change this
        self.consume_transition(transition)
        if self.steps % STEPS_PER_UPDATE == 0:
            batch = self.sample_batch()
            self.train_step(batch)
        if self.steps % STEPS_PER_TARGET_UPDATE == 0:
            self.update_target_network()
        self.steps += 1

    def save(self):
        torch.save(self.model, "agent.pkl")

In [5]:
# функция для загрузки политики игры LunarLander и определение параметров
def evaluate_policy(agent, episodes=5):
    env = make("LunarLander-v2")
    returns = []
    for _ in range(episodes):
        done = False
        state = env.reset()
        total_reward = 0.
        
        while not done:
            state, reward, done, _ = env.step(agent.act(state))
            total_reward += reward
        returns.append(total_reward)
    return returns

## Инициализация среды и запуск обучения

In [6]:
env = make("LunarLander-v2")
dqn = DQN(state_dim=env.observation_space.shape[0], action_dim=env.action_space.n)
eps = 0.1
state = env.reset()

for _ in range(INITIAL_STEPS):
    action = env.action_space.sample()

    next_state, reward, done, _ = env.step(action)
    dqn.consume_transition((state, action, next_state, reward, done))

    state = next_state if not done else env.reset()


for i in range(TRANSITIONS):
    #Epsilon-greedy policy
    if random.random() < eps:
        action = env.action_space.sample()
    else:
        action = dqn.act(state)

    next_state, reward, done, _ = env.step(action)
    dqn.update((state, action, next_state, reward, done))

    state = next_state if not done else env.reset()

    if (i + 1) % (TRANSITIONS//100) == 0:
        rewards = evaluate_policy(dqn, 5)
        print(f"Step: {i+1}, Reward mean: {np.mean(rewards)}, Reward std: {np.std(rewards)}")
        dqn.save()

Step: 5000, Reward mean: -124.5487814717917, Reward std: 96.09787632643042
Step: 10000, Reward mean: -172.86712852131637, Reward std: 127.55505976734356
Step: 15000, Reward mean: -118.23815419416333, Reward std: 94.31840788106732
Step: 20000, Reward mean: -94.51355880589674, Reward std: 181.8690414130671
Step: 25000, Reward mean: -140.62212791323154, Reward std: 57.889198802110506
Step: 30000, Reward mean: -131.39738129234698, Reward std: 96.50820092711625
Step: 35000, Reward mean: -118.72250988243937, Reward std: 23.315305513768184
Step: 40000, Reward mean: -124.81965151859636, Reward std: 31.07349985996088
Step: 45000, Reward mean: -120.7967659052812, Reward std: 11.6290736116102
Step: 50000, Reward mean: -113.17326180933378, Reward std: 38.04921890799007
Step: 55000, Reward mean: -92.9214794204257, Reward std: 4.72369479560845
Step: 60000, Reward mean: -167.60146481222384, Reward std: 76.37784041434708
Step: 65000, Reward mean: -147.3683522832108, Reward std: 56.873387840729436
Step

## Загрузка модели и визуализация

In [None]:
# убрать комменты, если под линуксом, под виндой не запустилось(

## загружаем модель
# agent = Agent()
## запуск агента
# state = env.reset()
# img = plt.imshow(env.render(mode='rgb_array'))
# for _ in range(200):
#     action = agent.act(state)
#     img.set_data(env.render(mode='rgb_array')) 
#     plt.axis('off')
    
#     display.display(plt.gcf())
#     display.clear_output(wait=True)
#     state, reward, done, _ = env.step(action)
#     if done:
#         break
        
# env.close()

