# Laboratorium 4 (4 pkt.)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem wcześniej przygotowanych środowisk: *FrozenLake* i *Pacman* oraz środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [1]:
from collections import deque
import gym
import numpy as np
import random
from tqdm import tqdm
from numpy import ndarray

Dołączenie bibliotek ze środowiskami:

In [2]:
from env.FrozenLakeMDP import frozenLake
from env.FrozenLakeMDPExtended import frozenLakeExtended


Dołączenie bibliotek do obsługi sieci neuronowych

In [3]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [4]:
# import tensorflow as tf
# from tensorflow.keras.layers import Dense, ReLU
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.optimizers import Adam

## Zadanie 1 - Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
        Q(s_t, a_t) = r_{t+1} + \gamma \text{max}_a Q(s_{t + 1}, a)
\end{equation}
</p>

In [5]:
class ReplayBuffer(object):
    def __init__(self, mem_size, state_shape):
        self.mem_size = mem_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, state_shape), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, state_shape), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def store_transition(self, state, action, reward, _state, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = _state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - done
        self.mem_cntr += 1

    @staticmethod
    def _softmax(x: ndarray) -> ndarray:
        return np.exp(x) / np.exp(x).sum()

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)

        probs = self._softmax(np.abs(self.reward_memory[:max_mem]))
        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        _states = self.new_state_memory[batch]
        done = self.terminal_memory[batch]

        return states, actions, rewards, _states, done

In [6]:
# class DQNAgent: # Tensorflow
#     def __init__(self, action_size, state_size, learning_rate, model):
#         self.action_size = action_size
#         self.memory = ReplayBuffer(1000000, state_size)
#         self.gamma = 0.95    # discount rate
#         self.epsilon = 1.0  # exploration rate
#         self.epsilon_min = 0.01
#         self.epsilon_decay = 0.001
#         self.learning_rate = learning_rate
#         self.model = model
#         self.evaluate = False
#
#     def remember(self, state, action, reward, _state, done):
#         self.memory.store_transition(state, action, reward, _state, done)
#
#     def get_action(self, state):
#         state = state.reshape(1, -1)
#         if np.random.random() <= self.epsilon and not self.evaluate:
#             action = np.random.choice(self.action_size)
#         else:
#             actions = self.model.predict(state, verbose=0)
#             action = np.argmax(actions)
#
#
#         return action
#
#     def get_best_action(self, state):
#         state = state.reshape(1, -1)
#         actions = self.model.predict(state, verbose=0)
#         action = np.argmax(actions)
#
#         return action
#
#     def learn(self, batch_size):
#         if self.memory.mem_cntr < batch_size:
#             return
#
#         states, actions, rewards, _states, done = self.memory.sample_buffer(batch_size)
#
#         q_pred = self.model.predict(states, verbose=0)
#         q_next = self.model.predict(_states, verbose=0)
#
#         max_actions = np.argmax(q_pred, axis=1)
#
#         batch_index = np.arange(batch_size, dtype=np.int32)
#
#         q_pred[batch_index, actions] = rewards + self.gamma * q_next[batch_index, max_actions.astype(int)] * (1-done)
#
#         self.model.train_on_batch(states, q_pred)
#
#     def update_epsilon_value(self):
#         self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.epsilon_min else self.epsilon_min


In [7]:
# def get_model(input_shape, action_size, fc1, fc2):
#     model = Sequential([
#         Dense(fc1, input_shape=input_shape),
#         ReLU(),
#         Dense(fc2),
#         ReLU(),
#         Dense(action_size, activation=None),
#     ])
#
#     model.compile(loss='mse', optimizer=Adam(learning_rate==0.001))
#
#     return model

In [18]:
class DQNAgent: # Pytorch
    def __init__(self, action_size, state_size, learning_rate, model):
        self.action_size = action_size
        self.memory = ReplayBuffer(1000, state_size)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.05
        self.learning_rate = learning_rate
        self.q = model
        self.evaluate = False

    def remember(self, state, action, reward, _state, done):
        self.memory.store_transition(state, action, reward, _state, done)

    def get_action(self, state):
        if np.random.random() <= self.epsilon and not self.evaluate:
            action = np.random.choice(self.action_size)
        else:
            state = T.tensor(state).to(self.q.device)
            actions = self.q.forward(state)
            action = T.argmax(actions).item()

        return action

    def get_best_action(self, state):
        state = T.tensor(state).to(self.q.device)
        actions = self.q.forward(state)
        action = T.argmax(actions).item()

        return action

    def learn(self, batch_size):
        if self.memory.mem_cntr < batch_size:
            return

        states, actions, rewards, _states, done = self.memory.sample_buffer(batch_size)

        states = T.tensor(states).to(self.q.device)
        _states = T.tensor(_states).to(self.q.device)


        q_next = self.q.forward(_states).cpu().detach().numpy()
        q_pred = self.q.forward(states)
        q_target = q_pred.cpu().detach().numpy().copy()

        max_actions = np.argmax(q_next, axis=1)

        batch_index = np.arange(batch_size, dtype=np.int32)

        q_target[batch_index, actions] = rewards + self.gamma * q_next[batch_index, max_actions] * (1-done)
        q_target = T.tensor(q_target).to(self.q.device)

        # q_pred = T.tensor(q_pred, requires_grad=True).to(self.q.device)

        self.q.optimizer.zero_grad()
        loss = self.q.loss(q_pred, q_target).to(self.q.device)
        loss.backward()
        self.q.optimizer.step()

    def update_epsilon_value(self):
        self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.epsilon_min else self.epsilon_min

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLake*, warstwa wejściowa powinna mieć tyle neuronów ile jest możlliwych stanów, warstwa wyjściowa tyle neuronów ile jest możliwych akcji do wykonania:

In [19]:
class DQN(nn.Module):
    def __init__(self, lr, state_shape, n_actions, fc1, fc2):
        super(DQN, self).__init__()

        self.fc1 = nn.Linear(state_shape, fc1)
        self.fc2 = nn.Linear(fc1, fc2)
        self.output = nn.Linear(fc2, n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0')
        self.to(self.device)

    def forward(self, state):
        state = F.relu(self.fc1(state))
        state = F.relu(self.fc2(state))
        actions = self.output(state)

        return actions

In [20]:
env = frozenLake("8x8")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

 Czas nauczyć agenta poruszania się po środowisku *FrozenLake*, jako stan przyjmij wektor o liczbie elementów równej liczbie możliwych stanów, z wartością 1 ustawioną w komórce o indeksie równym aktualnemu stanowi, pozostałe elementy mają być wypełnione zerami:
* 1 pkt < 35 epok,
* 0.5 pkt < 60 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [21]:
def check_action_values(agent: DQNAgent, state_size: int):
    rets = []
    for i_state in range(state_size):
        state = np.zeros(state_size, dtype=np.float32)
        state[i_state] = 1
        state = T.tensor(state).to(agent.q.device)
        rets.append(agent.q(state).cpu().detach().numpy())
    rets = np.array(rets).reshape(state_size, 4)
    with np.printoptions(precision=4, suppress=True):
        print(rets)

In [22]:
model = DQN(learning_rate, state_size, action_size, 128, 128)

agent = DQNAgent(action_size, state_size, learning_rate, model)
agent.epsilon = 1

done = False
batch_size = 64
EPISODES = 60
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in tqdm(range(100), desc=f'Epoch: {e}'):
        total_reward = 0
        i_state = env.reset()
    
        state = np.zeros(state_size, dtype=np.float32)
        state[i_state] = 1
        
        for time in range(1000):
            action = agent.get_action(state)
            _i_state, reward, done, _ = env.step(action)
            total_reward += reward

            _state = np.zeros(state_size, dtype=np.float32)
            _state[_i_state] = 1

            if np.allclose(state, _state):
                reward = -1

            if done and not reward:
                reward = -1

            agent.remember(state, action, reward, _state, done)
            agent.learn(batch_size)
            
            state = _state
            if done:
                break

        summary.append(total_reward)
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.update_epsilon_value()

    if np.mean(summary) > 0.9:
        print ("You Win!")
        break


Epoch: 0: 100%|██████████| 100/100 [00:08<00:00, 11.93it/s]


epoch #0	mean reward = 0.000	epsilon = 1.000


Epoch: 1: 100%|██████████| 100/100 [00:07<00:00, 12.94it/s]


epoch #1	mean reward = 0.000	epsilon = 0.950


Epoch: 2: 100%|██████████| 100/100 [00:08<00:00, 11.53it/s]


epoch #2	mean reward = 0.000	epsilon = 0.900


Epoch: 3: 100%|██████████| 100/100 [00:08<00:00, 11.17it/s]


epoch #3	mean reward = 0.000	epsilon = 0.850


Epoch: 4: 100%|██████████| 100/100 [00:09<00:00, 10.82it/s]


epoch #4	mean reward = 0.000	epsilon = 0.800


Epoch: 5: 100%|██████████| 100/100 [00:11<00:00,  8.92it/s]


epoch #5	mean reward = 0.000	epsilon = 0.750


Epoch: 6: 100%|██████████| 100/100 [00:08<00:00, 11.52it/s]


epoch #6	mean reward = 0.000	epsilon = 0.700


Epoch: 7: 100%|██████████| 100/100 [00:09<00:00, 10.45it/s]


epoch #7	mean reward = 0.000	epsilon = 0.650


Epoch: 8: 100%|██████████| 100/100 [00:12<00:00,  8.20it/s]


epoch #8	mean reward = 0.020	epsilon = 0.600


Epoch: 9: 100%|██████████| 100/100 [00:10<00:00,  9.21it/s]


epoch #9	mean reward = 0.000	epsilon = 0.550


Epoch: 10: 100%|██████████| 100/100 [00:11<00:00,  8.39it/s]


epoch #10	mean reward = 0.010	epsilon = 0.500


Epoch: 11: 100%|██████████| 100/100 [00:12<00:00,  7.94it/s]


epoch #11	mean reward = 0.000	epsilon = 0.450


Epoch: 12: 100%|██████████| 100/100 [00:12<00:00,  7.80it/s]


epoch #12	mean reward = 0.000	epsilon = 0.400


Epoch: 13: 100%|██████████| 100/100 [00:11<00:00,  8.47it/s]


epoch #13	mean reward = 0.000	epsilon = 0.350


Epoch: 14: 100%|██████████| 100/100 [00:16<00:00,  6.14it/s]


epoch #14	mean reward = 0.010	epsilon = 0.300


Epoch: 15: 100%|██████████| 100/100 [00:14<00:00,  7.13it/s]


epoch #15	mean reward = 0.060	epsilon = 0.250


Epoch: 16: 100%|██████████| 100/100 [00:16<00:00,  6.10it/s]


epoch #16	mean reward = 0.120	epsilon = 0.200


Epoch: 17: 100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


epoch #17	mean reward = 0.230	epsilon = 0.150


Epoch: 18: 100%|██████████| 100/100 [00:20<00:00,  4.84it/s]


epoch #18	mean reward = 0.410	epsilon = 0.100


Epoch: 19: 100%|██████████| 100/100 [00:33<00:00,  2.99it/s]


epoch #19	mean reward = 0.500	epsilon = 0.050


Epoch: 20: 100%|██████████| 100/100 [00:44<00:00,  2.22it/s]

epoch #20	mean reward = 0.990	epsilon = -0.000
You Win!





Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLakeExtended*, tym razem stan nie jest określany poprzez pojedynczą liczbę, a przez 3 tablice:
* pierwsza zawierająca informacje o celu,
* druga zawierająca informacje o dziurach,
* trzecia zawierająca informację o położeniu gracza.

In [60]:
env = frozenLakeExtended("4x4")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

model = DQN(learning_rate, 48, action_size, 128, 64)

 Czas nauczyć agenta poruszania się po środowisku *FrozenLakeExtended*, jako stan przyjmij wektor składający się ze wszystkich trzech tablic (2 pkt.):

In [63]:
agent = DQNAgent(action_size, 48, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 2000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        state = env.reset()
        state = np.array(state).reshape(-1,).astype(np.float32)

        
        for time in range(1000):
            action = agent.get_action(state)
            _state, reward, done, _ = env.step(action)
            _state = np.array(_state).reshape(-1,).astype(np.float32)
            total_reward += reward

            if np.allclose(state, _state):
                reward = -1

            if done and not reward:
                reward = -1

            agent.remember(state, action, reward, _state, done)
            agent.learn(batch_size)
            state = _state
            if done:
                break


        summary.append(total_reward)

    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.update_epsilon_value()
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break

epoch #0	mean reward = 0.000	epsilon = 0.750
epoch #1	mean reward = 0.070	epsilon = 0.700
epoch #2	mean reward = 0.090	epsilon = 0.650
epoch #3	mean reward = 0.130	epsilon = 0.600
epoch #4	mean reward = 0.080	epsilon = 0.550
epoch #5	mean reward = 0.190	epsilon = 0.500
epoch #6	mean reward = 0.180	epsilon = 0.450
epoch #7	mean reward = 0.200	epsilon = 0.400
epoch #8	mean reward = 0.230	epsilon = 0.350
epoch #9	mean reward = 0.260	epsilon = 0.300
epoch #10	mean reward = 0.270	epsilon = 0.250
epoch #11	mean reward = 0.390	epsilon = 0.200
epoch #12	mean reward = 0.450	epsilon = 0.150
epoch #13	mean reward = 0.600	epsilon = 0.100
epoch #14	mean reward = 0.790	epsilon = 0.050
epoch #15	mean reward = 1.000	epsilon = -0.000
You Win!


Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [64]:
env = gym.make("CartPole-v0")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.005


Czas nauczyć agenta gry w środowisku *CartPool*:
* 1 pkt < 10 epok,
* 0.5 pkt < 20 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [72]:
model = DQN(learning_rate, state_size, action_size, 32, 32)

agent = DQNAgent(action_size, state_size, learning_rate, model)

agent.epsilon = 0.5

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        state = env.reset().astype(np.float32)
        
        for time in range(300):
            action = agent.get_action(state)
            _state, reward, done, _ = env.step(action)
            total_reward += reward
            _state = _state.astype(np.float32)

            agent.remember(state, action, reward, _state, done)
            agent.learn(batch_size)
            state = _state
            if done:
                break

        summary.append(total_reward)

    agent.update_epsilon_value()
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 195:
        print ("You Win!")
        break


epoch #0	mean reward = 16.670	epsilon = 0.450
epoch #1	mean reward = 17.600	epsilon = 0.400
epoch #2	mean reward = 20.020	epsilon = 0.350
epoch #3	mean reward = 30.380	epsilon = 0.300
epoch #4	mean reward = 12.090	epsilon = 0.250
epoch #5	mean reward = 21.250	epsilon = 0.200
epoch #6	mean reward = 10.890	epsilon = 0.150
epoch #7	mean reward = 13.440	epsilon = 0.100
epoch #8	mean reward = 9.960	epsilon = 0.050
epoch #9	mean reward = 9.840	epsilon = 0.000
epoch #10	mean reward = 9.540	epsilon = 0.010
epoch #11	mean reward = 9.540	epsilon = 0.010
epoch #12	mean reward = 9.510	epsilon = 0.010
epoch #13	mean reward = 9.470	epsilon = 0.010
epoch #14	mean reward = 9.690	epsilon = 0.010
epoch #15	mean reward = 9.650	epsilon = 0.010
epoch #16	mean reward = 9.550	epsilon = 0.010
epoch #17	mean reward = 9.480	epsilon = 0.010
epoch #18	mean reward = 9.500	epsilon = 0.010
epoch #19	mean reward = 9.630	epsilon = 0.010
epoch #20	mean reward = 9.530	epsilon = 0.010
epoch #21	mean reward = 9.510	epsilo

KeyboardInterrupt: 