# Laboratorium 4 (4 pkt.)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem wcześniej przygotowanych środowisk: *FrozenLake* i *Pacman* oraz środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [15]:
from collections import deque
import gym
import numpy as np
import random
from tqdm import tqdm
from numpy import ndarray

Dołączenie bibliotek ze środowiskami:

In [16]:
from env.FrozenLakeMDP import frozenLake
from env.FrozenLakeMDPExtended import frozenLakeExtended


Dołączenie bibliotek do obsługi sieci neuronowych

In [17]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [18]:
# import tensorflow as tf
# from tensorflow.keras.layers import Dense, ReLU
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.optimizers import Adam

## Zadanie 1 - Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
        Q(s_t, a_t) = r_{t+1} + \gamma \text{max}_a Q(s_{t + 1}, a)
\end{equation}
</p>

In [39]:
class ReplayBuffer(object):
    def __init__(self, mem_size, state_shape):
        self.mem_size = mem_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, state_shape), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, state_shape), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def store_transition(self, state, action, reward, _state, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = _state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - done
        self.mem_cntr += 1

    @staticmethod
    def _softmax(x: ndarray) -> ndarray:
        return np.exp(x) / np.exp(x).sum()

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)

        probs = self._softmax(np.abs(self.reward_memory[:max_mem]))
        batch = np.random.choice(max_mem, batch_size, p=probs)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        _states = self.new_state_memory[batch]
        done = self.terminal_memory[batch]

        return states, actions, rewards, _states, done

In [40]:
# class DQNAgent: # Tensorflow
#     def __init__(self, action_size, state_size, learning_rate, model):
#         self.action_size = action_size
#         self.memory = ReplayBuffer(1000000, state_size)
#         self.gamma = 0.95    # discount rate
#         self.epsilon = 1.0  # exploration rate
#         self.epsilon_min = 0.01
#         self.epsilon_decay = 0.001
#         self.learning_rate = learning_rate
#         self.model = model
#         self.evaluate = False
#
#     def remember(self, state, action, reward, _state, done):
#         self.memory.store_transition(state, action, reward, _state, done)
#
#     def get_action(self, state):
#         state = state.reshape(1, -1)
#         if np.random.random() <= self.epsilon and not self.evaluate:
#             action = np.random.choice(self.action_size)
#         else:
#             actions = self.model.predict(state, verbose=0)
#             action = np.argmax(actions)
#
#
#         return action
#
#     def get_best_action(self, state):
#         state = state.reshape(1, -1)
#         actions = self.model.predict(state, verbose=0)
#         action = np.argmax(actions)
#
#         return action
#
#     def learn(self, batch_size):
#         if self.memory.mem_cntr < batch_size:
#             return
#
#         states, actions, rewards, _states, done = self.memory.sample_buffer(batch_size)
#
#         q_pred = self.model.predict(states, verbose=0)
#         q_next = self.model.predict(_states, verbose=0)
#
#         max_actions = np.argmax(q_pred, axis=1)
#
#         batch_index = np.arange(batch_size, dtype=np.int32)
#
#         q_pred[batch_index, actions] = rewards + self.gamma * q_next[batch_index, max_actions.astype(int)] * (1-done)
#
#         self.model.train_on_batch(states, q_pred)
#
#     def update_epsilon_value(self):
#         self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.epsilon_min else self.epsilon_min


In [41]:
# def get_model(input_shape, action_size, fc1, fc2):
#     model = Sequential([
#         Dense(fc1, input_shape=input_shape),
#         ReLU(),
#         Dense(fc2),
#         ReLU(),
#         Dense(action_size, activation=None),
#     ])
#
#     model.compile(loss='mse', optimizer=Adam(learning_rate==0.001))
#
#     return model

In [42]:
class DQNAgent: # Pytorch
    def __init__(self, action_size, state_size, learning_rate, model):
        self.action_size = action_size
        self.memory = ReplayBuffer(1000, state_size)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.95
        self.learning_rate = learning_rate
        self.q = model
        self.evaluate = False

    def remember(self, state, action, reward, _state, done):
        self.memory.store_transition(state, action, reward, _state, done)

    def get_action(self, state):
        if np.random.random() <= self.epsilon and not self.evaluate:
            action = np.random.choice(self.action_size)
        else:
            state = T.tensor(state).to(self.q.device)
            actions = self.q.forward(state)
            action = T.argmax(actions).item()

        return action

    def get_best_action(self, state):
        state = T.tensor(state).to(self.q.device)
        actions = self.q.forward(state)
        action = T.argmax(actions).item()

        return action

    def learn(self, batch_size):
        if self.memory.mem_cntr < batch_size:
            return

        states, actions, rewards, _states, done = self.memory.sample_buffer(batch_size)

        states = T.tensor(states).to(self.q.device)
        _states = T.tensor(_states).to(self.q.device)

        q_next = self.q(_states).cpu().detach().numpy()
        q_pred = self.q(states)
        q_target = q_pred.cpu().detach().numpy().copy()

        max_actions = np.argmax(q_target, axis=1)

        batch_index = np.arange(batch_size, dtype=np.int32)

        q_target[batch_index, actions] = rewards + self.gamma * q_next[batch_index, max_actions.astype(int)] * (1-done)
        q_target = T.tensor(q_target).to(self.q.device)
        # q_pred = T.tensor(q_pred, requires_grad=True).to(self.q.device)

        loss = self.q.loss(q_pred, q_target).to(self.q.device)
        loss.backward()
        self.q.optimizer.step()

    def update_epsilon_value(self):
        self.epsilon = self.epsilon * self.epsilon_decay if self.epsilon > self.epsilon_min else self.epsilon_min

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLake*, warstwa wejściowa powinna mieć tyle neuronów ile jest możlliwych stanów, warstwa wyjściowa tyle neuronów ile jest możliwych akcji do wykonania:

In [74]:
class DQN(nn.Module):
    def __init__(self, lr, state_shape, n_actions, fc1, fc2):
        super(DQN, self).__init__()

        self.fc1 = nn.Linear(state_shape, fc1)
        self.fc2 = nn.Linear(fc1, fc2)
        self.output = nn.Linear(fc2, n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0')
        self.to(self.device)

    def forward(self, state):
        state = F.relu(self.fc1(state))
        state = F.relu(self.fc2(state))
        actions = self.output(state)

        return actions

In [88]:
env = frozenLake("4x4")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.0005

In [89]:
model = DQN(learning_rate, state_size, action_size, 32, 32)
# model.double()
# model = get_model((state_size,), action_size, 128, 64)

 Czas nauczyć agenta poruszania się po środowisku *FrozenLake*, jako stan przyjmij wektor o liczbie elementów równej liczbie możliwych stanów, z wartością 1 ustawioną w komórce o indeksie równym aktualnemu stanowi, pozostałe elementy mają być wypełnione zerami:
* 1 pkt < 35 epok,
* 0.5 pkt < 60 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [90]:
agent = DQNAgent(action_size, state_size, learning_rate, model)
agent.epsilon = 0.75

In [91]:
def check_action_values(agent: DQNAgent, state_size: int):
    rets = []
    for i_state in range(state_size):
        state = np.zeros(state_size, dtype=np.float32)
        state[i_state] = 1
        state = T.tensor(state).to(agent.q.device)
        rets.append(agent.q(state).cpu().detach().numpy())
    rets = np.array(rets).reshape(state_size, 4)
    with np.printoptions(precision=4, suppress=True):
        print(rets)

In [92]:
check_action_values(agent, state_size)

[[ 0.0243  0.1178 -0.0756 -0.1235]
 [-0.0286  0.1111 -0.0681 -0.1757]
 [ 0.0184  0.1208 -0.0533 -0.154 ]
 [ 0.0361  0.1675 -0.0415 -0.1773]
 [ 0.003   0.1362 -0.0257 -0.1524]
 [ 0.0341  0.1424 -0.0333 -0.1241]
 [-0.0403  0.098  -0.052  -0.1293]
 [ 0.0523  0.1125 -0.0596 -0.1284]
 [ 0.0101  0.1028 -0.0387 -0.1395]
 [ 0.0536  0.086  -0.0528 -0.1125]
 [-0.0066  0.1063 -0.0403 -0.1292]
 [ 0.003   0.0911 -0.0482 -0.1351]
 [ 0.0449  0.1388 -0.0712 -0.1671]
 [ 0.041   0.0864 -0.003  -0.1109]
 [-0.0262  0.105  -0.0238 -0.1478]
 [ 0.0601  0.139  -0.0391 -0.1026]]


In [93]:
done = False
batch_size = 512
EPISODES = 60
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in tqdm(range(100), desc=f'Epoch: {e}'):
        total_reward = 0
        i_state = env.reset()
    
        state = np.zeros(state_size, dtype=np.float32)
        state[i_state] = 1
        
        for time in range(1000):
            action = agent.get_action(state)
            _i_state, reward, done, _ = env.step(action)
            total_reward += reward

            _state = np.zeros(state_size, dtype=np.float32)
            _state[_i_state] = 1

            # if np.allclose(state, _state):
            #     reward = -1
            #
            # if done and not reward:
            #     reward = -1

            agent.remember(state, action, reward, _state, done)
            agent.learn(batch_size)
            
            state = _state
            if done:
                break

        summary.append(total_reward)
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.update_epsilon_value()
    if np.mean(summary) > 0.2:
        break

    if np.mean(summary) > 0.9:
        print ("You Win!")
        break


Epoch: 0: 100%|██████████| 100/100 [00:00<00:00, 238.36it/s]


epoch #0	mean reward = 0.040	epsilon = 0.750


Epoch: 1: 100%|██████████| 100/100 [00:01<00:00, 64.50it/s]


epoch #1	mean reward = 0.010	epsilon = 0.712


Epoch: 2: 100%|██████████| 100/100 [00:01<00:00, 61.75it/s]


epoch #2	mean reward = 0.000	epsilon = 0.677


Epoch: 3: 100%|██████████| 100/100 [00:02<00:00, 48.09it/s]


epoch #3	mean reward = 0.000	epsilon = 0.643


Epoch: 4: 100%|██████████| 100/100 [00:01<00:00, 65.62it/s]


epoch #4	mean reward = 0.020	epsilon = 0.611


Epoch: 5: 100%|██████████| 100/100 [00:02<00:00, 45.84it/s]


epoch #5	mean reward = 0.020	epsilon = 0.580


Epoch: 6: 100%|██████████| 100/100 [00:02<00:00, 44.37it/s]


epoch #6	mean reward = 0.020	epsilon = 0.551


Epoch: 7: 100%|██████████| 100/100 [00:02<00:00, 43.53it/s]


epoch #7	mean reward = 0.020	epsilon = 0.524


Epoch: 8: 100%|██████████| 100/100 [00:01<00:00, 56.88it/s]


epoch #8	mean reward = 0.030	epsilon = 0.498


Epoch: 9: 100%|██████████| 100/100 [00:02<00:00, 48.58it/s]


epoch #9	mean reward = 0.000	epsilon = 0.473


Epoch: 10: 100%|██████████| 100/100 [00:02<00:00, 39.83it/s]


epoch #10	mean reward = 0.020	epsilon = 0.449


Epoch: 11: 100%|██████████| 100/100 [00:02<00:00, 40.96it/s]


epoch #11	mean reward = 0.010	epsilon = 0.427


Epoch: 12: 100%|██████████| 100/100 [00:03<00:00, 30.75it/s]


epoch #12	mean reward = 0.020	epsilon = 0.405


Epoch: 13: 100%|██████████| 100/100 [00:01<00:00, 64.81it/s]


epoch #13	mean reward = 0.010	epsilon = 0.385


Epoch: 14: 100%|██████████| 100/100 [00:02<00:00, 33.96it/s]


epoch #14	mean reward = 0.020	epsilon = 0.366


Epoch: 15: 100%|██████████| 100/100 [00:02<00:00, 44.13it/s]


epoch #15	mean reward = 0.010	epsilon = 0.347


Epoch: 16: 100%|██████████| 100/100 [00:02<00:00, 46.81it/s]


epoch #16	mean reward = 0.030	epsilon = 0.330


Epoch: 17: 100%|██████████| 100/100 [00:03<00:00, 25.81it/s]


epoch #17	mean reward = 0.010	epsilon = 0.314


Epoch: 18: 100%|██████████| 100/100 [00:01<00:00, 75.36it/s]


epoch #18	mean reward = 0.010	epsilon = 0.298


Epoch: 19: 100%|██████████| 100/100 [00:04<00:00, 23.98it/s]


epoch #19	mean reward = 0.000	epsilon = 0.283


Epoch: 20: 100%|██████████| 100/100 [00:03<00:00, 26.96it/s]


epoch #20	mean reward = 0.010	epsilon = 0.269


Epoch: 21: 100%|██████████| 100/100 [00:02<00:00, 43.41it/s]


epoch #21	mean reward = 0.010	epsilon = 0.255


Epoch: 22: 100%|██████████| 100/100 [00:07<00:00, 13.79it/s]


epoch #22	mean reward = 0.020	epsilon = 0.243


Epoch: 23: 100%|██████████| 100/100 [00:01<00:00, 87.37it/s]


epoch #23	mean reward = 0.010	epsilon = 0.231


Epoch: 24: 100%|██████████| 100/100 [00:03<00:00, 30.75it/s]


epoch #24	mean reward = 0.020	epsilon = 0.219


Epoch: 25: 100%|██████████| 100/100 [00:01<00:00, 67.75it/s]


epoch #25	mean reward = 0.020	epsilon = 0.208


Epoch: 26: 100%|██████████| 100/100 [00:04<00:00, 21.62it/s]


epoch #26	mean reward = 0.000	epsilon = 0.198


Epoch: 27: 100%|██████████| 100/100 [00:02<00:00, 40.80it/s]


epoch #27	mean reward = 0.030	epsilon = 0.188


Epoch: 28: 100%|██████████| 100/100 [00:03<00:00, 30.57it/s]


epoch #28	mean reward = 0.000	epsilon = 0.178


Epoch: 29: 100%|██████████| 100/100 [00:01<00:00, 74.54it/s]


epoch #29	mean reward = 0.030	epsilon = 0.169


Epoch: 30: 100%|██████████| 100/100 [00:03<00:00, 27.30it/s]


epoch #30	mean reward = 0.000	epsilon = 0.161


Epoch: 31: 100%|██████████| 100/100 [00:01<00:00, 87.18it/s]


epoch #31	mean reward = 0.010	epsilon = 0.153


Epoch: 32: 100%|██████████| 100/100 [00:07<00:00, 12.94it/s]


epoch #32	mean reward = 0.010	epsilon = 0.145


Epoch: 33: 100%|██████████| 100/100 [00:01<00:00, 86.96it/s]


epoch #33	mean reward = 0.010	epsilon = 0.138


Epoch: 34: 100%|██████████| 100/100 [00:04<00:00, 22.83it/s]


epoch #34	mean reward = 0.030	epsilon = 0.131


Epoch: 35: 100%|██████████| 100/100 [00:00<00:00, 104.88it/s]


epoch #35	mean reward = 0.000	epsilon = 0.125


Epoch: 36: 100%|██████████| 100/100 [00:05<00:00, 18.70it/s]


epoch #36	mean reward = 0.010	epsilon = 0.118


Epoch: 37: 100%|██████████| 100/100 [00:00<00:00, 107.99it/s]


epoch #37	mean reward = 0.000	epsilon = 0.112


Epoch: 38: 100%|██████████| 100/100 [00:01<00:00, 85.43it/s]


epoch #38	mean reward = 0.000	epsilon = 0.107


Epoch: 39: 100%|██████████| 100/100 [00:04<00:00, 20.82it/s]


epoch #39	mean reward = 0.000	epsilon = 0.101


Epoch: 40: 100%|██████████| 100/100 [00:01<00:00, 86.17it/s]


epoch #40	mean reward = 0.010	epsilon = 0.096


Epoch: 41: 100%|██████████| 100/100 [00:05<00:00, 17.78it/s]


epoch #41	mean reward = 0.000	epsilon = 0.092


Epoch: 42: 100%|██████████| 100/100 [00:03<00:00, 31.86it/s]


epoch #42	mean reward = 0.000	epsilon = 0.087


Epoch: 43: 100%|██████████| 100/100 [00:05<00:00, 19.08it/s]


epoch #43	mean reward = 0.000	epsilon = 0.083


Epoch: 44: 100%|██████████| 100/100 [00:03<00:00, 27.80it/s]


epoch #44	mean reward = 0.010	epsilon = 0.079


Epoch: 45: 100%|██████████| 100/100 [00:00<00:00, 105.21it/s]


epoch #45	mean reward = 0.000	epsilon = 0.075


Epoch: 46: 100%|██████████| 100/100 [00:00<00:00, 103.73it/s]


epoch #46	mean reward = 0.000	epsilon = 0.071


Epoch: 47: 100%|██████████| 100/100 [00:03<00:00, 31.59it/s]


epoch #47	mean reward = 0.010	epsilon = 0.067


Epoch: 48: 100%|██████████| 100/100 [00:01<00:00, 64.39it/s]


epoch #48	mean reward = 0.020	epsilon = 0.064


Epoch: 49: 100%|██████████| 100/100 [00:04<00:00, 22.46it/s]


epoch #49	mean reward = 0.000	epsilon = 0.061


Epoch: 50: 100%|██████████| 100/100 [00:01<00:00, 91.16it/s]


epoch #50	mean reward = 0.010	epsilon = 0.058


Epoch: 51: 100%|██████████| 100/100 [00:16<00:00,  5.97it/s]


epoch #51	mean reward = 0.010	epsilon = 0.055


Epoch: 52: 100%|██████████| 100/100 [00:05<00:00, 17.47it/s]


epoch #52	mean reward = 0.000	epsilon = 0.052


Epoch: 53: 100%|██████████| 100/100 [00:01<00:00, 68.82it/s]


epoch #53	mean reward = 0.010	epsilon = 0.049


Epoch: 54: 100%|██████████| 100/100 [00:03<00:00, 27.93it/s]


epoch #54	mean reward = 0.000	epsilon = 0.047


Epoch: 55: 100%|██████████| 100/100 [00:01<00:00, 71.51it/s]


epoch #55	mean reward = 0.010	epsilon = 0.045


Epoch: 56: 100%|██████████| 100/100 [00:02<00:00, 38.57it/s]


epoch #56	mean reward = 0.000	epsilon = 0.042


Epoch: 57: 100%|██████████| 100/100 [00:05<00:00, 18.39it/s]


epoch #57	mean reward = 0.000	epsilon = 0.040


Epoch: 58: 100%|██████████| 100/100 [00:04<00:00, 21.17it/s]


epoch #58	mean reward = 0.010	epsilon = 0.038


Epoch: 59: 100%|██████████| 100/100 [00:04<00:00, 24.84it/s]

epoch #59	mean reward = 0.000	epsilon = 0.036





In [94]:
check_action_values(agent, state_size)

[[ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]]


In [95]:
agent.memory.sample_buffer(1)

(array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       dtype=float32),
 array([0]),
 array([0.], dtype=float32),
 array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
       dtype=float32),
 array([ True]))

In [96]:
check_action_values(agent, state_size)

[[ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]
 [ 0.0363  0.2579 -0.3929  0.184 ]]


In [48]:
def test_model(agent: DQNAgent, env):
    eps = agent.epsilon
    agent.epsilon = 0
    rewards = []
    for _ in range(10):
        total_reward = 0
        i_state = env.reset()

        state = np.zeros(state_size, dtype=np.float32)
        state[i_state] = 1
        actions = []
        for time in range(1000):
            action = agent.get_action(state)
            actions.append(action)
            _i_state, reward, done, _ = env.step(action)
            total_reward += reward

            _state = np.zeros(state_size, dtype=np.float32)
            _state[_i_state] = 1


            state = _state
            if done:
                break
        print(actions)
        rewards.append(total_reward)

    model.epsilon = eps
    return rewards

In [49]:
test_model(agent, env)

[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLakeExtended*, tym razem stan nie jest określany poprzez pojedynczą liczbę, a przez 3 tablice:
* pierwsza zawierająca informacje o celu,
* druga zawierająca informacje o dziurach,
* trzecia zawierająca informację o położeniu gracza.

In [None]:
def test_model(model: DQNAgent, env):
    eps = model.epsilon
    model.epsilon = 0
    rewards = []
    for _ in range(10):
        total_reward = 0
        i_state = env.reset()

        state = np.zeros(state_size, dtype=np.float32)
        state[i_state] = 1

        for time in range(1000):
            action = agent.get_action(state)
            _i_state, reward, done, _ = env.step(action)
            total_reward += reward

            _state = np.zeros(state_size, dtype=np.float32)
            _state[_i_state] = 1

            state = _state
            if done:
                break

    model.epsilon = eps
    return rewards

In [37]:
env = frozenLakeExtended("4x4")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

state = env.reset()
state = np.array(state)

model = DQN(learning_rate, np.prod(state.shape), action_size, 128, 64)

 Czas nauczyć agenta poruszania się po środowisku *FrozenLakeExtended*, jako stan przyjmij wektor składający się ze wszystkich trzech tablic (2 pkt.):

In [38]:
agent = DQNAgent(action_size, state.shape, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 2000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        state = env.reset()
        state = np.array(state)

        # state = np.zeros(state_size)
        # state[i_state] = 1
        
        for time in range(1000):
            action = agent.get_action(state)
            _state, reward, done, _ = env.step(action)
            _state = np.array(_state)
            total_reward += reward

            # _state = np.zeros(state_size)
            # _state[_i_state] = 1

            agent.remember(state, action, reward, _state, done)
            agent.learn(batch_size)
            state = _state
            if done:
                break

        agent.update_epsilon_value()
        summary.append(total_reward)

    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))

    if np.mean(summary) > 0.9:
        print ("You Win!")
        break

TypeError: linear(): argument 'input' (position 1) must be Tensor, not Flatten

Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [None]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001


Czas nauczyć agenta gry w środowisku *CartPool*:
* 1 pkt < 10 epok,
* 0.5 pkt < 20 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [None]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        i_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        
        for time in range(300):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        
        summary.append(total_reward)
    if np.mean(summary) > 195:
        print ("You Win!")
        break
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))