# Laboratorium 5 (4 pkt)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [1]:
from collections import deque
import gym
import numpy as np
import random
from tqdm import tqdm

Dołączenie bibliotek do obsługi sieci neuronowych

In [2]:
# import tensorflow as tf
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras import Sequential
# import numpy as np

In [3]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Zadanie 1 - Double Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Double Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
       Q^*(s, a) \approx r + \gamma argmax_{a'}Q_\theta'(s', a') 
\end{equation}
a wagi pomiędzy sieciami wymieniane są co dziesięć aktualizacji wag sieci sterującej poczynaniami agenta ($Q$).
</p>

In [4]:
class ReplayBuffer(object):
    def __init__(self, mem_size, state_shape):
        self.mem_size = mem_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, state_shape), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, state_shape), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool)

    def store_transition(self, state, action, reward, _state, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = _state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - done
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        _states = self.new_state_memory[batch]
        done = self.terminal_memory[batch]

        return states, actions, rewards, _states, done

In [5]:
class DQN(nn.Module):
    def __init__(self, lr, state_shape, n_actions, fc1, fc2):
        super(DQN, self).__init__()

        self.fc1 = nn.Linear(state_shape, fc1)
        self.fc2 = nn.Linear(fc1, fc2)
        self.output = nn.Linear(fc2, n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0')
        self.to(self.device)

    def forward(self, state):
        state = F.relu(self.fc1(state))
        state = F.relu(self.fc2(state))
        actions = self.output(state)

        return actions

In [6]:
class DDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = ReplayBuffer(1000000, state_size)
        self.gamma = 0.95    # discount rate
        self.epsilon = 0.5  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.0005
        self.alpha = 0.001
        self.q = DQN(learning_rate, state_size, action_size, 64, 64)
        self.q_target = DQN(learning_rate, state_size, action_size, 64, 64)
        self.update_weights()
        self.evaluate=False

    # def _build_model(self):
    #     model = Sequential()
    #     model.add(Dense(128, input_shape=(self.state_size,), activation='relu'))
    #     model.add(Dense(128, activation='relu'))
    #     model.add(Dense(self.action_size, activation=None))
    #     model.compile(optimizer=Adam(learning_rate=self.alpha), loss='mse')
    #
    #     return model

    def remember(self, state, action, reward, _state, done):
        self.memory.store_transition(state, action, reward, _state, done)

    def get_action(self, state):
        if np.random.random() <= self.epsilon and not self.evaluate:
            action = np.random.choice(self.action_size)
        else:
            state = T.tensor(state).to(self.q.device)
            actions = self.q.forward(state)
            action = T.argmax(actions).item()

        return action


    def get_best_action(self, state):
        state = T.tensor(state).to(self.q.device)
        actions = self.q.forward(state)
        action = T.argmax(actions).item()

        return action

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory.
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        After each 10 Q Network trainings parameters should be copied to the target Q Network
        """
        if self.memory.mem_cntr < batch_size:
            return

        states, actions, rewards, _states, done = self.memory.sample_buffer(batch_size)

        states = T.tensor(states).to(self.q.device)
        _states = T.tensor(_states).to(self.q_target.device)

        q_pred = self.q(states)
        q_next = self.q_target(_states).cpu().detach().numpy()
        q_target = q_pred.cpu().detach().numpy()

        max_actions = np.argmax(q_target, axis=1)

        batch_index = np.arange(batch_size, dtype=np.int32)

        q_target[batch_index, actions] = rewards + self.gamma * q_next[batch_index, max_actions.astype(int)] * (1-done)
        q_target = T.tensor(q_target).to(self.q.device)

        loss = self.q.loss(q_pred, q_target).to(self.q.device)
        loss.backward()
        self.q.optimizer.step()

        if not self.memory.mem_cntr % 10:
            self.update_weights()

    def update_epsilon_value(self):
        self.epsilon = self.epsilon - self.epsilon_decay if self.epsilon > self.epsilon_min else self.epsilon_min

    def update_weights(self):
        self.q_target.load_state_dict(self.q.state_dict())


Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [7]:
env = gym.make("CartPole-v0")
state_size = env.observation_space.sample().shape[0]
action_size = env.action_space.n
learning_rate = 0.001

Czas nauczyć agenta gry w środowisku *CartPool*:

In [9]:
agent = DDQNAgent(state_size, action_size)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in tqdm(range(100), desc=f'Epoch: {e}'):
        total_reward = 0
        state = env.reset()
        state = np.array(state, dtype=np.float32)

        for time in range(500):
            action = agent.get_action(state)
            _state, reward, done, _ = env.step(action)
            _state = np.array(_state, dtype=np.float32)
            total_reward += reward

            agent.remember(state, action, reward, _state, done)
            agent.replay(batch_size)
            state = _state
            if done:
                break

        agent.update_epsilon_value()
        summary.append(total_reward)
        
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))    
    
    if np.mean(summary) > 195:
        print ("You Win!")
        break


Epoch: 0: 100%|██████████| 100/100 [00:05<00:00, 19.08it/s]


epoch #0	mean reward = 19.080	epsilon = 0.700


Epoch: 1: 100%|██████████| 100/100 [00:04<00:00, 21.56it/s]


epoch #1	mean reward = 15.850	epsilon = 0.650


Epoch: 2: 100%|██████████| 100/100 [00:04<00:00, 20.74it/s]


epoch #2	mean reward = 17.410	epsilon = 0.600


Epoch: 3: 100%|██████████| 100/100 [00:04<00:00, 24.28it/s]


epoch #3	mean reward = 14.760	epsilon = 0.550


Epoch: 4: 100%|██████████| 100/100 [00:04<00:00, 24.42it/s]


epoch #4	mean reward = 14.460	epsilon = 0.500


Epoch: 5: 100%|██████████| 100/100 [00:03<00:00, 25.27it/s]


epoch #5	mean reward = 13.940	epsilon = 0.450


Epoch: 6: 100%|██████████| 100/100 [00:03<00:00, 26.19it/s]


epoch #6	mean reward = 12.820	epsilon = 0.400


Epoch: 7: 100%|██████████| 100/100 [00:03<00:00, 27.31it/s]


epoch #7	mean reward = 12.230	epsilon = 0.350


Epoch: 8: 100%|██████████| 100/100 [00:03<00:00, 27.11it/s]


epoch #8	mean reward = 11.740	epsilon = 0.300


Epoch: 9: 100%|██████████| 100/100 [00:03<00:00, 29.14it/s]


epoch #9	mean reward = 11.110	epsilon = 0.250


Epoch: 10: 100%|██████████| 100/100 [00:03<00:00, 31.36it/s]


epoch #10	mean reward = 10.560	epsilon = 0.200


Epoch: 11: 100%|██████████| 100/100 [00:03<00:00, 31.91it/s]


epoch #11	mean reward = 10.120	epsilon = 0.150


Epoch: 12: 100%|██████████| 100/100 [00:03<00:00, 30.02it/s]


epoch #12	mean reward = 10.170	epsilon = 0.100


Epoch: 13: 100%|██████████| 100/100 [00:03<00:00, 32.06it/s]


epoch #13	mean reward = 9.840	epsilon = 0.050


Epoch: 14: 100%|██████████| 100/100 [00:03<00:00, 31.85it/s]


epoch #14	mean reward = 9.610	epsilon = 0.010


Epoch: 15: 100%|██████████| 100/100 [00:03<00:00, 30.72it/s]


epoch #15	mean reward = 9.290	epsilon = 0.010


Epoch: 16: 100%|██████████| 100/100 [00:03<00:00, 27.02it/s]


epoch #16	mean reward = 9.350	epsilon = 0.010


Epoch: 17: 100%|██████████| 100/100 [00:03<00:00, 28.03it/s]


epoch #17	mean reward = 9.410	epsilon = 0.010


Epoch: 18: 100%|██████████| 100/100 [00:03<00:00, 32.86it/s]


epoch #18	mean reward = 9.400	epsilon = 0.010


Epoch: 19: 100%|██████████| 100/100 [00:02<00:00, 33.98it/s]


epoch #19	mean reward = 9.330	epsilon = 0.010


Epoch: 20: 100%|██████████| 100/100 [00:03<00:00, 33.18it/s]


epoch #20	mean reward = 9.340	epsilon = 0.010


Epoch: 21: 100%|██████████| 100/100 [00:02<00:00, 34.72it/s]


epoch #21	mean reward = 9.290	epsilon = 0.010


Epoch: 22: 100%|██████████| 100/100 [00:02<00:00, 34.44it/s]


epoch #22	mean reward = 9.290	epsilon = 0.010


Epoch: 23: 100%|██████████| 100/100 [00:02<00:00, 33.64it/s]


epoch #23	mean reward = 9.440	epsilon = 0.010


Epoch: 24: 100%|██████████| 100/100 [00:02<00:00, 34.10it/s]


epoch #24	mean reward = 9.390	epsilon = 0.010


Epoch: 25: 100%|██████████| 100/100 [00:03<00:00, 33.23it/s]


epoch #25	mean reward = 9.480	epsilon = 0.010


Epoch: 26: 100%|██████████| 100/100 [00:03<00:00, 32.87it/s]


epoch #26	mean reward = 9.310	epsilon = 0.010


Epoch: 27: 100%|██████████| 100/100 [00:03<00:00, 26.97it/s]


epoch #27	mean reward = 9.430	epsilon = 0.010


Epoch: 28: 100%|██████████| 100/100 [00:03<00:00, 31.45it/s]


epoch #28	mean reward = 9.320	epsilon = 0.010


Epoch: 29: 100%|██████████| 100/100 [00:02<00:00, 33.53it/s]


epoch #29	mean reward = 9.340	epsilon = 0.010


Epoch: 30: 100%|██████████| 100/100 [00:03<00:00, 31.75it/s]


epoch #30	mean reward = 9.320	epsilon = 0.010


Epoch: 31: 100%|██████████| 100/100 [00:03<00:00, 33.23it/s]


epoch #31	mean reward = 9.420	epsilon = 0.010


Epoch: 32: 100%|██████████| 100/100 [00:02<00:00, 33.64it/s]


epoch #32	mean reward = 9.400	epsilon = 0.010


Epoch: 33: 100%|██████████| 100/100 [00:02<00:00, 34.73it/s]


epoch #33	mean reward = 9.370	epsilon = 0.010


Epoch: 34: 100%|██████████| 100/100 [00:02<00:00, 33.90it/s]


epoch #34	mean reward = 9.440	epsilon = 0.010


Epoch: 35: 100%|██████████| 100/100 [00:02<00:00, 34.34it/s]


epoch #35	mean reward = 9.520	epsilon = 0.010


Epoch: 36: 100%|██████████| 100/100 [00:02<00:00, 34.17it/s]


epoch #36	mean reward = 9.380	epsilon = 0.010


Epoch: 37: 100%|██████████| 100/100 [00:02<00:00, 34.58it/s]


epoch #37	mean reward = 9.370	epsilon = 0.010


Epoch: 38: 100%|██████████| 100/100 [00:02<00:00, 34.96it/s]


epoch #38	mean reward = 9.310	epsilon = 0.010


Epoch: 39: 100%|██████████| 100/100 [00:02<00:00, 34.51it/s]


epoch #39	mean reward = 9.400	epsilon = 0.010


Epoch: 40: 100%|██████████| 100/100 [00:02<00:00, 34.07it/s]


epoch #40	mean reward = 9.510	epsilon = 0.010


Epoch: 41: 100%|██████████| 100/100 [00:02<00:00, 33.59it/s]


epoch #41	mean reward = 9.430	epsilon = 0.010


Epoch: 42: 100%|██████████| 100/100 [00:03<00:00, 32.57it/s]


epoch #42	mean reward = 9.280	epsilon = 0.010


Epoch: 43: 100%|██████████| 100/100 [00:02<00:00, 33.90it/s]


epoch #43	mean reward = 9.340	epsilon = 0.010


Epoch: 44: 100%|██████████| 100/100 [00:02<00:00, 34.14it/s]


epoch #44	mean reward = 9.460	epsilon = 0.010


Epoch: 45: 100%|██████████| 100/100 [00:02<00:00, 34.47it/s]


epoch #45	mean reward = 9.480	epsilon = 0.010


Epoch: 46: 100%|██████████| 100/100 [00:03<00:00, 33.06it/s]


epoch #46	mean reward = 9.360	epsilon = 0.010


Epoch: 47: 100%|██████████| 100/100 [00:03<00:00, 31.47it/s]


epoch #47	mean reward = 9.440	epsilon = 0.010


Epoch: 48: 100%|██████████| 100/100 [00:02<00:00, 35.57it/s]


epoch #48	mean reward = 9.220	epsilon = 0.010


Epoch: 49: 100%|██████████| 100/100 [00:02<00:00, 35.15it/s]


epoch #49	mean reward = 9.350	epsilon = 0.010


Epoch: 50: 100%|██████████| 100/100 [00:02<00:00, 34.66it/s]


epoch #50	mean reward = 9.500	epsilon = 0.010


Epoch: 51: 100%|██████████| 100/100 [00:02<00:00, 34.40it/s]


epoch #51	mean reward = 9.420	epsilon = 0.010


Epoch: 52: 100%|██████████| 100/100 [00:02<00:00, 35.16it/s]


epoch #52	mean reward = 9.420	epsilon = 0.010


Epoch: 53: 100%|██████████| 100/100 [00:02<00:00, 34.46it/s]


epoch #53	mean reward = 9.490	epsilon = 0.010


Epoch: 54: 100%|██████████| 100/100 [00:02<00:00, 34.85it/s]


epoch #54	mean reward = 9.400	epsilon = 0.010


Epoch: 55: 100%|██████████| 100/100 [00:02<00:00, 34.25it/s]


epoch #55	mean reward = 9.520	epsilon = 0.010


Epoch: 56: 100%|██████████| 100/100 [00:02<00:00, 33.46it/s]


epoch #56	mean reward = 9.460	epsilon = 0.010


Epoch: 57: 100%|██████████| 100/100 [00:03<00:00, 31.32it/s]


epoch #57	mean reward = 9.600	epsilon = 0.010


Epoch: 58: 100%|██████████| 100/100 [00:03<00:00, 33.23it/s]


epoch #58	mean reward = 9.530	epsilon = 0.010


Epoch: 59: 100%|██████████| 100/100 [00:02<00:00, 34.98it/s]


epoch #59	mean reward = 9.400	epsilon = 0.010


Epoch: 60: 100%|██████████| 100/100 [00:02<00:00, 34.76it/s]


epoch #60	mean reward = 9.460	epsilon = 0.010


Epoch: 61: 100%|██████████| 100/100 [00:02<00:00, 35.80it/s]


epoch #61	mean reward = 9.320	epsilon = 0.010


Epoch: 62: 100%|██████████| 100/100 [00:02<00:00, 35.32it/s]


epoch #62	mean reward = 9.320	epsilon = 0.010


Epoch: 63: 100%|██████████| 100/100 [00:02<00:00, 35.09it/s]


epoch #63	mean reward = 9.440	epsilon = 0.010


Epoch: 64: 100%|██████████| 100/100 [00:02<00:00, 34.54it/s]


epoch #64	mean reward = 9.390	epsilon = 0.010


Epoch: 65: 100%|██████████| 100/100 [00:02<00:00, 34.21it/s]


epoch #65	mean reward = 9.500	epsilon = 0.010


Epoch: 66: 100%|██████████| 100/100 [00:02<00:00, 35.84it/s]


epoch #66	mean reward = 9.320	epsilon = 0.010


Epoch: 67: 100%|██████████| 100/100 [00:03<00:00, 32.04it/s]


epoch #67	mean reward = 9.470	epsilon = 0.010


Epoch: 68: 100%|██████████| 100/100 [00:03<00:00, 32.63it/s]


epoch #68	mean reward = 9.480	epsilon = 0.010


Epoch: 69: 100%|██████████| 100/100 [00:02<00:00, 35.20it/s]


epoch #69	mean reward = 9.390	epsilon = 0.010


Epoch: 70: 100%|██████████| 100/100 [00:02<00:00, 33.65it/s]


epoch #70	mean reward = 9.470	epsilon = 0.010


Epoch: 71: 100%|██████████| 100/100 [00:02<00:00, 34.29it/s]


epoch #71	mean reward = 9.430	epsilon = 0.010


Epoch: 72: 100%|██████████| 100/100 [00:02<00:00, 35.19it/s]


epoch #72	mean reward = 9.230	epsilon = 0.010


Epoch: 73: 100%|██████████| 100/100 [00:02<00:00, 34.86it/s]


epoch #73	mean reward = 9.360	epsilon = 0.010


Epoch: 74: 100%|██████████| 100/100 [00:02<00:00, 34.27it/s]


epoch #74	mean reward = 9.440	epsilon = 0.010


Epoch: 75: 100%|██████████| 100/100 [00:02<00:00, 34.04it/s]


epoch #75	mean reward = 9.460	epsilon = 0.010


Epoch: 76: 100%|██████████| 100/100 [00:02<00:00, 34.81it/s]


epoch #76	mean reward = 9.440	epsilon = 0.010


Epoch: 77: 100%|██████████| 100/100 [00:03<00:00, 31.54it/s]


epoch #77	mean reward = 9.390	epsilon = 0.010


Epoch: 78: 100%|██████████| 100/100 [00:03<00:00, 30.53it/s]


epoch #78	mean reward = 9.610	epsilon = 0.010


Epoch: 79: 100%|██████████| 100/100 [00:03<00:00, 31.88it/s]


epoch #79	mean reward = 9.570	epsilon = 0.010


Epoch: 80: 100%|██████████| 100/100 [00:03<00:00, 32.95it/s]


epoch #80	mean reward = 9.420	epsilon = 0.010


Epoch: 81: 100%|██████████| 100/100 [00:02<00:00, 33.53it/s]


epoch #81	mean reward = 9.380	epsilon = 0.010


Epoch: 82: 100%|██████████| 100/100 [00:03<00:00, 31.75it/s]


epoch #82	mean reward = 9.630	epsilon = 0.010


Epoch: 83: 100%|██████████| 100/100 [00:02<00:00, 33.50it/s]


epoch #83	mean reward = 9.260	epsilon = 0.010


Epoch: 84: 100%|██████████| 100/100 [00:03<00:00, 32.18it/s]


epoch #84	mean reward = 9.290	epsilon = 0.010


Epoch: 85: 100%|██████████| 100/100 [00:02<00:00, 33.54it/s]


epoch #85	mean reward = 9.330	epsilon = 0.010


Epoch: 86: 100%|██████████| 100/100 [00:03<00:00, 30.64it/s]


epoch #86	mean reward = 9.510	epsilon = 0.010


Epoch: 87: 100%|██████████| 100/100 [00:03<00:00, 30.69it/s]


epoch #87	mean reward = 9.420	epsilon = 0.010


Epoch: 88: 100%|██████████| 100/100 [00:03<00:00, 32.77it/s]


epoch #88	mean reward = 9.430	epsilon = 0.010


Epoch: 89: 100%|██████████| 100/100 [00:03<00:00, 32.44it/s]


epoch #89	mean reward = 9.470	epsilon = 0.010


Epoch: 90: 100%|██████████| 100/100 [00:03<00:00, 31.71it/s]


epoch #90	mean reward = 9.370	epsilon = 0.010


Epoch: 91: 100%|██████████| 100/100 [00:03<00:00, 32.01it/s]


epoch #91	mean reward = 9.470	epsilon = 0.010


Epoch: 92:  20%|██        | 20/100 [00:00<00:02, 34.16it/s]


KeyboardInterrupt: 