# Laboratorium 4

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem wcześniej przygotowanych środowisk: *FrozenLake* i *Pacman* oraz środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [4]:
from collections import deque
import gym
import numpy as np
import random
import time as tm

Dołączenie bibliotek ze środowiskami:

In [5]:
from env.FrozenLakeMDP import frozenLake
from env.FrozenLakeMDPExtended import frozenLakeExtended
from gym.envs.classic_control import CartPoleEnv


Dołączenie bibliotek do obsługi sieci neuronowych

In [6]:
from keras import Model
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
from keras.utils import to_categorical

Using TensorFlow backend.


## Zadanie 1 - Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
        Q(s_t, a_t) = r_{t+1} + \gamma \text{max}_a Q(s_{t + 1}, a)
\end{equation}
</p>

In [7]:
class DQNAgent:
    def __init__(self, action_size, learning_rate, model: Model, get_legal_actions=None, env=None):
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999
        self.learning_rate = learning_rate
        self.model = model
        self.get_legal_actions = get_legal_actions
        self.env = env

    def remember(self, state, action, reward, next_state, done):
        # Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done))

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list).
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        #
        # INSERT CODE HERE to get action in a given state (according to epsilon greedy algorithm)
        #

        epsilon = self.epsilon

        # Pick Action
        if isinstance(env, CartPoleEnv):
            if np.random.random() < epsilon:
                return self.env.action_space.sample()
            else:
                return np.argmax(self.model.predict(state)[0])
        else:
            possible_actions = self.get_legal_actions(state)
            if len(possible_actions) == 0:
                return None
            best_action = self.get_best_action(state)
            chosen_action = best_action

            if random.uniform(0, 1) < epsilon:
                random_actions = possible_actions.copy()
                random_actions.remove(best_action)
                chosen_action = random.choice(random_actions if random_actions else [best_action])
            return chosen_action

    def get_best_action(self, state):
        """
        Compute the best action to take in a state (using current q-values).
        """
        if isinstance(env, CartPoleEnv):
            possible_actions = self.env.action_space
        else:
            possible_actions = self.get_legal_actions(state)
            if len(possible_actions) == 0:
                return None

        return np.argmax(self.model.predict(state))

    def lower_epsilon(self):
        new_epsilon = self.epsilon * self.epsilon_decay
        if new_epsilon >= self.epsilon_min:
            self.epsilon = new_epsilon

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory.
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        Also every time the function replay is called self.epsilon value should be updated according to equation:
        self.epsilon *= self.epsilon_decay
        """
        #
        # INSERT CODE HERE to train network
        #

        if len(self.memory) < batch_size:
            return

        info_sets = random.sample(self.memory, batch_size)
        states_list = []
        targets_list = []
        for info_set in info_sets:
            state, action, reward, next_state, done = info_set
            states_list.append(state.flatten())
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.model.predict(next_state)[0])
                target[0][action] = reward + Q_future * self.gamma
            targets_list.append(target.flatten())

        states_array = np.array(states_list)
        targets_array = np.array(targets_list)

        self.model.train_on_batch(states_array, targets_array)
        self.lower_epsilon()


Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLake*, warstwa wejściowa powinna mieć tyle neuronów ile jest możlliwych stanów, warstwa wyjściowa tyle neuronów ile jest możliwych akcji do wykonania:

In [8]:
env = frozenLake("4x4")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

model = Sequential()
model.add(Dense(16, input_dim=state_size, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(action_size))  # wyjście
model.compile(loss="mean_squared_error",
              optimizer=Adam(lr=learning_rate))

 Czas nauczyć agenta poruszania się po środowisku *FrozenLake*, jako stan przyjmij wektor o liczbie elementów równej liczbie możliwych stanów, z wartością 1 ustawioną w komórce o indeksie równym aktualnemu stanowi, pozostałe elementy mają być wypełnione zerami:

In [9]:
agent = DQNAgent(action_size, learning_rate, model, get_legal_actions=env.get_possible_actions)

done = False
batch_size = 64
EPISODES = 1000
counter = 0

for e in range(EPISODES):
    start = tm.time()
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()

        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([to_categorical(env_state, num_classes=state_size)])

        for time in range(500):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([to_categorical(next_state_env, num_classes=state_size)])

            # add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(64)

        summary.append(total_reward)

    end = tm.time()
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}\ttime = {}".format(e, np.mean(summary), agent.epsilon,
                                                                                end - start))
    if np.mean(total_reward) > 0.9:
        print("You Win!")
        # Should win after 4 to 8 episodes
        break

epoch #0	mean reward = 0.030	epsilon = 0.913	time = 11.518755674362183
epoch #1	mean reward = 0.000	epsilon = 0.826	time = 10.277201890945435
epoch #2	mean reward = 0.000	epsilon = 0.747	time = 10.0339674949646
epoch #3	mean reward = 0.010	epsilon = 0.676	time = 9.654991626739502
epoch #4	mean reward = 0.040	epsilon = 0.612	time = 9.37389087677002
epoch #5	mean reward = 0.040	epsilon = 0.554	time = 9.524007797241211
epoch #6	mean reward = 0.080	epsilon = 0.501	time = 9.860939979553223
epoch #7	mean reward = 0.090	epsilon = 0.453	time = 9.535884141921997
You Win!


Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLakeExtended*, tym razem stan nie jest określany poprzez pojedynczą liczbę, a przez 3 tablice:
* pierwsza zawierająca informacje o celu,
* druga zawierająca informacje o dziurach,
* trzecia zawierająca informację o położeniu gracza.

In [7]:
env = frozenLakeExtended("4x4")

state_size = env.get_number_of_states()*3
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

model = Sequential()
model.add(Dense(16, input_dim=state_size, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(action_size))  # wyjście
model.compile(loss="mean_squared_error",
              optimizer=Adam(lr=learning_rate))

 Czas nauczyć agenta poruszania się po środowisku *FrozenLakeExtended*, jako stan przyjmij wektor składający się ze wszystkich trzech tablic:

In [8]:
agent = DQNAgent(action_size, learning_rate, model, get_legal_actions=env.get_possible_actions)

done = False
batch_size = 64
EPISODES = 2000
counter = 0
for e in range(EPISODES):
    start = tm.time()
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()

        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([np.array(env_state).flatten()])

        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([np.array(next_state_env).flatten()])

            # add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(64)

        summary.append(total_reward)

    end = tm.time()
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}\ttime = {}".format(e, np.mean(summary), agent.epsilon,
                                                                                end - start))
    if np.mean(total_reward) > 0.9:
        print("You Win!")
        break

epoch #0	mean reward = 0.010	epsilon = 0.905	time = 13.993134498596191
epoch #1	mean reward = 0.010	epsilon = 0.819	time = 9.429208517074585
epoch #2	mean reward = 0.010	epsilon = 0.741	time = 9.285630464553833
epoch #3	mean reward = 0.020	epsilon = 0.670	time = 10.799579381942749
epoch #4	mean reward = 0.010	epsilon = 0.606	time = 9.596582412719727
epoch #5	mean reward = 0.070	epsilon = 0.549	time = 9.35084867477417
epoch #6	mean reward = 0.190	epsilon = 0.496	time = 9.934166193008423
epoch #7	mean reward = 0.250	epsilon = 0.449	time = 9.886844158172607
epoch #8	mean reward = 0.240	epsilon = 0.406	time = 9.796828746795654
epoch #9	mean reward = 0.230	epsilon = 0.368	time = 9.335887670516968
epoch #10	mean reward = 0.340	epsilon = 0.333	time = 9.331056118011475
epoch #11	mean reward = 0.540	epsilon = 0.301	time = 9.226623058319092
You Win!


Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [9]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

model = Sequential()
model.add(Dense(16, input_dim=state_size, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(action_size))  # wyjście
model.compile(loss="mean_squared_error",
              optimizer=Adam(lr=learning_rate))

Czas nauczyć agenta gry w środowisku *CartPool*:

In [10]:
agent = DQNAgent(action_size, learning_rate, model, env=env)

done = False
batch_size = 64
EPISODES = 2000
counter = 0
for e in range(EPISODES):
    start = tm.time()
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()

        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        state = np.array([np.array(env_state).flatten()])

        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #
            next_state = np.array([np.array(next_state_env).flatten()])

            # add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        if len(agent.memory) > batch_size:
            agent.replay(64)

        summary.append(total_reward)

    end = tm.time()
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}\ttime = {:.3f}".format(e, np.mean(summary), agent.epsilon,
                                                                                    end - start))
    if np.mean(total_reward) > 195:
        print("You Win!")
        break

epoch #0	mean reward = 25.930	epsilon = 0.908	time = 9.370
epoch #1	mean reward = 19.140	epsilon = 0.822	time = 9.508
epoch #2	mean reward = 15.750	epsilon = 0.744	time = 9.413
epoch #3	mean reward = 17.560	epsilon = 0.673	time = 9.604
epoch #4	mean reward = 32.870	epsilon = 0.609	time = 10.047
epoch #5	mean reward = 40.360	epsilon = 0.551	time = 10.731
epoch #6	mean reward = 49.210	epsilon = 0.498	time = 10.990
epoch #7	mean reward = 58.520	epsilon = 0.451	time = 11.813
epoch #8	mean reward = 88.360	epsilon = 0.408	time = 13.184
epoch #9	mean reward = 104.980	epsilon = 0.369	time = 14.313
epoch #10	mean reward = 150.770	epsilon = 0.334	time = 17.213
epoch #11	mean reward = 207.800	epsilon = 0.302	time = 20.636
You Win!
