# Monte Carlo

#### Prof. Armando Alves Neto - Introdução ao Aprendizado por Reforço - PPGEE/UFMG

<img src="cart_pole.gif" width="400">

Características do pêndulo:

Num | Observation			| Min 				| Max

0	| Cart Position 		| -4.8 				| 4.8

1	| Cart Velocity			| -Inf				| Inf

2	| Pole Angle			| ~-0.418rad (-24°)	| ~ 0.418 rad (24°)

3	| Pole Angular Velocity	| -Inf				| Inf 

Importando bilbiotecas. A biblioteca gym emula ambientes para aprendizado por reforço (no caso, o pêndulo).

In [1]:
%matplotlib
import numpy as np
import gym
import matplotlib.pyplot as plt
from functools import partial
import running_average as avg
import seaborn as sns
sns.set()

Using matplotlib backend: Qt5Agg


Criando a classe do Monte Carlo tabular.

In [2]:
class TabularMC():
    ##########################################
    def __init__(self, parameters):

        self.episode = 0

        # cria o ambiente (pendulo invertido)
        self.env = gym.make('CartPole-v0')
        num_states = [1, 8, 8, 8]
        lower_bounds = [-4.8, -3.0, -0.418, -2.0]
        upper_bounds = [ 4.8,  3.0,  0.418,  2.0]

        # converte estados continuos em discretos
        self.get_state = partial(self.obs_to_state, num_states, lower_bounds, upper_bounds)

        # tamanho dos espacos de estados e acoes
        self.num_states = np.prod(np.array(num_states))
        self.num_actions = self.env.action_space.n

        # parametros de aprendizado
        self.gamma = parameters['gamma']
        self.eps = parameters['eps']

        # reseta a politica
        self.reset_policy()

        # eh first-visit?
        self.first_visit = parameters['first-visit']

    ##########################################
    # reseta a funcao acao-valor
    def reset_policy(self):
        # Q(s,a)
        self.Q = np.zeros((self.num_states, self.num_actions))

        # Retornos(s,a)
        self.returns = [ [ [] for i in range(self.num_actions) ] for j in range(self.num_states) ]

    ##########################################
    # retorna a politica corrente
    def curr_policy(self, copy=False):
        if copy:
            return partial(self.TabularEpsilonGreedyPolicy, np.copy(self.Q))
        else:
            return partial(self.TabularEpsilonGreedyPolicy, self.Q)

    ##########################################
    # converte estados continuos em discretos
    def obs_to_state(self, num_states, lower_bounds, upper_bounds, obs):
        state_idx = []
        for ob, lower, upper, num in zip(obs, lower_bounds, upper_bounds, num_states):
            state_idx.append(self.discretize_val(ob, lower, upper, num))

        return np.ravel_multi_index(state_idx, num_states)

    ##########################################
    # discretiza um valor
    def discretize_val(self, val, min_val, max_val, num_states):
        """
        Discretizes a single float
        if val < min_val, it gets a discrete value of 0
        if val >= max_val, it gets a discrete value of num_states-1

        Args:
            val (float): value to discretize
            min_val (float): lower bound of discretization
            max_val (float): upper bound of discretization
            num_states (int): number of discrete states

        Returns:
            float: discrete value
        """
        state = int(num_states * (val - min_val) / (max_val - min_val))
        if state >= num_states:
            state = num_states - 1
        if state < 0:
            state = 0
        return state

    ##########################################
    # simula um episodio até o fim seguindo a politica corente
    def rollout(self, max_iter=10000, render=False):

        # inicia o ambiente (começa aleatoriamente)
        obs = self.env.reset()

        # listas da trajetoria S_{T-1}, A_{T-1}, R_{T}, ....
        rewards = []
        actions = []
        states = []

        for _ in range(max_iter):
            # gera estados baseado na observacao
            S = self.get_state(obs)
            # \pi(s)
            A = self.policy(S)

            # renderiza o ambiente
            if render:
                self.env.render()

            # passo de interacao com o ambiente
            [obs, R, done, info] = self.env.step(A)

            # Salva S_{T-1}, A_{T-1}, R_{T}, ....
            actions.append(A)
            states.append(S)
            rewards.append(R)

            # chegou a um estado terminal?
            if done: break

        return [states, actions, rewards]

    ##########################################
    def runEpisode(self):

        # novo episodio
        self.episode += 1

        # pega a politica corrente (on-policy)
        self.policy = self.curr_policy()

        # gera um episodio seguindo a politica corrente
        [states, actions, rewards] = self.rollout(render=(self.episode%10 == 0))

        # estados alcancados
        updated_states = set()

        G = 0.0
        for state, action, reward in zip(states, actions, rewards):

            # first-visit?
            if self.first_visit:
                if state in updated_states:
                    continue
                updated_states.add(state)

            # atualiza Q(s,a)
            G = self.gamma*G + reward
            self.returns[state][action].append(G)
            self.Q[state, action] = np.mean(self.returns[state][action])

        return np.sum(np.array(rewards))

    ##########################################
    def __del__(self):
        self.env.close()

Probabilidade de escolha de uma ação $a$ baseada na política $\varepsilon$-soft:
$$
\pi(a|S_t) \gets 
                        \begin{cases}
                            1 - \varepsilon + \varepsilon/|\mathcal{A}|,  & \text{se}~ a = A^*,\\
                            \varepsilon/|\mathcal{A}|, & \text{caso contrário.}
                        \end{cases}
$$

In [3]:
class TabularMC(TabularMC):    
    ##########################################
    # escolha da açao (epsilon-soft)
    def TabularEpsilonGreedyPolicy(self, Q, state):

        # acao otima corrente
        Astar = Q[state, :].argmax()

        # numero total de acoes
        nactions = self.num_actions

        # probabilidades de escolher as acoes
        prob = [(1.0 - self.eps + self.eps/nactions) if a == Astar else self.eps/nactions for a in range(nactions)]
        
        return np.random.choice(nactions, p=np.array(prob))

In [4]:
##########################################
# main
##########################################
if __name__ == '__main__':

    # parametros
    parameters = {
                'episodes'    : 1000,
                'gamma'       : 1.0,
                'eps'         : 1.0e-1,
                'first-visit' : False
            }

    plt.ion()

    # Cria Monte-Carlo tabular
    mc = TabularMC(parameters)

    rewards = []
    avg_calc = avg.RunningAverage(50)

    for i in range(parameters['episodes']):
        # roda um episodio
        total_reward = mc.runEpisode()

        # reward medio
        avg_calc.push(total_reward)
        rewards.append(avg_calc.get())
        print("Iteracao %d," % i, "Reforço: %.2f" % total_reward)	

        plt.figure(1)
        plt.clf()
        plt.plot(rewards)
        plt.title('Reforço por episódios')
        plt.xlabel('Episódios')
        plt.ylabel('Reforço')

        plt.show()
        plt.pause(.1)

plt.ioff()

Iteracao 0, Reforço: 9.00
Iteracao 1, Reforço: 10.00
Iteracao 2, Reforço: 9.00
Iteracao 3, Reforço: 10.00
Iteracao 4, Reforço: 9.00
Iteracao 5, Reforço: 10.00
Iteracao 6, Reforço: 10.00
Iteracao 7, Reforço: 10.00
Iteracao 8, Reforço: 10.00
Iteracao 9, Reforço: 14.00
Iteracao 10, Reforço: 12.00
Iteracao 11, Reforço: 13.00
Iteracao 12, Reforço: 11.00
Iteracao 13, Reforço: 14.00
Iteracao 14, Reforço: 11.00
Iteracao 15, Reforço: 12.00
Iteracao 16, Reforço: 13.00
Iteracao 17, Reforço: 11.00
Iteracao 18, Reforço: 9.00
Iteracao 19, Reforço: 10.00
Iteracao 20, Reforço: 9.00
Iteracao 21, Reforço: 12.00
Iteracao 22, Reforço: 13.00
Iteracao 23, Reforço: 14.00
Iteracao 24, Reforço: 10.00
Iteracao 25, Reforço: 10.00
Iteracao 26, Reforço: 16.00
Iteracao 27, Reforço: 13.00
Iteracao 28, Reforço: 14.00
Iteracao 29, Reforço: 11.00
Iteracao 30, Reforço: 21.00
Iteracao 31, Reforço: 14.00
Iteracao 32, Reforço: 17.00
Iteracao 33, Reforço: 27.00
Iteracao 34, Reforço: 11.00
Iteracao 35, Reforço: 21.00
Iteraca

KeyboardInterrupt: 