# Monte Carlo Control from scratch in Python and solving Frozen Lake problem.

In this notebook you will:  

1. Implement Monte Carlo Control with $\epsilon$-greedy action selection.  
2. Test MC Control on Frozen Lake problem.

**Monte Carlo Control Pseudocode**:
    
    
Input:  $epsilon$, $gamma$, $n\_episodes$


Initialize for all $s\in S$ and $a\in A$:    
>$Q(s, a)$ <- arbitrary  
    $\pi(s)$ <- arbitrary

Repeat for $n\_episodes$:  
>generate episode using exploring starts and current policy $\pi$  
    $Q(s, a)$ <- evaluate policy using first-visit MC method   
    $\pi$ <- improve policy

 
$Q^*(s, a)$ <- $Q(s, a)$  
$\pi^*$  <- $\pi$ 

# Packages

In [2]:
import gym
import numpy as np
import random
import itertools
import time

# MC Control Implementation

We will create class called MC control

In [111]:
class MCControl:
    '''Implements Monte Carlo Control.'''
    def __init__(self, env, num_states, num_actions, epsilon, gamma):
        '''Parameters
        ----------
        env:         open gym environment object
        num_states:  integer, number of states in the environment
        num_actions: integer, number of possible actions
        epsilon:     float, the epsilon parameter used for exploration
        gamma:       float, discount factor
        '''
        self.env = env
        self.num_states = num_states
        self.num_actions = num_actions
        self.epsilon = epsilon
        self.gamma = gamma

    def init_agent(self):
        '''Initializes RL agent components:
        self.policy:      list of integers of length self.num_states, the action to take at a given state
        self.Q:           nested dictionary {state: {action: q value}}, action value function
        self.visit_count: nested dictionary {state: {action: count}}, keeps track of how many episodes
                          state and action pair were visited for a first time in every episode
        '''
        # --------------------------
        # Randomly initialize policy, use numpy random.choice method:
        # your code here (1 line)
        self.policy = np.random.choice(num_actions, num_states)
        # --------------------------

        self.Q = {}
        self.visit_count = {}

        for state in range(self.num_states):
            self.Q[state] = {}
            self.visit_count[state] = {}
            for action in range(self.num_actions):
                # --------------------------
                # Initalize action value (self.Q) and visit count (self.visit_count) dictionaries to zero:
                # your code here (~ 2 lines)
                self.Q[state][action] = 0
                self.visit_count[state][action] = 0
                # --------------------------

    def get_epsilon_greedy_action(self, greedy_action):
        '''Returns action using epsilon greedy approach.
        greedy_action: integer, greedy action (action with a maximum Q value)
        '''
        prob = np.random.random()

        if prob < self.epsilon:
            return np.random.randint(0, self.num_actions)
        
        return greedy_action

    def generate_episode(self, policy):
        '''Generates episode given current policy.
        Parameters
        ----------
        policy: list of integers of length self.num_states, the action to take at a given state
        
        Returns
        ----------
        G: float, episode return (total discounted reward)
        state_action_reward: list of tuple (state, action, reward), excludes terminal one
        '''
        G = 0
        s = env.reset()
        a = self.get_epsilon_greedy_action(policy[s])

        state_action_reward = [(s, a, 0)]
        while True:
            s, r, terminated, _ = env.step(a)
            if terminated:
                state_action_reward.append((s, None, r))
                break
            else:
                a = self.get_epsilon_greedy_action(policy[s])
                state_action_reward.append((s, a, r))

        # --------------------------
        # Calculate G:
        # your code here (~ 4 lines)
        t = 1
        for _, _, reward in state_action_reward:
            G += self.gamma ** (t - 1) * reward
            t += 1
        # --------------------------

        return G, state_action_reward[:-1]

    def argmax(self, Q, policy):
        """
        Finds and returns greedy policy.

        Parameters
        ----------
        Q: nested dictionary {state: {action: q value}}, action value function

        Returns
        ----------
        policy: list of integers of length self.num_states, the action to take at a given state 

        """
        #greedy_policy = ???
        for state in range(self.num_states):
            # --------------------------
            # Find greedy action to take in every state and assign to policy[state]:
            # your code here (~ 4 lines)
            best_action = None
            best_value = float('-inf')

            for action, value in Q[state].items():
                if value > best_value:
                    best_value = value
                    best_action = action
            policy[state] = best_action
            # --------------------------

        return policy

    def improve_policy(self):
        '''Improves and updates current policy self.policy.'''
        self.policy = self.argmax(self.Q, self.policy) 
        
    def evaluate_policy(self, G, state_action_reward):
        '''Evaluates current policy self.Q using incremental mean.

        Parameters
        ----------
        G: float, episode return (total discounted reward)
        state_action_reward: list of tuple (state, action, reward)
        '''
        seen_state_action = set()

        for state, action, _ in state_action_reward:
            #  if we see step and action pair for a first time in episode
            if (state, action) not in seen_state_action:
                self.visit_count[state][action] += 1
                # --------------------------
                # Calculate action value for current state and action
                # your code here
                self.Q[state][action] += (G - self.Q[state][action]) / self.visit_count[state][action]
                # --------------------------
                seen_state_action.add((state, action))

    def run_mc_control(self, num_episodes):
        '''Performs Monte Carlo control task.
        
        Parameters
        ----------
        num_episodes: integer, number of episodes to run to train RL agent
        
        Returns
        ----------
        self.Q:      nested dictionary {state: {action: q value}}, final action value function
        self.policy: list of integers of length self.num_states, final policy
        '''
        self.init_agent()

        for episode in range(num_episodes):
            G, state_action_reward = self.generate_episode(self.policy)
            self.evaluate_policy(G, state_action_reward)
            self.improve_policy()

        print (f"Finished training RL agent for {num_episodes} episodes!")
        
        return self.Q, self.policy


# Tests

Test init_agent

In [109]:
np.random.seed(1)

epsilon = 0.4
gamma = 0.9
n_episodes = 10000

env = None
num_states = 2
num_actions = 3

mc_model = MCControl(env, num_states, num_actions, epsilon, gamma)

mc_model.init_agent()
assert np.all(mc_model.policy == np.array([1, 0]))
assert mc_model.Q == {0: {0: 0, 1: 0, 2: 0}, 1: {0: 0, 1: 0, 2: 0}}
assert mc_model.visit_count == {0: {0: 0, 1: 0, 2: 0}, 1: {0: 0, 1: 0, 2: 0}}

Test generate_episode

In [152]:
np.random.seed(1)

epsilon = 0.4
gamma = 0.9
n_episodes = 10000

env = gym.make('FrozenLake-v0')
env.seed(2)
num_states = env.observation_space.n
num_actions = env.action_space.n


mc_model = MCControl(env, num_states, num_actions, epsilon, gamma)

policy = np.array([1, 1, 1, 1, 0, 0, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3])
res = mc_model.generate_episode(policy)

assert res == (0.0, [(0, 1, 0), (4, 0, 0.0), (4, 3, 0.0), (4, 0, 0.0), (8, 1, 0.0)])

# Initialize environment and Monte Carlo Class

In [112]:
def render_single(env, policy, max_steps=100):
    """
    This function does not need to be modified
    Renders policy once on environment. Watch your agent play!

    Parameters
    ----------
    env: gym.core.Environment
      Environment to play on. Must have nS, nA, and P as
      attributes.
    Policy: np.array of shape [env.nS]
      The action to take at a given state
    """
    episode_reward = 0
    ob = env.reset()
    for t in range(max_steps):
        env.render()
        time.sleep(0.25)
        a = policy[ob]
        ob, reward, done, _ = env.step(a)
        episode_reward += reward
        if done:
            break
    env.render();
    if not done:
        print("The agent didn't reach a terminal state in {} steps.".format(max_steps))
    else:
        print("Episode reward: %f" % episode_reward)

In [129]:
epsilon = 0.4
gamma = 0.9
n_episodes = 10000

env = gym.make('FrozenLake-v0')

num_states = env.observation_space.n
num_actions = env.action_space.n

mc_model = MCControl(env, num_states, num_actions, epsilon, gamma)

# Take MC Control to a Frozen Lake!

In [114]:
Q, policy = mc_model.run_mc_control(n_episodes)

Finished training RL agent for 10000 episodes!


In [125]:
env.seed(5)
render_single(env, policy, 100)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Episode reward: 1.000000


In [32]:
env.close()

# Evaluation