In [3]:
%matplotlib inline

import gymnasium as gym
import matplotlib
import numpy as np
import sys

from collections import defaultdict
if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.blackjack import BlackjackEnv
from lib import plotting

matplotlib.style.use('ggplot')

In [None]:
env = BlackjackEnv()

In [1]:
def make_epsilon_greedy_policy(Q, epsilon, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function and epsilon.
    
    Args:
        Q: A dictionary that maps from state -> action-values.
            Each value is a numpy array of length nA (see below)
        epsilon: The probability to select a random action . float between 0 and 1.
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes the observation as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    
    """
    def policy_fn(observation):
        
        # Implement this!

        best_action = np.argmax(Q[observation])

        response_probability = (epsilon/nA)*np.ones(nA)
        response_probability[best_action] += 1-epsilon


        
        return response_probability







    return policy_fn

# unit test for make_epsilon_greedy_policy:


# for 

Q = defaultdict(lambda: np.zeros(env.action_space.n))

Q

In [25]:
Q = defaultdict(lambda: np.zeros(10))

Q['9'][7] +=1
Q['9'][7] +=1
Q['9']

array([0., 0., 0., 0., 0., 0., 0., 2., 0., 0.])

In [34]:
np.tile(np.array([0, 1]), (1000, 1))

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [0, 1],
       [0, 1],
       [0, 1]])

In [33]:
Q = defaultdict(lambda: np.zeros([10, 2]))

Q['0'][4] = np.array([5, 1])

Q['0']

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [5., 1.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

## Learning points in the following exercise:

### To outline different ways to go through an episode, where the signal to stop is when the environment returns 'done' as 'true'

### Note that if there is a possibility that the epsiode can go on without reaching an end, i.e. when the environment can be trapped in a loop, the algorithm might never end.

In [None]:
def mc_control_epsilon_greedy(env, num_episodes, discount_factor=1.0, epsilon=0.1):
    """
    Monte Carlo Control using Epsilon-Greedy policies.
    Finds an optimal epsilon-greedy policy.
    
    Args:
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function that takes an observation as an argument and returns
        action probabilities
    """
    
    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    
    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n)) # all action-values are started to be zero
    
    # The policy we're following
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
    
    # Implement this!

    ## loop for the specified number of episodes:
    for n in num_episodes:

        
        # 2. generate an episode using the policy (to be iterated) so far:
        
        # start by reseting the env:
        observation = env.reset()

        state_action_return_and_factor = defaultdict(lambda: np.tiles(np.array([0, 1]), (env.action_space, 1) ) )
        while True:
            # select an action according to the (possibly stochastic) policy
            action = np.random.multinomial(1, policy(observation))

            # the environment takes the action and give a new state (observation)
            # , reward, and 
            new_observation, reward, done, _ = env.step(action)

            # start adding the rewards for each state-action pairs
            # we need also to record the accum discount fact for that pairs:

            
            accum_reward, accum_discount = state_action_return_and_factor[observation][action]
            new_accum_discount = discount_factor*accum_discount
            state_action_return_and_factor[observation][action] = np.array( [accum_reward+ new_accum_discount*reward, new_accum_discount])



        # 3.after finishing one episode:
        # 3.1 update the 
        # 


    
    
    return Q, policy

In [None]:
Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)

In [None]:
# For plotting: Create value function from action-value function
# by picking the best action at each state
V = defaultdict(float)
for state, actions in Q.items():
    action_value = np.max(actions)
    V[state] = action_value
plotting.plot_value_function(V, title="Optimal Value Function")