In [2]:
import gym

# Create the Blackjack environment
env = gym.make('Blackjack-v1')


  deprecation(
  deprecation(


In [3]:
def basic_policy(state):
    """
    A simple policy that hits if the player's sum is less than 20, else holds.
    """
    player_sum, dealer_sum, usable_ace = state
    return 0 if player_sum >= 20 else 1  # 0 = hold, 1 = hit

def play_episode(policy, env):
    """
    Plays an episode of Blackjack using the given policy.

    Args:
        policy: A function that takes a state and returns an action (0=hold, 1=hit).
        env: The Blackjack environment.

    Returns:
        A tuple (states, actions, rewards) for the episode.
    """
    states = []
    actions = []
    rewards = []

    state = env.reset()
    while True:
        states.append(state)
        action = policy(state)
        actions.append(action)
        next_state, reward, done, _ = env.step(action)
        rewards.append(reward)
        state = next_state
        if done:
            break

    return states, actions, rewards


  and should_run_async(code)


In [4]:
from collections import defaultdict

def monte_carlo_policy_evaluation(policy, env, num_episodes, discount_factor=1.0):
    """
    Evaluate a policy using Monte Carlo sampling.

    Args:
        policy: A function that takes a state and returns an action.
        env: The environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Discount factor for future rewards.

    Returns:
        A dictionary mapping state to value.
    """
    # Store returns for each state
    returns_sum = defaultdict(float)
    returns_count = defaultdict(int)
    V = defaultdict(float)

    for _ in range(num_episodes):
        # Generate an episode using the policy
        episode = play_episode(policy, env)
        states, _, rewards = episode

        # Calculate returns
        G = 0
        for t in reversed(range(len(states))):
            G = rewards[t] + discount_factor * G
            state = states[t]
            # First visit Monte Carlo: only consider first time state is visited in episode
            if state not in states[:t]:
                returns_sum[state] += G
                returns_count[state] += 1
                V[state] = returns_sum[state] / returns_count[state]

    return V

# Example usage: Evaluate the initial policy with many episodes
V = monte_carlo_policy_evaluation(basic_policy, env, num_episodes=5000)


  if not isinstance(terminated, (bool, np.bool8)):


In [5]:
V

defaultdict(float,
            {(15, 3, False): -0.5576923076923077,
             (17, 3, True): -0.7272727272727273,
             (14, 5, False): -0.6666666666666666,
             (18, 4, False): -0.711864406779661,
             (12, 4, False): -0.6382978723404256,
             (8, 4, False): -0.4666666666666667,
             (21, 6, False): 0.9574468085106383,
             (15, 6, False): -0.4888888888888889,
             (19, 5, False): -0.7454545454545455,
             (19, 6, False): -0.603448275862069,
             (21, 7, True): 1.0,
             (20, 6, False): 0.7808219178082192,
             (13, 6, False): -0.5476190476190477,
             (19, 10, False): -0.7702702702702703,
             (18, 10, False): -0.7272727272727273,
             (8, 10, False): -0.6206896551724138,
             (20, 9, True): 0.9230769230769231,
             (16, 7, False): -0.7307692307692307,
             (20, 3, False): 0.6865671641791045,
             (16, 3, False): -0.6363636363636364,
     

In [6]:
def greedy_policy_from_value_function(V, env):
    """
    Create a greedy policy based on the given value function.

    Args:
        V: A dictionary mapping state to value.
        env: The environment.

    Returns:
        A policy function that maps state to action.
    """
    def policy(state):
        # Get the player sum from the state
        player_sum, _, _ = state
        # Greedy policy: hit if V(player_sum + card_value) > V(player_sum), hold otherwise
        hit_value = 0
        if player_sum < 21:  # If the player sum is less than 21, consider hitting
            for card_value in range(1, 11):  # Possible values of the next card
                next_state = (player_sum + card_value, state[1], state[2])
                if next_state in V:
                    hit_value += V[next_state]
            hit_value /= 10  # Average over all possible card values

        hold_value = V[state] if state in V else 0

        return 1 if hit_value > hold_value else 0

    return policy

# Improve the policy based on the current value function V
improved_policy = greedy_policy_from_value_function(V, env)


In [10]:
def evaluate_policy(policy, env, num_episodes):
    """
    Evaluate the performance of a policy by running multiple episodes.

    Args:
        policy: A policy function that maps state to action.
        env: The environment.
        num_episodes: Number of episodes to evaluate.

    Returns:
        The average return over the episodes.
    """
    total_return = 0

    for _ in range(num_episodes):
        states, _, rewards = play_episode(policy, env)
        total_return += sum(rewards)

    return total_return / num_episodes

# Evaluate the optimal policy
num_test_episodes = 1000
average_return = evaluate_policy(improved_policy, env, num_test_episodes)

print(f"Average return of the optimal policy over {num_test_episodes} episodes: {average_return:.2f}")


Average return of the optimal policy over 1000 episodes: -0.40


In [None]:
def monte_carlo_policy_iteration(env, num_episodes, discount_factor=0.9, tol=1e-10):
    """
    Perform Monte Carlo Policy Iteration to find an optimal policy.

    Args:
        env: The environment.
        num_episodes: Number of episodes to sample for policy evaluation.
        discount_factor: Discount factor for future rewards.
        tol: Convergence tolerance for policy improvement.

    Returns:
        A tuple (optimal_policy, optimal_value_function).
    """
    # Initialize a random policy
    policy = basic_policy  # initial policy
    V = defaultdict(float)
    counter = 0
    while True:
        counter += 1

        average_return = evaluate_policy(improved_policy, env, num_test_episodes)

        print(f"Iteration number {counter} trained model for {num_episodes} episodes, Tested for {num_test_episodes} has average return {average_return:.2f}")
        if(counter >100):
          break
        # Policy Evaluation
        V_new = monte_carlo_policy_evaluation(policy, env, num_episodes, discount_factor)

        # Policy Improvement
        new_policy = greedy_policy_from_value_function(V_new, env)



        # Check for convergence
        if max(abs(V_new[state] - V[state]) for state in V_new) < tol:
            break


        # Update policy and value function
        policy = new_policy
        V = V_new

    return policy, V

# Perform Monte Carlo Policy Iteration
improved_policy, optimal_value_function = monte_carlo_policy_iteration(env, num_episodes=500000)


Iteration number 1 trained model for 500000 episodes, Tested for 1000 has average return -0.40
Iteration number 2 trained model for 500000 episodes, Tested for 1000 has average return -0.35
Iteration number 3 trained model for 500000 episodes, Tested for 1000 has average return -0.43
Iteration number 4 trained model for 500000 episodes, Tested for 1000 has average return -0.39
Iteration number 5 trained model for 500000 episodes, Tested for 1000 has average return -0.37
Iteration number 6 trained model for 500000 episodes, Tested for 1000 has average return -0.44
Iteration number 7 trained model for 500000 episodes, Tested for 1000 has average return -0.42
Iteration number 8 trained model for 500000 episodes, Tested for 1000 has average return -0.42
Iteration number 9 trained model for 500000 episodes, Tested for 1000 has average return -0.39
Iteration number 10 trained model for 500000 episodes, Tested for 1000 has average return -0.34
Iteration number 11 trained model for 500000 epis