<a href="https://colab.research.google.com/github/2303a51603/Reinforcement-learning-/blob/main/Lab%20Ass%203.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import gymnasium as gym
import numpy as np
from collections import defaultdict
import random


# Monte Carlo Policy Evaluation (First-Visit)
def mc_policy_evaluation(policy, env, num_episodes, gamma=1.0):
    returns_sum = defaultdict(float)
    returns_count = defaultdict(int)
    V = defaultdict(float)

    for _ in range(num_episodes):
        episode = []
        state, _ = env.reset()
        done = False

        # Generate episode
        while not done:
            action = policy(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            episode.append((state, action, reward))
            state = next_state
            done = terminated or truncated

        # First-visit MC
        visited_states = set()
        G = 0
        for t in reversed(range(len(episode))):
            state_t, _, reward_t = episode[t]
            G = gamma * G + reward_t
            if state_t not in visited_states:
                returns_sum[state_t] += G
                returns_count[state_t] += 1
                V[state_t] = returns_sum[state_t] / returns_count[state_t]
                visited_states.add(state_t)

    return V


# ε-Greedy policy generator
def create_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(state):
        if state not in Q:
            Q[state] = np.zeros(nA)
        probs = np.ones(nA) * epsilon / nA
        best_action = np.argmax(Q[state])
        probs[best_action] += 1.0 - epsilon
        return np.random.choice(np.arange(nA), p=probs)
    return policy_fn


# Monte Carlo Control with ε-Greedy Policy
def mc_control_epsilon_greedy(env, num_episodes, gamma=1.0, epsilon=0.1):
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    returns_sum = defaultdict(float)
    returns_count = defaultdict(int)

    policy = create_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for _ in range(num_episodes):
        episode = []
        state, _ = env.reset()
        done = False

        while not done:
            action = policy(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            episode.append((state, action, reward))
            state = next_state
            done = terminated or truncated

        G = 0
        visited_state_actions = set()
        for t in reversed(range(len(episode))):
            state_t, action_t, reward_t = episode[t]
            G = gamma * G + reward_t
            if (state_t, action_t) not in visited_state_actions:
                returns_sum[(state_t, action_t)] += G
                returns_count[(state_t, action_t)] += 1
                Q[state_t][action_t] = returns_sum[(state_t, action_t)] / returns_count[(state_t, action_t)]
                visited_state_actions.add((state_t, action_t))

    # Final greedy policy
    final_policy = {state: np.argmax(actions) for state, actions in Q.items()}
    return Q, final_policy


# ---- Run the code ----
if __name__ == "__main__":
    env = gym.make("Blackjack-v1", sab=True)

    # ---- Policy Evaluation ----
    print("\nEvaluating a simple policy (always stick)...")
    random_policy = lambda s: 0  # always stick
    V = mc_policy_evaluation(random_policy, env, num_episodes=100000)

    test_state = (20, 10, True)
    print(f"Estimated value of state {test_state}: {V.get(test_state, 0.0)}")

    # ---- Policy Control ----
    print("\nRunning Monte Carlo Control with ε-greedy policy...")
    Q, learned_policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)

    # Test optimal action at a specific state
    best_action = learned_policy.get(test_state, "Unknown")
    print(f"Best action for state {test_state}: {best_action} (0=Stick, 1=Hit)")



Evaluating a simple policy (always stick)...
Estimated value of state (20, 10, True): 0.4069767441860465

Running Monte Carlo Control with ε-greedy policy...
Best action for state (20, 10, True): 0 (0=Stick, 1=Hit)
