<a href="https://colab.research.google.com/github/2303a51015/AIML-2025/blob/main/RL_LAB3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium
!pip install numpy



In [2]:

import gymnasium as gym
import numpy as np
from collections import defaultdict
import random
from typing import Dict, Tuple, List
from tqdm import tqdm


def make_epsilon_greedy_policy(Q: Dict, nA: int, epsilon: float):
    """Return a policy function that takes state and returns action probabilities."""
    def policy_fn(state):
        probs = np.ones(nA) * (epsilon / nA)
        q_vals = Q[state]
        best_a = np.argmax(q_vals)
        probs[best_a] += (1.0 - epsilon)
        return probs
    return policy_fn

def generate_episode(env, policy):
    """Generate an episode: returns list of (state, action, reward). Uses policy as action-prob function."""
    episode = []
    state, _ = env.reset()
    done = False
    while not done:
        probs = policy(state)
        action = np.random.choice(len(probs), p=probs)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode.append((state, action, reward))
        state = next_state
    return episode


def mc_control_epsilon_greedy(env, num_episodes, gamma=1.0, epsilon=0.1, alpha=0.1):
    """
    Finds an optimal epsilon-greedy policy using First-Visit MC Control.
    Based on the logic from section 3.2 of your document.
    """

    Q = defaultdict(lambda: np.zeros(env.action_space.n))


    for i in tqdm(range(num_episodes)):

        policy = make_epsilon_greedy_policy(Q, env.action_space.n, epsilon)


        episode = generate_episode(env, policy)


        sa_in_episode = set([(s, a) for s, a, r in episode])


        for state, action in sa_in_episode:

            first_occurrence_idx = next(i for i, x in enumerate(episode) if x[0] == state and x[1] == action)


            G = sum([x[2] * (gamma ** i) for i, x in enumerate(episode[first_occurrence_idx:])])


            Q[state][action] = Q[state][action] + alpha * (G - Q[state][action])

    return Q


if __name__ == '__main__':

    env = gym.make('Blackjack-v1', sab=False)


    print("Training the agent...")
    Q_optimal = mc_control_epsilon_greedy(env, num_episodes=500000)
    print("Training finished.")


    print("\nExample of the learned policy:")

    state_1 = (18, 7, False)

    state_2 = (17, 3, True)

    best_action_1 = np.argmax(Q_optimal[state_1])
    best_action_2 = np.argmax(Q_optimal[state_2])

    action_map = {0: "Stick", 1: "Hit"}

    print(f"For state {state_1}: Optimal action is to {action_map[best_action_1]}")
    print(f"For state {state_2}: Optimal action is to {action_map[best_action_2]}")

    env.close()

Training the agent...


100%|██████████| 500000/500000 [02:03<00:00, 4032.28it/s]

Training finished.

Example of the learned policy:
For state (18, 7, False): Optimal action is to Stick
For state (17, 3, True): Optimal action is to Hit



