<a href="https://colab.research.google.com/github/GOPIKA-S-S/RL/blob/main/2348518_lab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install numpy




In [4]:
import numpy as np
import matplotlib.pyplot as plt
import random

# Set up a simple grid world
class GridWorld:
    def __init__(self, width, height, start, goal):
        self.width = width
        self.height = height
        self.start = start
        self.goal = goal
        self.agent_pos = start

    def reset(self):
        self.agent_pos = self.start
        return self.agent_pos

    def step(self, action):
        if action == 0:  # Up
            self.agent_pos = (max(self.agent_pos[0] - 1, 0), self.agent_pos[1])
        elif action == 1:  # Down
            self.agent_pos = (min(self.agent_pos[0] + 1, self.height - 1), self.agent_pos[1])
        elif action == 2:  # Left
            self.agent_pos = (self.agent_pos[0], max(self.agent_pos[1] - 1, 0))
        elif action == 3:  # Right
            self.agent_pos = (self.agent_pos[0], min(self.agent_pos[1] + 1, self.width - 1))

        reward = 1 if self.agent_pos == self.goal else 0
        return self.agent_pos, reward


class MonteCarloControl:
    def __init__(self, env, n_episodes=10000, discount_factor=0.9, epsilon=0.1):
        self.env = env
        self.n_episodes = n_episodes
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.q_values = np.zeros((env.height, env.width, 4))  # Q-values for each state-action pair
        self.policy = np.zeros((env.height, env.width), dtype=int)  # Best action for each state
        self.policy_stable = False

    def get_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, 3)  # Explore: random action
        return self.policy[state]  # Exploit: best action according to policy

    def generate_episode(self):
        state = self.env.reset()
        episode = []
        done = False

        while not done:
            action = self.get_action(state)
            new_state, reward = self.env.step(action)
            episode.append((state, action, reward))
            if new_state == self.env.goal:
                done = True
            state = new_state

        return episode

    def update_q_values(self, episode):
        G = 0
        visited = np.zeros((self.env.height, self.env.width, 4))  # To keep track of visited state-action pairs

        for state, action, reward in reversed(episode):
            G = self.discount_factor * G + reward
            if visited[state[0], state[1], action] == 0:
                self.q_values[state[0], state[1], action] += G
                visited[state[0], state[1], action] += 1

        # Update policy
        for s in range(self.env.height):
            for t in range(self.env.width):
                best_action = np.argmax(self.q_values[s, t])
                self.policy[s, t] = best_action

    def learn(self):
        for episode in range(self.n_episodes):
            episode_data = self.generate_episode()
            self.update_q_values(episode_data)

            if (episode + 1) % 1000 == 0:
                print(f"Episode {episode + 1}: Policy updated.")

# Main Execution
if __name__ == "__main__":
    env = GridWorld(width=4, height=4, start=(0, 0), goal=(3, 3))
    mc_control = MonteCarloControl(env)
    mc_control.learn()

    # Display the learned policy
    print("Learned Policy (0=Up, 1=Down, 2=Left, 3=Right):")
    print(mc_control.policy)


Episode 1000: Policy updated.
Episode 2000: Policy updated.
Episode 3000: Policy updated.
Episode 4000: Policy updated.
Episode 5000: Policy updated.
Episode 6000: Policy updated.
Episode 7000: Policy updated.
Episode 8000: Policy updated.
Episode 9000: Policy updated.
Episode 10000: Policy updated.
Learned Policy (0=Up, 1=Down, 2=Left, 3=Right):
[[3 3 3 1]
 [0 0 3 1]
 [0 0 0 1]
 [0 0 0 0]]
