## Comparison of Qlearning and SARSA

In [None]:
%pip install gym numpy

**Implementation of Epsilon Greedy Method(Q-learning)**

In [None]:



import numpy as np
import gym
import random

env = gym.make("FrozenLake-v1", map_name="4x4", render_mode="human", is_slippery=False)
env.reward_range = (-1, 1)  
env.rewards = {'S': -0.8, 'F': -0.1, 'H': -1, 'G': 1} 

# Q-table initialization
action_size = env.action_space.n
state_size = env.observation_space.n
qtable = np.zeros((state_size, action_size))

# Training parameters
total_episodes = 100        # Total episodes
max_steps = 99              # Max steps per episode
learning_rate = 0.8         # Learning rate
gamma = 0.95                # Discounting rate

# Exploration-exploitation parameters
epsilon = 1.0               # Exploration rate
max_epsilon = 1.0           # Initial exploration probability
min_epsilon = 0.01          # Minimum exploration probability 
decay_rate = 0.005          # Exponential decay rate for exploration probability

# List to store rewards over episodes
rewards = []


def epsilon_greedy_policy(state, epsilon):
    if np.random.rand() < epsilon:
        # Explore: choose a random action
        return np.random.choice(action_size)
    else:
        # Exploit: choose the action with the highest Q-value
        return np.argmax(Q[state, :])

# epsilon greedy with Q-learning algorithm
for episode in range(total_episodes):
    state = env.reset()
    total_rewards = 0
    
    for step in range(max_steps):
        # Exploration-exploitation trade-off
        if isinstance(state, tuple):
            state = state[0]
        action = epsilon_greedy_policy(state, epsilon)

        # Take the chosen action and observe the next state and reward
        new_state, reward, done, _,_ = env.step(action)

        # Update Q-value using the Q-learning update rule
        qtable[state, action] = qtable[state, action] + learning_rate * (
            reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action]
        )


        total_rewards += reward
        state = new_state

        # Render the environment 
        env.render()

        if done:
            break

    # Reduce epsilon over time
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

    # Store the total rewards for analysis
    rewards.append(total_rewards)

# Display the average reward over episodes
print("Average reward over episodes:", sum(rewards) / total_episodes)

# Display the learned Q-table
print("Q-table:")
print(qtable)


**Implementation of SARSA**

In [None]:
import numpy as np
import gym

env = gym.make("FrozenLake-v1", map_name="4x4", render_mode="human", is_slippery=False)
env.reward_range = (-1, 1)
env.rewards = {'S': -0.8, 'F': -0.1, 'H': -1, 'G': 1}

# Q-table initialization
action_size = env.action_space.n
state_size = env.observation_space.n
qtable = np.zeros((state_size, action_size))

# Training parameters
total_episodes = 100  # Total episodes
max_steps = 99  # Max steps per episode
learning_rate = 0.8  # Learning rate
gamma = 0.95  # Discounting rate

# Exploration-exploitation parameters
epsilon = 1.0  # Exploration rate
max_epsilon = 1.0  # Initial exploration probability
min_epsilon = 0.01  # Minimum exploration probability
decay_rate = 0.005  # Exponential decay rate for exploration probability

# List to store rewards over episodes
rewards = []


def epsilon_greedy_policy(state, epsilon):
    if np.random.rand() < epsilon:
        # Explore: choose a random action
        return np.random.choice(action_size)
    else:
        # Exploit: choose the action with the highest Q-value
        return np.argmax(qtable[state, :])


# SARSA algorithm
for episode in range(total_episodes):
    state = env.reset()
    total_rewards = 0
    if isinstance(state, tuple):
            state = state[0]

    # Choose action using epsilon-greedy policy
    action = epsilon_greedy_policy(state, epsilon)

    for step in range(max_steps):
        

        # Take the chosen action and observe the next state and reward
        new_state, reward, done, _, _ = env.step(action)

        # Choose the next action using epsilon-greedy policy for the new state
        new_action = epsilon_greedy_policy(new_state, epsilon)

        # Update Q-value using the SARSA update rule
        qtable[state, action] = qtable[state, action] + learning_rate * (
            reward + gamma * qtable[new_state, new_action] - qtable[state, action]
        )

        total_rewards += reward
        state = new_state
        action = new_action

        # Render the environment 
        env.render()

        if done:
            break

    # Reduce epsilon over time
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

    # Store the total rewards for analysis
    rewards.append(total_rewards)

# Display the average reward over episodes
print("Average reward over episodes:", sum(rewards) / total_episodes)

# Display the learned Q-table
print("Q-table:")
print(qtable)
