In [15]:
import numpy as np 
  
class Agent: 

    def choose_action(self, state):  
        action = 0
        if np.random.uniform(0, 1) < self.epsilon:  
            action = self.action_space.sample() 
        else: 
            action = np.argmax(self.Q[state, :])  
        return action 

In [17]:
class SarsaAgent(Agent):
    
    def __init__(self,epislon,alpha,gamma,num_state,num_actions,action_space):
        self.epsilon = epsilon
        self.alpha = alpha 
        self.gamma = gamma
        self.num_state = num_state
        self.num_actions = num_actions
        
        self.Q = np.zeros((num_state,num_actions))
        self.action_space = action_space
        
    def update(self,prev_state,prev_action,reward,next_state,next_action):
        predict = self.Q[prev_state,prev_action]
        target = reward + self.gamma * self.Q[next_state, next_action] 
        error = target-predict
        self.Q[prev_state,next_state] += alpha*error

In [18]:
class QLearningAgent(Agent):
    def __init__(self,epislon,alpha,gamma,num_state,num_actions,action_space):
        self.epsilon = epsilon
        self.alpha = alpha 
        self.gamma = gamma
        self.num_state = num_state
        self.num_actions = num_actions
        
        self.Q = np.zeros((num_state,num_actions))
        self.action_space = action_space
        
    def update(self,prev_state,prev_action,reward,next_state,next_action):
        predict = self.Q[prev_state,prev_action]
        target = reward + self.gamma * np.max(self.Q[next_state,:])
        error = target-predict
        self.Q[prev_state,next_state] += alpha*error

In [21]:
class ExpectedSarsaAgent(Agent): 
    def __init__(self, epsilon, alpha, gamma, num_state, num_actions, action_space): 
        """ 
        Contructor 
        Args: 
            epsilon: The degree of exploration 
            gamma: The discount factor 
            num_state: The number of states 
            num_actions: The number of actions 
            action_space: To call the random action 
        """
        self.epsilon = epsilon 
        self.alpha = alpha 
        self.gamma = gamma 
        self.num_state = num_state 
        self.num_actions = num_actions 
  
        self.Q = np.zeros((self.num_state, self.num_actions)) 
        self.action_space = action_space 
    def update(self, prev_state, next_state, reward, prev_action, next_action): 
        """ 
        Update the action value function using the Expected SARSA update. 
        Q(S, A) = Q(S, A) + alpha(reward + (pi * Q(S_, A_) - Q(S, A)) 
        Args: 
            prev_state: The previous state 
            next_state: The next state 
            reward: The reward for taking the respective action 
            prev_action: The previous action 
            next_action: The next action 
        Returns: 
            None 
        """
        predict = self.Q[prev_state, prev_action] 
  
        expected_q = 0
        q_max = np.max(self.Q[next_state, :]) 
        greedy_actions = 0
        for i in range(self.num_actions): 
            if self.Q[next_state][i] == q_max: 
                greedy_actions += 1
      
        non_greedy_action_probability = self.epsilon / self.num_actions 
        greedy_action_probability = ((1 - self.epsilon) / greedy_actions) + non_greedy_action_probability 
  
        for i in range(self.num_actions): 
            if self.Q[next_state][i] == q_max: 
                expected_q += self.Q[next_state][i] * greedy_action_probability 
            else: 
                expected_q += self.Q[next_state][i] * non_greedy_action_probability 
  
        target = reward + self.gamma * expected_q 
        self.Q[prev_state, prev_action] += self.alpha * (target - predict) 

In [22]:
import numpy as np 
  
class ExpectedSarsaAgent(Agent): 
    def __init__(self, epsilon, alpha, gamma, num_state, num_actions, action_space): 
        """ 
        Contructor 
        Args: 
            epsilon: The degree of exploration 
            gamma: The discount factor 
            num_state: The number of states 
            num_actions: The number of actions 
            action_space: To call the random action 
        """
        self.epsilon = epsilon 
        self.alpha = alpha 
        self.gamma = gamma 
        self.num_state = num_state 
        self.num_actions = num_actions 
  
        self.Q = np.zeros((self.num_state, self.num_actions)) 
        self.action_space = action_space 
    def update(self, prev_state, next_state, reward, prev_action, next_action): 
        """ 
        Update the action value function using the Expected SARSA update. 
        Q(S, A) = Q(S, A) + alpha(reward + (pi * Q(S_, A_) - Q(S, A)) 
        Args: 
            prev_state: The previous state 
            next_state: The next state 
            reward: The reward for taking the respective action 
            prev_action: The previous action 
            next_action: The next action 
        Returns: 
            None 
        """
        predict = self.Q[prev_state, prev_action] 
  
        expected_q = 0
        q_max = np.max(self.Q[next_state, :]) 
        greedy_actions = 0
        for i in range(self.num_actions): 
            if self.Q[next_state][i] == q_max: 
                greedy_actions += 1
      
        non_greedy_action_probability = self.epsilon / self.num_actions 
        greedy_action_probability = ((1 - self.epsilon) / greedy_actions) + non_greedy_action_probability 
  
        for i in range(self.num_actions): 
            if self.Q[next_state][i] == q_max: 
                expected_q += self.Q[next_state][i] * greedy_action_probability 
            else: 
                expected_q += self.Q[next_state][i] * non_greedy_action_probability 
  
        target = reward + self.gamma * expected_q 
        self.Q[prev_state, prev_action] += self.alpha * (target - predict) 

In [24]:
import gym 
import numpy as np 
  
from matplotlib import pyplot as plt 
  
# Using the gym library to create the environment 
env = gym.make('CliffWalking-v0') 
  
# Defining all the required parameters 
epsilon = 0.1
total_episodes = 500
max_steps = 100
alpha = 0.5
gamma = 1
""" 
    The two parameters below is used to calculate 
    the reward by each algorithm 
"""
episodeReward = 0
totalReward = { 
    'SarsaAgent': [], 
    'QLearningAgent': [], 
    'ExpectedSarsaAgent': [] 
} 
  
# Defining all the three agents 
expectedSarsaAgent = ExpectedSarsaAgent( 
    epsilon, alpha, gamma, env.observation_space.n,  
    env.action_space.n, env.action_space) 
qLearningAgent = QLearningAgent( 
    epsilon, alpha, gamma, env.observation_space.n,  
    env.action_space.n, env.action_space) 
sarsaAgent = SarsaAgent( 
    epsilon, alpha, gamma, env.observation_space.n,  
    env.action_space.n, env.action_space) 
  
# Now we run all the episodes and calculate the reward obtained by 
# each agent at the end of the episode 
  
agents = [expectedSarsaAgent, qLearningAgent, sarsaAgent] 
  
for agent in agents: 
    for _ in range(total_episodes): 
        # Initialize the necesary parameters before  
        # the start of the episode 
        t = 0
        state1 = env.reset()  
        action1 = agent.choose_action(state1)  
        episodeReward = 0
        while t < max_steps: 
  
            # Getting the next state, reward, and other parameters 
            state2, reward, done, info = env.step(action1)  
      
            # Choosing the next action  
            action2 = agent.choose_action(state2)  
              
            # Learning the Q-value  
            agent.update(state1, state2, reward, action1, action2)  
      
            state1 = state2  
            action1 = action2  
              
            # Updating the respective vaLues  
            t += 1
            episodeReward += reward 
              
            # If at the end of learning process  
            if done:  
                break
        # Append the sum of reward at the end of the episode 
        totalReward[type(agent).__name__].append(episodeReward) 
env.close() 
  
# Calculate the mean of sum of returns for each episode 
meanReturn = { 
    'SARSA-Agent': np.mean(totalReward['SarsaAgent']), 
    'Q-Learning-Agent': np.mean(totalReward['QLearningAgent']), 
    'Expected-SARSA-Agent': np.mean(totalReward['ExpectedSarsaAgent']) 
} 
  
# Print the results 
print(f"SARSA Average Sum of Reward: {meanReturn['SARSA-Agent']}") 
print(f"Q-Learning Average Sum of Return: {meanReturn['Q-Learning-Agent']}") 
print(f"Expected Sarsa Average Sum of Return: {meanReturn['Expected-SARSA-Agent']}")

IndexError: index 24 is out of bounds for axis 1 with size 4