# Artificial Intelligence Course - Fall 1402
## Computer Assignment #2 - Reinforcement Learning

# Table of Contents

- [Part 1: Value Iteration & Policy Iteration Algorithms](#1)
    - [َQuestion 1:](#1-0)
    - [َQuestion 2:](#1-1)
    - [َQuestion 3:](#1-12)
    - [َQuestion 4:](#1-2)
    - [َQuestion 5:](#1-3)
        - [Value Iteration](#1-3-1)
        - [Policy Iteration](#1-3-2)
    - [َQuestion 6:](#1-4)
        - [Value Iteration](#1-4-1)
        - [Policy Iteration](#1-4-2)
- [Part 2: Q-Learning Algorithm](#2)
    - [َQuestion 8:](#2-1)
    - [َQuestion 9:](#2-2)
    - [َQuestion 10:](#2-3)

In [1]:
# import
import numpy as np
import gymnasium as gym
from time import sleep
from typing import *

<a name='1'></a>
## Part 1: Value Iteration & Policy Iteration Algorithms

In [2]:
def render(env: gym.Env):
    env.render()
    sleep(0.1)

<a name='1-0'></a>
### Question 1:

<a name='1-1'></a>
### Question 2:

In [3]:
class ValueIteration:
    def __init__(self: Self, env: gym.Env, discount_factor: float, theta:float=1e-8):
        self.env = env
        self.discount_factor = discount_factor
        self.theta = theta
        self.state_values = np.zeros(self.env.observation_space.n)
        self.q_values = np.zeros((self.env.observation_space.n, 
                    self.env.action_space.n))

    def value_estimation(self: Self):
        env, state_values, q_values, discount_factor, theta = \
                self.env, \
                self.state_values, \
                self.q_values, \
                self.discount_factor, \
                self.theta
            
        delta = np.inf

        while(delta > theta):
            delta = 0

            for state in range(env.observation_space.n - 1):
                previous_state_value = state_values[state]

                for action in range(env.action_space.n):
                    action_value = 0
                    for probability, next_state, reward, _ in env.unwrapped.P[state][action]:
                        action_value += probability * \
                                (reward + \
                                discount_factor * state_values[next_state])
                    q_values[state, action] = action_value

                state_values[state] = np.max(q_values[state,:])

                delta = np.max([delta, abs(previous_state_value - state_values[state])])

            if (delta < theta):
                break

    def take_action(self: Self, action: Any):
        next_state, reward, terminated, _, _ = self.env.step(action)
        return next_state, reward, terminated

    def get_optimal_policy(self: Self, state: Any):
        return np.argmax(self.q_values[state,:])

    def get_state_values(self: Self):
        return self.state_values

    def get_q_values(self: Self):
        return self.q_values

    def reset(self: Self, *args, **kwargs):
        initial_state, _ = self.env.reset(*args, **kwargs)
        return initial_state, False

In [4]:
discount_factor = 0.9
env = gym.make('FrozenLake-v1', 
               desc=None, 
               map_name="4x4", 
               is_slippery=True,
               render_mode="human")
value_iteration = ValueIteration(env, discount_factor, 0.001)
value_iteration.value_estimation()

# print state_values and optimal_policy
action_char = ['L', 'B', 'R', 'T']
state_values = value_iteration.get_state_values().reshape(4, 4)
optimal_policy = np.array([action_char[value_iteration.get_optimal_policy(state)] for state in range(16)]).reshape(4, 4)
print("State Utilities:\n", state_values)
print("Optimal Policy:\n", optimal_policy)

state, terminated = value_iteration.reset(seed=573)
total_reward = 0
move_index = 0
try:
    while not terminated:
        render(env)
        action = value_iteration.get_optimal_policy(state)
        state, reward, terminated = value_iteration.take_action(action)
        total_reward = total_reward + (discount_factor ** move_index) * reward
        move_index += 1
    render(env)
finally:
    print(f"Total Reward = {total_reward}")
    env.close()

State Utilities:
 [[0.06428821 0.05807365 0.07231299 0.05356057]
 [0.08830336 0.         0.11127288 0.        ]
 [0.14298808 0.24613328 0.29877497 0.        ]
 [0.         0.37905097 0.63860174 0.        ]]
Optimal Policy:
 [['L' 'T' 'L' 'T']
 ['L' 'L' 'L' 'L']
 ['T' 'B' 'L' 'L']
 ['L' 'R' 'B' 'L']]


Total Reward = 0.05233476330273609


<a name='1-12'></a>
### Question 3:

<a name='1-2'></a>
### Question 4:

In [5]:
class PolicyIteration():
    def __init__(self, env, discount_factor, theta=1e-8):
        self.env = env
        self.discount_factor = discount_factor
        self.theta = theta
        self.reset()
        self.state_values = np.ones((self.env.observation_space.n)) / self.env.action_space.n
        self.q_values = np.ones((self.env.observation_space.n, self.env.action_space.n)) / self.env.action_space.n
        self.state_values[self.env.observation_space.n - 1] = 0
        self.q_values[self.env.observation_space.n - 1] = np.zeros((self.env.action_space.n))
        self.policy = np.random.randint(self.env.action_space.n, size=self.env.observation_space.n) # initial policy
        self.policy_stable = False

    def policy_evaluation(self):
        self.delta = np.inf

        while(self.delta >= self.theta):

            self.delta = 0

            for state in range(self.env.observation_space.n):

                v = self.state_values[state]

                new_state_value = 0
                for probability, next_state, reward, done in self.env.P[state][self.policy[state]]:
                    ### START CODE HERE ###
                    new_state_value += ...
                    ### END CODE HERE ###
                self.state_values[state] = new_state_value

                self.delta = np.max([self.delta, abs(v - self.state_values[state])])

    def policy_improvement(self):
        self.policy_stable = True

        for state in range(self.env.observation_space.n):
            old_policy = self.policy[state]

            for action in range(self.env.action_space.n):

                action_value = 0
                for probability, next_state, reward, done in self.env.P[state][action]:
                    ### START CODE HERE ###
                    action_value += ...
                    ### END CODE HERE ###
                self.q_values[state, action] = action_value

            self.policy[state] = np.argmax(self.q_values[state,:])

            if old_policy != self.policy[state]:
                self.policy_stable = False

    def policy_estimation(self):
        self.policy_stable = False

        while not self.policy_stable:
            self.policy_evaluation()
            self.policy_improvement()

    def take_action(self, action):
        next_state, reward, done, _ = self.env.step(action)
        return next_state, reward, done

    def get_optimal_policy(self, state):
        return self.policy[state]

    def get_state_values(self):
        return self.state_values

    def get_q_values(self):
        return self.q_values

    def reset(self):
        initial_state = self.env.reset()
        return initial_state

<a name='1-3'></a>
### Question 5:

<a name='1-3-1'></a>
#### Value Iteration:

<a name='1-3-2'></a>
#### Policy Iteration:

<a name='1-4'></a>
### Question 6:

<a name='1-4-1'></a>
#### Value Iteration:

<a name='1-4-2'></a>
#### Policy Iteration:

<a name='2'></a>
## Part 2: Q-Learning Algorithm

In [6]:
# hyperparameters
REPS = 20
EPISODES = 2000
EPSILON = 0.1
LEARNING_RATE = 0.1
DISCOUNT = 0.9
STUDENT_NUM = 123

In [7]:
# environment
env = gym.make('Taxi-v3')
env.seed(seed = STUDENT_NUM)
Initial_State = env.reset()
Initial_State

  logger.warn(


AttributeError: 'TaxiEnv' object has no attribute 'seed'

In [None]:
taxi_row, taxi_col, pass_idx, dest_idx = env.decode(Initial_State)
taxi_row, taxi_col, pass_idx, dest_idx

In [None]:
# get familiar with the environment
print("you can see the environment in each step by render command :")
env.render()

In [None]:
# Total no. of states
env.observation_space.n

In [None]:
# Total no. of actions
env.action_space.n

In [None]:
# base code for Q-learning

env = gym.make('Taxi-v3')
env.seed(seed = STUDENT_NUM)


for rep in range(REPS):
    agent = # Agent Object instance from Algorithm_name(e.g Q_learning_agent) class which has inherited from Agentbase.
    for episode in range(EPISODES):
        Initial_state = env.reset()

        for ... :

            bestAction = np.random.choice(ACTIONS)

            next_state,rew,done,_ = environment.step(bestAction)

            if done:
                break

<a name='2-1'></a>
### Question 8:

In [None]:
class QLearningAgent():
    def __init__(self, env, epsilon, learning_rate, discount_factor, seed):
      self.env = env
      self.epsilon = epsilon
      self.learning_rate = learning_rate
      self.olr = learning_rate
      self.discount_factor = discount_factor
      self.q_table = np.zeros((env.observation_space.n, env.action_space.n))
      self.seed = seed

    def choose_action(self, state):
      ### START CODE HERE ###
      # With probability epsilon, choose a random action

      # Otherwise, choose the action with the highest Q-value

      ### END CODE HERE ###
      return action

    def update_q_table(self, state, action, nextState, reward):
      ### START CODE HERE ###
      # Calculate the new Q-value using the Q-learning formula
      self.q_table[state][action] = ...
      ### END CODE HERE ###

    def decay_epsilon(self, episode):
      ### START CODE HERE ###
      self.epsilon = ...
      ### END CODE HERE ###

    def decrease_learning_rate(self, episode):
      ### START CODE HERE ###
      self.learning_rate = ...
      ### END CODE HERE ###

    def take_action(self, action):
      next_state, reward, done, _ = self.env.step(action)
      return next_state, reward, done

    def get_optimal_policy(self, state):
      return np.argmax(self.q_table[state])

    def get_q_values(self):
      return self.q_table

    def reset(self):
      # self.learning_rate = self.olr
      return self.env.reset(seed=self.seed)

<a name='2-2'></a>
### Question 9:

<a name='2-3'></a>
### Question 10: