In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gymnasium as gym
from PIL import Image

In [None]:
class QLearning:
    
    """
    Class of the Q-Learning algorithm. This is a ready class that discretises all dimensions of the state space.
    """
    
    def __init__(self, env, gamma=0.9):
        """
        Description
        --------------
        Constructor of class FrozenLakeAgent.
        
        Arguments
        --------------
        env          : gymnasium environment.
        gamma        : Float in [0, 1] generally close to 1, discount factor.
        n_states     : Int, the number of states.
        q_values     : np.array of shape (n_states, n_actions) or None, state-action values.
        """
        
        self.env = env
        self.gamma = gamma
        self.n_states = None
        self.n_actions = None

    def reset(self):
        """
        Description
        --------------
        Reinitialize the state value function, the state-action value function and the policy.

        Arguments
        --------------

        Returns
        --------------
        """

        self.q_values, self.visits = {}, {}
            
    def action_explore(self, state, epsilon):
        """
        Description
        --------------
        Take an action according to an epsilon-greedy policy.
        
        Arguments
        --------------
        state   : np.array, state.
        epsilon : Float in ]0, 1[, probability of taking a suboptimal action.
        
        Returns
        --------------
        Int, action to perform.
        """

        action_max = self.action(state)
        bern = np.random.binomial(1, 1 - epsilon)
        if bern == 1:
            return action_max
        
        return self.env.action_space.sample()
        
    def action(self, state):
        """
        Description
        --------------
        Take an action according to the estimated optimal policy.
        
        Arguments
        --------------
        state : Int, state.
        
        Returns
        --------------
        Int, estimated optimal action.
        """

        try:
            return self.q_values[state].argmax()
        
        except KeyError:
            return self.env.action_space.sample()
        
    def update_q_value(self, state, action, reward, next_state, alpha):
        """
        Description
        --------------
        Perform a q-learning update of experience (state, action, reward, next_state).
        
        Arguments
        --------------
        
        Returns
        --------------
        """

        try:
            q_max = self.q_values[next_state].max()

        except KeyError:
            self.q_values[next_state], self.visits[next_state] = np.zeros(self.n_actions), np.zeros(self.n_actions)
            q_max = 0

        try:
            td = reward + self.gamma*q_max - self.q_values[state][action]

        except KeyError:
            self.q_values[state], self.visits[state] = np.zeros(self.n_actions), np.zeros(self.n_actions)
            td = reward + self.gamma*q_max - self.q_values[state][action]
            
        self.visits[state][action] += 1
        if alpha is None:
            alpha = 1/(self.visits[state][action])

        self.q_values[state][action] += alpha*td
        
    def unroll(self, alpha, epsilon):
        """
        Description
        --------------
        Unroll the current epsilon-greedy policy from state and update the q-values at each step.
        
        Arguments
        --------------
        alpha   : Float in ]0, 1[, learning rate.
        epsilon : Float in ]0, 1[, parameter of the epsilon-greedy policy.
        
        Returns
        --------------
        """

        state, _ = self.env.reset()
        done = False
        while not done:
            action = self.action_explore(state, epsilon)
            next_state, reward, terminated, truncated, _ = self.env.step(action)
            done = (terminated or truncated)
            self.update_q_value(state, action, reward, next_state, alpha)
            state = next_state
            
    def train(self, alpha=0.1, epsilon_start=1, epsilon_stop=0.1, decay_rate=1e-3, n_train=1000, print_iter=10):
        """
        Description
        --------------
        Train an on-policy first-visit MC algorithm.
        
        Arguments
        --------------
        alpha         : Float in ]0, 1[, learning rate.
        epsilon_start : Float in ]0, 1[, initial value of epsilon.
        epsilon_stopt : Float in ]0, 1[, final value of epsilon.
        decay_rate    : Float, decay rate of epsilon from epsilon_start to epsilon_stop.
        n_train       : Int, total number of iterations.
        print_iter    : Int, number of iterations between two successive prints.
        
        Returns
        --------------
        """

        for i in range(n_train):
            epsilon = epsilon_stop + (epsilon_start - epsilon_stop)*np.exp(-decay_rate*i)
            self.unroll(alpha, epsilon)
            if i%print_iter == 0:
                print('Iteration : %d' %i)
                print('Epsilon   : %.5f' %epsilon)
                print('\n')
            
    def test(self, env, n_episodes=1000, verbose=False):
        """
        Description
        --------------
        Test the agent.
        
        Arguments
        --------------
        env        : gymnasium environment.
        n_episodes : Int, number of test episodes.
        verbose    : Boolean, if True, print the episode index and its corresponding length and return.
        
        Returns
        --------------
        """
        
        returns = np.empty(n_episodes)
        for episode in range(n_episodes):
            state, _ = env.reset()
            done = False
            R = 0
            n_steps = 0
            while not done:
                action = self.action(state)
                next_state, reward, terminated, truncated, _ = env.step(action)
                done = (terminated or truncated)
                state = next_state
                R += reward
                n_steps += 1
                
            returns[episode] = R
            if verbose:
                print('Episode : %d, length : %d, return : %.3F' %(episode, n_steps, R))

        return_avg, return_std = returns.mean(), returns.std()
        print('avg : %.3f, std : %.3f' %(return_avg, return_std))
        return return_avg, return_std
            
    def save_gif(self, env, file_name, n_episodes=1):
        """
        Description
        --------------
        Test the agent and save a gif.
        
        Arguments
        --------------
        env       : gymnasium environment.
        file_name : String, path to the saved gif.
        
        Returns
        --------------
        """
        
        frames = []
        for i in range(n_episodes):
            state, _ = env.reset()
            done = False
            R = 0
            n_steps = 0
            while not done:
                frames.append(Image.fromarray(env.render(), mode='RGB'))
                action = self.action(state)
                next_state, reward, terminated, truncated, _ = env.step(action)
                done = (terminated or truncated)
                state = next_state
                R += reward
                n_steps += 1

            frames.append(Image.fromarray(env.render(), mode='RGB'))
            
        frames[0].save(file_name, save_all=True, append_images=frames[1:], optimize=False, duration=150, loop=0)

In [3]:
class FrozenLakeQLearning(QLearning):
    """
    Description
    --------------
    Class describing an agent operating in the FrozenLake environment.
    """
    
    def __init__(self, env, gamma=0.9):
        """
        Description
        --------------
        Constructor of class FrozenLakeAgent.
        
        Arguments
        --------------
        env          : CliffWalking-v0 environment.
        gamma        : Float in [0, 1] generally close to 1, discount factor.
        n_states     : Int, the number of states.
        n_actions    : Int, the number of actions.
        q_values     : np.array of shape (n_states, n_actions) or None, q-values.
        """
        
        super(FrozenLakeQLearning, self).__init__(env, gamma)
        self.n_states = env.observation_space.n
        self.n_actions = env.action_space.n
        self.reset()

class CliffWalkingQLearning(QLearning):
    """
    Description
    --------------
    Class describing an agent operating in the CliffWalking environment.
    """
    
    def __init__(self, env, gamma=0.9):
        """
        Description
        --------------
        Constructor of class FrozenLakeAgent.
        
        Arguments
        --------------
        env          : CliffWalking-v0 environment.
        gamma        : Float in [0, 1] generally close to 1, discount factor.
        n_states     : Int, the number of states.
        n_actions    : Int, the number of actions.
        q_values     : np.array of shape (n_states, n_actions) or None, q-values.
        """
        
        super(CliffWalkingQLearning, self).__init__(env, gamma)
        self.n_states = env.observation_space.n
        self.n_actions = env.action_space.n
        self.reset()

class TaxiQLearning(QLearning):
    """
    Description
    --------------
    Class describing an agent operating in the Taxi environment.
    """
    
    def __init__(self, env, gamma=0.9):
        """
        Description
        --------------
        Constructor of class FrozenLakeAgent.
        
        Arguments
        --------------
        env          : CliffWalking-v0 environment.
        gamma        : Float in [0, 1] generally close to 1, discount factor.
        n_states     : Int, the number of states.
        n_actions    : Int, the number of actions.
        q_values     : np.array of shape (n_states, n_actions) or None, q-values.
        """
        
        super(TaxiQLearning, self).__init__(env, gamma)
        self.n_states = env.observation_space.n
        self.n_actions = env.action_space.n
        self.reset()

class BlackJackQLearning(QLearning):
    """
    Description
    --------------
    Class describing an agent operating in the BlackJack environment.
    """
    
    def __init__(self, env, gamma=0.9):
        """
        Description
        --------------
        Constructor of class FrozenLakeAgent.
        
        Arguments
        --------------
        env          : CliffWalking-v0 environment.
        gamma        : Float in [0, 1] generally close to 1, discount factor.
        n_states     : Int, the number of states.
        n_actions    : Int, the number of actions.
        q_values     : np.array of shape (n_states, n_actions) or None, q-values.
        """
        
        super(BlackJackQLearning, self).__init__(env, gamma)
        self.n_actions = env.action_space.n
        self.reset()

## Frozen Lake

In [4]:
is_slippery = False
map_name = '8x8'

env = gym.make('FrozenLake-v1', is_slippery=is_slippery, map_name=map_name)
agent = FrozenLakeQLearning(env, gamma=0.99)
agent.train(alpha=0.1, n_train=100000, print_iter=1000, decay_rate=1e-4)

Iteration : 0
Epsilon   : 1.00000


Iteration : 1000
Epsilon   : 0.91435


Iteration : 2000
Epsilon   : 0.83686


Iteration : 3000
Epsilon   : 0.76674


Iteration : 4000
Epsilon   : 0.70329


Iteration : 5000
Epsilon   : 0.64588


Iteration : 6000
Epsilon   : 0.59393


Iteration : 7000
Epsilon   : 0.54693


Iteration : 8000
Epsilon   : 0.50440


Iteration : 9000
Epsilon   : 0.46591


Iteration : 10000
Epsilon   : 0.43109


Iteration : 11000
Epsilon   : 0.39958


Iteration : 12000
Epsilon   : 0.37107


Iteration : 13000
Epsilon   : 0.34528


Iteration : 14000
Epsilon   : 0.32194


Iteration : 15000
Epsilon   : 0.30082


Iteration : 16000
Epsilon   : 0.28171


Iteration : 17000
Epsilon   : 0.26442


Iteration : 18000
Epsilon   : 0.24877


Iteration : 19000
Epsilon   : 0.23461


Iteration : 20000
Epsilon   : 0.22180


Iteration : 21000
Epsilon   : 0.21021


Iteration : 22000
Epsilon   : 0.19972


Iteration : 23000
Epsilon   : 0.19023


Iteration : 24000
Epsilon   : 0.18165


Iteration : 2

In [5]:
agent.test(env, n_episodes=100, verbose=True)

Episode : 0, length : 14, return : 1.000
Episode : 1, length : 14, return : 1.000
Episode : 2, length : 14, return : 1.000
Episode : 3, length : 14, return : 1.000
Episode : 4, length : 14, return : 1.000
Episode : 5, length : 14, return : 1.000
Episode : 6, length : 14, return : 1.000
Episode : 7, length : 14, return : 1.000
Episode : 8, length : 14, return : 1.000
Episode : 9, length : 14, return : 1.000
Episode : 10, length : 14, return : 1.000
Episode : 11, length : 14, return : 1.000
Episode : 12, length : 14, return : 1.000
Episode : 13, length : 14, return : 1.000
Episode : 14, length : 14, return : 1.000
Episode : 15, length : 14, return : 1.000
Episode : 16, length : 14, return : 1.000
Episode : 17, length : 14, return : 1.000
Episode : 18, length : 14, return : 1.000
Episode : 19, length : 14, return : 1.000
Episode : 20, length : 14, return : 1.000
Episode : 21, length : 14, return : 1.000
Episode : 22, length : 14, return : 1.000
Episode : 23, length : 14, return : 1.000
Ep

(1.0, 0.0)

In [6]:
env = gym.make('FrozenLake-v1', is_slippery=is_slippery, map_name=map_name, render_mode='human')
agent.test(env, n_episodes=5, verbose=True)
env.close()

2025-03-31 23:05:04.099 python[5297:395631] +[IMKClient subclass]: chose IMKClient_Modern
2025-03-31 23:05:04.099 python[5297:395631] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Episode : 0, length : 14, return : 1.000
Episode : 1, length : 14, return : 1.000
Episode : 2, length : 14, return : 1.000
Episode : 3, length : 14, return : 1.000
Episode : 4, length : 14, return : 1.000
avg : 1.000, std : 0.000


## Cliff Walking

In [5]:
env = gym.make('CliffWalking-v0', max_episode_steps=100)
agent = CliffWalkingQLearning(env, gamma=0.99)
agent.train(alpha=0.1, n_train=100000, print_iter=1000, decay_rate=1e-4)

Iteration : 0
Epsilon   : 1.00000


Iteration : 1000
Epsilon   : 0.91435


Iteration : 2000
Epsilon   : 0.83686


Iteration : 3000
Epsilon   : 0.76674


Iteration : 4000
Epsilon   : 0.70329


Iteration : 5000
Epsilon   : 0.64588


Iteration : 6000
Epsilon   : 0.59393


Iteration : 7000
Epsilon   : 0.54693


Iteration : 8000
Epsilon   : 0.50440


Iteration : 9000
Epsilon   : 0.46591


Iteration : 10000
Epsilon   : 0.43109


Iteration : 11000
Epsilon   : 0.39958


Iteration : 12000
Epsilon   : 0.37107


Iteration : 13000
Epsilon   : 0.34528


Iteration : 14000
Epsilon   : 0.32194


Iteration : 15000
Epsilon   : 0.30082


Iteration : 16000
Epsilon   : 0.28171


Iteration : 17000
Epsilon   : 0.26442


Iteration : 18000
Epsilon   : 0.24877


Iteration : 19000
Epsilon   : 0.23461


Iteration : 20000
Epsilon   : 0.22180


Iteration : 21000
Epsilon   : 0.21021


Iteration : 22000
Epsilon   : 0.19972


Iteration : 23000
Epsilon   : 0.19023


Iteration : 24000
Epsilon   : 0.18165


Iteration : 2

In [6]:
agent.test(env, n_episodes=100, verbose=True)

Episode : 0, length : 13, return : -13.000
Episode : 1, length : 13, return : -13.000
Episode : 2, length : 13, return : -13.000
Episode : 3, length : 13, return : -13.000
Episode : 4, length : 13, return : -13.000
Episode : 5, length : 13, return : -13.000
Episode : 6, length : 13, return : -13.000
Episode : 7, length : 13, return : -13.000
Episode : 8, length : 13, return : -13.000
Episode : 9, length : 13, return : -13.000
Episode : 10, length : 13, return : -13.000
Episode : 11, length : 13, return : -13.000
Episode : 12, length : 13, return : -13.000
Episode : 13, length : 13, return : -13.000
Episode : 14, length : 13, return : -13.000
Episode : 15, length : 13, return : -13.000
Episode : 16, length : 13, return : -13.000
Episode : 17, length : 13, return : -13.000
Episode : 18, length : 13, return : -13.000
Episode : 19, length : 13, return : -13.000
Episode : 20, length : 13, return : -13.000
Episode : 21, length : 13, return : -13.000
Episode : 22, length : 13, return : -13.00

(-13.0, 0.0)

In [7]:
env = gym.make('CliffWalking-v0', max_episode_steps=100, render_mode='human')
agent.test(env, n_episodes=5, verbose=True)
env.close()

2025-03-31 22:43:08.077 python[4781:376453] +[IMKClient subclass]: chose IMKClient_Modern
2025-03-31 22:43:08.077 python[4781:376453] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Episode : 0, length : 13, return : -13.000
Episode : 1, length : 13, return : -13.000
Episode : 2, length : 13, return : -13.000
Episode : 3, length : 13, return : -13.000
Episode : 4, length : 13, return : -13.000
avg : -13.000, std : 0.000


## BlackJack

In [None]:
env = gym.make('Blackjack-v1', natural=False, sab=True)
agent = BlackJackQLearning(env, gamma=0.99)
agent.train(alpha=0.1, n_train=100000, print_iter=1000, decay_rate=1e-4)

Iteration : 0
Epsilon   : 1.00000


Iteration : 10000
Epsilon   : 0.91435


Iteration : 20000
Epsilon   : 0.83686


Iteration : 30000
Epsilon   : 0.76674


Iteration : 40000
Epsilon   : 0.70329


Iteration : 50000
Epsilon   : 0.64588


Iteration : 60000
Epsilon   : 0.59393


Iteration : 70000
Epsilon   : 0.54693


Iteration : 80000
Epsilon   : 0.50440


Iteration : 90000
Epsilon   : 0.46591


Iteration : 100000
Epsilon   : 0.43109


Iteration : 110000
Epsilon   : 0.39958


Iteration : 120000
Epsilon   : 0.37107


Iteration : 130000
Epsilon   : 0.34528


Iteration : 140000
Epsilon   : 0.32194


Iteration : 150000
Epsilon   : 0.30082


Iteration : 160000
Epsilon   : 0.28171


Iteration : 170000
Epsilon   : 0.26442


Iteration : 180000
Epsilon   : 0.24877


Iteration : 190000
Epsilon   : 0.23461


Iteration : 200000
Epsilon   : 0.22180


Iteration : 210000
Epsilon   : 0.21021


Iteration : 220000
Epsilon   : 0.19972


Iteration : 230000
Epsilon   : 0.19023


Iteration : 240000
Epsilon   :

In [6]:
agent.test(env, n_episodes=10000, verbose=True)

Episode : 0, length : 1, return : 1.000
Episode : 1, length : 1, return : 0.000
Episode : 2, length : 1, return : 1.000
Episode : 3, length : 2, return : -1.000
Episode : 4, length : 2, return : 1.000
Episode : 5, length : 2, return : 1.000
Episode : 6, length : 1, return : -1.000
Episode : 7, length : 1, return : -1.000
Episode : 8, length : 3, return : 0.000
Episode : 9, length : 1, return : -1.000
Episode : 10, length : 2, return : 1.000
Episode : 11, length : 1, return : -1.000
Episode : 12, length : 1, return : -1.000
Episode : 13, length : 2, return : -1.000
Episode : 14, length : 1, return : -1.000
Episode : 15, length : 2, return : -1.000
Episode : 16, length : 2, return : 1.000
Episode : 17, length : 2, return : 1.000
Episode : 18, length : 1, return : -1.000
Episode : 19, length : 1, return : -1.000
Episode : 20, length : 1, return : -1.000
Episode : 21, length : 1, return : -1.000
Episode : 22, length : 1, return : -1.000
Episode : 23, length : 2, return : -1.000
Episode : 2

(-0.1517, 0.9538800291441268)

In [6]:
env = gym.make('Blackjack-v1', natural=False, sab=False, render_mode='human')
agent.test(env, n_episodes=10, verbose=True)
env.close()

2025-03-31 22:48:47.122 python[4954:383214] +[IMKClient subclass]: chose IMKClient_Modern
2025-03-31 22:48:47.122 python[4954:383214] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Episode : 0, length : 2, return : 1.000
Episode : 1, length : 2, return : 1.000
Episode : 2, length : 1, return : -1.000
Episode : 3, length : 2, return : 1.000
Episode : 4, length : 2, return : 1.000
Episode : 5, length : 2, return : 1.000
Episode : 6, length : 1, return : -1.000
Episode : 7, length : 2, return : -1.000
Episode : 8, length : 1, return : 1.000
Episode : 9, length : 2, return : 1.000
avg : 0.400, std : 0.917


## Taxi

In [4]:
env = gym.make('Taxi-v3')
agent = TaxiQLearning(env, gamma=0.99)
agent.train(alpha=0.1, n_train=100000, print_iter=1000, decay_rate=1e-4)

Iteration : 0
Epsilon   : 1.00000


Iteration : 1000
Epsilon   : 0.91435


Iteration : 2000
Epsilon   : 0.83686


Iteration : 3000
Epsilon   : 0.76674


Iteration : 4000
Epsilon   : 0.70329


Iteration : 5000
Epsilon   : 0.64588


Iteration : 6000
Epsilon   : 0.59393


Iteration : 7000
Epsilon   : 0.54693


Iteration : 8000
Epsilon   : 0.50440


Iteration : 9000
Epsilon   : 0.46591


Iteration : 10000
Epsilon   : 0.43109


Iteration : 11000
Epsilon   : 0.39958


Iteration : 12000
Epsilon   : 0.37107


Iteration : 13000
Epsilon   : 0.34528


Iteration : 14000
Epsilon   : 0.32194


Iteration : 15000
Epsilon   : 0.30082


Iteration : 16000
Epsilon   : 0.28171


Iteration : 17000
Epsilon   : 0.26442


Iteration : 18000
Epsilon   : 0.24877


Iteration : 19000
Epsilon   : 0.23461


Iteration : 20000
Epsilon   : 0.22180


Iteration : 21000
Epsilon   : 0.21021


Iteration : 22000
Epsilon   : 0.19972


Iteration : 23000
Epsilon   : 0.19023


Iteration : 24000
Epsilon   : 0.18165


Iteration : 2

In [5]:
agent.test(env, n_episodes=100, verbose=True)

Episode : 0, length : 8, return : 13.000
Episode : 1, length : 13, return : 8.000
Episode : 2, length : 11, return : 10.000
Episode : 3, length : 12, return : 9.000
Episode : 4, length : 16, return : 5.000
Episode : 5, length : 15, return : 6.000
Episode : 6, length : 8, return : 13.000
Episode : 7, length : 7, return : 14.000
Episode : 8, length : 15, return : 6.000
Episode : 9, length : 12, return : 9.000
Episode : 10, length : 6, return : 15.000
Episode : 11, length : 13, return : 8.000
Episode : 12, length : 12, return : 9.000
Episode : 13, length : 14, return : 7.000
Episode : 14, length : 16, return : 5.000
Episode : 15, length : 9, return : 12.000
Episode : 16, length : 11, return : 10.000
Episode : 17, length : 11, return : 10.000
Episode : 18, length : 10, return : 11.000
Episode : 19, length : 10, return : 11.000
Episode : 20, length : 9, return : 12.000
Episode : 21, length : 15, return : 6.000
Episode : 22, length : 15, return : 6.000
Episode : 23, length : 12, return : 9.0

(8.29, 2.601134367924887)

In [6]:
env = gym.make('Taxi-v3', render_mode='human')
agent.test(env, n_episodes=10, verbose=True)
env.close()

Episode : 0, length : 17, return : 4.000
Episode : 1, length : 18, return : 3.000
Episode : 2, length : 14, return : 7.000
Episode : 3, length : 16, return : 5.000
Episode : 4, length : 13, return : 8.000
Episode : 5, length : 13, return : 8.000
Episode : 6, length : 15, return : 6.000
Episode : 7, length : 16, return : 5.000
Episode : 8, length : 14, return : 7.000
Episode : 9, length : 8, return : 13.000
avg : 6.600, std : 2.653
