In [1]:
#pip install pyglet
#pip install PyOpenGL
#pip install ffmpeg
#pip install xvfbwrapper
#pip install PyVirtualDisplay
#pip install pygame
#pip install gym
#pip install imageio imageio-ffmpeg
#conda install -n base -c conda-forge widgetsnbextension

In [2]:
from pyvirtualdisplay import Display
import numpy as np
import gym
import random
import imageio
from tqdm.notebook import trange

In [3]:
# virtual_display = Display(visible=0, size=(1400, 900))
# virtual_display.start()

In [4]:
env = gym.make("FrozenLake-v1",render_mode="rgb_array",map_name="4x4", is_slippery=False)
print("Observation Spacce", env.observation_space)
# display a random observation
print("Sample Observation", env.observation_space.sample())

Observation Spacce Discrete(16)
Sample Observation 8


In [5]:
state_space = env.observation_space.n
action_space = env.action_space.n
#initializing qtable with 0 values
def initialize_q_table(state_space, action_space):
    Qtable = np.zeros((state_space, action_space))
    return Qtable

"""
With a Probability of 1 - ɛ, we do exploitation, and with the probability ɛ,
we do exploration. 
In the epsilon_greedy_policy we will:
1-Generate the random number between 0 to 1.
2-If the random number is greater than epsilon, we will do exploitation.
    It means that the agent will take the action with the highest value given
    a state.
3-Else, we will do exploration (Taking random action). 

"""
def epsilon_greedy_policy(Qtable, state, epsilon):
    random_int = random.uniform(0,1)
    if random_int > epsilon:
        action = np.argmax(Qtable[state])
    else:
        action = env.action_space.sample()
    return action
"""
Q-learning is an off-policy algorithm which means that the policy of 
   taking action and updating function is different.
In this example, the Epsilon Greedy policy is acting policy, and 
   the Greedy policy is updating policy.
The Greedy policy will also be the final policy when the agent is trained.
   It is used to select the highest state and action value from the Q-Table.
"""
def greedy_policy(Qtable, state):
    action = np.argmax(Qtable[state])
    return action

Qtable_frozenlake = initialize_q_table(state_space, action_space)

In [6]:
#Training parameters
n_traintaing_episodes = 10000
learning_rate = 0.7
#Evaluation_parameters
n_eval_episodes = 100
#Environment parameters
env_id = "FrozenLake-v1"
max_steps = 99
gamma = 0.95
eval_seed = []
#Exploration parameters
max_epsilon = 1.0
min_epsilon = 0.05
decay_rate = 0.0005

In [7]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
    for episode in trange(n_training_episodes):
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        #Reset the environment
        state = env.reset()[0]
        step = 0
        done = False
        
        #repeat
        for step in range(max_steps):
            action = epsilon_greedy_policy(Qtable, state, epsilon)
            new_state, reward, done, info, _ = env.step(action)
            Qtable[state][action] = Qtable[state][action] + learning_rate*(reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action])
            #if done, finish the episode
            if done:
                break
                
            #update state
            state = new_state
    return Qtable
    
Qtable_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_frozenlake)

  0%|          | 0/10000 [00:00<?, ?it/s]

  if not isinstance(terminated, (bool, np.bool8)):


In [8]:
Qtable_frozenlake

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77378094],
       [0.77378094, 0.857375  , 0.77378094, 0.81450625],
       [0.81450625, 0.        , 0.77378094, 0.77378094],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.81450625],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.95      , 0.857375  ],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

In [9]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
    episode_rewards = []
    for episode in range(n_eval_episodes):
        if seed:
            state = env.reset(seed=seed[episode])[0]
        else:
            state = env.reset()[0]
        step = 0
        done = False
        total_rewards_ep = 0

        for step in range(max_steps):
            # Take the action (index) that have the maximum reward
            action = np.argmax(Q[state][:])
            new_state, reward, done, info, _ = env.step(action)
            total_rewards_ep += reward

            if done:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_frozenlake, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

Mean_reward=1.00 +/- 0.00


In [10]:
def record_video(env, Qtable, out_directory, fps=1):
    images = [] 
    done = False
    state = env.reset(seed=random.randint(0,500))[0]
    img = env.render()#mode='rgb_array'
    images.append(img)
    while not done:
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Qtable[state][:])
        state, reward, done, info, _ = env.step(action) # We directly put next_state = state for recording logic
        img = env.render()
        images.append(img)
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], duration=400)
    
    
video_path="~/content/replay.gif"
video_fps=1
record_video(env, Qtable_frozenlake, video_path, video_fps)

# from IPython.display import Image
# Image('~/content/replay.gif')