In [1]:
import numpy as np
import gym
import random
import time
import IPython.display as display



 ### Creating the environment
 We will using the environment *FrozenLake-v0*

In [2]:
env = gym.make("FrozenLake-v0")


 ### Creating the Q-table
 We are going to construct our Q-table, and initialize all the Q-values.
 number of rows in the table = size of state space in the environemnt

 number of columns is equivalent size of action space

In [3]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)


[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


 ### Initializing Q-learning parameters
 Now we initalize all the paramenters.
 `num_episodes` defines the total number of episodes.
 `learning_rate` is represented with the symbol $\alpha$.
 `discount_rate` is represented with the symbol $\gama$.
 `exploration_rate` is represented with the symbol $\epsilon$.

In [4]:
num_episodes = 10000
# we define a maximum number of steps that our agent is allowed to take within a single episode
max_steps_per_episode = 100

# Alpha the weighted for sum of old value and learned value in Q
learning_rate = 0.01
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001


 ###  Implement Reinforcement learning

In [5]:
rewards_all_episodes = []

# Q-learning
for episode in range(num_episodes):
    state = env.reset()
    print("episodes:", episode)

    done = False
    rewards_current_episode = 0  # The start rewards
    for step in range(max_steps_per_episode):

        # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)
        # this will determine whether our agent will explore or expoit
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])
        else:
            action = env.action_space.sample()

        new_state, reward, done, info = env.step(action)

        # UPdate Q-table for Q(s,a)
        q_table[state, action] = q_table[state, action] * (1-learning_rate) + learning_rate*(
            reward + discount_rate * np.max(q_table[new_state, :]))

        # Transition to the next state
        state = new_state
        rewards_current_episode += reward

        if done == True:
            break

    # Exploration rate decay
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * \
        np.exp(-exploration_decay_rate * episode)

    rewards_all_episodes.append(rewards_current_episode)


episodes: 0
episodes: 1
episodes: 2
episodes: 3
episodes: 4
episodes: 5
episodes: 6
episodes: 7
episodes: 8
episodes: 9
episodes: 10
episodes: 11
episodes: 12
episodes: 13
episodes: 14
episodes: 15
episodes: 16
episodes: 17
episodes: 18
episodes: 19
episodes: 20
episodes: 21
episodes: 22
episodes: 23
episodes: 24
episodes: 25
episodes: 26
episodes: 27
episodes: 28
episodes: 29
episodes: 30
episodes: 31
episodes: 32
episodes: 33
episodes: 34
episodes: 35
episodes: 36
episodes: 37
episodes: 38
episodes: 39
episodes: 40
episodes: 41
episodes: 42
episodes: 43
episodes: 44
episodes: 45
episodes: 46
episodes: 47
episodes: 48
episodes: 49
episodes: 50
episodes: 51
episodes: 52
episodes: 53
episodes: 54
episodes: 55
episodes: 56
episodes: 57
episodes: 58
episodes: 59
episodes: 60
episodes: 61
episodes: 62
episodes: 63
episodes: 64
episodes: 65
episodes: 66
episodes: 67
episodes: 68
episodes: 69
episodes: 70
episodes: 71
episodes: 72
episodes: 73
episodes: 74
episodes: 75
episodes: 76
episodes:

In [6]:
# Calculate and print the a erage reward per thousand episodes
rewards_per_thousand_episodes = np.split(
    np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print("*********Average reward per thousand episodes *******\n")
for r in rewards_per_thousand_episodes:
    print(count, ":", str(sum(r/1000)))
    count += 1000

print("\n\n**********************Q-table*******************\n")
print(q_table)


*********Average reward per thousand episodes *******

1000 : 0.015000000000000006
2000 : 0.03300000000000002
3000 : 0.03300000000000002
4000 : 0.047000000000000035
5000 : 0.046000000000000034
6000 : 0.04100000000000003
7000 : 0.04500000000000003
8000 : 0.03800000000000003
9000 : 0.04400000000000003
10000 : 0.037000000000000026


**********************Q-table*******************

[[9.69997157e-03 9.80103750e-03 3.46884801e-02 6.47796976e-03]
 [4.41795381e-03 3.61316923e-02 7.29893935e-03 7.08698117e-03]
 [6.93522093e-02 3.32274109e-03 4.64684889e-03 2.70450609e-03]
 [3.04154854e-04 1.01549867e-05 6.60465731e-07 4.86165446e-03]
 [9.36544722e-03 3.48128007e-02 6.96152138e-03 2.24649924e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.71715758e-03 5.59764709e-03 1.05972709e-01 2.54881163e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.97472486e-03 4.87612843e-03 7.22432894e-02 4.99919745e-03]
 [5.89285615e-03 8.03795894e-03 1.86461909e-01 8.12

 ### Watch the agent play the game

In [7]:
for episode in range(3):
    # initialize new episode params
    state = env.reset()
    done = False
    print("*******EPISODE ", episode + 1, "**********\n\n\n\n\n")
    time.sleep(1)
    for step in range(max_steps_per_episode):
        display.clear_output(wait=True)
        env.render()  # render the current state of the environment to display
        time.sleep(0.3)
        # Show the current state of environment on screen
        # Choose action with highest Q-value for current state
        # Take new action
        action = np.argmax(q_table[state, :])
        new_state, reward, done, info = env.step(action)

        state = new_state

        if done:
            display.clear_output(wait=True)
            env.render()
            if reward == 1:
                # Agent reached the goak and won episode
                print("**** You reached the goal****")
                time.sleep(3)
                display.clear_output(wait=True)
            else:
                # Agent stepped in a hole and lost episode
                print("***** You fell into a hole****")
                time.sleep(3)
                display.clear_output(wait=True)
            break
env.close


  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
***** You fell into a hole****


<bound method Wrapper.close of <TimeLimit<FrozenLakeEnv<FrozenLake-v0>>>>