In [1]:
from IPython.display import clear_output
import time

In [2]:
import numpy as np
import gym
import random
import pygame

In [3]:
env = gym.make("FrozenLake-v1",render_mode="ansi",is_slippery=False)
action_size = env.action_space.n
state_size = env.observation_space.n
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [4]:
total_episodes = 5000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005             # Exponential decay rate for exploration prob

In [5]:
# List of rewards
rewards = []

for episode in range(total_episodes):
    # Reset the environment
    env.reset()
    state=0
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info,_ = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        # print(type(state[0]))
        # print(new_state)
        
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)
# print(type(total_episode))
rewards_per_thousand_episodes = np.split(np.array(rewards),total_episodes/1000)
cnt=1000
for r in rewards_per_thousand_episodes:
    print(str(cnt)," : " ,sum(r/1000),sep=" ")
    cnt+=1000
print(qtable)

1000  :  0.7500000000000006
2000  :  0.9990000000000008
3000  :  0.9900000000000008
4000  :  0.9920000000000008
5000  :  0.9910000000000008
[[0.73509189 0.77378094 0.6983373  0.73509189]
 [0.73509189 0.         0.6453527  0.69833728]
 [0.69832286 0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.77378094 0.81450625 0.         0.73509189]
 [0.         0.         0.         0.        ]
 [0.         0.90249998 0.         0.65772917]
 [0.         0.         0.         0.        ]
 [0.81450625 0.         0.857375   0.77378094]
 [0.81450625 0.9025     0.9025     0.        ]
 [0.857375   0.95       0.         0.85737484]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9025     0.95       0.857375  ]
 [0.9025     0.95       1.         0.9025    ]
 [0.         0.         0.         0.        ]]


In [7]:
env = gym.make("FrozenLake-v1",render_mode="human",is_slippery=False)
for episode in range(3):
    env.reset()
    state=0
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode+1)
    time.sleep(1)
    for step in range(max_steps):
        # clear_output(wait=True)
        env.render()
        time.sleep(.3)
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        new_state, reward, done, info,_ = env.step(action)
        env.render()
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            # clear_output(wait=True)
            if reward == 1:
                print("****You reached the goal!****")
                env.render()
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                env.render()
                time.sleep(3)
                # clear_output(wait=True)
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  1
****You reached the goal!****
Number of steps 5
****************************************************
EPISODE  2
****You reached the goal!****


KeyboardInterrupt: 