In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from gym.envs.registration import register

try:
    register(
        id='Amitabh-FrozenLakeNotSlippery-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery': False},
        max_episode_steps= 100,
        reward_threshold= 0.78
    )
except:
    print("Environment already created!")

In [3]:
env = gym.make("Amitabh-FrozenLakeNotSlippery-v0", render_mode="rgb_array")

In [4]:
env.action_space

Discrete(4)

In [5]:
env.action_space.n

4

In [6]:
env.observation_space

Discrete(16)

In [7]:
state = env.reset()

In [8]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [9]:
q_table.shape

(16, 4)

In [10]:
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [11]:
state

(0, {'prob': 1})

In [12]:
# Below function is a very raw, novice way to randomly select any action and perform on the environment. It has nothing to do with 
# machine learning or any such thing. 
for episode in range(5):
    done = False
    state = env.reset()
    
    while not done:
        env.render()
        action = env.action_space.sample()
        state, reward, done, truncate, info = env.step(action)
        
env.close()

In [13]:
# Now lets implement it via code to see how machine learns to play this game and how can we teach the machine to do it
# Starting with the Q learning

In [14]:
# Things we need
# 1. EPSILON GREEDY METHOD
# 2. Function to compute optimal q value
# 3. Few necessary variables : EPSILON, discount factor GAMMA, lerning rate ALPHA and other for loop control and Epsilon decay

In [15]:
# PARAMETERS:
NUM_EPISODES = 20000
ALPHA = 0.01
GAMMA = 0.99
EPSILON = 1.0
MIN_EPSILON = 0.0
MAX_EPSILON = 1.0
EPSILON_DECAY = 0.001

In [16]:
def epsilon_greedy_action(q_table, state):
    random_val = np.random.random()
    if random_val > EPSILON:
        action = np.argmax(q_table[state, :]) # argmax gets me the action
    
    else:
        action = env.action_space.sample()
    return action

In [17]:
def compute_next_q_val(old_q_val, reward, next_optimal_q_val):
    return (old_q_val + ALPHA * (reward + (GAMMA * next_optimal_q_val) - old_q_val))

In [18]:
def reduce_epsilon(epsilon, episode):
    return (MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON)*np.exp(-EPSILON_DECAY*episode))

In [20]:
log_interval = 1000
rewards = []
for episode in range(NUM_EPISODES):
    done = False
    state = env.reset()
    state = state[0] if isinstance(state, tuple) else state
    total_rewards = 0
   
    
    while not done:
        env.render()
        
        # Choose an action and perform
        action = epsilon_greedy_action(q_table, state)
        next_state, reward, done, truncate, info = env.step(action)
        
        # Get old q val
        old_q_val = q_table[state, action]
        
        # Get next optimal q val
        next_optimal_q_val = np.max(q_table[next_state, :]) # whole row as we dont know what action is to be take.
        
        # Compute next q val
        new_q_val = compute_next_q_val(old_q_val, reward, next_optimal_q_val)
        
        # Update the q table
        q_table[state, action] = new_q_val
        
        
        # Update current state
        state = next_state
        
        # accumulate total reward
        total_rewards += reward

    # Decay EPSILON
    episode += episode
    EPSILON = reduce_epsilon(EPSILON, episode)
    rewards.append(total_rewards)
    
    if episode % log_interval == 0:
        print("EPISODE : ", episode, "  Reward : ", np.sum(rewards))
env.close()

EPISODE :  0   Reward :  0.0
EPISODE :  1000   Reward :  122.0
EPISODE :  2000   Reward :  481.0
EPISODE :  3000   Reward :  923.0
EPISODE :  4000   Reward :  1406.0
EPISODE :  5000   Reward :  1901.0
EPISODE :  6000   Reward :  2398.0
EPISODE :  7000   Reward :  2898.0
EPISODE :  8000   Reward :  3398.0
EPISODE :  9000   Reward :  3898.0
EPISODE :  10000   Reward :  4398.0
EPISODE :  11000   Reward :  4898.0
EPISODE :  12000   Reward :  5398.0
EPISODE :  13000   Reward :  5898.0
EPISODE :  14000   Reward :  6398.0
EPISODE :  15000   Reward :  6898.0
EPISODE :  16000   Reward :  7398.0
EPISODE :  17000   Reward :  7898.0
EPISODE :  18000   Reward :  8398.0
EPISODE :  19000   Reward :  8898.0
EPISODE :  20000   Reward :  9398.0
EPISODE :  21000   Reward :  9898.0
EPISODE :  22000   Reward :  10398.0
EPISODE :  23000   Reward :  10898.0
EPISODE :  24000   Reward :  11398.0
EPISODE :  25000   Reward :  11898.0
EPISODE :  26000   Reward :  12398.0
EPISODE :  27000   Reward :  12898.0
EPISO

In [21]:
q_table

array([[2.18794713e-01, 1.99853974e-03, 9.50990050e-01, 1.53902424e-01],
       [1.65803992e-01, 0.00000000e+00, 9.60596010e-01, 1.60042995e-01],
       [1.96575186e-01, 9.70299000e-01, 4.28346065e-02, 2.34994976e-01],
       [2.49112409e-01, 0.00000000e+00, 1.57019061e-03, 9.48804024e-04],
       [6.42103985e-05, 1.65634695e-02, 0.00000000e+00, 9.13252951e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 9.80100000e-01, 0.00000000e+00, 2.10071280e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.32544791e-03, 0.00000000e+00, 1.08802639e-01, 2.73715046e-05],
       [2.78151587e-03, 9.36215375e-03, 4.80344161e-01, 0.00000000e+00],
       [8.77445892e-02, 9.90000000e-01, 0.00000000e+00, 3.27681439e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 4.57092082e-03, 3.89843800e

In [22]:
env_human = gym.make("Amitabh-FrozenLakeNotSlippery-v0", render_mode="human")

In [23]:
import time
i = 1
for episode in range(200):
    done = False
    state = env_human.reset()
    state = state[0] if isinstance(state, tuple) else state
    while not done:
        env_human.render()
        action = np.argmax(q_table[state, :])
        state, reward, done, truncate, info = env_human.step(action)
        if i == 1:
            time.sleep(10)
            i += 1
        
        time.sleep(0.5)
        if done:
            print("Woohoo, you won!")
            break
env.close()

Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you won!
Woohoo, you wo

In [None]:
env.close()