In [None]:
!pip install numpy #to install Python libraries using pip
!pip install gym #Library for RL



In [None]:
import numpy as np #data handling
import gym   #simulated environments
import random #randomness in the learning process


In [None]:
env = gym.make("FrozenLake-v1")

In [None]:
action_size = env.action_space.n   #left, right, up, down in FrozenLake
state_size = env.observation_space.n  #16 grid positions

In [None]:
# Create our Q table with state_size rows and action_size columns (64x4)
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


## Step 3: Create the hyperparameters ⚙️
- Here, we'll specify the hyperparameters

In [None]:
total_episodes =50000        # Total episodes-times the Q-learning algorithm will run
learning_rate = 0.01         # Learning rate-how much the Q-values are updated-aggressive or conservative
max_steps = 99               # Max steps per episode
gamma = 0.75                 # Discounting rate-weight to future or immediate rewards

# Exploration parameters
epsilon = 0.5                 # Exploration rate - probability of choosing a random action
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability
decay_rate = 0.005            # Exponential decay rate for exploration prob -rate at which epsilon decreases over time

## The Q learning algorithm
- Now we implement the Q learning algorithm:
  ![alt text](http://simoninithomas.com/drlc/Qlearning//qtable_algo.png)


In [None]:
# List of rewards
rewards = []

# 2 For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False  #a flag to indicate whether the episode has ended
    total_rewards = 0

    for step in range(max_steps):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)
        #exploit its knowledge or explore the environment

        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        # Controls the balance between exploration and exploitation.
        #  Initially, the agent will explore more, and as training progresses, the agent will exploit more.
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
            #print(exp_exp_tradeoff, "action", action)

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()
            #print("action random", action)


        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # Bellman equation
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])

        total_rewards += reward

        # Our new state is state
        state = new_state

        # If done (if we're dead) : finish episode
        if done == True:
            break

    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards)


print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.09856
[[4.57797095e-03 4.71229766e-03 7.48922452e-03 4.26622151e-03]
 [2.45649282e-03 4.95119282e-03 5.16387234e-03 7.60114965e-03]
 [1.66712484e-02 8.77224279e-03 1.21198608e-02 5.18661952e-03]
 [4.16467568e-09 4.43253726e-03 3.26996627e-05 6.55113317e-10]
 [1.60842746e-02 6.03465195e-03 5.23970669e-03 3.04876369e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.34246920e-02 1.38050682e-02 1.61952162e-02 1.04476941e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.85797172e-03 1.37071439e-02 1.33247590e-02 4.17280915e-02]
 [4.00337652e-02 9.64595267e-02 3.34091534e-02 2.19175448e-02]
 [1.34481382e-01 4.65661464e-02 3.90764012e-02 5.70081076e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.41237264e-02 5.74208686e-02 1.82312129e-01 5.57876103e-02]
 [6.38003997e-02 1.60024578e-01 4.67541514e-01 1.30569167e-01]
 [0.00000000e+00 0.00000000e+0

## Use our Q-table to play FrozenLake
- After 10 000 episodes, our Q-table can be used as a "cheatsheet" to play FrozenLake"
- By running this cell you can see our agent playing FrozenLake.

In [None]:
!pip install pygame



In [None]:
import pygame

In [None]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):

        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])

        new_state, reward, done, info = env.step(action)

        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            #env.render()
            if new_state == 15:
                print("We reached our Goal 🏆")
            else:
                print("We fell into a hole ☠️")

            # We print the number of step it took.
            print("Number of steps", step)

            break
        state = new_state
env.close()

****************************************************
EPISODE  0
We reached our Goal 🏆
Number of steps 9
****************************************************
EPISODE  1
We reached our Goal 🏆
Number of steps 30
****************************************************
EPISODE  2
We fell into a hole ☠️
Number of steps 10
****************************************************
EPISODE  3
We fell into a hole ☠️
Number of steps 71
****************************************************
EPISODE  4
We reached our Goal 🏆
Number of steps 16
