In [28]:
import gym
import numpy as np
import random
import time
from IPython.display import clear_output

In [29]:
#!conda install -c anaconda ipython

# FrozenLake-v0

### The agent controls the movement of a character in a grid world. Some tiles of the grid are walkable, and others lead to the agent falling into the water. Additionally, the movement direction of the agent is uncertain and only partially depends on the chosen direction. The agent is rewarded for finding a walkable path to a goal tile.

Winter is here. You and your friends were tossing around a frisbee at the park when you made a wild throw that left the frisbee out in the middle of the lake. The water is mostly frozen, but there are a few holes where the ice has melted. If you step into one of those holes, you'll fall into the freezing water. At this time, there's an international frisbee shortage, so it's absolutely imperative that you navigate across the lake and retrieve the disc. However, the ice is slippery, so you won't always move in the direction you intend.

### The surface is described using a grid like the following:

- SFFF       (S: starting point, safe)
- FHFH       (F: frozen surface, safe)
- FFFH       (H: hole, fall to your doom)
- HFFG       (G: goal, where the frisbee is located)

**The episode ends when you reach the goal or fall in a hole. You receive a reward of 1 if you reach the goal, and zero otherwise**

# Creating the Environment variable for the game

In [30]:
env = gym.make('FrozenLake-v0')

# Creating and Initialising the Q - Table

In [31]:
action_space = env.action_space.n
state_space = env.observation_space.n


q_table = np.zeros((state_space , action_space))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


# Defining the parameters for our Algorithm's Learning and Execution

In [32]:
# Episode based constants
EPISODES = 10000
MAX_ITERATIONS_PER_EPISODE = 100

# Learning and Reward based constants
LEARNING_RATE_ALPHA = 0.1
DISCOUNT_RATE_GAMMA = 0.99

# Exploration and Exploitation based constants
exploration_rate = 1 # Initial value of Epsilon
MAX_EXPLORATION_RATE = 1  # Upper bound for exploration rate
MIN_EXPLORATION_RATE = 0.01  # Lower bound for exploration rate
EXPLORATION_DECAY_RATE = 0.001 # Decay per episode

# Implementing the Q-Learning Algorithm

In [33]:
rewards_from_episodes = [] # to hold the rewards we get from each episode and later observe how the agent learned

# Algorithm 

for episode in range(EPISODES):
    
    state = env.reset() # The reset method returns us the current Environment state , from which we start playing
    done = False # Setting a parameter ( like a flag ) to know if we have reached a terminal state or not
    
    rewards_current_episode = 0 # Initilaising the current episode reward
    
    for step in range(MAX_ITERATIONS_PER_EPISODE):
        
        # Exploration - Exploitation Trade-Off
        
        exploration_rate_threshold = random.uniform(0,1)
        
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
            
        # Doing the Action
        
        new_state, reward , done , info = env.step(action)
        
        # Update the Q-Table for Q(s,a) based on the updation formula
        # Q(s,a) = (1-alpha)*Q(s,a) + alpha(reward , max_reward(Q-table[state,:]))
        
        q_table[state,action] = q_table[state,action]*(1-LEARNING_RATE_ALPHA)+ LEARNING_RATE_ALPHA*(reward + DISCOUNT_RATE_GAMMA*np.max(q_table[new_state,:]))
        
        state = new_state
        rewards_current_episode += reward
        
        if done == True:
            break
            
            
    # Exploration Rate Decay Procedure
    
    exploration_rate = MIN_EXPLORATION_RATE + (MAX_EXPLORATION_RATE - MIN_EXPLORATION_RATE)*np.exp(-EXPLORATION_DECAY_RATE*episode)
    
    
    rewards_from_episodes.append(rewards_current_episode)
        

In [34]:
rewards_per_1000_episodes = np.split(np.array(rewards_from_episodes) , EPISODES/1000)
count = 1000

print("**** Average Rewards per Thousand Episodes **** \n")
for r in rewards_per_1000_episodes:
    print(count , ' : ' , str(sum(r/1000)))
    count += 1000
print('\n')    
    
# Printing Updated Q-Value

print('********* Q-TABLE ********** \n')
print(q_table)

    
    
    

**** Average Rewards per Thousand Episodes **** 

1000  :  0.03800000000000003
2000  :  0.21500000000000016
3000  :  0.4200000000000003
4000  :  0.5510000000000004
5000  :  0.6140000000000004
6000  :  0.6810000000000005
7000  :  0.6630000000000005
8000  :  0.6940000000000005
9000  :  0.6870000000000005
10000  :  0.6760000000000005


********* Q-TABLE ********** 

[[0.51907373 0.50036143 0.51077019 0.50802164]
 [0.3547606  0.37401188 0.28186437 0.47418438]
 [0.40725072 0.42233127 0.40519175 0.45799795]
 [0.32003878 0.26844336 0.37940503 0.44630301]
 [0.55217645 0.28647335 0.36115532 0.45101651]
 [0.         0.         0.         0.        ]
 [0.30437663 0.15585007 0.210934   0.10671138]
 [0.         0.         0.         0.        ]
 [0.39321476 0.42068917 0.43279473 0.58359608]
 [0.45777763 0.62387816 0.41361401 0.29190728]
 [0.62844379 0.3945588  0.28023411 0.41651082]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.49097244 0.46564821

# Visulaising our Agent playing Frozen Lake :


In [35]:
import matplotlib.pyplot as plt

for episode in range(3):
    
    state = env.reset()
    done = False
    
    print("********** EPISODE : ",episode+1 , "************\n\n" )
    time.sleep(2)
    
    for step in range(MAX_ITERATIONS_PER_EPISODE):
        
        clear_output(wait = True)
        
        env.render()
        time.sleep(1)
        
        action = np.argmax(q_table[state,:])
        new_state , reward , done , info = env.step(action)
        
        
        if done:
            clear_output(wait = True)
            env.render()
            
            if reward == 1:
                print("**** You've reached the Goal !! ****")
                time.sleep(2)
            else:
                print("**** You fell through a Hole !! ****")
                time.sleep(2)
            
            clear_output(wait = True)
            break
            
            
        state = new_state
        
        
env.close()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
**** You've reached the Goal !! ****
