In [52]:
import numpy as np
import gym # pip intall gym
import random

In [53]:
env = gym.make('FrozenLake-v1', is_slippery = False) # Instantiating gym object

In [54]:
action_space_size = env.action_space.n # Number of possible actions in each of these states in the frozen lake game.
state_space_size = env.observation_space.n # Number of states for the frozen lake game

qtable = np.zeros((state_space_size, action_space_size)) # Creating q tabel from state spaces and action space states.
print("Q Table\n",qtable) 

Q Table
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [55]:
total_episodes = 10000 # Number of times we want our agent to play the game to fill the q-table.
learning_rate = 0.2 # Learning Rate 
max_steps = 100
gamma = 0.99

epsilon = 1
max_epsilon = 1
min_epsilon = 0.01
decay_rate = 0.001

In [56]:
rewards = []

for episode in range(total_episodes):
    state, info = env.reset()
    step = 0
    done = False
    total_rewards = 0
    for step in range(max_steps):
        
        if random.uniform(0,1) > epsilon:
            action = np.argmax(qtable[state,:]) # Exploit
        else:
            action = env.action_space.sample() # Explore

        new_state, reward, done, truncated, info = env.step(action)
        
        max_new_state = np.max(qtable[new_state,:])
        
        qtable[state,action] = qtable[state, action] + learning_rate*(reward+gamma*max_new_state-qtable[state,action])
        total_rewards += reward
        
        state = new_state
        if done:
            break
            
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) # Decreasing the probability of exploring as episodes increase.
    rewards.append(total_rewards) 
    
print("Score:", str(sum(rewards)/total_episodes*100) + "%")
print(qtable)

Score: 87.92%
[[0.94148015 0.95099005 0.93206535 0.94148015]
 [0.94148015 0.         0.92190736 0.93201897]
 [0.93197552 0.29134814 0.6311653  0.8573256 ]
 [0.79744731 0.         0.04112673 0.34098788]
 [0.95099005 0.96059601 0.         0.94148015]
 [0.         0.         0.         0.        ]
 [0.         0.97102458 0.         0.44659748]
 [0.         0.         0.         0.        ]
 [0.96059601 0.         0.970299   0.95099005]
 [0.96059601 0.9801     0.9801     0.        ]
 [0.96960674 0.99       0.         0.92813135]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9801     0.99       0.970299  ]
 [0.9801     0.99       1.         0.9801    ]
 [0.         0.         0.         0.        ]]


In [57]:
env.reset()

for episode in range(1):
    state, info = env.reset()
    step = 0
    done = False
    
    print("Episode:", episode+1)
    
    for step in range(max_steps):
        action = np.argmax(qtable[state,:])
        new_state, reward, done, truncated, info = env.step(action)
        
        if done:
            
            print("Number of Steps:", step)
            break
        state = new_state
        
env.close()

Episode: 1
Number of Steps: 5


# SARSA

SARSA Hyper-Parameters

In [61]:
total_episodes = 100000 # INCREASE NUMBER OF EPISODES FOR SARSA
learning_rate = 0.2 # Learning Rate 
max_steps = 100
gamma = 0.99

epsilon = 1
max_epsilon = 1
min_epsilon = 0.01
decay_rate = 0.001

In [62]:
rewards = []
qtable = np.zeros((state_space_size, action_space_size)) # Reinstantiating the q-table
for episode in range(total_episodes):
    state, info = env.reset()
    step = 0
    done = False
    total_rewards = 0
    for step in range(max_steps):
        
        if random.uniform(0,1) > epsilon:
            action = np.argmax(qtable[state,:]) # Exploit
        else:
            action = env.action_space.sample() # Explore

        new_state, reward, done, truncated, info = env.step(action)

        if random.uniform(0,1) > epsilon:
            new_action = np.argmax(qtable[new_state,:]) # Exploit
        else:
            new_action = env.action_space.sample() # Explore
        
        sarsa_new_state = np.max(qtable[new_state, new_action])
        
        qtable[state,action] = qtable[state, action] + learning_rate*(reward+gamma*sarsa_new_state-qtable[state,action])
        total_rewards += reward
        
        state = new_state
        if done:
            break
            
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) # Decreasing the probability of exploring as episodes increase.
    rewards.append(total_rewards) 
    
print("Score:", str(sum(rewards)/total_episodes*100) + "%")
print(qtable)

Score: 97.03399999999999%
[[0.5886471  0.5720508  0.94832426 0.5889198 ]
 [0.5902236  0.         0.95982367 0.51071018]
 [0.59185449 0.97014088 0.59089818 0.66300633]
 [0.59088956 0.         0.56924608 0.45360084]
 [0.71363063 0.71120136 0.         0.71336351]
 [0.         0.         0.         0.        ]
 [0.         0.98007246 0.         0.77698245]
 [0.         0.         0.         0.        ]
 [0.78623331 0.         0.77520013 0.83373464]
 [0.8424861  0.97957525 0.8546465  0.        ]
 [0.93902983 0.99       0.         0.91809905]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.97995938 0.99       0.97025411]
 [0.98009982 0.99       1.         0.98008605]
 [0.         0.         0.         0.        ]]
