In [5]:
# @title Install Dependencies
!pip install numpy
!pip install gym

Collecting gym
[?25l  Downloading https://files.pythonhosted.org/packages/c3/44/3a63e8b87f642db49ac81239620e68df8cfae223dcfda4f8508aec88d204/gym-0.10.8.tar.gz (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 6.4MB/s 
Collecting pyglet>=1.2.0 (from gym)
[?25l  Downloading https://files.pythonhosted.org/packages/1c/fc/dad5eaaab68f0c21e2f906a94ddb98175662cc5a654eee404d59554ce0fa/pyglet-1.3.2-py2.py3-none-any.whl (1.0MB)
[K    100% |████████████████████████████████| 1.0MB 8.8MB/s 
Building wheels for collected packages: gym
  Running setup.py bdist_wheel for gym ... [?25l- \ | / done
[?25h  Stored in directory: /root/.cache/pip/wheels/ea/ec/dd/33bcc8801d345f0b640fced8a0864a7c8474828564bc5ccf70
Successfully built gym
Installing collected packages: pyglet, gym
Successfully installed gym-0.10.8 pyglet-1.3.2


In [0]:
# @title Import required libraries
import numpy as np
import gym

In [0]:
# @title TD(0) prediction method
EPSILON = 0.1
GAMMA = 1.0
ALPHA = 0.2
N_EPISODES = 10000

def generate_rand_policy(env):
  policy = []
  for i in range(env.env.nS):
    policy.append(env.action_space.sample())
  return policy

def eps_greedy_policy(state, policy, epsilon=EPSILON):
  prob = np.random.random()
  if prob < (1 - epsilon):
    return policy[state]
  else:
    return env.action_space.sample()

def td_0(env, alpha=ALPHA, epsilon=EPSILON, gamma=GAMMA, num_episodes=N_EPISODES):
  V = np.zeros(env.env.nS)
  wins = 0
  policy = generate_rand_policy(env)
  for i_episode in range(num_episodes):
    state = env.reset()
    timesteps = 0
    while True:
      action = eps_greedy_policy(state, policy)
      next_state, reward, done, _ = env.step(action)
      V[state] = V[state] + alpha*(reward + gamma*V[next_state] - V[state])
      if done:
        if next_state == 15:
          wins += 1
        break
      state = next_state
      timesteps += 1
  return V, wins

def sarsa(env, num_episodes=N_EPISODES, alpha=ALPHA, epsilon=EPSILON, gamma=GAMMA):
  Q = np.zeros((env.env.nS, env.env.nA))
  policy = generate_rand_policy(env)
  wins = 0
  for i_episode in range(num_episodes):
    state = env.reset()
    action = eps_greedy_policy(state, policy)
    while True:
      next_state, reward, done, _ = env.step(action)
      next_action = eps_greedy_policy(next_state, policy)
      Q[state][action] = Q[state][action] + \
        alpha*(reward + gamma*Q[next_state][next_action] - Q[state][action])
      state = next_state
      action = next_action
      policy[state] = np.argmax(Q[state])
      if done:
        if state == 15:
          wins += 1
        break
  return wins
        

In [114]:
# @title Shows TD(0) Agent's success
env = gym.make('FrozenLake-v0')
for _ in range(50):
  _, wins = td_0(env)
  print("Agent wins {} games of {} games".format(wins, N_EPISODES))

Agent wins 69 games of 10000 games
Agent wins 66 games of 10000 games
Agent wins 458 games of 10000 games
Agent wins 121 games of 10000 games
Agent wins 53 games of 10000 games
Agent wins 515 games of 10000 games
Agent wins 26 games of 10000 games
Agent wins 185 games of 10000 games
Agent wins 184 games of 10000 games
Agent wins 37 games of 10000 games
Agent wins 7 games of 10000 games
Agent wins 130 games of 10000 games
Agent wins 77 games of 10000 games
Agent wins 285 games of 10000 games
Agent wins 759 games of 10000 games
Agent wins 34 games of 10000 games
Agent wins 368 games of 10000 games
Agent wins 153 games of 10000 games
Agent wins 94 games of 10000 games
Agent wins 377 games of 10000 games
Agent wins 329 games of 10000 games
Agent wins 64 games of 10000 games
Agent wins 133 games of 10000 games
Agent wins 286 games of 10000 games
Agent wins 69 games of 10000 games
Agent wins 28 games of 10000 games
Agent wins 5 games of 10000 games
Agent wins 139 games of 10000 games
Agent w

In [43]:
# @title Shows SARSA Agent's success
env = gym.make('FrozenLake-v0')
N_SES = 10
sum = 0
success = 0
policy = generate_rand_policy(env)
for _ in range(N_SES):
  wins = sarsa(env)
  if wins > success:
    success = wins
  sum += wins
mean = sum/N_SES
print("The mean wins of the agent are {} of {} games after {} learning sessions".format(mean, N_EPISODES, N_SES))
print("The best result of the agent was {} wins of {} games after {} learning sessions".format(success, N_EPISODES, N_SES))

The mean wins of the agent are 1668.4 of 10000 games after 10 learning sessions
The best result of the agent was 3019 wins of 10000 games after 10 learning sessions
