In [1]:
import gym

In [2]:
env = gym.make("Taxi-v3").env
env.render()

+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+



In [3]:
env.reset()
env.render()

print("Action Space {}". format(env.action_space))
print("State Space {}" . format(env.observation_space))

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |[35mB[0m: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


Cek berapa kali bisa melakukan percobaan

In [4]:
state = env.encode(1, 2, 3, 2)
print("State : ", state)

env.s = state
env.render()

State :  154
+---------+
|R: | : :G|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



Algoritma Q Learning

In [5]:
env.s = 154

epochs = 0
penalties, reward = 0,0

frames = []

done = False

while not done :
  action = env.action_space.sample()
  state, reward, done, info = env.step(action)

  if reward == -10 :
    penalties += 1

  frames.append({
      'frame' : env.render(mode = 'ansi'),
      'state' : state,
      'action' : action,
      'reward' : reward
  })

  epochs += 1

print("Timesteps : {}" . format(epochs))
print("Penalties : {}" . format(penalties))

Timesteps : 528
Penalties : 175


In [6]:
from IPython.display import  clear_output
from time import sleep

def print_frames(frames) :
  for i, frame in enumerate(frames) :
    clear_output(wait = True)
    print(frame['frame'])
    print(f"Timestep : {i + 1}")
    print(f"State : {frame['state']}")
    print(f"Action : {frame['action']}")
    print(f"Reward : {frame['reward']}")
    sleep(.1)

print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

Timestep : 528
State : 410
Action : 5
Reward : 20


In [7]:
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [8]:
%%time

import random
from IPython.display import clear_output

alpha = 0.1
gamma = 0.6
epsilon = 0.1

all_epochs = []
all_penalties = []

for i in range(1, 100001) :
  state = env.reset()

  epochs, penalties, reward, = 0,0,0
  done = False

  while not done :
    if random.uniform(0,1) < epsilon :
      action = env.action_space.sample()
    else :
      action = np.argmax(q_table[state])
    
    next_state, reward, done, info = env.step(action)

    old_value = q_table[state, action]
    next_max = np.max(q_table[next_state])

    new_value = (1-alpha) * old_value + alpha * (reward + gamma * next_max)
    q_table[state, action] = new_value

    if reward == -10 :
      penalties += 1
    
    state = next_state
    epochs += 1

  if i % 100 == 0 :
    clear_output(wait = True)
    print(f"Episode : {i}")


print("Training finished")

Episode : 100000
Training finished
CPU times: user 1min 8s, sys: 15.9 s, total: 1min 24s
Wall time: 1min 9s


In [9]:
total_epoch, total_penalties = 0,0
episodes = 100

frames = []

for _ in range(episodes) :
  state = env.reset()
  epochs, penalties, reward = 0, 0, 0

  done = False

  while not done :
    action = np.argmax(q_table[state])
    state, reward, done, info = env.step(action)

    if reward == -10 :
      penalties += 1
    
    frames.append({
        'frame' : env.render(mode = 'ansi'),
        'state' : state,
        'action' : action,
        'reward' : reward
    })
    
    epochs += 1
  
  total_penalties += penalties
  total_epoch += epochs

print(f"result after {episodes} episodes : ")
print(f"Average timestep per episode : {total_epoch / episodes}")
print(f"Average penalties per episode : {total_penalties / episodes}")

result after 100 episodes : 
Average timestep per episode : 12.99
Average penalties per episode : 0.0


In [10]:
print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

Timestep : 1299
State : 475
Action : 5
Reward : 20
