In [1]:
!pip install gym



In [2]:
import gym
import random

In [3]:
env = gym.make("Taxi-v3")

env.render()

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+



In [4]:
alpha = 0.4
gamma = 0.999
epsilon = 0.017

In [5]:
q = {}
for s in range(env.observation_space.n):
    for a in range(env.action_space.n):
        q[(s, a)] = 0    

In [6]:
def update_q_table(prev_state, action, reward, nextstate, alpha, gamma):
    qa = max([q[(nextstate, a)] for a in range(env.action_space.n)])
    q[(prev_state, action)] += alpha * (reward + gamma * qa - q[(prev_state, action)])

In [7]:
def epsilon_greedy_policy(state, epsilon):
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        return max(list(range(env.action_space.n)), key = lambda x: q[(state, x)])

In [11]:
for i in range(10):
    print(i + 1, "번 째 시도")
    
    r = 0
    prev_state = env.reset()
    while True:
        env.render()
            
        # In each state, we selec the action by epsilon-greedy policy
        action = epsilon_greedy_policy(prev_state, epsilon)
        
        # then we perfrom the action and move to the next state, and receive the reward
        nextstate, reward, done, _ = env.step(action)
            
        # Next we update the Q value using our update_q_table function
        # which updates the Q value by Q learning update rule
        update_q_table(prev_state, action, reward, nextstate, alpha, gamma)
        
        
        # Finally we update the previous state as next state
        prev_state = nextstate
            
        # Store all the reward obtained
        r += reward
            
        # we will break the loop, if we are at the terminal
        # state of the episode
        if done:
            break
                
    print("total reward : ", r)
        
env.close()

1 번 째 시도
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+

!
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+
  (South)
!
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+
  (West)
!
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| :[43m [0m|B: |
+---------+
  (East)
!
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| :[43m [0m|B: |
+---------+
  (South)
!
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| :[43m [0m|B: |
+---------+
  (East)
!
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+
  (West)
!
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |B: |
+---------+
  (North)
!
+---

!
+---------+
|R:[42m_[0m| : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
!
+---------+
|[42mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
!
+---------+
|[42mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
!
+---------+
|[42mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
!
+---------+
|R:[42m_[0m| : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
!
+---------+
|R: | : :[35mG[0m|
| :[42m_[0m| : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (South)
!
+---------+
|R: | : :[35mG[0m|
| :[42m_[0m| : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
!
+---------+
|R: | : :[35mG[0m|
| :[42m_[0m| : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)
!
total reward :  -209
4 번 째 시도
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
|

+---------+
|R: | : :G|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
!
+---------+
|R: | : :G|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)
!
+---------+
|R: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
!
+---------+
|R: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)
!
+---------+
|R: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)
!
+---------+
|R: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (East)
!
+---------+
|R: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
!
+---------+
|R: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
!
+--

!
+---------+
|R: | : :G|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
!
+---------+
|R: | : :G|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (East)
!
+---------+
|R: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
!
+---------+
|R: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
!
+---------+
|R: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (North)
!
+---------+
|R: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
!
+---------+
|R: | : :G|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (East)
!
+---------+
|R: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
!
+-