In [1]:
import gym
import numpy as np

from tqdm import tqdm_notebook

# Q-Learning


## Bellman Equation 

$$ Q(s, a) = learning\ rate \cdot (r + \gamma( max(Q(s^{\prime}, a^{\prime})))) $$

## Q Function

$$ Q(s, a) = Q(s,a) + \text{lr} \left[ R(s, a) + \gamma \max Q^\prime (s^\prime, a^\prime) - Q(s, a) \right] $$

* $ \text{lr} $ : Learning rate
* $ R(s, a) $ : 현재 state, action으로 얻은 reward
* $ Q $ : 현재의 Q value
* $ \max Q^\prime (s^\prime, a^\prime) $ : Maximum future reward
* $ s^\prime $ : step(action)으로 얻은 next_state
* $ \gamma $ : Discount rate

## Build Q Table

In [2]:
env = gym.make('MountainCar-v0')
n_state = (env.observation_space.high - env.observation_space.low) * np.array([10, 100])
n_state = np.round(n_state, 0).astype(int) + 1

Q = np.random.uniform(-1, 1, size=(n_state[0], n_state[1], env.action_space.n))
print('Q shape:', Q.shape)
print('Q Sample:')
print(Q[1:2])

Q shape: (19, 15, 3)
Q Sample:
[[[-0.93550143  0.49348608  0.28042329]
  [-0.51138641  0.60662467  0.20133321]
  [-0.7740239  -0.40789656 -0.25508083]
  [ 0.96154604  0.78241915  0.38204132]
  [-0.49029844 -0.40603148 -0.5912724 ]
  [-0.52286276 -0.78501707 -0.28431108]
  [ 0.22901036  0.37104667  0.50171052]
  [-0.4238752  -0.69301336  0.22947357]
  [-0.30397968 -0.18516846 -0.75730867]
  [ 0.70832296 -0.72325355 -0.15829449]
  [ 0.78304422  0.18435865  0.13484241]
  [-0.07186571  0.40669639  0.57055028]
  [ 0.10579579 -0.42285466  0.98768751]
  [-0.17498335  0.52100423 -0.15624894]
  [-0.80557831 -0.55626878  0.8144684 ]]]


In [None]:
def discretize(env, state):
    state = (state - env.observation_space.low) * np.array([10, 100])
    state = np.round(state, 0).astype(int)
    return state

def train(env, Q, epochs=10000, lr=0.1, gamma=0.9, epsilon=0.9):
    reduction = epsilon/epochs
    action_n = env.action_space.n
    
    rewards = list()
    
    for epoch in tqdm_notebook(range(epochs)):
        state = env.reset()
        state = discretize(env, state)
        
        done = False
        _tot_reward = 0
        _tot_rand_action = 0
        _tot_q_action = 0
        _max_pos = 0
        
        while not done:

            # Calculate next action
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q[state[0], state[1]])
                _tot_q_action += 1
            else:
                action = np.random.randint(0, action_n)
                _tot_rand_action += 1
                
            # Step!
            next_state, reward, done, info = env.step(action)
            next_state_apx = discretize(env, next_state)

            # Terminal Update
            if done and next_state[0] >= 0.5:
                Q[next_state_apx[0], next_state_apx[1], action] = reward
            else:
                delta = lr * (reward + gamma * np.max(Q[next_state_apx[0], next_state_apx[1]]) - 
                              Q[state[0], state[1], action])
                Q[state[0], state[1], action] += delta
            
            state = next_state_apx
            _tot_reward += reward
            
        # Decay Epsilon
        if epsilon > 0:
            epsilon -= reduction
            epsilon = round(epsilon, 4)
            
        # Track Rewards
        rewards.append(_tot_reward)
        
        # Log
        if epoch%100 == 0:
            print(f'\repoch:{epoch} | tot reward:{_tot_reward} | epsilon:{epsilon} | ' 
                  f'rand action:{_tot_rand_action} | Q action:{_tot_q_action}')

train(env, Q)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

epoch:0 | tot reward:-200.0 | epsilon:0.8999 | rand action:178 | Q action:22
epoch:100 | tot reward:-200.0 | epsilon:0.8899 | rand action:170 | Q action:30
epoch:200 | tot reward:-200.0 | epsilon:0.8799 | rand action:168 | Q action:32
epoch:300 | tot reward:-200.0 | epsilon:0.8699 | rand action:170 | Q action:30
epoch:400 | tot reward:-200.0 | epsilon:0.8599 | rand action:163 | Q action:37
epoch:500 | tot reward:-200.0 | epsilon:0.8499 | rand action:164 | Q action:36
epoch:600 | tot reward:-200.0 | epsilon:0.8399 | rand action:165 | Q action:35
epoch:700 | tot reward:-200.0 | epsilon:0.8299 | rand action:162 | Q action:38
epoch:800 | tot reward:-200.0 | epsilon:0.8199 | rand action:159 | Q action:41
epoch:900 | tot reward:-200.0 | epsilon:0.8099 | rand action:155 | Q action:45
epoch:1000 | tot reward:-200.0 | epsilon:0.7999 | rand action:162 | Q action:38
epoch:1100 | tot reward:-200.0 | epsilon:0.7899 | rand action:163 | Q action:37
epoch:1200 | tot reward:-200.0 | epsilon:0.7799 | ra

## Computer Play

In [None]:
env = gym.make('MountainCar-v0')
state = env.reset()
state = discretize(env, state)

env.render()
input()

while True:
    env.render()
    action = np.argmax(Q[state[0], state[1]])
    state, reward, done, info = env.step(action)
    state = discretize(env, state)
    
    print(f'\rstate:{state} | reward:{reward} | done:{done} | info:{info}')
    
    if done:
        break

## Human Play 

In [13]:
env = gym.make('MountainCar-v0')
env.reset()
while True:
    env.render()
    while True:
        action = int(input())
        if action in [0, 1, 2]:
            break
    state, reward, done, info = env.step(action)
    print(f'\rstate:{state} | reward:{reward} | done:{done} | info:{info}')

0
state:[-5.96023268e-01 -4.64482215e-04] | reward:-1.0 | done:False | info:{}
0
state:[-0.59694883 -0.00092556] | reward:-1.0 | done:False | info:{}
0
state:[-0.5983287  -0.00137987] | reward:-1.0 | done:False | info:{}
0
state:[-0.60015277 -0.00182407] | reward:-1.0 | done:False | info:{}
0
state:[-0.60240772 -0.00225495] | reward:-1.0 | done:False | info:{}
0
state:[-0.6050771  -0.00266938] | reward:-1.0 | done:False | info:{}
0
state:[-0.60814146 -0.00306436] | reward:-1.0 | done:False | info:{}
0
state:[-0.61157852 -0.00343706] | reward:-1.0 | done:False | info:{}
0
state:[-0.61536337 -0.00378485] | reward:-1.0 | done:False | info:{}
0
state:[-0.61946865 -0.00410528] | reward:-1.0 | done:False | info:{}
0
state:[-0.62386477 -0.00439612] | reward:-1.0 | done:False | info:{}
0
state:[-0.62852019 -0.00465542] | reward:-1.0 | done:False | info:{}
0
state:[-0.63340163 -0.00488144] | reward:-1.0 | done:False | info:{}
0
state:[-0.63847435 -0.00507273] | reward:-1.0 | done:False | info:{

2
state:[-0.21406428  0.06130218] | reward:-1.0 | done:False | info:{}
2
state:[-0.15376406  0.06030022] | reward:-1.0 | done:False | info:{}
2
state:[-0.09470254  0.05906152] | reward:-1.0 | done:False | info:{}
2
state:[-0.0370408   0.05766174] | reward:-1.0 | done:False | info:{}
2
state:[0.01913637 0.05617716] | reward:-1.0 | done:False | info:{}
2
state:[0.07381765 0.05468128] | reward:-1.0 | done:False | info:{}
2
state:[0.12705998 0.05324233] | reward:-1.0 | done:False | info:{}
2
state:[0.17898175 0.05192177] | reward:-1.0 | done:False | info:{}
2
state:[0.22975533 0.05077358] | reward:-1.0 | done:False | info:{}
2
state:[0.27959963 0.0498443 ] | reward:-1.0 | done:False | info:{}
2
state:[0.32877303 0.04917341] | reward:-1.0 | done:False | info:{}
2
state:[0.37756703 0.048794  ] | reward:-1.0 | done:False | info:{}
2
state:[0.42630049 0.04873346] | reward:-1.0 | done:False | info:{}
2
state:[0.47531453 0.04901404] | reward:-1.0 | done:False | info:{}
2
state:[0.5249677  0.0496

KeyboardInterrupt: 

## MP4 to GIf

In [18]:
from moviepy.editor import VideoFileClip
from moviepy.video.fx import rotate

clip = VideoFileClip('mountain-car.mp4', audio=False)
clip = clip
clip.write_gif('mountain-car.gif', fps=32)

t:   2%|▏         | 5/280 [00:00<00:05, 46.64it/s, now=None]

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html
MoviePy - Building file mountain-car.gif with imageio.


                                                              