In [1]:
import gym
import numpy as np

from tqdm import tqdm_notebook

# Q-Learning


## Bellman Equation 

$$ Q(s, a) = learning\ rate \cdot (r + \gamma( max(Q(s^{\prime}, a^{\prime})))) $$

## Q Function

$$ Q(s, a) = Q(s,a) + \text{lr} \left[ R(s, a) + \gamma \max Q^\prime (s^\prime, a^\prime) - Q(s, a) \right] $$

* $ \text{lr} $ : Learning rate
* $ R(s, a) $ : 현재 state, action으로 얻은 reward
* $ Q $ : 현재의 Q value
* $ \max Q^\prime (s^\prime, a^\prime) $ : Maximum future reward
* $ s^\prime $ : step(action)으로 얻은 next_state
* $ \gamma $ : Discount rate

## Build Q Table

In [2]:
env = gym.make('MountainCar-v0')
n_state = (env.observation_space.high - env.observation_space.low) * np.array([10, 100])
n_state = np.round(n_state, 0).astype(int) + 1

Q = np.random.uniform(-1, 1, size=(n_state[0], n_state[1], env.action_space.n))
print('Q shape:', Q.shape)
print('Q Sample:')
print(Q[1:2])

Q shape: (19, 15, 3)
Q Sample:
[[[ 0.40362861 -0.33400827 -0.44276763]
  [-0.20425145  0.44374132  0.35643621]
  [ 0.6735468  -0.3671098   0.10382042]
  [ 0.07533884 -0.9838463   0.23892995]
  [-0.29104912  0.78122936 -0.84611013]
  [ 0.69975269  0.88235987  0.12485457]
  [ 0.22286157  0.54722626 -0.16735047]
  [ 0.3656084   0.77050298 -0.19274283]
  [-0.87375598  0.35647618  0.48079769]
  [-0.84785423  0.04108182  0.30542148]
  [ 0.05457983 -0.44679372 -0.60764636]
  [ 0.52302156  0.87381095 -0.94724575]
  [-0.52528738  0.77206436  0.68115652]
  [-0.73443984  0.9866018  -0.45258224]
  [ 0.47438185 -0.06215433  0.08366828]]]


In [3]:
def discretize(env, state):
    state = (state - env.observation_space.low) * np.array([10, 100])
    state = np.round(state, 0).astype(int)
    return state

def train(env, Q, epochs=10000, lr=0.1, gamma=0.9, epsilon=0.9):
    np.random.seed(2424)
    reduction = epsilon/epochs
    action_n = env.action_space.n
    
    rewards = list()
    
    for epoch in tqdm_notebook(range(epochs)):
        state = env.reset()
        state = discretize(env, state)
        
        done = False
        _tot_reward = 0
        _tot_rand_action = 0
        _tot_q_action = 0
        _max_pos = 0
        
        while not done:

            # Calculate next action
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q[state[0], state[1]])
                _tot_q_action += 1
            else:
                action = np.random.randint(0, action_n)
                _tot_rand_action += 1
                
            # Step!
            next_state, reward, done, info = env.step(action)
            next_state_apx = discretize(env, next_state)

            # Terminal Update
            if done and next_state[0] >= 0.5:
                Q[next_state_apx[0], next_state_apx[1], action] = reward
            else:
                delta = lr * (reward + gamma * np.max(Q[next_state_apx[0], next_state_apx[1]]) - 
                              Q[state[0], state[1], action])
                Q[state[0], state[1], action] += delta
            
            state = next_state_apx
            _tot_reward += reward
            
        # Decay Epsilon
        if epsilon > 0:
            epsilon -= reduction
            epsilon = round(epsilon, 4)
            
        # Track Rewards
        rewards.append(_tot_reward)
        
        # Log
        if epoch%100 == 0:
            print(f'\repoch:{epoch} | tot reward:{_tot_reward} | epsilon:{epsilon} | ' 
                  f'rand action:{_tot_rand_action} | Q action:{_tot_q_action}')

train(env, Q)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))

epoch:0 | tot reward:-200.0 | epsilon:0.8999 | rand action:177 | Q action:23
epoch:100 | tot reward:-200.0 | epsilon:0.8899 | rand action:179 | Q action:21
epoch:200 | tot reward:-200.0 | epsilon:0.8799 | rand action:174 | Q action:26
epoch:300 | tot reward:-200.0 | epsilon:0.8699 | rand action:183 | Q action:17
epoch:400 | tot reward:-200.0 | epsilon:0.8599 | rand action:170 | Q action:30
epoch:500 | tot reward:-200.0 | epsilon:0.8499 | rand action:163 | Q action:37
epoch:600 | tot reward:-200.0 | epsilon:0.8399 | rand action:166 | Q action:34
epoch:700 | tot reward:-200.0 | epsilon:0.8299 | rand action:163 | Q action:37
epoch:800 | tot reward:-200.0 | epsilon:0.8199 | rand action:163 | Q action:37
epoch:900 | tot reward:-200.0 | epsilon:0.8099 | rand action:167 | Q action:33
epoch:1000 | tot reward:-200.0 | epsilon:0.7999 | rand action:149 | Q action:51
epoch:1100 | tot reward:-200.0 | epsilon:0.7899 | rand action:168 | Q action:32
epoch:1200 | tot reward:-200.0 | epsilon:0.7799 | ra

## Computer Play

In [4]:
env = gym.make('MountainCar-v0')
state = env.reset()
state = discretize(env, state)

env.render()
input()

while True:
    env.render()
    action = np.argmax(Q[state[0], state[1]])
    state, reward, done, info = env.step(action)
    state = discretize(env, state)
    
    print(f'\rstate:{state} | reward:{reward} | done:{done} | info:{info}')
    
    if done:
        break


state:[7 7] | reward:-1.0 | done:False | info:{}
state:[7 7] | reward:-1.0 | done:False | info:{}
state:[7 7] | reward:-1.0 | done:False | info:{}
state:[7 6] | reward:-1.0 | done:False | info:{}
state:[7 6] | reward:-1.0 | done:False | info:{}
state:[7 6] | reward:-1.0 | done:False | info:{}
state:[7 6] | reward:-1.0 | done:False | info:{}
state:[7 6] | reward:-1.0 | done:False | info:{}
state:[7 6] | reward:-1.0 | done:False | info:{}
state:[7 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 5] | reward:-1.0 | done:False | info:{}
state:[5 5] | reward:-1.0 | done:False | info:{}
state:[5 5] | reward:-1.0 | done:False | info:{}
state:[5 5] | reward:-1.0 | done:False | info:{}
state:[5 6] | rewar

## Human Play 

In [6]:
env = gym.make('MountainCar-v0')
env.reset()
while True:
    env.render()
    while True:
        action = int(input())
        if action in [0, 1, 2]:
            break
    state, reward, done, info = env.step(action)
    print(f'\rstate:{state} | reward:{reward} | done:{done} | info:{info}')

0
state:[-0.43620921 -0.00165996] | reward:-1.0 | done:False | info:{}
1
state:[-0.43851711 -0.00230789] | reward:-1.0 | done:False | info:{}
1
state:[-0.44145621 -0.0029391 ] | reward:-1.0 | done:False | info:{}
1
state:[-0.44500516 -0.00354895] | reward:-1.0 | done:False | info:{}
1
state:[-0.44913812 -0.00413296] | reward:-1.0 | done:False | info:{}
1
state:[-0.4538249  -0.00468678] | reward:-1.0 | done:False | info:{}
1
state:[-0.45903118 -0.00520627] | reward:-1.0 | done:False | info:{}
1
state:[-0.46471868 -0.00568751] | reward:-1.0 | done:False | info:{}
1
state:[-0.4708455  -0.00612681] | reward:-1.0 | done:False | info:{}
2
state:[-0.47636631 -0.00552081] | reward:-1.0 | done:False | info:{}
2
state:[-0.48124019 -0.00487387] | reward:-1.0 | done:False | info:{}
2
state:[-0.4854309  -0.00419071] | reward:-1.0 | done:False | info:{}
2
state:[-0.48890724 -0.00347634] | reward:-1.0 | done:False | info:{}
22
2
state:[-0.4916433  -0.00273606] | reward:-1.0 | done:False | info:{}
2
s

KeyboardInterrupt: 

## MP4 to GIf

In [None]:
from moviepy.editor import VideoFileClip
from moviepy.video.fx import rotate

clip = VideoFileClip('mountain-car.mp4', audio=False)
clip = clip
clip.write_gif('mountain-car.gif', fps=32)