In [5]:
import gym
import numpy as np

from tqdm import tqdm_notebook

# Q-Learning


## Bellman Equation 

$$ Q(s, a) = learning\ rate \cdot (r + \gamma( max(Q(s^{\prime}, a^{\prime})))) $$

## Q Function

$$ Q(s, a) = Q(s,a) + \text{lr} \left[ R(s, a) + \gamma \max Q^\prime (s^\prime, a^\prime) - Q(s, a) \right] $$

* $ \text{lr} $ : Learning rate
* $ R(s, a) $ : 현재 state, action으로 얻은 reward
* $ Q $ : 현재의 Q value
* $ \max Q^\prime (s^\prime, a^\prime) $ : Maximum future reward
* $ s^\prime $ : step(action)으로 얻은 next_state
* $ \gamma $ : Discount rate

## Build Q Table

In [10]:
env = gym.make('MountainCar-v0')
n_state = (env.observation_space.high - env.observation_space.low) * np.array([10, 100])
n_state = np.round(n_state, 0).astype(int) + 1

Q = np.random.uniform(-1, 1, size=(n_state[0], n_state[1], env.action_space.n))
print('Q shape:', Q.shape)
print('Q Sample:')
print(Q[1:2])

Q shape: (19, 15, 3)
Q Sample:
[[[ 0.82619025 -0.06216362 -0.68491643]
  [-0.46537736  0.0026488   0.76352481]
  [-0.46084451  0.41834014 -0.70205364]
  [-0.50025345 -0.99267844  0.44491119]
  [ 0.52070402 -0.08365642  0.97717661]
  [-0.46801954  0.92435095  0.63541954]
  [ 0.57558932 -0.9850298   0.42748597]
  [ 0.7221162  -0.21099647  0.41876368]
  [ 0.67178309  0.32167557 -0.48464403]
  [-0.09345879 -0.39326889 -0.69800853]
  [ 0.63033553  0.79487805 -0.88664955]
  [ 0.13433765  0.87601109  0.6994363 ]
  [ 0.33959778  0.29759607 -0.22626105]
  [ 0.36603566  0.11900018 -0.38516679]
  [ 0.5702685   0.66159952  0.04293651]]]


In [22]:
def discretize(env, state):
    state = (state - env.observation_space.low) * np.array([10, 100])
    state = np.round(state, 0).astype(int)
    return state

def train(env, Q, epochs=10000, lr=0.1, gamma=0.9, epsilon=0.9):
    reduction = epsilon/epochs
    action_n = env.action_space.n
    
    rewards = list()
    
    for epoch in tqdm_notebook(range(epochs)):
        state = env.reset()
        state = discretize(env, state)
        
        done = False
        _tot_reward = 0
        _tot_rand_action = 0
        _tot_q_action = 0
        _max_pos = 0
        
        while not done:
#             env.render()

            # Calculate next action
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q[state[0], state[1]])
                _tot_q_action += 1
            else:
                action = np.random.randint(0, action_n)
                _tot_rand_action += 1
                
            # Step!
            next_state, reward, done, info = env.step(action)
            next_state_apx = discretize(env, next_state)

            # Terminal Update
            if done and next_state[0] >= 0.5:
                reward = 1
                Q[next_state_apx[0], next_state_apx[1], action] = reward
            else:
                delta = lr * (reward + gamma * np.max(Q[next_state_apx[0], next_state_apx[1]]) - 
                              Q[state[0], state[1], action])
                Q[state[0], state[1], action] += delta
            
            state = next_state_apx
            _tot_reward += reward
            
        # Decay Epsilon
        if epsilon > 0:
            epsilon -= reduction
            epsilon = round(epsilon, 4)
            
        # Track Rewards
        rewards.append(_tot_reward)
        
        # Log
        if epoch%100 == 0:
            print(f'\repoch:{epoch} | tot reward:{_tot_reward} | epsilon:{epsilon} | ' 
                  f'rand action:{_tot_rand_action} | Q action:{_tot_q_action}')

train(env, Q)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

epoch:0 | tot reward:-200.0 | epsilon:0.8999 | rand action:183 | Q action:17
epoch:100 | tot reward:-200.0 | epsilon:0.8899 | rand action:184 | Q action:16
epoch:200 | tot reward:-200.0 | epsilon:0.8799 | rand action:177 | Q action:23
epoch:300 | tot reward:-200.0 | epsilon:0.8699 | rand action:174 | Q action:26
epoch:400 | tot reward:-200.0 | epsilon:0.8599 | rand action:166 | Q action:34
epoch:500 | tot reward:-200.0 | epsilon:0.8499 | rand action:163 | Q action:37
epoch:600 | tot reward:-200.0 | epsilon:0.8399 | rand action:172 | Q action:28
epoch:700 | tot reward:-200.0 | epsilon:0.8299 | rand action:167 | Q action:33
epoch:800 | tot reward:-200.0 | epsilon:0.8199 | rand action:159 | Q action:41
epoch:900 | tot reward:-200.0 | epsilon:0.8099 | rand action:158 | Q action:42
epoch:1000 | tot reward:-200.0 | epsilon:0.7999 | rand action:164 | Q action:36
epoch:1100 | tot reward:-200.0 | epsilon:0.7899 | rand action:156 | Q action:44
epoch:1200 | tot reward:-200.0 | epsilon:0.7799 | ra

## Computer Play

In [29]:
env = gym.make('MountainCar-v0')
state = env.reset()
state = discretize(env, state)

env.render()
input()

while True:
    env.render()
    action = np.argmax(Q[state[0], state[1]])
    state, reward, done, info = env.step(action)
    state = discretize(env, state)
    
    print(f'\rstate:{state} | reward:{reward} | done:{done} | info:{info}')
    
    if done:
        break


state:[7 7] | reward:-1.0 | done:False | info:{}
state:[7 7] | reward:-1.0 | done:False | info:{}
state:[7 7] | reward:-1.0 | done:False | info:{}
state:[7 7] | reward:-1.0 | done:False | info:{}
state:[7 6] | reward:-1.0 | done:False | info:{}
state:[7 6] | reward:-1.0 | done:False | info:{}
state:[7 6] | reward:-1.0 | done:False | info:{}
state:[7 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[6 6] | reward:-1.0 | done:False | info:{}
state:[5 6] | reward:-1.0 | done:False | info:{}
state:[5 6] | reward:-1.0 | done:False | info:{}
state:[5 6] | reward:-1.0 | done:False | info:{}
state:[5 6] | rewar

## Human Play 

In [13]:
env = gym.make('MountainCar-v0')
env.reset()
while True:
    env.render()
    while True:
        action = int(input())
        if action in [0, 1, 2]:
            break
    state, reward, done, info = env.step(action)
    print(f'\rstate:{state} | reward:{reward} | done:{done} | info:{info}')

0
state:[-5.96023268e-01 -4.64482215e-04] | reward:-1.0 | done:False | info:{}
0
state:[-0.59694883 -0.00092556] | reward:-1.0 | done:False | info:{}
0
state:[-0.5983287  -0.00137987] | reward:-1.0 | done:False | info:{}
0
state:[-0.60015277 -0.00182407] | reward:-1.0 | done:False | info:{}
0
state:[-0.60240772 -0.00225495] | reward:-1.0 | done:False | info:{}
0
state:[-0.6050771  -0.00266938] | reward:-1.0 | done:False | info:{}
0
state:[-0.60814146 -0.00306436] | reward:-1.0 | done:False | info:{}
0
state:[-0.61157852 -0.00343706] | reward:-1.0 | done:False | info:{}
0
state:[-0.61536337 -0.00378485] | reward:-1.0 | done:False | info:{}
0
state:[-0.61946865 -0.00410528] | reward:-1.0 | done:False | info:{}
0
state:[-0.62386477 -0.00439612] | reward:-1.0 | done:False | info:{}
0
state:[-0.62852019 -0.00465542] | reward:-1.0 | done:False | info:{}
0
state:[-0.63340163 -0.00488144] | reward:-1.0 | done:False | info:{}
0
state:[-0.63847435 -0.00507273] | reward:-1.0 | done:False | info:{

2
state:[-0.21406428  0.06130218] | reward:-1.0 | done:False | info:{}
2
state:[-0.15376406  0.06030022] | reward:-1.0 | done:False | info:{}
2
state:[-0.09470254  0.05906152] | reward:-1.0 | done:False | info:{}
2
state:[-0.0370408   0.05766174] | reward:-1.0 | done:False | info:{}
2
state:[0.01913637 0.05617716] | reward:-1.0 | done:False | info:{}
2
state:[0.07381765 0.05468128] | reward:-1.0 | done:False | info:{}
2
state:[0.12705998 0.05324233] | reward:-1.0 | done:False | info:{}
2
state:[0.17898175 0.05192177] | reward:-1.0 | done:False | info:{}
2
state:[0.22975533 0.05077358] | reward:-1.0 | done:False | info:{}
2
state:[0.27959963 0.0498443 ] | reward:-1.0 | done:False | info:{}
2
state:[0.32877303 0.04917341] | reward:-1.0 | done:False | info:{}
2
state:[0.37756703 0.048794  ] | reward:-1.0 | done:False | info:{}
2
state:[0.42630049 0.04873346] | reward:-1.0 | done:False | info:{}
2
state:[0.47531453 0.04901404] | reward:-1.0 | done:False | info:{}
2
state:[0.5249677  0.0496

KeyboardInterrupt: 

In [16]:
class QLearning(object):
    
    def __init__(self, game, epoch = 5000, lr=0.9, gamma=0.95):
        self.env = gym.make(game)
        self.epoch = epoch
        self.lr = lr
        self.gamma = gamma
        self.Q = np.zeros((self.env.observation_space.n, self.env.action_space.n))
        
    def train(self):
        env, Q, lr, gamma = self.env, self.Q, self.lr, self.gamma
        for i in range(1, self.epoch+1):
            state = self.env.reset()
            
            while True:
                action = np.argmax(Q[state,:] + np.random.randn(1, env.action_space.n)*(1./(i+1)))
                next_state, reward, done, info = env.step(action)
                
                Q[state, action] += lr*(reward + gamma * max(Q[next_state,:]) - Q[state, action])
                print(state)
                if done:
                    break
                    
                state = next_state
            
            if i% 1000 == 0:
                self.render()
#                 print('epoch:', i
#                 print Q
    
    def play(self, render=True):
        env, Q = self.env, self.Q
        score = 0
        count = 0
        state = env.reset()
        while True:
            action = np.argmax(Q[state, :] + np.random.randn(1, env.action_space.n) * 0.01)
            state, reward, done, info = env.step(action)
            
            if render:
                self.render()
                
            count += 1
            score += reward
            if done:
                self.render()
                break
        print('Total Score:', score, 'Move Count:', count)
        return score, count
    
    def render(self):
        clear_output(True)
        self.env.render()
        sleep(0.05)

q = QLearning('MountainCar-v0')
q.train()

AttributeError: 'Box' object has no attribute 'n'