In [1]:
import gym
import numpy as np

from keras.layers import Dense, Softmax
from keras.models import Sequential

env = gym.make('CartPole-v0')
print(env.observation_space)
print(env.action_space)
n_features = env.observation_space.shape[0]
n_actions = env.action_space.n

In [None]:
class DeepQNetwork:
    def __init__(self, 
                 n_actions, 
                 n_features, 
                 e_greedy=0.9, 
                 reward_decay=0.9,
                 memory_size=500, 
                 batch_size=100, 
                 update_weights=300):
        self.n_actions = n_actions
        self.n_features = n_features
        self.e_greedy = e_greedy
        self.reward_decay = reward_decay
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.update_weights = update_weights
        
        self.eval_model = self._build_model()
        self.target_model = self._build_model()
        
        self.memory = np.zeros([self.memory_size, 2 * self.n_features + 2])
        self.memory_counter = 0
        self.learn_counter = 0
    
    def _build_model(self):
        model = Sequential()
        model.add(Dense(10, input_shape=(self.n_features,), activation='relu'))
        model.add(Dense(self.n_actions))
        model.compile(optimizer='adam', loss='mse')
        return model

    def choose_action(self, obs):
        if np.random.uniform() < self.e_greedy:
            obs = obs[np.newaxis, :]
            action_values = self.eval_model.predict(obs)
            action = np.argmax(action_values)
        else:
            action = np.random.randint(0, self.n_actions)
        return action

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, [a, r], s_))
        self.memory[self.memory_counter % self.memory_size] = transition
        self.memory_counter += 1

    def model_train(self):
        self.learn_counter += 1
        if self.learn_counter % self.update_weights == 0:
            print("Update weights!")
            self.target_model.set_weights(self.eval_model.get_weights())

        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, self.batch_size)
        batch_memory = self.memory[sample_index, :]

        s = batch_memory[:, :self.n_features]
        s_ = batch_memory[:, -self.n_features:]
        actions = batch_memory[:, self.n_features].astype(int)
        rewards = batch_memory[:, self.n_features+1]
        q_eval = self.eval_model.predict(s)
        q_next = self.target_model.predict(s_)

        q_target = q_eval.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        q_target[batch_index, actions] = rewards + self.reward_decay * np.max(q_next, axis=1)

        self.eval_model.fit(s, q_target, verbose=0)

    def save(self, model_path):
        self.eval_model.save(model_path)

In [None]:
dqn = DeepQNetwork(n_actions, n_features,
                   e_greedy=0.9,
                   reward_decay=0.9, 
                   memory_size=500, 
                   batch_size=100, 
                   update_weights=300)

total_steps = 0
for i_episode in range(300):
    obs = env.reset()
    ep_r = 0
    while(True):
        env.render()
        action = dqn.choose_action(obs)
        obs_, reward, done, info = env.step(action)
        
        # the smaller theta and closer to center the better
        x, x_dot, theta, theta_dot = obs_
        r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
        r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
        reward = r1 + r2
        
        dqn.store_transition(obs, action, reward, obs_)
        
        ep_r += reward
        if total_steps > 500:
            dqn.model_train()
            
        if done or ep_r > 200:
            print('episode: ', i_episode,
                  'ep_r: ', round(ep_r, 2))
            break
            
        obs = obs_
        total_steps += 1
env.close()

Discrete(2)
Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
episode:  0 ep_r:  17.03
episode:  1 ep_r:  14.36
episode:  2 ep_r:  6.98
episode:  3 ep_r:  23.8
episode:  4 ep_r:  7.52
episode:  5 ep_r:  10.27
episode:  6 ep_r:  7.37
episode:  7 ep_r:  21.01
episode:  8 ep_r:  39.54
episode:  9 ep_r:  6.29
episode:  10 ep_r:  20.02
episode:  11 ep_r:  12.83
episode:  12 ep_r:  8.18
episode:  13 ep_r:  6.4
episode:  14 ep_r:  54.66
episode:  15 ep_r:  6.13
Update weights!
episode:  16 ep_r:  24.96
episode:  17 ep_r:  24.35
episode:  18 ep_r:  16.1
episode:  19 ep_r:  9.4
Update weights!
episode:  20 ep_r:  44.28
episode:  21 ep_r:  44.44
Update weights!
episode:  22 ep_r:  92.98
Update weights!
episode:  23 ep_r:  86.2
Update weights!
episode:  24 ep_r:  77.84
episode:  25 ep_r:  62.73
Update weights!
episode:  26 ep_r:  56.4
Update weights!
episode:  

Update weights!
episode:  188 ep_r:  200.37
Update weights!
episode:  189 ep_r:  200.58
Update weights!
episode:  190 ep_r:  200.05
Update weights!
episode:  191 ep_r:  200.35
Update weights!
episode:  192 ep_r:  200.57
Update weights!
episode:  193 ep_r:  200.6
Update weights!
episode:  194 ep_r:  200.0
Update weights!
Update weights!
episode:  195 ep_r:  200.08
Update weights!
episode:  196 ep_r:  200.44
Update weights!
episode:  197 ep_r:  200.6
Update weights!
episode:  198 ep_r:  200.02
Update weights!
episode:  199 ep_r:  200.06
Update weights!
episode:  200 ep_r:  200.19
Update weights!
Update weights!
episode:  201 ep_r:  171.16
Update weights!
episode:  202 ep_r:  200.29
Update weights!
episode:  203 ep_r:  200.08
Update weights!
episode:  204 ep_r:  200.09
Update weights!
episode:  205 ep_r:  160.96
Update weights!
Update weights!
episode:  206 ep_r:  200.19
Update weights!
episode:  207 ep_r:  200.14
Update weights!
episode:  208 ep_r:  200.36
Update weights!
episode:  209 e

In [6]:
env.close()

In [1]:
import numpy as np
import gym
from keras.models import load_model

model = load_model('model/CartPoleModel')

In [2]:
env = gym.make('CartPole-v0')
n_round = 0
obs = env.reset()
while(True):
    n_round += 1
    env.render()
    actions_q = model.predict(obs[np.newaxis, :])
    action = np.argmax(actions_q)
    obs_, reward, done, info = env.step(action)
    if done:
        print("Game over!(round %d)" % n_round)
        break
    obs = obs_
env.close()

Game over!(round 200)
