In [1]:
import gym
import numpy as np

from keras.layers import Dense, Softmax
from keras.models import Sequential

env = gym.make('MountainCar-v0')
print(env.observation_space)
print(env.action_space)
n_features = env.observation_space.shape[0]
n_actions = env.action_space.n

In [None]:
class DeepQNetwork:
    def __init__(self, 
                 n_actions, 
                 n_features, 
                 e_greedy=0.9, 
                 reward_decay=0.9,
                 memory_size=500, 
                 batch_size=100, 
                 update_weights=300):
        self.n_actions = n_actions
        self.n_features = n_features
        self.e_greedy = e_greedy
        self.reward_decay = reward_decay
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.update_weights = update_weights
        
        self.eval_model = self._build_model()
        self.target_model = self._build_model()
        
        self.memory = np.zeros([self.memory_size, 2 * self.n_features + 2])
        self.memory_counter = 0
        self.learn_counter = 0
    
    def _build_model(self):
        model = Sequential()
        model.add(Dense(10, input_shape=(self.n_features,), activation='relu'))
        model.add(Dense(self.n_actions))
        model.compile(optimizer='adam', loss='mse')
        return model

    def choose_action(self, obs):
        if np.random.uniform() < self.e_greedy:
            obs = obs[np.newaxis, :]
            action_values = self.eval_model.predict(obs)
            action = np.argmax(action_values)
        else:
            action = np.random.randint(0, self.n_actions)
        return action

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, [a, r], s_))
        self.memory[self.memory_counter % self.memory_size] = transition
        self.memory_counter += 1

    def model_train(self):
        self.learn_counter += 1
        if self.learn_counter % self.update_weights == 0:
            print("Update weights!")
            self.target_model.set_weights(self.eval_model.get_weights())

        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, self.batch_size)
        batch_memory = self.memory[sample_index, :]

        s = batch_memory[:, :self.n_features]
        s_ = batch_memory[:, -self.n_features:]
        actions = batch_memory[:, self.n_features].astype(int)
        rewards = batch_memory[:, self.n_features+1]
        q_eval = self.eval_model.predict(s)
        q_next = self.target_model.predict(s_)

        q_target = q_eval.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        q_target[batch_index, actions] = rewards + self.reward_decay * np.max(q_next, axis=1)

        self.eval_model.fit(s, q_target, verbose=0)

    def save(self, model_path):
        self.eval_model.save(model_path)

In [47]:
dqn = DeepQNetwork(n_actions, n_features, batch_size=50, update_weights=200)
total_step = 0
total_round = 10
for i in range(total_round):
    obs = env.reset()
    step = 0
    while(True):
        env.render()
        action = dqn.choose_action(obs)
        obs_, reward, done, info = env.step(action)
        
        pos, vel = obs_
        reward = 1 / (0.5 - pos) + abs(vel)
        
        dqn.store_transition(obs, action, reward, obs_)
        
        if total_step > 200:
            dqn.model_train()
        
        if done:
            print("Game win!")
            break
        
        obs = obs_
        total_step += 1
env.close()

Game win!
Game win!
Update weights!
Game win!
Update weights!
Game win!
Update weights!
Game win!
Game win!
Update weights!
Game win!
Update weights!
Update weights!
Game win!
Game win!
Update weights!
Game win!


In [36]:
env.close()

In [1]:
import numpy as np
import gym
from keras.models import load_model

model = load_model('model/MountainCarModel')

In [2]:
env = gym.make('MountainCar-v0')
n_round = 0
obs = env.reset()
while(True):
    n_round += 1
    env.render()
    actions_q = model.predict(obs[np.newaxis, :])
    action = np.argmax(actions_q)
    obs_, reward, done, info = env.step(action)
    if done:
        print("Game over!(round %d)" % n_round)
        break
    obs = obs_
env.close()

Game over!(round 179)
