In [1]:
import random
import gym
import numpy as np
from collections import deque
import tensorflow as tf
import os

In [2]:
env = gym.make('MountainCar-v0')

In [3]:
state_size= env.observation_space.shape[0]
state_size

2

In [4]:
action_size= env.action_space.n
action_size

3

In [5]:
batch_size = 32


In [6]:
n_episodes= 70000

In [7]:
output_dir= 'model_output/MountainCar'

In [8]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [9]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.env= env
        self.state_size= state_size
        self.action_size= action_size
        
        self.memory= deque(maxlen=200000)
        
        self.gamma= 0.99
        
        self.epsilon = 1.0
        self.epsilon_decay= .85
        self.epsilon_min=0.00001
        
        self.learning_rate= 0.001251
        self.model= self._build_model()
        self.target_model=self._build_model()
        
        self.update_target_model()
        
    def _build_model(self):
        model= tf.keras.models.Sequential()
        state_shape= self.env.observation_space.shape
        model.add(tf.keras.layers.Dense(24, input_shape= state_shape, activation='relu'))
        model.add(tf.keras.layers.Dense(48, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        
        return model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    def act(self, state):
        if np.random.rand(1) <=self.epsilon:
            return random.randrange(self.action_size)
        act_values= self.model.predict(state)
        return np.argmax(act_values[0])
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        states=[]
        targets=[]
        
        for state, action, reward, next_state, done in minibatch:
            target=reward
            if not done:
                target = (reward + self.gamma * np.amax(self.target_model.predict(next_state)[0]))
            target_f= self.model.predict(state)
            target_f[0][action]= target
            states.append(state[0])
            targets.append(target_f[0])
            
        self.model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)
            
        
    def load(self, name):
        self.model.load_weights(name)
    def save(self, name):
        self.model.save_weights(name)
            
    
        

In [10]:
agent= DQNAgent(state_size, action_size)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [None]:
done = False
counter=0 
scores_memory= deque(maxlen=100)
for e in range(n_episodes):
    state=env.reset()

    state= np.reshape(state, [1, state_size])
    
    for time in range(7000):
        if e % 50==0:
            env.render()
        action= agent.act(state)
        next_state, reward, done, halp =env.step(action)
        
        next_state = np.reshape(next_state, [1, state_size])

        
        agent.remember(state, action, reward, next_state, done)
            
        if len(agent.memory)>batch_size:
            agent.replay(batch_size)

        
        state = next_state

        if done:
            scores_memory.append(time)
            scores_avg= np.mean(scores_memory)*-1

            
            print('episode: {}/{}, score: {}, e {:.2}, help: {}, reward: {}, 100score avg: {}'.format(e, n_episodes, time, agent.epsilon, state, reward, scores_avg))

            break
    agent.update_target_model()
        
        
    if agent.epsilon > agent.epsilon_min:
            agent.epsilon *= agent.epsilon_decay
        
    if e % 50==0:
        agent.save(output_dir + 'weights_final' + '{:04d}'.format(e) + ".hdf5")

Instructions for updating:
Use tf.cast instead.
episode: 0/70000, score: 199, e 1.0, help: [[-0.53327205 -0.01343278]], reward: -1.0, 100score avg: -199.0
episode: 1/70000, score: 199, e 0.85, help: [[-0.63251497  0.01193769]], reward: -1.0, 100score avg: -199.0
episode: 2/70000, score: 199, e 0.72, help: [[-0.20412662  0.00235219]], reward: -1.0, 100score avg: -199.0
episode: 3/70000, score: 199, e 0.61, help: [[-0.84518602  0.0045429 ]], reward: -1.0, 100score avg: -199.0
episode: 4/70000, score: 199, e 0.52, help: [[-0.61302877 -0.00153696]], reward: -1.0, 100score avg: -199.0
episode: 5/70000, score: 199, e 0.44, help: [[-0.46991779 -0.01549339]], reward: -1.0, 100score avg: -199.0
episode: 6/70000, score: 199, e 0.38, help: [[-0.63693088  0.02154086]], reward: -1.0, 100score avg: -199.0
episode: 7/70000, score: 199, e 0.32, help: [[-0.20043436  0.03310686]], reward: -1.0, 100score avg: -199.0
episode: 8/70000, score: 199, e 0.27, help: [[-0.49212264 -0.01359907]], reward: -1.0, 10

episode: 71/70000, score: 199, e 9.7e-06, help: [[-0.74258636  0.01068138]], reward: -1.0, 100score avg: -196.66666666666666
episode: 72/70000, score: 199, e 9.7e-06, help: [[-0.59745224  0.0393401 ]], reward: -1.0, 100score avg: -196.6986301369863
episode: 73/70000, score: 199, e 9.7e-06, help: [[-0.32718423 -0.05361774]], reward: -1.0, 100score avg: -196.72972972972974
episode: 74/70000, score: 199, e 9.7e-06, help: [[-0.02702674  0.01304403]], reward: -1.0, 100score avg: -196.76
episode: 75/70000, score: 199, e 9.7e-06, help: [[-0.96336355  0.00618102]], reward: -1.0, 100score avg: -196.78947368421052
episode: 76/70000, score: 199, e 9.7e-06, help: [[0.24714812 0.03056357]], reward: -1.0, 100score avg: -196.8181818181818
episode: 77/70000, score: 199, e 9.7e-06, help: [[-0.2956503   0.01402103]], reward: -1.0, 100score avg: -196.84615384615384
episode: 78/70000, score: 199, e 9.7e-06, help: [[-0.27539099  0.02907886]], reward: -1.0, 100score avg: -196.873417721519
episode: 79/70000,

episode: 142/70000, score: 199, e 9.7e-06, help: [[-0.42989512 -0.03545322]], reward: -1.0, 100score avg: -193.09
episode: 143/70000, score: 199, e 9.7e-06, help: [[-0.35451704  0.0601173 ]], reward: -1.0, 100score avg: -193.09
episode: 144/70000, score: 199, e 9.7e-06, help: [[ 0.36308517 -0.00177309]], reward: -1.0, 100score avg: -193.09
episode: 145/70000, score: 199, e 9.7e-06, help: [[-0.70813853 -0.0135698 ]], reward: -1.0, 100score avg: -193.09
episode: 146/70000, score: 191, e 9.7e-06, help: [[0.50128423 0.02320023]], reward: -1.0, 100score avg: -193.01
episode: 147/70000, score: 199, e 9.7e-06, help: [[-0.27969318  0.0162373 ]], reward: -1.0, 100score avg: -193.01
episode: 148/70000, score: 199, e 9.7e-06, help: [[-0.0793114  -0.02657818]], reward: -1.0, 100score avg: -193.01
episode: 149/70000, score: 199, e 9.7e-06, help: [[-0.66586801  0.02013104]], reward: -1.0, 100score avg: -193.01
episode: 150/70000, score: 199, e 9.7e-06, help: [[-0.72310569 -0.00159812]], reward: -1.0

In [None]:
#DQN and building around OpenAIgym reference:https://www.youtube.com/watch?v=OYhFoMySoVs
#Double DQN reference: https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/