In [1]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf

# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))

# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.2.0
Default GPU Device: /gpu:0


In [2]:
import gym
import numpy as np

# Create the Cart-Pole game environment
env = gym.make('MountainCar-v0')

# Number of possible actions
print('Number of possible actions:', env.action_space.n)

[2018-04-13 12:31:07,047] Making new env: MountainCar-v0


('Number of possible actions:', 3)


In [2]:
env.reset()

array([-0.55614474,  0.        ])

In [4]:
import tensorflow as tf

class QNetwork:
    def __init__(self, learning_rate=0.01, state_size=2, 
                 action_size=3, hidden_size=10, 
                 name='QNetwork'):
        # state inputs to the Q-network
        with tf.variable_scope(name):
            self.inputs_ = tf.placeholder(tf.float32, [None, state_size], name='inputs')
            
            # One hot encode the actions to later choose the Q-value for the action
            self.actions_ = tf.placeholder(tf.int32, [None], name='actions')
            one_hot_actions = tf.one_hot(self.actions_, action_size)
            
            # Target Q values for training
            self.targetQs_ = tf.placeholder(tf.float32, [None], name='target')
            
            # ReLU hidden layers
            self.fc1 = tf.contrib.layers.fully_connected(self.inputs_, hidden_size)
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, hidden_size)

            # Linear output layer
            self.output = tf.contrib.layers.fully_connected(self.fc2, action_size, 
                                                            activation_fn=None)
            
            ### Train with loss (targetQ - Q)^2
            # output has length 2, for two actions. This next line chooses
            # one value from output (per row) according to the one-hot encoded actions.
            self.Q = tf.reduce_sum(tf.multiply(self.output, one_hot_actions), axis=1)
            
            self.loss = tf.reduce_mean(tf.square(self.targetQs_ - self.Q))
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

In [5]:
from collections import deque

class Memory():
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]

In [6]:
train_episodes = 1000          # max number of episodes to learn from
max_steps = 200                # max steps in an episode
gamma = 0.99                   # future reward discount

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
hidden_size = 64               # number of units in each Q-network hidden layer
learning_rate = 0.0001         # Q-network learning rate

# Memory parameters
memory_size = 10000            # memory capacity
batch_size = 20                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

In [7]:
tf.reset_default_graph()
mainQN = QNetwork(name='main', hidden_size=hidden_size, learning_rate=learning_rate)

In [8]:
# Initialize the simulation
env.reset()
# Take one random step to get the pole and cart moving
state, reward, done, _ = env.step(env.action_space.sample())
round(state[0],3)
round(state[1],3)
memory = Memory(max_size=memory_size)

# Make a bunch of random actions and store the experiences
for ii in range(pretrain_length):

    # Make a random action
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    round(next_state[0], 3)
    round(next_state[1], 3)
    if done:
        # The simulation fails so no next state
        next_state = np.zeros(state.shape)
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        
        # Start new episode
        env.reset()
        # Take one random step to get the pole and cart moving
        state, reward, done, _ = env.step(env.action_space.sample())
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state))
        state = next_state

def reward_function(state, action):
    '''  '''
    #0 push left
    #1 no push
    #2 push right
    # going the right state [0] positive and positiv velocity 
    veloctiy = state[1]
    if state[0] > 0 and veloctiy > 0:
        if action == 0:
            reward = -20
        if action == 1:
            reward = -20
        if action == 2:
            reward = 20
        return reward
    # its on the right going to the left
    if state[0] > 0 and veloctiy <= 0:
        if action == 0:
            reward = 20
        if action == 1:
            reward = -20
        if action == 2:
            reward = -20
        return reward
    # its on the left 
    if state[0] <= 0 and veloctiy > 0:
        if action == 0:
            reward = 20
        if action == 1:
            reward = -20
        if action == 2:
            reward = -20
        return reward
        # its on the left 
    if state[0] <= 0 and veloctiy <= 0:
        if action == 0:
            reward = -20
        if action == 1:
            reward = -20
        if action == 2:
            reward = 20
        return reward

In [27]:
def reward_function(state, action):
    '''  '''
    #0 push left
    #1 no push
    #2 push right
    # going the right state [0] positive and positiv velocity 
    veloctiy = state[1]
    if veloctiy > 0:
        if action == 0:
            reward = -1
        if action == 1:
            reward = -1
        if action == 2:
            reward = 1
        return reward
    # its on the right going to the left
    if  veloctiy <= 0:
        if action == 0:
            reward = 1
        if action == 1:
            reward = -1
        if action == 2:
            reward = -1
    if state[0] > 0.2:
        reward += state[0]
    return np.tanh(reward)
    #return np.tanh(reward * abs(state[0]))

In [23]:
value = abs(state[0])
value

0.41499999999999998

In [26]:
np.tanh()

-0.76159415595576485

In [16]:
from __future__ import print_function

In [17]:
def print_func(state, action):
    '''  '''
    print(state)
    if state[0] >= 0:
        print('right ', end='')
    else:
        print('left ', end='')
    if state[1] >= 0:
        print('positiv ', end='')
    else:
        print('negative ', end='')
    if action == 0:
        print(' push left')
    if action == 1:
        print(' no push ')
    if action == 2:
        print(' push right')

In [30]:
train_episodes = 300

In [31]:
# Now train with experiences
saver = tf.train.Saver()
rewards_list = []
with tf.Session() as sess:
    # Initialize variables
    sess.run(tf.global_variables_initializer())
    max_state = -2.0
    step = 0
    for ep in range(1, train_episodes):
        total_reward = 0
        t = 0
        while t < max_steps:
            step += 1
            # Uncomment this next line to watch the training
            #env.render() 
            
            # Explore or Exploit
            explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step) 
            if explore_p > np.random.rand():
                # Make a random action
                action = env.action_space.sample()
            else:
                # Get action from Q-network
                feed = {mainQN.inputs_: state.reshape(1, *state.shape)}
                Qs = sess.run(mainQN.output, feed_dict=feed)
                action = np.argmax(Qs)
            #print_func(state,action)
            # Take action, get new state and reward
            next_state, reward, done, _ = env.step(action)
            next_state[0] = round(next_state[0], 3)
            next_state[1] = round(next_state[1], 3)
            reward = reward_function(next_state, action)
            #print(reward)
            total_reward += reward
            if next_state[0] > max_state:
                max_state = next_state[0]
            
            if done:
                # the episode ends so no next state
                next_state = np.zeros(state.shape)
                print('end episode', max_state)
                
                if t < 198:
                    print('Done ',t)
                    total_reward += 200 - (t*2)
                    print(state)
                t = max_steps
                max_state = -2.0
                print('Episode: {}'.format(ep),
                      'Total reward: {}'.format(total_reward),
                      'Training loss: {:.4f}'.format(loss),
                      'Explore P: {:.4f}'.format(explore_p))
                rewards_list.append((ep, total_reward))
                
                # Add experience to memory
                memory.add((state, action, reward, next_state))
                
                # Start new episode
                env.reset()
                # Take one random step to get the pole and cart moving
                state, reward, done, _ = env.step(env.action_space.sample())
                state[0] = round(state[0], 3)
                state[1] = round(state[1], 3)
            else:
                # Add experience to memory
                memory.add((state, action, reward, next_state))
                state = next_state
                t += 1
            
            # Sample mini-batch from memory
            batch = memory.sample(batch_size)
            states = np.array([each[0] for each in batch])
            actions = np.array([each[1] for each in batch])
            rewards = np.array([each[2] for each in batch])
            next_states = np.array([each[3] for each in batch])
            
            # Train network
            target_Qs = sess.run(mainQN.output, feed_dict={mainQN.inputs_: next_states})
            
            # Set target_Qs to 0 for states where episode ends
            episode_ends = (next_states == np.zeros(states[0].shape)).all(axis=1)
            target_Qs[episode_ends] = (0, 0,0)
            
            targets = rewards + gamma * np.max(target_Qs, axis=1)

            loss, _ = sess.run([mainQN.loss, mainQN.opt],
                                feed_dict={mainQN.inputs_: states,
                                           mainQN.targetQs_: targets,
                                           mainQN.actions_: actions})
            

        saver.save(sess, "checkpoints/mountain.ckpt")

end episode -0.279
Done  171
[-0.32   0.015]
Episode: 1 Total reward: -180.516665587 Training loss: 0.8116 Explore P: 0.9831
end episode -0.319
Episode: 2 Total reward: -26.8014480548 Training loss: 0.7443 Explore P: 0.9639
end episode -0.412
Episode: 3 Total reward: -42.5101428621 Training loss: 0.8024 Explore P: 0.9451
end episode -0.307
Episode: 4 Total reward: -19.9007240274 Training loss: 0.6830 Explore P: 0.9267
end episode -0.298
Episode: 5 Total reward: -45.1855064953 Training loss: 1.8057 Explore P: 0.9087
end episode -0.445
Episode: 6 Total reward: -45.3710129906 Training loss: 5.6177 Explore P: 0.8910
end episode -0.24
Episode: 7 Total reward: -41.0398538989 Training loss: 1.0052 Explore P: 0.8736
end episode -0.414
Episode: 8 Total reward: -31.7152175321 Training loss: 16.4066 Explore P: 0.8566
end episode -0.444
Episode: 9 Total reward: -29.3311590917 Training loss: 79.2301 Explore P: 0.8399
end episode -0.403
Episode: 10 Total reward: -58.2318831191 Training loss: 1.9026 

end episode -0.094
Episode: 84 Total reward: 48.3115909169 Training loss: 1.6921 Explore P: 0.1966
end episode -0.249
Episode: 85 Total reward: -23.7811623307 Training loss: 3.1690 Explore P: 0.1929
end episode 0.082
Episode: 86 Total reward: 27.0797077978 Training loss: 1.5885 Explore P: 0.1893
end episode 0.068
Episode: 87 Total reward: 31.7949253299 Training loss: 3.4172 Explore P: 0.1858
end episode 0.23
Episode: 88 Total reward: 57.8698854872 Training loss: 1.2465 Explore P: 0.1823
end episode 0.154
Episode: 89 Total reward: 35.5565194859 Training loss: 2.8845 Explore P: 0.1789
end episode -0.028
Episode: 90 Total reward: 9.46376623823 Training loss: 2.0999 Explore P: 0.1756
end episode 0.045
Episode: 91 Total reward: 54.8413019537 Training loss: 1.0539 Explore P: 0.1723
end episode 0.214
Episode: 92 Total reward: 48.9869545501 Training loss: 1.0893 Explore P: 0.1691
end episode -0.24
Episode: 93 Total reward: 32.8811558526 Training loss: 1.8581 Explore P: 0.1660
end episode 0.215

end episode -0.159
Episode: 164 Total reward: 74.8673928535 Training loss: 0.8146 Explore P: 0.0484
end episode -0.211
Episode: 165 Total reward: 50.1652207712 Training loss: 0.8981 Explore P: 0.0477
end episode 0.276
Episode: 166 Total reward: 77.7120435224 Training loss: 0.8962 Explore P: 0.0469
end episode -0.34
Episode: 167 Total reward: 36.2181201199 Training loss: 0.8748 Explore P: 0.0462
end episode -0.344
Episode: 168 Total reward: 12.8405844044 Training loss: 0.8353 Explore P: 0.0455
end episode 0.508
Done  186
[ 0.472  0.036]
Episode: 169 Total reward: -68.1391298715 Training loss: 0.8292 Explore P: 0.0448
end episode -0.183
Episode: 170 Total reward: 50.0724675235 Training loss: 0.7997 Explore P: 0.0441
end episode -0.282
Episode: 171 Total reward: 41.0260908997 Training loss: 0.8394 Explore P: 0.0435
end episode 0.522
Done  194
[ 0.475  0.047]
Episode: 172 Total reward: -91.3311590917 Training loss: 0.6189 Explore P: 0.0428
end episode -0.313
Episode: 173 Total reward: 40.5

end episode 0.518
Done  140
[ 0.47   0.048]
Episode: 237 Total reward: 23.026808449 Training loss: 0.9346 Explore P: 0.0210
end episode 0.517
Done  148
[ 0.468  0.048]
Episode: 238 Total reward: 16.788402605 Training loss: 0.7867 Explore P: 0.0208
end episode -0.545
Episode: 239 Total reward: 12.8681104028 Training loss: 0.4998 Explore P: 0.0206
end episode 0.507
Done  151
[ 0.46   0.047]
Episode: 240 Total reward: -18.6884090831 Training loss: 0.7435 Explore P: 0.0205
end episode -0.5
Episode: 241 Total reward: 23.2521688432 Training loss: 0.6241 Explore P: 0.0203
end episode 0.537
Done  144
[ 0.487  0.049]
Episode: 242 Total reward: 8.3115909169 Training loss: 0.5702 Explore P: 0.0201
end episode 0.517
Done  110
[ 0.499  0.019]
Episode: 243 Total reward: 43.2253603942 Training loss: 0.6501 Explore P: 0.0200
end episode 0.537
Done  141
[ 0.487  0.049]
Episode: 244 Total reward: 21.2188376693 Training loss: 0.8303 Explore P: 0.0199
end episode 0.537
Done  152
[ 0.487  0.049]
Episode: 2

In [40]:
from scipy.special import expit
expit(-3)

0.047425873177566788

In [64]:
state[0] =round(state[0],3)

In [65]:
state

array([-0.694     , -0.00251189])

In [18]:
env.reset()

array([-0.58005696,  0.        ])

In [32]:
with tf.Session() as sess:
    
    model = tf.train.import_meta_graph('checkpoints/mountain.ckpt.meta')
    model = model.restore(sess,tf.train.latest_checkpoint('checkpoints/'))
    step=0
    
    for ep in range(1):
        total_reward = 0
        t = 0
        while t < max_steps:
            t +=1
            env.render()
            feed = {mainQN.inputs_: state.reshape(1, *state.shape)}
            Qs = sess.run(mainQN.output, feed_dict=feed)
            action = np.argmax(Qs)
                       
            #print_func(state,action)
            # Take action, get new state and reward
            next_state, reward, done, _ = env.step(action)
            next_state[0] = round(next_state[0], 3)
            next_state[1] = round(next_state[1], 3)
            print(state[0])
            state = next_state
            if done:
                print("Very good")
                
                print('Episode: {}'.format(ep),
                      'Total reward: {}'.format(total_reward),
                      'Training loss: {:.4f}'.format(loss),
                      'Explore P: {:.4f}'.format(explore_p))
                r
               

INFO:tensorflow:Restoring parameters from checkpoints/mountain.ckpt


[2018-04-13 13:22:04,540] Restoring parameters from checkpoints/mountain.ckpt


-0.455
-0.458
-0.462
-0.468
-0.476
-0.485
-0.495
-0.506
-0.518
-0.532
-0.546
-0.561
-0.577
-0.594
-0.611
-0.628
-0.646
-0.663
-0.681
-0.699
-0.716
-0.733
-0.749
-0.765
-0.78
-0.795
-0.808
-0.821
-0.833
-0.844
-0.854
-0.862
-0.87
-0.876
-0.882
-0.886
-0.886
-0.884
-0.878
-0.87
-0.858
-0.843
-0.825
-0.804
-0.78
-0.753
-0.724
-0.692
-0.658
-0.622
-0.585
-0.546
-0.506
-0.465
-0.423
-0.381
-0.339
-0.298
-0.257
-0.217
-0.177
-0.139
-0.105
-0.073
-0.045
-0.021
0.0
0.017
0.031
0.042
0.049
0.052
0.052
0.049
0.042
0.032
0.018
0.0
-0.02
-0.045
-0.073
-0.104
-0.139
-0.176
-0.218
-0.262
-0.309
-0.358
-0.409
-0.463
-0.518
-0.574
-0.63
-0.687
-0.743
-0.799
-0.855
-0.909
-0.961
-1.013
-1.063
-1.111
-1.158
-1.2
-1.197
-1.19
-1.18
-1.167
-1.151
-1.131
-1.108
-1.081
-1.051
-1.018
-0.98
-0.94
-0.896
-0.849
-0.798
-0.745
-0.69
-0.632
-0.572
-0.511
-0.449
-0.387
-0.324
-0.262
-0.201
-0.141
-0.082
-0.025
0.031
0.086
0.139
0.19
0.241
0.289
0.334
0.377
0.417
0.456
0.494
Very good
Episode: 0 Total reward: 0 Tra

NameError: name 'r' is not defined