#### Gradient Descent SARSA algorithm demonstration using Cart-Pole env from OpenAI Gym

In [1]:
import gym
import collections
import copy
import numpy as np
import matplotlib.pyplot as plt
import time
import keras
from keras.models import Sequential
from keras.layers import Dense, normalization
from keras.optimizers import SGD,RMSprop,Nadam

Using TensorFlow backend.


In [16]:
class Agent(object):
    def __init__(self, env):
        self.env = env
        self.memory = collections.deque(maxlen=5000)
        self.epsilon = 0.6
        self.epsilonDecayFactor = 0.99
        self.epsilonMin = 0.1
        self.gamma = 0.9                             # discount factor for Q-learning
        self.lr = 0.001                              # Learning Rate
        self.replayBatchSize = 100                 # Number of memories to use for training
        self.model = self.build_model()
        
        
    def build_model(self):
        model = keras.models.Sequential()
        model.add(Dense(64, input_dim=4, activation='tanh', init='he_uniform'))
        model.add(Dense(128, activation='tanh', init='he_uniform'))
        model.add(Dense(64, activation='tanh', init='he_uniform'))
        model.add(Dense(2, activation='linear', init='he_uniform'))
        model.compile(loss='mse', optimizer=RMSprop(lr=self.lr))
        return model
        
    def choose_action(self, state):
        if (np.random.rand() > self.epsilon):
            action_values = self.model.predict(state)[0] #[Q(s,a1),Q(s,a2)]
            return np.argmax(action_values)
        else:
            return self.env.action_space.sample()
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def replay(self):
        memorySampleIndices = np.random.choice(len(self.memory),
                                               size=min(len(self.memory),self.replayBatchSize),
                                               replace=False)
        for i in memorySampleIndices:
            state, action, reward, next_state, done = self.memory[i]             # s, ai, r, s_next 
            Q_estimate = self.model.predict(np.array(state).reshape(1,4))[0]    # [Q(s,a1), Q(s,a2)]
            if not done:
                next_Qs = self.model.predict(np.array(next_state).reshape(1,4))[0]  # [Q_next(s_next,a1), Q_next(s_next,a2)]
                target = reward + self.gamma*max(next_Qs)                           # This is what the Q(s,ai) should be (new estimate!)
            else:
                target = reward
            Q_estimate[action] = target                                         # Updating the Q_estimate to use the better estimate for action ai
            # Update the model parameters to fit the change
            self.model.fit(np.array(state).reshape(1,4),
                           np.array(Q_estimate).reshape(1,2),
                           nb_epoch=1,
                           verbose=False)
        #decrese the epsilon after each episode
        self.epsilon = self.epsilonDecayFactor * self.epsilon if self.epsilon > self.epsilonMin else self.epsilon 

In [18]:
#train
env = gym.make('CartPole-v0')
agent = Agent(env=env)
# agent.model.load_weights("./100000_iter_weights.h5")
for episode in range(500):
    state = env.reset()
    for t in range(5000): 
#         env.render()
        action = agent.choose_action(np.array(state).reshape(1,4))  # Choose an action
        next_state, reward, done, info = env.step(action)           # Observe next_state, reward
        reward = -1000/t if done else reward+t                          # heavily penalize the failing reward
        agent.remember(state, action, reward, next_state, done)     # remember SARS + done
        state = copy.deepcopy(next_state)                           # state = next_state
        if done:
            print("Episode {} finished after {} timesteps. memory size: {}. epsilon: {}".format(episode,t+1, len(agent.memory), agent.epsilon))
            break
    if(episode%50 == 0):
        print("Saving the model...\n")
        agent.model.save_weights("./model_weights.h5")
    agent.replay()
env.render(close=True)

[2017-02-15 12:49:47,988] Making new env: CartPole-v0


Episode 0 finished after 9 timesteps. memory size: 9. epsilon: 0.6
Saving the model...

Episode 1 finished after 13 timesteps. memory size: 22. epsilon: 0.594
Episode 2 finished after 16 timesteps. memory size: 38. epsilon: 0.58806
Episode 3 finished after 12 timesteps. memory size: 50. epsilon: 0.5821794
Episode 4 finished after 10 timesteps. memory size: 60. epsilon: 0.576357606
Episode 5 finished after 9 timesteps. memory size: 69. epsilon: 0.57059402994
Episode 6 finished after 21 timesteps. memory size: 90. epsilon: 0.564888089641
Episode 7 finished after 16 timesteps. memory size: 106. epsilon: 0.559239208744
Episode 8 finished after 21 timesteps. memory size: 127. epsilon: 0.553646816657
Episode 9 finished after 13 timesteps. memory size: 140. epsilon: 0.54811034849
Episode 10 finished after 17 timesteps. memory size: 157. epsilon: 0.542629245005
Episode 11 finished after 15 timesteps. memory size: 172. epsilon: 0.537202952555
Episode 12 finished after 14 timesteps. memory size:

In [19]:
#test
env = gym.make('CartPole-v0')
agent = Agent(env=env)
agent.epsilon = 0.0 # No exploration for testing
agent.model.load_weights("./300_iter_weights.h5")
for episode in range(51):
    state = env.reset()
    for t in range(5000): 
        env.render()
        action = agent.choose_action(np.array(state).reshape(1,4))  # Choose an action
        next_state, reward, done, info = env.step(action)           # Observe next_state, reward
        state = copy.deepcopy(next_state)                           # state = next_state
        if done:
            print("Episode {} finished after {} timesteps. memory size: {}. epsilon: {}".format(episode,t+1, len(agent.memory), agent.epsilon))
            break
env.render(close=True)

[2017-02-15 12:54:02,929] Making new env: CartPole-v0


Episode 0 finished after 71 timesteps. memory size: 0. epsilon: 0.0
Episode 1 finished after 112 timesteps. memory size: 0. epsilon: 0.0
Episode 2 finished after 143 timesteps. memory size: 0. epsilon: 0.0
Episode 3 finished after 107 timesteps. memory size: 0. epsilon: 0.0
Episode 4 finished after 112 timesteps. memory size: 0. epsilon: 0.0
Episode 5 finished after 58 timesteps. memory size: 0. epsilon: 0.0
Episode 6 finished after 58 timesteps. memory size: 0. epsilon: 0.0
Episode 7 finished after 75 timesteps. memory size: 0. epsilon: 0.0
Episode 8 finished after 57 timesteps. memory size: 0. epsilon: 0.0
Episode 9 finished after 138 timesteps. memory size: 0. epsilon: 0.0
Episode 10 finished after 144 timesteps. memory size: 0. epsilon: 0.0
Episode 11 finished after 124 timesteps. memory size: 0. epsilon: 0.0
Episode 12 finished after 73 timesteps. memory size: 0. epsilon: 0.0
Episode 13 finished after 108 timesteps. memory size: 0. epsilon: 0.0
Episode 14 finished after 50 timeste

In [11]:
# From openA1 gym

import numpy as np
import gym

env = gym.make('CartPole-v1')
for episode in range(1):
    observation = env.reset()
    for i in range(50):
        env.render()
        output_vector = np.zeros((2, 1))
        output_vector[1] = np.tanh(observation[2] + observation[3])
        print(output_vector)
        action = np.argmax(output_vector)
        observation, reward, done, _ = env.step(action)
        if done:
            break
env.render(close=True)

[2017-02-15 12:30:07,233] Making new env: CartPole-v1


[[ 0.        ]
 [ 0.05375614]]
[[ 0.        ]
 [-0.22563813]]
[[ 0.        ]
 [ 0.06651141]]
[[ 0.        ]
 [-0.21454871]]
[[ 0.        ]
 [ 0.07711924]]
[[ 0.        ]
 [-0.20529476]]
[[ 0.        ]
 [ 0.08588051]]
[[ 0.        ]
 [-0.19764454]]
[[ 0.        ]
 [ 0.09304517]]
[[ 0.        ]
 [-0.19140188]]
[[ 0.        ]
 [ 0.09881882]]
[[ 0.        ]
 [-0.18640325]]
[[ 0.        ]
 [ 0.10336839]]
[[ 0.        ]
 [-0.18251488]]
[[ 0.        ]
 [ 0.10682694]]
[[ 0.        ]
 [-0.17963015]]
[[ 0.        ]
 [ 0.10929755]]
[[ 0.        ]
 [-0.17766726]]
[[ 0.        ]
 [ 0.11085639]]
[[ 0.        ]
 [-0.17656736]]
[[ 0.        ]
 [ 0.11155516]]
[[ 0.        ]
 [-0.17629307]]
[[ 0.        ]
 [ 0.11142269]]
[[ 0.        ]
 [-0.17682744]]
[[ 0.        ]
 [ 0.11046605]]
[[ 0.       ]
 [-0.1781733]]
[[ 0.        ]
 [ 0.10867102]]
[[ 0.       ]
 [-0.1803531]]
[[ 0.        ]
 [ 0.10600193]]
[[ 0.        ]
 [-0.18340906]]
[[ 0.        ]
 [ 0.10240106]]
[[ 0.        ]
 [-0.18740378]]
[[ 0.       

In [12]:
%run cart_pole.py

[2017-02-14 16:48:28,019] Making new env: CartPole-v0


Episode 0 finished after 9 timesteps. memory size: 9. epsilon: 0.1
Saving the model...

Episode 1 finished after 9 timesteps. memory size: 18. epsilon: 0.099
Episode 2 finished after 98 timesteps. memory size: 116. epsilon: 0.09801
Episode 3 finished after 60 timesteps. memory size: 176. epsilon: 0.0970299
Episode 4 finished after 39 timesteps. memory size: 215. epsilon: 0.096059601
Episode 5 finished after 10 timesteps. memory size: 225. epsilon: 0.09509900499
Episode 6 finished after 31 timesteps. memory size: 256. epsilon: 0.0941480149401
Episode 7 finished after 17 timesteps. memory size: 273. epsilon: 0.0932065347907
Episode 8 finished after 29 timesteps. memory size: 302. epsilon: 0.0922744694428
Episode 9 finished after 25 timesteps. memory size: 327. epsilon: 0.0913517247484
Episode 10 finished after 40 timesteps. memory size: 367. epsilon: 0.0904382075009
Episode 11 finished after 34 timesteps. memory size: 401. epsilon: 0.0895338254259
Episode 12 finished after 24 timesteps. 

KeyboardInterrupt: 