In [1]:
#: imports, nothing to see here
import random
from collections import defaultdict, namedtuple, deque
from itertools import product, starmap

import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Image
from matplotlib import animation
import copy


from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2

from keras.optimizers import RMSprop


from numpy import sin, cos, pi

%matplotlib inline

random.seed(1)

Using Theano backend.
Using gpu device 0: GRID K520 (CNMeM is disabled, cuDNN 5005)


In [2]:
State = namedtuple('State', ['theta', 'theta_d', 'x', 'x_d'])

In [8]:
dim_actions = 2
dim_states = 4

def make_model():
    model = Sequential()
    rms = RMSprop()
    model.add(Dense(32, input_shape=(dim_states,), init='zero', bias=True))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(32, init='zero', bias=True))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(32, init='zero', bias=True))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(dim_actions, init='zero',bias=True))
    model.add(BatchNormalization())
    model.add(Activation('linear'))
    model.compile(loss='mse', optimizer=rms)
    return model

In [46]:
class CartPole:
    def __init__(self):
        self.grav = 9.81
        self.mass_cart = 1.0
        self.mass_pole = 0.1
        self.mass_total = self.mass_cart + self.mass_pole
        self.pole_mcenter = 0.5
        self.polemass_mom = self.mass_pole * self.pole_mcenter
        self.force_mag = 10.0
        self.delta_t = 0.02
        self.lim_theta = pi / 15
        self.start_state = self.compute_start_state()
        self.range_theta_rad = 12 * 2 * pi / 360
        self.lim_x = 2.4
        self.n_actions = 2
        self.actions = [-1, 1]

    @property
    def state(self):
        return self._state

    def is_terminal(self, state):
        x, theta = state[:2]
        if (abs(x) >= self.lim_x) or (abs(theta) >= self.lim_theta):
            return True
        else:
            return False

    def reward(self, state):
        if self.is_terminal(state) is True:
            return 0
        else:
            return 1

    def newstate(self, state, action):
        move = self.actions[action]
        x, theta, x_dot, theta_dot = state
        F = self.force_mag * move
        costheta = cos(theta)
        sintheta = sin(theta)
        temp = (F + self.polemass_mom * theta_dot * theta_dot *sintheta)/ self.mass_total

        theta_dot_dot = (self.grav * sintheta - costheta* temp) / (self.pole_mcenter * (4.0/3.0 - self.mass_pole * costheta * costheta / self.mass_total))

        x_dot_dot = temp - self.polemass_mom * theta_dot_dot * costheta /self.mass_total
        x_new = x + self.delta_t * x_dot
        x_dot_new = x_dot + self.delta_t * x_dot_dot
        theta_new = theta + self.delta_t * theta_dot
        theta_dot_new = theta_dot + self.delta_t * theta_dot_dot
        return (x_new, theta_new, x_dot_new, theta_dot_new)

    def compute_start_state(self):
        random = np.random.uniform(low=-0.05, high=0.05, size=(4, ))
        start_state = State(*random)
        return start_state

In [254]:
dim_actions = 2
dim_states = 4


def make_model(hidden_size = 64, dim_states = 4):
    model = Sequential()
    model.add(Dense(hidden_size, input_shape=(dim_states,), activation='relu'))
    model.add(Dense(hidden_size, activation='relu'))
    model.add(Dense(2))
    model.compile(loss='mse', optimizer='sgd')
    return model


class Agent():
    def __init__(self, explore=0.1, discount=0.9, hidden_size=64, memory_limit=8000):
   
        self.Q = make_model()
        self.Q_target = make_model()
        self.batch_size = 32
        self.step_count=0
        self.target_switch = False
        self.target_update = 2000


        # experience replay:
        # remember states to "reflect" on later
        self.memory = deque([], maxlen=memory_limit)

        self.explore = explore
        self.discount = discount

    def act(self, state):
        if np.random.rand() <= self.explore:
            return np.random.randint(0, 2)
        s = np.asarray(state).reshape(1,4)
        q = self.Q.predict(s)
        choice = np.argmax(q[0])
        return choice

    def remember(self, state, action, next_state, reward):
        # the deque object will automatically keep a fixed length
        self.memory.append((state, action, next_state, reward))

    def _prep_batch(self, batch_size):
        self.step_count+=1
        if batch_size > self.memory.maxlen:
            Warning('batch size should not be larger than max memory size. Setting batch size to memory size')
            batch_size = self.memory.maxlen

        batch_size = min(batch_size, len(self.memory))

        inputs = []
        targets = []

        # prep the batch
        # inputs are states, outputs are values over actions
        batch = random.sample(list(self.memory), batch_size)
        random.shuffle(batch)
        for state, action, next_state, reward in batch:
            inputs.append(state)
            s = np.asarray(state).reshape(1,4)
            if self.target_switch:
                target = self.Q_target.predict(s)[0]
            else:
                target = self.Q.predict(s)[0]
            # debug, "this should never happen"
            assert not np.array_equal(state, next_state)

            # non-zero reward indicates terminal state
            if reward == 0:
                target[action] = reward
            else:
                # reward + gamma * max_a' Q(s', a')
                ns = np.asarray(next_state).reshape(1,4)
                if self.target_switch:
                    Q_sa = np.max(self.Q_target.predict(ns)[0])
                else:
                    Q_sa = np.max(self.Q.predict(ns)[0])
                target[action] = reward + self.discount * Q_sa
            targets.append(target)

        # to numpy matrices
        return np.vstack(inputs), np.vstack(targets)

    def flashback(self):
        inputs, targets = self._prep_batch(self.batch_size)
        loss = self.Q.train_on_batch(inputs, targets)
        if self.step_count% self.target_update==0:
            self.update_target_network()
        pass
    
    def update_target_network(self):
        self.target_switch = True
        weights = self.Q.get_weights()
        self.Q_target.set_weights(weights)
        pass

    def save(self, fname):
        self.Q.save_weights(fname)

    def load(self, fname):
        self.Q.load_weights(fname)
        print(self.Q.get_weights())

In [255]:
def run_episode(domain, agent):
    state = domain.compute_start_state()
    step = 0
    while not domain.is_terminal(state):
        action = agent.act(state)    #: Take the current state as input and compute an action.
        newstate = domain.newstate(state, action)   #: Take the action and compute the changed state.
        reward = domain.reward(newstate)
        agent.remember(state, action, newstate, reward)#: Learn.
        agent.flashback()
        state = newstate                            #: Newstate becomes the current state for next iteration.
        step +=1
    return step

In [256]:
def run_experiment(domain, agent, epsilon_decay, n_episodes):
    for i in range(n_episodes):
       # agent.epsilon *= epsilon_decay
        step = run_episode(domain, agent)
        if step > 200:
            print('Episode Number: {0}\n'.format(i))
            print('Number of steps reached: {0}\n'.format(step))
    print('Setting epsilon paramter to zero',
          'to prevent random actions and evaluate learned policy.\n')
    # agent.epsilon = 0
    run_episode(domain, agent)                    
    pass

In [257]:
n_episodes = 1000
epsilon_decay = 0.9
domain = CartPole()
model = make_model()
agent = Agent()

In [258]:
run_experiment(domain, agent, epsilon_decay, n_episodes)

Episode Number: 90

Number of steps reached: 366

Episode Number: 128

Number of steps reached: 232

Episode Number: 131

Number of steps reached: 203

Episode Number: 135

Number of steps reached: 253

Episode Number: 140

Number of steps reached: 212

Episode Number: 157

Number of steps reached: 249

Episode Number: 158

Number of steps reached: 219

Episode Number: 173

Number of steps reached: 275

Episode Number: 213

Number of steps reached: 246

Episode Number: 216

Number of steps reached: 213

Episode Number: 220

Number of steps reached: 259

Episode Number: 237

Number of steps reached: 224

Episode Number: 238

Number of steps reached: 202

Episode Number: 240

Number of steps reached: 243

Episode Number: 248

Number of steps reached: 201

Episode Number: 249

Number of steps reached: 213

Episode Number: 251

Number of steps reached: 339

Episode Number: 252

Number of steps reached: 232

Episode Number: 263

Number of steps reached: 261

Episode Number: 264

Number of s

KeyboardInterrupt: 