In [1]:
# based on Deep Learning Illustrated by Jon Krohn
# https://www.amazon.com/Deep-Learning-Illustrated-Intelligence-Addison-Wesley/dp/0135116694
# in turn based on bit.ly/keonDQN

import os
from collections import deque
import random
import time
import resource
import pickle

import pdb

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, load_model, model_from_json
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

import plotly.express as px
import plotly.graph_objects as go

# requires python 3.6
# conda install -c akode gym
import gym

# set seeds for reproducibility
# np.random.uniform(0,10000) 4465
random.seed(4465)
np.random.seed(4465)
tf.random.set_seed(4465)

# todo
# results, timestep instance vars
# pickle / load epsilon, results, timestep , model all in one file

In [2]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.98
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self.build_model()
        self.memory = pd.DataFrame(columns=["state", "action", "next_state", "reward", "done"])
        self.memory_size=200000
        self.results = []
        self.train_batch_size=8
        self.timestep=0
        self.summary_interval=10
        
    def build_model(self,
                    n_hidden_layers=2, 
                    hidden_layer_size=16, 
                    activation='relu',
                    reg_penalty=0.001,
                    dropout=0.0675,
                    verbose=True
                   ):
        """return keras NN model per inputs
        input is a state - array of size state_size
        output is an array of action values - array of size action_size
        """

        model = Sequential()

        for i in range(n_hidden_layers):
            if verbose:
                print("layer %d size %d, %s, reg_penalty %.8f, dropout %.3f" % (i + 1, 
                                                                                hidden_layer_size, 
                                                                                activation,
                                                                                reg_penalty,
                                                                                dropout,
                                                                               ))
            # add dropout, but not on inputs, only between hidden layers
            if i and dropout:
                model.add(Dropout(dropout))

            if i==0: # first layer, specify input shape
                model.add(Dense(input_shape=(state_size,),
                                units = hidden_layer_size, 
                                activation = activation,
                                kernel_initializer = keras.initializers.glorot_uniform(),
                                kernel_regularizer=keras.regularizers.l2(reg_penalty),
                                name = "Dense%02d" % i))
            else: #use implicit input shape
                model.add(Dense(units = hidden_layer_size, 
                                activation = activation,
                                kernel_initializer = keras.initializers.glorot_uniform(),
                                kernel_regularizer=keras.regularizers.l2(reg_penalty),
                                name = "Dense%02d" % i))

        model.add(Dense(self.action_size, activation='linear'))

        if verbose:
            print(model.summary())

        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

        return model
        
    def remember(self, state, action, reward, next_state, done):
        # append in place
        self.memory.loc[self.memory.shape[0]]=[state[0], action, next_state[0], reward, done]
            
    def train(self, sample_size, start_epoch=0):
        # truncate memory
        self.memory = self.memory[-self.memory_size:]
        # sample sample_size observations from memory
        minibatch = self.memory.sample(n=sample_size)
        
        # target is our best estimate of value of each action
        X_fit = np.concatenate(minibatch['state'].values)
        X_fit = X_fit.reshape((sample_size, self.state_size))
        Y_pred = self.model.predict(X_fit)

        # we don't just fit model against model's own prediction, that would get us nowhere
        # we improve the target by what we learned about the action we actually took
        # value is reward obtained + predicted value of the observed next state
        minibatch['target_observed'] = minibatch['reward']
        # if done, target is the reward 
        # reward by gym env is only 1 for each timestep of survival
        # but we also added a reward of -10 on failure
        # if not done, add gamma discount rate * Q-value prediction for the observed next state
        not_done = minibatch.loc[minibatch['done'] == False]
        X_observed = np.concatenate(not_done['next_state'].values)
        X_observed = X_observed.reshape((not_done.shape[0], self.state_size))
        # run all predictions at once
        # iterates faster but does not train after each prediction
        y_observed_pred = np.amax(self.model.predict(X_observed), axis=1)
        minibatch.loc[minibatch['done'] == False, 'target_observed'] += self.gamma * y_observed_pred
        # vectorized vlookup - update y_pred column specified by action using target_observed
        np.put_along_axis(Y_pred, 
                          minibatch['action'].astype(int).values.reshape(sample_size,1), 
                          minibatch['target_observed'].values.reshape(sample_size,1),
                          axis=1)
        # fit model against improved target
        # arbitrary 8 batch size to reduce variance a little and speed up fit
        self.model.fit(X_fit, Y_pred, 
                       epochs=1, initial_epoch=start_epoch,
                       batch_size=self.train_batch_size, 
                       verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def reset(self):
        self.timestep = 0
    
    def increment_time(self):
        self.timestep +=1

    def score_episode(self, e, n_episodes):
        agent.save_score()
        avglen=min(len(self.results), self.summary_interval)
        print("{} episode: {}/{}, score: {}, {}-episode avg: {:.1f} epsilon: {:.02}"
              .format(time.strftime("%H:%M:%S"), e, n_episodes, self.timestep, 
                      avglen, sum(self.results[-avglen:])/avglen, self.epsilon))
        
    def save_score(self):
        agent.results.append(self.timestep)    
    
    def load(self, filename, memory=True):
        self.model = load_model("%s.h5" % filename)
        pickledict = pickle.load(open( "%s.p" % filename, "rb"))
        self.memory = pickledict['memory']
        self.results = pickledict['results']
        self.epsilon = pickledict['epsilon']
        print("loaded %d results, %d rows of memory, epsilon %.4f" % (len(self.results),
                                                                      len(self.memory),
                                                                      self.epsilon))

    def save(self, pathname, memory=True):
        fullname = "%s%04d" % (pathname, len(self.results))
        self.model.save("%s.h5" % fullname)
        pickledict = {
            'memory': self.memory,
            'results': self.results,
            'epsilon': self.epsilon,
        }
        pickle.dump( pickledict, open( "%s.p" % fullname, "wb" ) )
        print("saved model to %s" % fullname)

        

In [3]:
#https://gym.openai.com/envs/CartPole-v0/
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
sample_size = 128
n_episodes = 400
fail_penalty = -20

output_dir = 'model_output/cartpole/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m



Parameters to load are deprecated.  Call .resolve and .require separately.



In [4]:
agent = DQNAgent(state_size, action_size)

layer 1 size 16, relu, reg_penalty 0.00100000, dropout 0.068
layer 2 size 16, relu, reg_penalty 0.00100000, dropout 0.068
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Dense00 (Dense)              (None, 16)                80        
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
Dense01 (Dense)              (None, 16)                272       
_________________________________________________________________
dense (Dense)                (None, 2)                 34        
Total params: 386
Trainable params: 386
Non-trainable params: 0
_________________________________________________________________
None


In [5]:
# load earlier model
# start_epoch=1000
# loadmodel = '%04d' % start_epoch
# agent.load(output_dir + 'model_' + loadmodel)
# n_episodes = 50
# agent.epsilon = 0.094


In [6]:
for e in range(n_episodes):
    print ('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    agent.reset()
    done = False
    
    while not done:
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        if done and e < (n_episodes-1):
            reward = fail_penalty
        next_state = next_state.reshape([1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            agent.score_episode(e, n_episodes)
        else:
            agent.increment_time()
    
    if len(agent.memory) > sample_size*2:
        agent.train(max(sample_size, int(agent.memory.shape[0] *0.05)))
        
    if e and (e+1) % agent.summary_interval == 0:
        agent.save(output_dir + "model_")


Memory usage: 263540736 (kb)
09:44:57 episode: 0/400, score: 15, 1-episode avg: 15.0 epsilon: 1.0
Memory usage: 300068864 (kb)
09:44:58 episode: 1/400, score: 13, 2-episode avg: 14.0 epsilon: 1.0
Memory usage: 300109824 (kb)
09:44:58 episode: 2/400, score: 11, 3-episode avg: 13.0 epsilon: 1.0
Memory usage: 300138496 (kb)
09:44:58 episode: 3/400, score: 32, 4-episode avg: 17.8 epsilon: 1.0
Memory usage: 300240896 (kb)
09:44:59 episode: 4/400, score: 20, 5-episode avg: 18.2 epsilon: 1.0
Memory usage: 300380160 (kb)
09:44:59 episode: 5/400, score: 28, 6-episode avg: 19.8 epsilon: 1.0
Memory usage: 300548096 (kb)
09:44:59 episode: 6/400, score: 32, 7-episode avg: 21.6 epsilon: 1.0
Memory usage: 300810240 (kb)
09:45:00 episode: 7/400, score: 21, 8-episode avg: 21.5 epsilon: 1.0
Memory usage: 300855296 (kb)
09:45:00 episode: 8/400, score: 11, 9-episode avg: 20.3 epsilon: 1.0
Memory usage: 300896256 (kb)
09:45:00 episode: 9/400, score: 36, 10-episode avg: 21.9 epsilon: 1.0
saved model to mode

09:45:35 episode: 78/400, score: 15, 10-episode avg: 14.8 epsilon: 0.71
Memory usage: 551313408 (kb)
09:45:36 episode: 79/400, score: 17, 10-episode avg: 14.5 epsilon: 0.71
saved model to model_output/cartpole/model_0080
Memory usage: 558219264 (kb)
09:45:37 episode: 80/400, score: 19, 10-episode avg: 14.8 epsilon: 0.71
Memory usage: 562237440 (kb)
09:45:38 episode: 81/400, score: 19, 10-episode avg: 15.5 epsilon: 0.7
Memory usage: 566423552 (kb)
09:45:38 episode: 82/400, score: 12, 10-episode avg: 14.8 epsilon: 0.7
Memory usage: 570544128 (kb)
09:45:39 episode: 83/400, score: 17, 10-episode avg: 15.7 epsilon: 0.7
Memory usage: 575193088 (kb)
09:45:40 episode: 84/400, score: 41, 10-episode avg: 18.3 epsilon: 0.69
Memory usage: 587497472 (kb)
09:45:41 episode: 85/400, score: 23, 10-episode avg: 19.7 epsilon: 0.69
Memory usage: 592678912 (kb)
09:45:42 episode: 86/400, score: 21, 10-episode avg: 19.9 epsilon: 0.69
Memory usage: 597733376 (kb)
09:45:42 episode: 87/400, score: 14, 10-episod

Memory usage: 1273663488 (kb)
09:47:27 episode: 155/400, score: 60, 10-episode avg: 46.2 epsilon: 0.49
Memory usage: 1295544320 (kb)
09:47:30 episode: 156/400, score: 39, 10-episode avg: 47.5 epsilon: 0.48
Memory usage: 1304834048 (kb)
09:47:32 episode: 157/400, score: 62, 10-episode avg: 46.7 epsilon: 0.48
Memory usage: 1327128576 (kb)
09:47:35 episode: 158/400, score: 69, 10-episode avg: 47.4 epsilon: 0.48
Memory usage: 1347616768 (kb)
09:47:37 episode: 159/400, score: 36, 10-episode avg: 46.5 epsilon: 0.48
saved model to model_output/cartpole/model_0160
Memory usage: 1363361792 (kb)
09:47:38 episode: 160/400, score: 42, 10-episode avg: 47.0 epsilon: 0.47
Memory usage: 1373122560 (kb)
09:47:40 episode: 161/400, score: 43, 10-episode avg: 47.0 epsilon: 0.47
Memory usage: 1390116864 (kb)
09:47:43 episode: 162/400, score: 57, 10-episode avg: 48.3 epsilon: 0.47
Memory usage: 1413230592 (kb)
09:47:46 episode: 163/400, score: 57, 10-episode avg: 50.2 epsilon: 0.47
Memory usage: 1434267648 

Memory usage: 2439811072 (kb)
09:51:36 episode: 231/400, score: 108, 10-episode avg: 94.4 epsilon: 0.33
Memory usage: 2439811072 (kb)
09:51:40 episode: 232/400, score: 74, 10-episode avg: 95.6 epsilon: 0.33
Memory usage: 2439811072 (kb)
09:51:46 episode: 233/400, score: 114, 10-episode avg: 99.6 epsilon: 0.33
Memory usage: 2439811072 (kb)
09:51:49 episode: 234/400, score: 66, 10-episode avg: 92.6 epsilon: 0.33
Memory usage: 2439811072 (kb)
09:51:54 episode: 235/400, score: 70, 10-episode avg: 91.3 epsilon: 0.33
Memory usage: 2439811072 (kb)
09:52:01 episode: 236/400, score: 123, 10-episode avg: 89.6 epsilon: 0.32
Memory usage: 2439811072 (kb)
09:52:08 episode: 237/400, score: 117, 10-episode avg: 91.7 epsilon: 0.32
Memory usage: 2439811072 (kb)
09:52:15 episode: 238/400, score: 116, 10-episode avg: 97.1 epsilon: 0.32
Memory usage: 2439811072 (kb)
09:52:21 episode: 239/400, score: 108, 10-episode avg: 97.9 epsilon: 0.32
saved model to model_output/cartpole/model_0240
Memory usage: 24398

10:04:32 episode: 306/400, score: 135, 10-episode avg: 181.4 epsilon: 0.23
Memory usage: 3662536704 (kb)
10:04:44 episode: 307/400, score: 146, 10-episode avg: 172.6 epsilon: 0.23
Memory usage: 3662536704 (kb)
10:05:53 episode: 308/400, score: 267, 10-episode avg: 179.9 epsilon: 0.23
Memory usage: 3662536704 (kb)
10:06:22 episode: 309/400, score: 298, 10-episode avg: 184.6 epsilon: 0.22
saved model to model_output/cartpole/model_0310
Memory usage: 3662536704 (kb)
10:06:54 episode: 310/400, score: 347, 10-episode avg: 203.9 epsilon: 0.22
Memory usage: 3662536704 (kb)
10:07:16 episode: 311/400, score: 244, 10-episode avg: 213.9 epsilon: 0.22
Memory usage: 3662536704 (kb)
10:07:45 episode: 312/400, score: 279, 10-episode avg: 212.9 epsilon: 0.22
Memory usage: 3662536704 (kb)
10:08:06 episode: 313/400, score: 175, 10-episode avg: 213.9 epsilon: 0.22
Memory usage: 3662536704 (kb)
10:08:38 episode: 314/400, score: 351, 10-episode avg: 236.6 epsilon: 0.22
Memory usage: 3662536704 (kb)
10:08:5

Memory usage: 4503404544 (kb)
10:33:39 episode: 381/400, score: 499, 10-episode avg: 207.4 epsilon: 0.16
Memory usage: 4503404544 (kb)
10:33:58 episode: 382/400, score: 198, 10-episode avg: 209.7 epsilon: 0.16
Memory usage: 4503404544 (kb)
10:34:18 episode: 383/400, score: 209, 10-episode avg: 196.3 epsilon: 0.15
Memory usage: 4503404544 (kb)
10:35:02 episode: 384/400, score: 477, 10-episode avg: 230.2 epsilon: 0.15
Memory usage: 4503404544 (kb)
10:35:22 episode: 385/400, score: 203, 10-episode avg: 238.9 epsilon: 0.15
Memory usage: 4503404544 (kb)
10:35:37 episode: 386/400, score: 154, 10-episode avg: 241.9 epsilon: 0.15
Memory usage: 4503404544 (kb)
10:36:15 episode: 387/400, score: 378, 10-episode avg: 266.7 epsilon: 0.15
Memory usage: 4503404544 (kb)
10:36:35 episode: 388/400, score: 209, 10-episode avg: 274.2 epsilon: 0.15
Memory usage: 4503404544 (kb)
10:36:48 episode: 389/400, score: 133, 10-episode avg: 270.5 epsilon: 0.15
saved model to model_output/cartpole/model_0390
Memory 

In [7]:
df = pd.DataFrame({'timesteps': agent.results})
df['avg'] = df['timesteps'].rolling(10).mean() 
df


loaded 1000 results, 199698 rows of memory, epsilon 0.0100


Unnamed: 0,timesteps,avg
0,14,
1,10,
2,12,
3,20,
4,10,
...,...,...
995,117,230.8
996,114,192.3
997,113,153.7
998,499,192.2


In [13]:
# chart timesteps vs. episodes
def rlplot(agent):
    df = pd.DataFrame({'timesteps': agent.results})
    df['avg'] = df['timesteps'].rolling(10).mean() 

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.index, 
                             y=df['timesteps'],
                             mode='markers',
                             name='timesteps',
                             marker=dict(
                                 color='LightSkyBlue',
                                 size=5,
                             ),
                            ))

    fig.add_trace(go.Scatter(x=df.index, 
                             y=df['avg'],
                             mode='lines',
                             name='moving average'))

    fig.update_layout(
        title= dict(text='Cartpole DQN Agent Training Progress',
                    x=0.5,
                    xanchor='center'),
        xaxis=dict(
            title="Episodes",
            linecolor='black',
            linewidth=1,
            mirror=True
        ),
        yaxis=dict(
            title="Completed Timesteps",
            linecolor='black',
            linewidth=1,
            mirror=True
        ),
        legend=go.layout.Legend(
            x=0.01,
            y=0.99,
            traceorder="normal",
            font=dict(
                family="sans-serif",
                size=12,
                color="black"
            ),
            #bgcolor="LightSteelBlue",
            bordercolor="Black",
            borderwidth=1,
        ),
    )

    return fig.show()

start_epoch=400
loadmodel = '%04d' % start_epoch
agent.load(output_dir + 'model_' + loadmodel)
rlplot(agent)



loaded 400 results, 39436 rows of memory, epsilon 0.1423


In [12]:
start_epoch=1000
loadmodel = '%04d' % start_epoch
agent.load(output_dir + 'model_' + loadmodel)
rlplot(agent)

loaded 1000 results, 199698 rows of memory, epsilon 0.0100


In [None]:
# training as above does well up to a point but not very stable
# sometimes performance goes off a cliff esp with more complex NNs like 2x32
# continuing to train sometimes results in forgetting what it learned
# also on my machine tensorflow leaks memory, can't train long without restarting
# trained repeatedly, when it fell off a cliff restarted using best previous model
# early stopping after achieving a very good model
# saved best model, run it here without epsilon random exploration, or training

agent.load('good_new')
agent.epsilon = 0.01
print(agent.model.summary())

for e in range(10):
    print ('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    agent.reset()
    done = False
    
    while not done:
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else fail_penalty
        next_state = next_state.reshape([1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            agent.score_episode(e, n_episodes)
        else:
            agent.increment_time()
            
    # don't train or save after each episode
    


loaded 0 results, 1132 rows of memory, epsilon 0.0100
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Dense00 (Dense)              (None, 32)                160       
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
Dense01 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense (Dense)                (None, 2)                 66        
Total params: 1,282
Trainable params: 1,282
Non-trainable params: 0
_________________________________________________________________
None
Memory usage: 4503404544 (kb)
10:54:31 episode: 0/400, score: 499, 1-episode avg: 499.0 epsilon: 0.01
Memory usage: 4503404544 (kb)
10:55:26 episode: 1/400, score: 499, 2-episode avg: 499.0 eps