In [1]:
import gym 
import numpy as np
import pandas as pd

env = gym.make("CartPole-v0")

In [2]:
# Print out state space type.
print("States:", env.observation_space)
# Print out action space type.
print("Actions: ", env.action_space, end="\n\n")

# Print out the observation space boundaries.
print("State Low: ", env.observation_space.low)
print("state High: ", env.observation_space.high, end="\n\n")

# Generate some action samples:
print("Action space samples: ", [env.action_space.sample() for _ in range(5)])
# Generate some state samples:
print("State space samples: ", [list(env.observation_space.sample()) for _ in range(3)])

States: Box(4,)
Actions:  Discrete(2)

State Low:  [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
state High:  [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]

Action space samples:  [1, 0, 0, 1, 0]
State space samples:  [[-2.718252, 1.6423467e+38, -0.2508165, 3.3923539e+38], [-0.02896775, 1.2998261e+38, 0.32401782, -1.5791837e+38], [3.0925324, -2.1742727e+38, -0.37817836, -2.9291582e+38]]


In [3]:
# Random agent interacting with the environment.
env.reset()
score = 0
while True:
    env.render()
    action = env.action_space.sample()
    next_state, reward, done, info = env.step(action)
    score += reward
    if done:
        break
env.close()
print("Score = ", score)

Score =  27.0


In [4]:
from collections import deque
from collections import namedtuple
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, buffer_size, batch_size):
        """Initialize a ReplayBuffer object.
        Params
        ======
            buffer_size: maximum size of buffer
            batch_size: size of each training batch
        """
        self.memory = deque(maxlen=buffer_size)  # internal memory (deque)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

    def add(self, prev_state, prev_action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(prev_state, prev_action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        return random.sample(self.memory, k=self.batch_size)

    def size(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [5]:
import sys
import time
import random
from keras.models import Sequential
from keras.layers import Dense, Dropout, regularizers
from keras.optimizers import Adam

class DQNAgent:
    
    def __init__(self, env):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.replay_buffer = ReplayBuffer(buffer_size = 2000, batch_size = 32)
        self.target_update_frequency = 10
        self.target_update_counter = 0
        self.gamma = 0.95
        self.initial_epsilon = 1
        self.epsilon = self.initial_epsilon
        self.epsilon_decay_rate = 0.9995
        self.min_epsilon = 0.01
        self.training_scores = []
        
        # main model  # gets trained every step
        self.model = self.build_model()

        # Target model this is what we .predict against every step
        self.target_model = self.build_model()
        self.target_model.set_weights(self.model.get_weights())
        
        
    def build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(8, input_dim=self.state_size, activation='relu'))
        model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
        model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=0.001), metrics = ["accuracy"])
        model.summary()
        return model
    
    
    def reset_episode(self, initial_state):
        """Reset variables for a new episode."""
        # Gradually decrease exploration rate
        self.epsilon *= self.epsilon_decay_rate
        self.epsilon = max(self.epsilon, self.min_epsilon)
        
        self.prev_state = self.preprocess_state(initial_state)
        self.prev_action = np.argmax(self.model.predict(self.prev_state))
        return self.prev_action
    
    
    def preprocess_state(self, state):
        out = []
        for i in state:
            out.append(round(i, 2))
        return np.reshape(out, [1, self.env.observation_space.shape[0]])
    
    
    def reset_exploration(self, epsilon=None):
        """Reset exploration rate used when training."""
        self.epsilon = epsilon if epsilon is not None else self.initial_epsilon
    
    
    def plot_scores(self, scores, rolling_window=100):
        """Plot scores and optional rolling mean using specified window."""
        plt.plot(scores); plt.title("Scores");
        rolling_mean = pd.Series(scores).rolling(rolling_window).mean()
        plt.plot(rolling_mean);
    
    
    def act(self, next_state, reward, done, mode="train", time_delay=None):
        """Pick next action and update weights of the neural network (when mode != 'test')."""
        next_state = self.preprocess_state(next_state)
        if mode == "test": 
            # Test mode: Simply produce an action
            action = np.argmax(self.model.predict(next_state))
            if time_delay != None:        # Adding time delay to watch the agent perform at a little slower pace.
                time.sleep(time_delay)
        else:
            # Exploration vs. exploitation
            do_exploration = np.random.uniform(0, 1) < self.epsilon
            if do_exploration:
                # Pick a random action
                action = np.random.randint(0, self.action_size)
            else:
                # Pick the best action from Q table
                action = np.argmax(self.model.predict(next_state))
            
            # Store the experience in replay memory
            self.replay_buffer.add(self.prev_state, self.prev_action, reward, next_state, done)
            
            # Learn
            self.replay(done)
            
        # Roll over current state, action for next step
        self.prev_state = next_state
        self.prev_action = action
        return action
    
    
    def replay(self, done):
        if self.replay_buffer.size() < self.replay_buffer.batch_size:
            return 
        
        terminal_state = done        # Determine if the episode has ended.
        minibatch = self.replay_buffer.sample()
        
        # X : states, y : predictions
        X = []
        y = []
        
        prev_states = np.array([transition[0][0] for transition in minibatch])
        prev_qs = self.model.predict(prev_states)
        
        next_states = np.array([transition[3][0] for transition in minibatch])
        next_qs = self.target_model.predict(next_states)
        
        for index, (prev_state, prev_action, reward, next_state, done) in enumerate(minibatch):
            # Setting the target for the model to improve upon
            if not done:
                target = reward + (self.gamma * np.max(next_qs[index]))
            else:
                target = reward

            new_q_value = prev_qs[index]
            new_q_value[prev_action] = target

            X.append(prev_state)
            y.append(new_q_value)

        # Fit on all samples as one batch, log only on terminal state
        self.model.fit(np.vstack(X), np.vstack(y), batch_size=self.replay_buffer.batch_size, 
                       verbose=0, shuffle=False)

        if terminal_state:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter > self.target_update_frequency:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0

    
    def run(self, num_episodes=20000, mode="train", time_delay=0.01, score_threshold=None, weights_path=None, 
            scores_path=None):
        
        """Run agent in given reinforcement learning environment and return scores."""
        scores = []
        max_score = -np.inf
        min_score = np.inf
        max_avg_score = -np.inf
        avg_score = -np.inf
        for i_episode in range(1, num_episodes+1):
            # Initialize episode
            state = self.env.reset()
            action = self.reset_episode(state)
            total_reward = 0
            done = False

            # Roll out steps until done
            while not done:
                if mode=='test':
                    env.render()
                next_state, reward, done, info = self.env.step(action)
                total_reward += reward
                action = self.act(next_state, reward, done, mode, time_delay)
                env.render()
            # Save final score
            scores.append(total_reward)
            # Print episode stats
            if mode == 'train':
                self.training_done = True
                
                if total_reward > max_score:
                    max_score = total_reward
                    
                if total_reward < min_score:
                    min_score = total_reward
                    
                if len(scores) > 100:
                    avg_score = np.mean(scores[-100:])
                    if avg_score > max_avg_score:
                        max_avg_score = avg_score
                        
                print("\rEpisode {}/{} | Episode Score: {} | Min. Score: {} | Max. Score: {} | Current Avg. Score: {} | Max. Average Score: {} | epsilon: {}"
                      .format(i_episode, num_episodes, total_reward, min_score, max_score, 
                              avg_score, max_avg_score, self.epsilon), end="")
                sys.stdout.flush()
            
            # Terminating loop if the agent achieves reward threshold
            if score_threshold != None and max_avg_score > score_threshold:
                print("\nEnvironment solved after {} episodes".format(i_episode, epsilon))
                break
        
        # Close rendering
        env.close()
        if weights_path != None:
            # Save the model weights
            self.model.save_weights(weights_path)
        
        if scores_path != None:
            logs = {"scores" : scores}
            logs = pd.DataFrame.from_dict(data=logs, orient='index')
            logs.to_csv(scores_path ,index=False)
            
        if mode == "test":
            print("\nScore: ", np.mean(scores))
        else:
            self.training_scores.append(scores)
                

Using TensorFlow backend.


In [6]:
agent = DQNAgent(env)

W0801 05:57:30.585764 4378183104 deprecation_wrapper.py:119] From /Users/apoorvmalik/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0801 05:57:30.600603 4378183104 deprecation_wrapper.py:119] From /Users/apoorvmalik/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0801 05:57:30.603658 4378183104 deprecation_wrapper.py:119] From /Users/apoorvmalik/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0801 05:57:30.690012 4378183104 deprecation_wrapper.py:119] From /Users/apoorvmalik/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W080

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 18        
Total params: 202
Trainable params: 202
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 8)                 40        
_________________________________________________________________
dense_6 (Den

In [7]:
agent.run(num_episodes=20000, mode="train", score_threshold=195,
          weights_path="./weights/model_weights_1", scores_path="./scores/logs1")

Episode 20000/20000 | Episode Score: 63.0 | Min. Score: 8.0 | Max. Score: 200.0 | Current Avg. Score: 92.62 | Max. Average Score: 194.0 | epsilon: 0.01110000188023547553