In [72]:
import gymnasium as gym
import numpy as np
import random
import numpy as np
from keras.models import Sequential
from keras.layers import Conv2D, Flatten, Dense
# from evaluation import evaluate
from Logger import Logger
import argparse
import atexit
import utils

ModuleNotFoundError: No module named 'Logger'

In [36]:
NUM_EPISODES = 1000
LEARNING_RATE = 0.1
DISCOUNT_FACTOR = 0.9
EPSILON = 0.1

In [60]:
env = gym.make("ALE/Breakout-v5", render_mode='human')
state_size = env.observation_space.shape
action_size = env.action_space.n
q_values = np.zeros((state_size[0], state_size[1], action_size))

In [58]:
color = np.array([210, 164, 74]).mean()

def preprocess_state(state):

    image_array = state[0]

    #crop and resize the image
    image = image_array[1:176:2, ::2]
    
    #convert the image to greyscale
    image = image.mean(axis=2)

    #improve image contrast
    image[image==color] = 0

    #normalize the image
    image = (image - 128) / 128 - 1
    
    #reshape the image


    return image

In [42]:
def get_action(self, state):
    is_random = (random() < self.epsilon)
    q_values = self.DQN.predict(state)
    return np.argmax(q_values)

In [63]:
class DQNetwork:
    def __init__(self, actions, input_shape,
                 minibatch_size=32,
                 learning_rate=0.00025,
                 discount_factor=0.99,
                 dropout_prob=0.1,
                 load_path=None,
                 logger=None):

        # Parameters
        self.actions = actions  # Size of the network output
        self.discount_factor = discount_factor  # Discount factor of the MDP
        self.minibatch_size = minibatch_size  # Size of the training batches
        self.learning_rate = learning_rate  # Learning rate
        self.dropout_prob = dropout_prob  # Probability of dropout
        self.logger = logger
        self.training_history_csv = 'training_history.csv'

        if self.logger is not None:
            self.logger.to_csv(self.training_history_csv, 'Loss,Accuracy')

        # Deep Q Network as defined in the DeepMind article on Nature
        # Ordering channels first: (samples, channels, rows, cols)
        self.model = Sequential()

        # First convolutional layer
        self.model.add(Conv2D(32, 8, strides=(4, 4),
                              padding='valid',
                              activation='relu',
                              input_shape=input_shape,
                              data_format='channels_first'))

        # Second convolutional layer
        self.model.add(Conv2D(64, 4, strides=(2, 2),
                              padding='valid',
                              activation='relu',
                              input_shape=input_shape,
                              data_format='channels_first'))

        # Third convolutional layer
        self.model.add(Conv2D(64, 3, strides=(1, 1),
                              padding='valid',
                              activation='relu',
                              input_shape=input_shape,
                              data_format='channels_first'))

        # Flatten the convolution output
        self.model.add(Flatten())

        # First dense layer
        self.model.add(Dense(512, activation='relu'))

        # Output layer
        self.model.add(Dense(self.actions))

        # Load the network weights from saved model
        if load_path is not None:
            self.load(load_path)

        self.model.compile(loss='mean_squared_error',
                           optimizer='rmsprop',
                           metrics=['accuracy'])

    def train(self, batch, DQN_target):
        """
        Generates inputs and targets from the given batch, trains the model on
        them.
        :param batch: iterable of dictionaries with keys 'source', 'action',
        'dest', 'reward'
        :param DQN_target: a DQNetwork instance to generate targets
        """
        x_train = []
        t_train = []

        # Generate training inputs and targets
        for datapoint in batch:
            # Inputs are the states
            x_train.append(datapoint['source'].astype(np.float64))

            # Apply the DQN or DDQN Q-value selection
            next_state = datapoint['dest'].astype(np.float64)
            next_state_pred = DQN_target.predict(next_state).ravel()
            next_q_value = np.max(next_state_pred)

            # The error must be 0 on all actions except the one taken
            t = list(self.predict(datapoint['source'])[0])
            if datapoint['final']:
                t[datapoint['action']] = datapoint['reward']
            else:
                t[datapoint['action']] = datapoint['reward'] + \
                                         self.discount_factor * next_q_value
            t_train.append(t)

        # Prepare inputs and targets
        x_train = np.asarray(x_train).squeeze()
        t_train = np.asarray(t_train).squeeze()

        # Train the model for one epoch
        h = self.model.fit(x_train,
                           t_train,
                           batch_size=self.minibatch_size,
                           nb_epoch=1)

        # Log loss and accuracy
        if self.logger is not None:
            self.logger.to_csv(self.training_history_csv,
                               [h.history['loss'][0], h.history['acc'][0]])

    def predict(self, state):
        """
        Feeds state to the model, returns predicted Q-values.
        :param state: a numpy.array with same shape as the network's input
        :return: numpy.array with predicted Q-values
        """
        state = state.astype(np.float64)
        return self.model.predict(state, batch_size=1)

    def save(self, filename=None, append=''):
        """
        Saves the model weights to disk.
        :param filename: file to which save the weights (must end with ".h5")
        :param append: suffix to append after "model" in the default filename
            if no filename is given
        """
        f = ('model%s.h5' % append) if filename is None else filename
        if self.logger is not None:
            self.logger.log('Saving model as %s' % f)
        self.model.save_weights(self.logger.path + f)

    def load(self, path):
        """
        Loads the model's weights from path.
        :param path: h5 file from which to load teh weights
        """
        if self.logger is not None:
            self.logger.log('Loading weights from file...')
        self.model.load_weights(path)

In [65]:
class DQAgent:
    def __init__(self,
                 actions,
                 network_input_shape,
                 replay_memory_size=1024,
                 minibatch_size=32,
                 learning_rate=0.00025,
                 discount_factor=0.9,
                 dropout_prob=0.1,
                 epsilon=1,
                 epsilon_decrease_rate=0.99,
                 min_epsilon=0.1,
                 load_path=None,
                 logger=None):

        # Parameters
        self.network_input_shape = network_input_shape  # Shape of the DQN input
        self.actions = actions  # Size of the discrete action space
        self.learning_rate = learning_rate  # Learning rate for the DQN
        self.dropout_prob = dropout_prob  # Dropout probability of the DQN
        self.load_path = load_path  # Path from which to load the DQN's weights
        self.replay_memory_size = replay_memory_size  # Size of replay memory
        self.minibatch_size = minibatch_size  # Size of a DQN minibatch
        self.discount_factor = discount_factor  # Discount factor of the MDP
        self.epsilon = epsilon  # Probability of taking a random action
        self.epsilon_decrease_rate = epsilon_decrease_rate  # See update_epsilon
        self.min_epsilon = min_epsilon  # Minimum value for epsilon
        self.logger = logger

        # Replay memory
        self.experiences = []
        self.training_count = 0

        # Instantiate the deep Q-networks
        # Main DQN
        self.DQN = DQNetwork(
            self.actions,
            self.network_input_shape,
            learning_rate=self.learning_rate,
            discount_factor=self.discount_factor,
            minibatch_size=self.minibatch_size,
            dropout_prob=self.dropout_prob,
            load_path=self.load_path,
            logger=self.logger
        )

        # Target DQN used to generate targets
        self.DQN_target = DQNetwork(
            self.actions,
            self.network_input_shape,
            learning_rate=self.learning_rate,
            discount_factor=self.discount_factor,
            minibatch_size=self.minibatch_size,
            dropout_prob=self.dropout_prob,
            load_path=self.load_path,
            logger=self.logger
        )
        # Reset target DQN
        self.DQN_target.model.set_weights(self.DQN.model.get_weights())

    def get_action(self, state, testing=False, force_random=False):
        """
        Polls DQN for Q-values. Returns argmax(Q) with probability 1-epsilon
        during training, 0.95 during testing.
        :param state: a state that can be passed as input to DQN
        :param testing: whether to use the current epsilon or the constant 0.05
        :param force_random: whether to sample a random action regardless of
            parameters
        :return: the index of (action associated to) the highest Q-value 
        """
        is_random = (random() < (self.epsilon if not testing else 0.05))
        if force_random or is_random:
            return randint(0, self.actions - 1)
        else:
            q_values = self.DQN.predict(state)
            return np.argmax(q_values)

    def get_max_q(self, state):
        """
        Returns the maximum Q value predicted on the given state.
        :param state: a state that can be passed as input to DQN
        :return: an action index corresponding to the maximum Q-value in the 
            given state
        """
        q_values = self.DQN.predict(state)
        idxs = np.argwhere(q_values == np.max(q_values)).ravel()
        return np.random.choice(idxs)

    def get_random_state(self):
        """
        Samples a random state from the replay memory.
        :return: the sampled state
        """
        return self.experiences[randrange(0, len(self.experiences))]['source']

    def add_experience(self, source, action, reward, dest, final):
        """
        Add a SARS' tuple to the experience replay.
        :param source: source state
        :param action: action index
        :param reward: reward associated to the transition
        :param dest: destination state
        :param final: whether the state is absorbing
        """
        # Remove older transitions if the replay memory is full
        if len(self.experiences) >= self.replay_memory_size:
            self.experiences.pop(0)
        # Add a tuple (source, action, reward, dest, final) to replay memory
        self.experiences.append({'source': source,
                                 'action': action,
                                 'reward': reward,
                                 'dest': dest,
                                 'final': final})
        # Periodically log how many samples we've gathered so far
        if (len(self.experiences) % 100 == 0) and (len(self.experiences) < self.replay_memory_size) and (self.logger is not None):
            self.logger.log("Gathered %d samples of %d" %
                            (len(self.experiences), self.replay_memory_size))

    def sample_batch(self):
        """
        Samples self.minibatch_size random transitions from the replay memory
        and returns them as a batch.
        :return: a batch of SARS' tuples
        """
        batch = []
        for i in xrange(self.minibatch_size):
            batch.append(self.experiences[randrange(0, len(self.experiences))])
        return np.asarray(batch)

    def train(self):
        """
        Trains the DQN on a minibatch of transitions.
        """
        self.training_count += 1
        print ('Training session #%d - epsilon: %f' % \
              (self.training_count, self.epsilon))
        batch = self.sample_batch()
        self.DQN.train(batch, self.DQN_target)  # Train the DQN

    def update_epsilon(self):
        """
        Decreases the probability of picking a random action, to improve
        exploitation.
        """
        if self.epsilon - self.epsilon_decrease_rate > self.min_epsilon:
            self.epsilon -= self.epsilon_decrease_rate
        else:
            self.epsilon = self.min_epsilon

    def reset_target_network(self):
        """
        Updates the target DQN with the current weights of the main DQN.
        """
        if self.logger is not None:
            self.logger.log('Updating target network...')
        self.DQN_target.model.set_weights(self.DQN.model.get_weights())

    def quit(self):
        """
        Saves the DQN and the target DQN to file.
        """
        if self.load_path is None:
            if self.logger is not None:
                self.logger.log('Quitting...')
            self.DQN.save(append='_DQN')
            self.DQN_target.save(append='_DQN_target')

In [70]:
def exit_handler():
    global DQA
    DQA.quit()

IMG_SIZE = (84, 110)
utils.IMG_SIZE = IMG_SIZE

# I/O
parser = argparse.ArgumentParser()
parser.add_argument('-t', '--train', action='store_true',
                    help='train the agent')
parser.add_argument('-l', '--load', type=str, default=None,
                    help='load the neural network weights from the given path')
parser.add_argument('-v', '--video', action='store_true',
                    help='show video output')
parser.add_argument('-d', '--debug', action='store_true',
                    help='run in debug mode (no output files)')
parser.add_argument('--eval', action='store_true',
                    help='evaluate the agent')
parser.add_argument('-e', '--environment', type=str,
                    help='name of the OpenAI Gym environment to use '
                         '(default: MsPacmanDeterministic-v4)',
                    default='MsPacmanDeterministic-v4')
parser.add_argument('--minibatch-size', type=int, default=32,
                    help='number of sample to train the DQN at each update')
parser.add_argument('--replay-memory-size', type=int, default=1e6,
                    help='number of samples stored in the replay memory')
parser.add_argument('--target-network-update-freq', type=int, default=10e3,
                    help='frequency (number of DQN updates) with which the '
                         'target DQN is updated')
parser.add_argument('--avg-val-computation-freq', type=int, default=50e3,
                    help='frequency (number of DQN updates) with which the '
                         'average reward and Q value are computed')
parser.add_argument('--discount-factor', type=float, default=0.99,
                    help='discount factor for the environment')
parser.add_argument('--update-freq', type=int, default=4,
                    help='frequency (number of steps) with which to train the '
                         'DQN')
parser.add_argument('--learning-rate', type=float, default=0.00025,
                    help='learning rate for optimizer')
parser.add_argument('--epsilon', type=float, default=1,
                    help='initial exploration rate for the agent')
parser.add_argument('--min-epsilon', type=float, default=0.1,
                    help='final exploration rate for the agent')
parser.add_argument('--epsilon-decrease', type=float, default=9e-7,
                    help='rate at which to linearly decrease epsilon')
parser.add_argument('--replay-start-size', type=int, default=50e3,
                    help='minimum number of transitions (with fully random '
                         'policy) to store in the replay memory before '
                         'starting training')
parser.add_argument('--initial-random-actions', type=int, default=30,
                    help='number of random actions to be performed by the agent'
                         ' at the beginning of each episode')
parser.add_argument('--dropout', type=float, default=0.,
                    help='dropout rate for the DQN')
parser.add_argument('--max-episodes', type=int, default=np.inf,
                    help='maximum number of episodes that the agent can '
                         'experience before quitting')
parser.add_argument('--max-episode-length', type=int, default=np.inf,
                    help='maximum number of steps in an episode')
parser.add_argument('--max-frames-number', type=int, default=50e6,
                    help='maximum number of frames during the whole algorithm')
parser.add_argument('--test-freq', type=int, default=250000,
                    help='frequency (number of frames) with which to test the '
                         'agent\'s performance')
parser.add_argument('--validation-frames', type=int, default=135000,
                    help='number of frames to test the model like in table 3 of'
                         ' the paper')
parser.add_argument('--test-states', type=int, default=30,
                    help='number of states on which to compute the average Q '
                         'value')
args = parser.parse_args()

if args.debug:
    print ('####################################################' \
          'WARNING: debug flag is set, output will not be saved' \
          '####################################################')

logger = Logger(debug=args.debug, append=args.environment)
atexit.register(exit_handler)  # Make sure to always save the model when exiting

# Variables
test_scores = []
test_mean_q = []
test_states = []

# Setup
env = gym.make(args.environment)
network_input_shape = (4, 110, 84)  # Dimension ordering: 'th' (channels first)
DQA = DQAgent(env.action_space.n,
              network_input_shape,
              replay_memory_size=args.replay_memory_size,
              minibatch_size=args.minibatch_size,
              learning_rate=args.learning_rate,
              discount_factor=args.discount_factor,
              dropout_prob=args.dropout,
              epsilon=args.epsilon,
              epsilon_decrease_rate=args.epsilon_decrease,
              min_epsilon=args.min_epsilon,
              load_path=args.load,
              logger=logger)

# Initial logging
logger.log({
    'Action space': env.action_space.n,
    'Observation space': env.observation_space.shape
})
logger.log(vars(args))
training_csv = 'training_info.csv'
eval_csv = 'evaluation_info.csv'
test_csv = 'test_score_mean_q_info.csv'
logger.to_csv(training_csv, 'length,score')
logger.to_csv(eval_csv, 'length,score')
logger.to_csv(test_csv, 'avg_score,avg_Q')

# Set counters
episode = 0
frame_counter = 0

if args.train:
    # Main loop
    while episode < args.max_episodes:
        # Start episode
        logger.log("Episode %d" % episode)
        score = 0

        # Observe reward and initialize first state
        obs = utils.preprocess_observation(env.reset())

        # Initialize the first state with the same 4 images
        current_state = np.array([obs, obs, obs, obs])

        # Main episode loop
        t = 0
        frame_counter += 1
        while t < args.max_episode_length:
            # Stop the episode if it takes too long
            if frame_counter > args.max_frames_number:
                DQA.quit()

            # Render the game
            if args.video:
                env.render()

            # Select an action using the DQA
            action = DQA.get_action(np.asarray([current_state]))

            # Observe reward and next state
            obs, reward, done, info = env.step(action)
            obs = utils.preprocess_observation(obs)
            next_state = utils.get_next_state(current_state, obs)

            frame_counter += 1

            # Store transition in replay memory
            clipped_reward = np.clip(reward, -1, 1)  # Clip the reward
            DQA.add_experience(np.asarray([current_state]),
                               action,
                               clipped_reward,
                               np.asarray([next_state]),
                               done)

            # Train the agent
            if t % args.update_freq == 0 and len(DQA.experiences) >= args.replay_start_size:
                DQA.train()
                # Every C DQN updates, update DQN_target
                if DQA.training_count % args.target_network_update_freq == 0 and DQA.training_count >= args.target_network_update_freq:
                    DQA.reset_target_network()
                # Log the mean score and mean Q values of test states
                if DQA.training_count % args.avg_val_computation_freq == 0 and DQA.training_count >= args.avg_val_computation_freq:
                    logger.to_csv(test_csv,
                                  [np.mean(test_scores), np.mean(test_mean_q)])
                    del test_scores[:]
                    del test_mean_q[:]

            # Linear epsilon annealing
            if len(DQA.experiences) >= args.replay_start_size:
                DQA.update_epsilon()

            # Update the current state and score
            current_state = next_state
            score += reward

            # Log episode data in the training csv
            if done or t == args.max_episode_length - 1:
                logger.to_csv(training_csv, [t, score])
                logger.log("Length: %d; Score: %d\n" % (t + 1, score))
                break

            t += 1

            # Evaluate the agent's performance
            if frame_counter % args.test_freq == 0:
                t_evaluation, score_evaluation = evaluate(DQA, args, logger)
                # Log evaluation data
                logger.to_csv(eval_csv, [t_evaluation, score_evaluation])

            # Hold out a set of test states to monitor the mean Q value
            if len(test_states) < args.test_states:
                # Generate test states
                for _ in range(random.randint(1, 5)):
                    test_states.append(DQA.get_random_state())
            else:
                # Update scores and mean Q values
                test_scores.append(score)
                test_q_values = [DQA.get_max_q(state) for state in test_states]
                test_mean_q.append(np.mean(test_q_values))

        episode += 1

if args.eval:
    logger.log(evaluate(DQA, args, logger))

NameError: name 'utils' is not defined