In [None]:
import numpy as np
import matplotlib.pyplot as plt
import random

import gym

import tensorflow as tf

from tensorflow.keras import Model
from tensorflow.keras import backend as K

from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.losses import CategoricalCrossentropy

import import_ipynb
from helpers import *

%matplotlib inline

class Train:
    '''Runs a Rainbow experiment.
    
    Attributes:
    test_results: dict of test scores.
    Score: list of scores.
    Loss: list of losses.
    memory_size: int, size of memory buffer.
    s_shape: tuple of ints, shape of input state.
    num_a: int, number of available actions.
    agent_names: list of string of legal agent names. Those names are:
                : 'dqn',
                : 'double_dqn',
                : 'multi_step_dqn',
                : 'per_dqn',
                : 'dueling_dqn',
                : 'distributional_dqn'
    file_name: str with filename.
    save_memory: numpy array or list, agents memory.
    agents: class with various DQN agents and related methods.
    q_net: Keras model object, the q value approximator.
    t_net: Keras model object, the target value approximator.
    memory: class, instantiates appropriate replay buffer.
    
    '''
    
    def __init__(self,
                agent_name='dqn',
                episodes=3000,
                update_rate=500,
                smoothing=60,
                print_rate=3,
                alpha=0.001,
                decay_rate=0.9995,
                gamma=0.99,
                n_steps=False,
                n_horizon=0.1,
                trial=1,
                batchsize=36,
                frame_skipping=4,
                per_alpha=0.6,
                per_beta=0.4,
                v_min=-10.,
                v_max=10.,
                num_atoms=51):
        '''Initializes Train.
        
        Arguments:
        agent_name: str with the name of the agent to train.
        environment: str with the gym environment.
        episodes: int, number of episodes.
        update_rate: int, number of episodes between updates of target net.
        smoothing: int, average plotting over this many points.
        alpha: float, the learning rate.
        decay_rate: float, epsilon rate of decay.
        gamma: float, the discount rate in the TD error.
        n_steps: int, number of n-steps in multi step agent.
        n_horizon: float, window from which to sample n_steps from.
        trial: int, addition to filename to mark multiple runs.
        batchsize: int.
        frame_skipping: int, number of frames between choosing an action.
        per_alpha: float, alpha parameter for PER.
        per_beta: float, beta parameter for PER.
        v_min: float, minimum in c51 target support.
        v_max: float, maximum in c51 target support.
        num_atoms: int, number of atoms in c51 support.
        
        '''
        self.agent_name = agent_name
        self.episodes = episodes
        self.update_rate = update_rate
        self.smoothing = self.episodes // smoothing
        self.print_rate = self.episodes // print_rate
        self.alpha = alpha
        self.decay_rate = decay_rate
        self.gamma = gamma
        self.n_steps = n_steps
        self.trial = trial
        self.batchsize = batchsize
        self.frame_skipping = frame_skipping
        self.per_alpha = per_alpha
        self.per_beta = per_beta
        self.v_min = v_min
        self.v_max = v_max
        self.num_atoms = num_atoms
        self.test_results = None
        self.Score = None
        self.Loss = None
        self.memory_size = int((self.episodes * 200) * 0.5)
        self.n_horizon = int(self.memory_size * n_horizon)
        self.env = gym.make('MountainCar-v0')
        self.s_shape = self.env.observation_space.sample().shape
        self.num_a = self.env.action_space.n
        self.agent_names = [
            'dqn',
            'double_dqn',
            'multi_step_dqn',
            'per_dqn',
            'dueling_dqn',
            'distributional_dqn'
        ]
        assert self.agent_name in self.agent_names, 'Consider eneter a legal agent name.'
        self.filename = None
        self.save_memory = None
        self.agents = DqnAgents()
        self.q_net = None
        self.t_net = None
        self.memory = None
        
    def _get_epsilon(self, epsilon, episode):
        '''Calculates a decaying epsilon.
        
        Arguments:
        epsilon: float of current value of epsilon.
        episode: int, current epsiode.
        
        Returns:
        epsilon: float of a deacyed epsilon value.
        
        '''
        if epsilon > 0.1:
            epsilon = epsilon * self.decay_rate**episode
        else:
            epsilon = 0.1
        return epsilon
        
    def _get_agent(self):
        '''Instantiates models and memory according to chosen agent.'''
        if self.agent_name in self.agent_names[:4]:
            self.q_net = self.agents.build_dqn_graph(self.s_shape, 
                                                     self.num_a, 
                                                     self.alpha, 
                                                     compiled=True)
            self.t_net = self.agents.build_dqn_graph(self.s_shape, 
                                                     self.num_a, 
                                                     self.alpha,
                                                     compiled=False)
        elif self.agent_name == 'dueling_dqn':
            self.q_net, self.q_net_policy = self.agents.build_dueling_graph(self.s_shape,
                                                                            self.num_a,
                                                                            self.alpha,
                                                                            self.batchsize)
            self.t_net = self.agents.build_dueling_graph(self.s_shape,
                                                         self.num_a,
                                                         self.alpha,
                                                         self.batchsize,
                                                         compiled=False)
        elif self.agent_name == 'distributional_dqn':
            self.q_net, self.q_net_policy = self.agents.build_c51_graph(self.s_shape,
                                                                        self.num_a,
                                                                        self.num_atoms,
                                                                        self.alpha,
                                                                        self.batchsize)
            self.q_net, self.q_net_policy = self.agents.build_c51_graph(self.s_shape,
                                                                        self.num_a,
                                                                        self.num_atoms,
                                                                        self.alpha,
                                                                        self.batchsize,
                                                                        compiled=False)
            
        # experience replay for dqn, ddqn and multi step.
        if self.agent_name in self.agent_names[:3] or self.agent_name == 'distributional_dqn':
            self.memory = ExperienceReplay(self.memory_size,
                                          self.batchsize)
        else: 
            self.memory = PrioritizedExperienceReplay(self.memory_size,
                                                     self.batchsize)
    def _subtract_mean(self, args):
        '''Final layer module in the dueling architecture.
        
        Arguments:
        args: list of Keras tensor objects of layer outputs.
        num_a: int with the number of available actions.
        batchsize: int.
        
        Returns:
        action values: Keras tensor object, the model output.
        
        '''
        v, A = args
        A_mean = tf.math.reduce_mean(A)
        A_sub_mean = tf.math.subtract(A, A_mean)
        V = tf.broadcast_to(v, [self.batchsize, self.num_a])
        return tf.math.add(V, A_sub_mean)
        
    def _get_agent2(self):
        '''Instantiates various network graphs.
        
        Temporary method due to some errors experienced when
        instantiating Keras models from the DqnAgents class.
        '''
        if self.agent_name == 'dueling_dqn':
            inputs = Input(shape=self.s_shape)
            x = Dense(400, activation='relu')(inputs)
            x = Dense(200, activation='relu')(x)
            v = Dense(100, activation='relu')(x)
            v = Dense(1, activation='linear')(v)
            a = Dense(100, activation='relu')(x)
            a = Dense(self.num_a, activation='linear')(a)
            outputs = Lambda(self._subtract_mean)([v, a])
            self.t_net = Model(inputs, outputs)
            self.q_net = Model(inputs, outputs)
            self.q_net_policy = Model(inputs, a)
            opt = tf.keras.optimizers.Adam(learning_rate=self.alpha)
            self.q_net.compile(optimizer=opt, loss='mse')
            #self.q_net.compile(optimizer=opt, loss=tf.keras.losses.Huber())
            self.memory = PrioritizedExperienceReplay(self.memory_size,
                                                     self.batchsize,
                                                     self.per_alpha,
                                                     self.per_beta)
        elif self.agent_name == 'distributional_dqn':
            inputs = Input(shape=self.s_shape)
            x = Dense(100, activation='relu')(inputs)
            x = Dense(100, activation='relu')(x)
            outputs = [Dense(self.num_atoms, activation='linear')(x) for _ in range(self.num_a)]
            self.t_net = Model(inputs, outputs)
            self.q_net = Model(inputs, outputs)
            opt = Adam(learning_rate=self.alpha)
            loss = CategoricalCrossentropy(from_logits=False)
            self.q_net.compile(optimizer=opt, loss='mse')
            self.memory = ExperienceReplay(self.memory_size,
                                              self.batchsize)
        else:
            inputs = Input(shape=self.s_shape)
            x = Dense(400, activation='relu')(inputs)
            x = Dense(200, activation='relu')(x)
            outputs = Dense(self.num_a, activation='linear')(x)
            self.t_net = Model(inputs, outputs)
            self.q_net = Model(inputs, outputs)
            if self.agent_name == 'per_dqn':
                opt = tf.keras.optimizers.Adam(learning_rate=self.alpha)
                self.q_net.compile(optimizer=opt, loss=tf.keras.losses.Huber())
            else:
                opt = RMSprop(learning_rate=self.alpha)
                self.q_net.compile(optimizer=opt, loss='mse')
            if self.agent_name == 'per_dqn':
                self.memory = PrioritizedExperienceReplay(self.memory_size,
                                                         self.batchsize,
                                                         self.per_alpha,
                                                         self.per_beta)
            else:
                self.memory = ExperienceReplay(self.memory_size,
                                              self.batchsize)
            
    def _color_map(self):
        '''Matplotlib color map.'''
        Blue_1 = '#2CBDFE'
        Green1 = '#47DBCD'
        Pink1 = '#F3A0F2'
        Purple1 = '#9D2EC5'
        Violet1 = '#661D98'
        Amber1 = '#F5B14C'
        color_list =[Blue_1, Green1, Pink1, Purple1, Violet1, Amber1]
        random.shuffle(color_list)
        plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)
            
    def _train_visualizsation(self, episode, if_loss=False):
        '''Prints and plots results from training.
        
        Arguments:
        episode: int, episode.
        
        '''
        score = [np.average(self.Score[i:i+self.smoothing]) for i in range(len(self.Score))]
        loss = [np.average(self.Loss[i:i+self.smoothing]) for i in range(len(self.Loss))]
        ############
        var_t = [np.average(self.var_targets[i:i+self.smoothing]) for i in range(len(self.var_targets))]
        var_p = [np.average(self.var_predictions[i:i+self.smoothing]) for i in range(len(self.var_predictions))]
        ##########
        self._color_map()
        plt.figure(figsize=(5, 3))
        plt.plot(score, label='score')
        plt.xticks([])
        plt.xlabel('episode: ' + str(episode + 1))
        plt.ylabel('score')
        if if_loss:
            plt.figure(figsize=(5, 3))
            plt.plot(loss, label='loss')
            plt.xticks([])
            plt.xlabel('episode: ' + str(episode + 1))
            plt.ylabel('loss')
            
            ########
            plt.figure(figsize=(5, 3))
            plt.plot(var_t, label='var targs')
            plt.plot(var_p, label='var preds')
            plt.xticks([])
            plt.xlabel('episode: ' + str(episode + 1))
            plt.ylabel('variance')
            plt.legend(loc='upper left', frameon=False)
            ########
        plt.show()
        print(f'{self.agent_name} *** avg_train_score={np.average(self.Score)}')
        
    def test_visualization(self, art=False):
        '''Prints and plots tests from testing.
        
        Arguments:
        art: bool, whether to show a plot or not.
        
        '''
        X = []
        for i, j in zip(self.test_results['results'], self.test_results['filenames']):
            avg_score = np.average(i)
            print(f'{j} *** avg_test_score={avg_score}')
            X.append(avg_score)
        if art:
            self._color_map()
            plt.figure(figsize=(5, 3))
            plt.bar(np.arange(len(X)), X)
            plt.xticks(np.arange(len(X)))
            plt.xticks(range(5), labels=self.test_results['filenames'], rotation=270)
            plt.ylabel('average testscore')
            plt.show()
            
    def get_variance(self, targets):
        '''Computes the variance in c51 distributions.
    
        Arguments:
        targets: numpy array of targets.

        Returns:
        var_target: float, mean variance from batch of targets.
        var_pred: float, mean variance from batch of predictions.

        '''
        return np.var(targets), np.var(self.agents.z)
            
    def results_visualization(self):
        '''Plots the combined results from a series of runs.'''

    def test(self, load=False):
        '''Tests a trained agent.
        
        Arguments:
        load: bool, a list of str if True. Filenames to load.
        
        '''
        if load:
            models = [tensorflow.keras.models.load_model(i+'.txt') for i in load]
        else:
            models = [self.q_net]
            load = [self.agent_name]
        self.test_results = {'results': [], 'filenames': []}
        for i, model in enumerate(models):
            filename = load[i]
            Score = []
            for episode in range(100):
                s = self.env.reset()
                t = False
                score = 0
                while not t:
                    if self.agent_name == 'dueling_dqn':
                        a = self.agents.policy(self.q_net_policy, s, self.num_a, epsilon=0)
                    elif self.agent_name == 'distributional_dqn':
                        a = self.agents.policy(self.q_net, s, self.num_a, epsilon=0, c51=True)
                    else:
                        a = self.agents.policy(model, s, self.num_a, epsilon=0)
                    s2, r, t, _ = self.env.step(a)
                    score += r
                    s = s2
                Score.append(score)
            with open(filename + '_test' + '.txt', 'w') as file:
                for item in Score:
                    file.write(str(item))
            self.test_results['results'].append(Score)
            self.test_results['filenames'].append(filename)
        
    def train(self):
        '''Trains an agent by executing a train loop.'''
        #self._get_agent()
        self.Score = []
        #### test stuff ####
        self._get_agent2()
        self.var_targets = []
        self.var_predictions = []
        ######################
        self.filename = self.agent_name + '_' + str(self.trial)
        self.Loss = []
        epsilon = 1
        delta = 1
        for episode in range(self.episodes):
            s = self.env.reset()
            score = 0
            loss = []
            #####
            var_targets = []
            var_predictions = []
            #####
            t = False
            epsilon = self._get_epsilon(epsilon, episode)
            if episode % self.update_rate == 0:
                weights = self.q_net.get_weights()
                self.t_net.set_weights(weights)
            for step in range(201):
                if step % self.frame_skipping == 0: 
                    if self.agent_name == 'dueling_dqn':
                        a = self.agents.policy(self.q_net_policy, s, self.num_a, epsilon)
                    elif self.agent_name == 'distributional_dqn':
                        a = self.agents.policy(self.q_net, s, self.num_a, epsilon, c51=True)
                    else:
                        a = self.agents.policy(self.q_net, s, self.num_a, epsilon)
                s2, r, t, _ = self.env.step(a)
                if self.memory.len_memory() > (0.1 * self.memory_size):
                    if self.agent_name == 'dqn':
                        transitions = self.memory.sample_transition()
                        self.agents.unpack_experience(transitions, self.batchsize)
                        targets, S = self.agents.dqn_targets(self.q_net,
                                                             self.t_net, 
                                                             self.gamma, 
                                                             self.batchsize)
                    elif self.agent_name == 'double_dqn':
                        transitions = self.memory.sample_transition()
                        self.agents.unpack_experience(transitions, self.batchsize)
                        targets, deltas, S = self.agents.ddqn_targets(self.q_net,
                                                                      self.t_net, 
                                                                      self.gamma, 
                                                                      self.batchsize)
                    elif self.agent_name in ['per_dqn', 'dueling_dqn']:
                        transitions, idxs, is_w = self.memory.sample_transition()
                        self.agents.unpack_experience(transitions, self.batchsize)
                        targets, deltas, S = self.agents.per_targets(self.q_net,
                                                                     self.t_net, 
                                                                     self.gamma, 
                                                                     self.batchsize,
                                                                     is_w)
                    elif self.agent_name == 'multi_step_dqn':
                        transitions = self.memory.sample_transition(n_step=(self.n_steps,
                                                                           self.n_horizon))
                        self.agents.unpack_experience(transitions, self.batchsize, n_step=True)
                        targets, S = self.agents.multi_step_targets(self.q_net,
                                                                   self.t_net,  
                                                                   self.gamma, 
                                                                   self.batchsize,
                                                                   self.n_steps)
                    elif self.agent_name == 'distributional_dqn':
                        transitions = self.memory.sample_transition()
                        self.agents.unpack_experience(transitions, self.batchsize)
                        targets, S = self.agents.c51_targets(self.q_net,
                                                             self.t_net,
                                                             self.num_a,
                                                             self.v_min,
                                                             self.v_max,
                                                             self.num_atoms,
                                                             self.gamma,
                                                             self.batchsize)
                    #####
                    self.targs = targets
                    #var_t, var_p = self.get_variance(targets)
                    #var_targets.append(var_t)
                    #var_predictions.append(var_p)
                    #####    
                    if self.agent_name in ['per_dqn', 'dueling_dqn']:
                        loss.append(self.q_net.train_on_batch(S, targets, is_w))
                    else:
                        loss.append(self.q_net.train_on_batch(S, targets))
                score += r
                self.memory.store_transition(delta, (s, a, r, s2, t))
                s = s2
                if t:
                    break
            #####
            #self.var_targets.append(np.average(var_targets))
            #self.var_predictions.append(np.average(var_predictions))
            #####
            self.Loss.append(np.average(loss))
            self.Score.append(score)
            if (episode + 1) % self.print_rate == 0:
                self._train_visualizsation(episode, if_loss=True)
                with open(self.filename + '.txt', 'w') as file:
                    for item in self.Score:
                        file.write(str(item))
        self.test()
        self.test_visualization(art=False)
        self.q_net.save(self.filename + '_qnet' + '.h5')
        self.t_net.save(self.filename + '_tnet' + '.h5')
        K.clear_session()
    
    def train_multiple(self, runs=[]):
        '''Executes a series of train loops over a number of run variables.
        
        Arguments:
        runs: list of indicies of agent names.
        
        '''
        for run in runs:
            self.agent_name = self.agent_names[run]
            print('\n','\n', '*'*65, '\n')
            self.train()
            
    
if __name__ == '__main__':
    t = Train(
        agent_name='dqn',
        episodes=10000,
        smoothing=200,
        n_steps=2,
        print_rate=20,
        trial=5,
        n_horizon = 0.2,
        update_rate=15000,
        alpha=0.00025,
        batchsize=32,
        per_alpha=0.6,
        per_beta=0.5,
        num_atoms=35,
        v_min=-10,
        v_max=10)
    
    t.train_multiple(
        [2,])
    