In [233]:
import numpy as np
import os
import tensorflow as tf
from itertools import cycle
from tensorflow import keras
from collections import deque
from tensorflow.keras import layers
import numpy as np


import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset


# ------------------------------------environment----------------------------
class UAV():
    '''
    <Constants AND Variables>
        H: UAV flying height (m)
        N: number of users
        C: service region radius (m)
        T: operation period of the UAV (time slots)
        ts: time slot duration (seconds)
        Eb: UAV energy budget per slot (J)
        Pf: flying power of the UAV (watts)
        Ph: hovering power of the UAV (watts)
        v: UAV velocity (m/s)

        iota: chip energy consumption coefficient (kappa also same)
        FU: number of UAV CPU cycle per bit (bits)
        FI: number of GUs CPU cycle per bit (bits)
        Fil: computing capacity of GUs
        Fus: computing capacuty of UAV -> 1Ghz
        D: input data size (bits)

        B: system bandwidth (MHz)
        beta: channel power gain at reference distance d0 = 1 m (dB)
        alpha: path-loss exponent
        sigma: noise power spectral density (dBm/Hz)

        Emax: battery capacity of the user, (mJ)
        mu: average harvested energy (mJ)

        eta = energy conversion efficiency 
        ppb = transmit power at the Power Beacon (watts)

        wis: ground user(GU) locate
        tau: harvest time
        pis: GU transmit power (mW)

        w: weight (it will change(just simulate it))
        flis: computing capacity of GUs ()
        O: range of offlaoding ratio 

    '''

    def __init__(self, H=20, N=10, T=40, ts=6, Eb=8, v=15, Pf=2, Ph=1, iota=1e-27, FI=[1, 5], FU=[20, 30], D=[0.4, 1], B=30,
                 beta=30, alpha=2, sigma=-174, Emax=10, eta=1e-28, ppb=4, pis=[1.0, 10.0], w=0.1, O=[0, 1]):
        self.H = H
        self.N = N
        self.T = T
        self.ts = ts
        self.Eb = Eb
        self.Pf = Pf
        self.Ph = Ph
        self.v = v

        self.iota = iota
        self.FI = np.array(FI)*1e5
        self.FU = np.array(FU)*1e5 
        self.D = np.array(D) * (10**6)

        self.B = B*1e6
       #  self.pn = np.array(pn) *1e-3
        self.beta = beta**(-30/10)
        self.alpha = alpha
        self.sigma = 10**(sigma/10)*1e-3

        self.Es = [0.1, Emax]

        self.alpha = 0.1

        self.eta = eta
        self.ppb = ppb
        self.tau = self.T*0.9
        self.pis = np.random.uniform(pis[0], pis[1], self.N) * 1e-3

        self.wis = np.random.rand(self.N, 2)
        self.wo = np.array([0, 0])
        self.wb = np.array([9, 9])

        self.w = w
        self.O = np.array(O)

        self.Eh = self.rf_eh()

    '''
    wo: UAV locate
    wb: power beacon locate
    ois_1: offloading ratio on time slot s-1 for state (uniform dist.)
    dis: inputdata of user on time slot s (uniform dist.)
    eus: UAV energy consumption on time slot s, will change (uniform dist.)
    '''

    def reset(self):
        ois_1 = np.random.uniform(self.O[0], self.O[1], self.N)
        # print(f'in reset ois_1: {ois_1} \n')
        Dis = np.random.uniform(
            self.D[0], self.D[1], self.N)  # change the size
        # print(f'in reset Dis{Dis} \n ')
        Eus = np.random.rand()
        state = [ois_1, Dis, Eus]  # STATE

        return state

    '''
    dis: distance between UAV and GU 
    gis: channel gain between UAV and GU
    ris: data rate between user and UAV
    Fi: CPU cycle about input data
    fli: local computation capa. 
    Elc: energy consumption of local(GU)
    tlc: execution time from GU in time slot 
    Et: energy consumption for transmission
    ttis: local execution time at GU in time slot
    '''

    def local(self, fli, ois_1, Dis):
        Fi = np.random.uniform(self.FI[0], self.FI[1], self.N)
        dis = np.sqrt(self.H**2 + np.sum((self.wo-self.wis)**2))
        gis = self.beta / (dis**self.alpha)
        ris = self.B * np.log2(1 + ((self.pis*gis)/self.sigma**2))
        
        # print(f'in local \n fli shape:{np.shape(fli)} \n ois_1 shape:{np.shape(ois_1)} \n Dis shape:{np.shape(Dis)} \n')

        Elc = self.iota * (fli**2) * Fi * (1-ois_1)*Dis
        tlc = ((1-ois_1)*Dis) / fli

        Et = (self.pis * (1-ois_1)*Dis) / ris
        ttis = ((1-ois_1)*Dis)/ris
        
        Elis = Elc + Et
        tlis = tlc + ttis 
        return Elis, tlis

    '''
    Eexe: energy consumption at the UAV for processing the data 
    texe: execution time at the UAV for processing the offloaded data
    '''

    def computing(self, fus, ois_1, Dis):
        FU = np.random.randint(self.FU[0], self.FU[1],dtype='int64')

        # print(f'in computing \n fus shape: {np.shape(fus)} \n ois_1 shape: {np.shape(ois_1)} \n Dis shape: {np.shape(Dis)}, \n FU: {np.shape(self.FU)} \n')
        # print(f'FU rand int {self.FU[0], self.FU[1]} \n')
        
        Eexe = self.iota*((fus)**2)*FU*(ois_1*Dis)
        texe = (ois_1 * Dis) / fus
        return Eexe, texe
        # Eexe = self.iota*((fus)^2)*self.FU
        # texe = (ois_1 * Dis) / fus
        # return Eexe, texe

    '''
    db2u: distace between the UAV and power beacon
    gpb: channel gain between the UAV and power beacon
    Eh: harvested energy  
    '''

    def rf_eh(self):
        db2u = np.sqrt(self.H**2 + np.sum((self.wo - self.wb)**2))
        gpb = self.beta / (db2u**self.alpha)
        Eh = self.eta * self.ppb * gpb * self.tau
        return Eh

    '''
    State = [ois_1, dis, eus]
    Action = [pis, fus]
    '''

    def step(self, state, action):
        # will check this env.
        # extract information from state
        ois_1 = state[0]  # first state is randomly
        Dis = state[1] * (self.D[1]-self.D[0]) + self.D[0]
        Eus = state[2]

        # extract information from state
        # pis = action[0] * (self.pis[1]-self.pis[0]) + self.pis[0]
        fus = action[0] * (self.FU[1]-self.FU[0]) + self.FU[0]
        fli = action[1] * (self.FI[1]-self.FI[0]) + self.FI[0]
        ois = action[2]

        # need adjust Fi
        # Fi = self.FU
        # fli = self.

        Elis, tlis = self.local(fli, ois_1, Dis)
        Eexe, texe = self.computing(fus, ois_1, Dis)
                
        # --------------- reward ------------------
        alarm = (tlis < self.ts) | (texe < self.ts)
        # print(f'in step alarm {alarm}')
        tlis, texe = tlis[alarm], texe[alarm]

        # print(f'in step raward sum(Elis+Eexe) {np.sum(Elis + Eexe)} \n w*tlis {self.w * abs(tlis-self.ts)} \n w*texe {self.w * abs(texe-self.ts)} \n')
        # print(f'in step shapes about \n Elis {np.shape(Elis)} \n Eexe {np.shape(Eexe)} \n  tlis {np.shape(tlis)}  \ texe {np.shape(texe)}')
        
        reward = np.sum(Elis + (Eexe)) + self.w * np.sum(abs(tlis-self.ts)) + self.w * np.sum(abs(texe-self.ts))

    
        # print(f'in step reward {reward} \n')
        # ---------------  update state ---------------
        ois_nxt = ois
        Dis_nxt = np.random.uniform(self.D[0], self.D[1], self.N)
        # print(f'in step Dis_nxt: {Dis_nxt} \n')
        Eus_nxt = Eus - np.sum(Eexe) + self.Eh
        Eus_nxt = np.clip(Eus_nxt, self.Es[0], self.Es[1])
        Eus_nxt = (Eus_nxt - self.Es[0]) / (self.Es[1]-self.Es[0])
        # Eh

        next_state = [ois_nxt, Dis_nxt, Eus_nxt]
        # update state
        return reward, next_state

# ------------------------------------agent----------------------------


In [234]:
def rand_agent(env, state):
    ois_1 = state[0]
    Dis = state[1]
    Eus = state[2]

    alarm = True
    while alarm:
        FUmax = env.FU[1] * 0.9
        FUmin = env.FU[0] * 0.9
        FImax = env.FI[1] * 0.9
        FImin = env.FI[0] * 0.9

        fus = np.random.rand(env.N) * (FUmax-FUmin) + FUmin
        fli = np.random.rand(env.N) * (FImax-FImin) + FImin
        ois = np.random.rand(env.N)

        tlis = env.local(fli, ois_1, Dis)[1]
        Eexe, texe = env.computing(fus, ois_1, Dis)[0], env.computing(fus, ois_1, Dis)[1]

        alarm = bool((tlis < env.ts) | (texe < env.ts) & (Eexe < Eus))

    action = [fus, fli, ois]
    return action


class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        x = (self.x_prev
             + self.theta * (self.mean - self.x_prev) * self.dt
             + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape))
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)


class DDPG():
    def __init__(self, env, buffer_capacity=1000, learning_rate=0.01, batch_size=32, discount=0.9):
        self.env = env
        self.state_dim = 2 * env.N + 1  # why add 2? -> because it is vector
        self.action_dim = 2 * env.N + 1
        self.actor_learning_rate = learning_rate
        self.critic_learning_rate = 2*learning_rate
        self.gamma = discount
        self.batch_size = batch_size
        self.buffer_capacity = buffer_capacity
        self.train_start = 10
        self.xi = 0.001  # for update target networks
        self.kn_init = tf.keras.initializers.RandomUniform(
            minval=-0.1, maxval=0.1)

        # ---- Replay memory --------------------------------------------------
        self.buffer = deque(maxlen=self.buffer_capacity)

        # ---- Creat a noise process ------------------------------------------
        self.mean = 0.0
        self.std = 0.01
        self.epsilon = 1
        self.epsilon_decay = 0.9995
        self.epsilon_min = 0.01
        self.noise = OUActionNoise(mean=self.mean*np.ones(self.action_dim),
                                   std_deviation=self.std*np.ones(self.action_dim))

        # ---- Create actor and critic ----------------------------------------
        self.actor = self.get_actor()
        self.target_actor = self.get_actor()
        self.critic = self.get_critic()
        self.target_critic = self.get_critic()

        # Make the weights equal initially
        self.target_actor.set_weights(self.actor.get_weights())
        self.target_critic.set_weights(self.critic.get_weights())

        self.actor_optimizer = tf.keras.optimizers.Adam(
            self.actor_learning_rate)
        self.critic_optimizer = tf.keras.optimizers.Adam(
            self.critic_learning_rate)

    def record(self, obs_tuple):
        # Saves experience tuple (s,a,r,s') in the replay memory
        self.buffer.append(obs_tuple)

    def get_actor(self):
        inputs = layers.Input(shape=(self.state_dim,))
        hidden = layers.Dense(128, activation="relu")(inputs)
        hidden = layers.Dense(128, activation="relu")(hidden)

        # action = [ut, fnt, pnt, bnt]
        action = layers.Dense(
            self.action_dim, activation="sigmoid", kernel_initializer=self.kn_init)(hidden)

        # Outputs actions
        model = keras.Model(inputs=inputs, outputs=action)
        return model

    def get_critic(self):
        # State as input
        state_input = layers.Input(shape=(self.state_dim,))
        state_out = layers.Dense(128, activation="relu")(state_input)
        state_out = layers.Dense(128, activation="relu")(state_out)

        # Action as input
        action_input = layers.Input(shape=(self.action_dim,))
        action_out = layers.Dense(128, activation="relu")(action_input)
        action_out = layers.Dense(128, activation="relu")(action_out)

        # Both are passed through seperate layer before concatenating
        concat = layers.Concatenate()([state_out, action_out])

        hidden = layers.Dense(256, activation="relu")(concat)
        hidden = layers.Dense(256, activation="relu")(hidden)
        Qvalue = layers.Dense(1)(hidden)

        # Outputs Q-value for given state-action
        model = keras.Model(inputs=[state_input, action_input], outputs=Qvalue)
        return model

    def convert_to_vector(self, var_in):
        # Convert a state into a vector for Tensorflow process
        out = np.empty(0)
        for var in var_in:
            out = np.append(out, np.reshape(var, (1, -1)))
        return out

    def policy(self, state, scheme='Proposed'):
        # Return an action sampled from the actor DNN plus some noise for exploration
        # Convert the state into a vector, then to a tensor
        state_vector = self.convert_to_vector(state)
        tf_state = tf.expand_dims(tf.convert_to_tensor(state_vector), 0)

        # Sample action from the actor, and add noise to the sampled actions
        sampled_action = tf.squeeze(self.actor(tf_state))
        sampled_action = sampled_action.numpy() + self.epsilon * self.noise()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

         # Make sure actions are within bounds
        action = np.clip(sampled_action, 0, 1)

        # actions = [ut, fnt, pnt, bnt]

        # will change below
        fus = action[0:1]
        fli = action[1:self.env.N+1]
        ois = action[self.env.N+1:]
        # print(f'in policy action shape: {np.shape(action)} \n')
        # print(f'in  policy action {action}\n')
        # print(f'in policy fus: {fus} \n fli: {fli} \n ois: {ois} \n')
        # print(f'in policy \n fus shape: {np.shape(fus)} \n fli shape: {np.shape(fli)} \n ois shape: {np.shape(ois)} \n')

        if scheme == 'Uniform':
            fus = 0.9*np.random.rand(1)
            fli = 0.9*np.random.rand(self.env.N)
            ois = np.random.rand(0, 1, self.env.N)

        if scheme == 'Random':
            fus = 0.9*np.random.rand(1)
            fli = np.random.rand(1, 5, self.env.N)
            ois = np.random.rand(0, 1, self.env.N)

        return [fus, fli, ois]

     # Use tf.function to speed up blocks of code that contain many small TensorFlow operations.
    @tf.function
    def update(self, state_batch, action_batch, reward_batch, state_next_batch):
        # Train the actor and the critic
        with tf.GradientTape() as tape:
            target_actions = self.target_actor(state_next_batch, training=True)
            y = reward_batch + self.gamma * \
                self.target_critic(
                    [state_next_batch, target_actions], training=True)
            critic_value = self.critic(
                [state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
        critic_grad = tape.gradient(
            critic_loss, self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(
            zip(critic_grad, self.critic.trainable_variables))

        with tf.GradientTape() as tape:
            actions = self.actor(state_batch, training=True)
            critic_value = self.critic([state_batch, actions], training=True)
            # Used `-value` as we want to maximize the value given by the critic for our actions
            actor_loss = -tf.math.reduce_mean(critic_value)
        actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor.trainable_variables))

    @tf.function
    def update_target(self):
        for (a, b) in zip(self.target_actor.variables, self.actor.variables):
            a.assign(b * self.xi + a * (1 - self.xi))

        for (c, d) in zip(self.target_critic.variables, self.critic.variables):
            c.assign(d * self.xi + c * (1 - self.xi))

    def update_model(self):
        # Select random samples from the buffer to train the actor and the critic
        if len(self.buffer) < self.train_start:
            return

        indices = np.random.choice(len(self.buffer), self.batch_size)
        state_batch, action_batch, reward_batch, state_next_batch = [], [], [], []
        for i in indices:
            state_batch.append(self.convert_to_vector(self.buffer[i][0]))
            action_batch.append(self.convert_to_vector(self.buffer[i][1]))
            reward_batch.append(self.buffer[i][2])
            state_next_batch.append(self.convert_to_vector(self.buffer[i][3]))

        # Convert to tensors
        state_batch = tf.convert_to_tensor(state_batch)
        action_batch = tf.convert_to_tensor(action_batch)
        reward_batch = tf.convert_to_tensor(reward_batch)
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        state_next_batch = tf.convert_to_tensor(state_next_batch)

        # Update parameters
        self.update(state_batch, action_batch, reward_batch, state_next_batch)

In [235]:
# ------------------------------------convergence------------------------------------
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

total_episodes = 450
max_step_per_episode = 450
max_running_times = 2
learning_rates = [0.001] # 0.003,0.01

# def fus_txt(ep,action):
#     fus_li = []
#     ac_fus = np.zeros(total_episodes)
#     if action[0] == 'None':
#         pass
#     else: 
#         fus_li.append(action[0])
    
#     ac_fus[ep] = np.mean(fus_li)
#     return ac_fus


def train(learning_rates=learning_rates, iterations=1):
    "Convergence performance with different learning rates."
    # Create an environment
    env = UAV()
    
    for learning_rate in learning_rates:
        output = './convergence_learning_rate_' + str(learning_rate) + '.txt'
        output_fus = './convergence_fus_' + str(learning_rate) + '.txt'
        
        # Run several times and get the average results
        count = 1
        iteration = 1
        while iteration <= iterations:
            tf.keras.backend.clear_session()
            print("\n====== Learning rate ====== :", learning_rate)
            print("------ Iteration: {}/{}".format(iteration,iterations))
            
            # Employ a new agent
            agent = DDPG(env, learning_rate=learning_rate)
            
            # Train the ddpg agent
            ep_reward_list = []
            fus_li = []
            avg_reward = np.zeros(total_episodes)
            ac_fus = np.zeros(total_episodes)
            # ac_fus = np.zeros(total_episodes)
            fault = 0
            for ep in range(total_episodes):
                state = env.reset()
                episodic_reward = 0
                for _ in range(max_step_per_episode):
                    # print(f'in  train state: {state}')
                    action = agent.policy(state)
                    reward, state_next = env.step(state, action)

                    agent.record((state, action, reward, state_next))
                    agent.update_model()
                    agent.update_target()
                # print(f'in train reward {reward} \n episodic_reward {episodic_reward} \n max_step_per_episode {max_step_per_episode} \n')
                    episodic_reward += reward
                    state = state_next
                ep_reward_list.append(episodic_reward/max_step_per_episode) #
                # print(f'in train mean reward {episodic_reward}') # / max_step_per_episode
               
                nor_ep_reward = (ep_reward_list-min(ep_reward_list)) / (max(ep_reward_list) - min(ep_reward_list))
               
                avg_reward[ep] = np.mean(nor_ep_reward)

                if action[0] == 'None':
                    pass
                else: 
                    fus_li.append(action[0])

                ac_fus[ep] = np.mean(fus_li)

                # fault = fault + 1 if avg_reward[ep] > avg_reward[ep-1]-10 else 0
                print(" Ep. {}  *  Avg Reward => {:.3f}".format(ep, avg_reward[ep]))
                # if fault == 5:
                    # print('=====>>>> Restart the training loop <<<<=====')
                    # break
                # else:
                # if not(os.path.isfile(output)):
                #     np.savetxt(output, avg_reward,  fmt='%.3f', delimiter=',')
                # else:
                #     R = np.loadtxt(output, delimiter=',').reshape((-1,total_episodes))
                #     # R_r = R.reshape(-1,total_episodes)
                #     temp = np.mean(R, axis=0)
                #     if ((learning_rate==0.01) & (avg_reward[-1] > temp[-1])) or ((learning_rate!=0.01) & (avg_reward[-1] < temp[-1])):
                # R = np.append(R,avg_reward.reshape((1,total_episodes)),axis=0)
                np.savetxt(output, avg_reward,  fmt='%.3f', delimiter=',')
                np.savetxt(output_fus, ac_fus, fmt='%.8f', delimiter=',')
                    # else:
                    #     if count < max_running_times:
                    #         count += 1
                    #         # print("Result is not satisfied ==> Run again.")
                    #         continue
                    #     else:
                    #         count = 1
                iteration += 1

def plot(learning_rates=learning_rates):
    # Create a figure and its twin.
    fig, ax = plt.subplots()
    # axins = zoomed_inset_axes(ax, zoom=25, loc='upper right', bbox_to_anchor=([235,215]))
    
    ticks = np.append(np.arange(0,100,20),[99])
    ticklabels = np.append([1],np.arange(20,100+1,20))
    marks = np.concatenate((np.arange(0,100,step=10),[99])).tolist()
    lines = cycle(["o-","s--","d-.","*:"])
    for i in range(len(learning_rates)):
        line_style = next(lines)
        output = './convergence_learning_rate_' + str(learning_rates[i]) + '.txt'

        R = np.loadtxt(output, delimiter=',').reshape((-1,total_episodes))
        R = np.mean(R, axis=0)
        ax.plot(R, line_style, label='Learning rate = {}'.format(learning_rates[i]), markevery=marks)
        # axins.plot(R, line_style)
    
    ax.set_ylim(100,450)
    ax.set_xticks(ticks)
    ax.set_xticklabels(ticklabels)
    ax.legend()
    ax.grid()
    ax.set_xlabel('Episode')
    ax.set_ylabel('Average reward')
    
    # axins.set_xlim(79.4, 80.6) # apply the x-limits
    # axins.set_ylim(-2.21, -1.99)    # apply the y-limits
    mark_inset(ax, loc1=2, loc2=4, fc="none", ec="0.5")
    # axins.set_xticks([])
    # axins.set_yticks([])
    
    plt.savefig('./convergence_learning_rate.pdf', bbox_inches='tight')

import sys
sys.argv=['']
del sys

if __name__ == "__main__":
    import argparse
    # Set the input argument
    parser = argparse.ArgumentParser(description='Convergence analysis')
    parser.add_argument("-lr","--learning_rate", type=float, nargs='+', default=learning_rates, 
                        help="Learning rate of the proposed algorithm")
    parser.add_argument("-it","--iteration", type=int, default=1, help="number of training iteration.")
    
    # Get the input argument
    args = parser.parse_args()
    learning_rates = args.learning_rate
    iterations = args.iteration
    
    # Use the argument in function
    train(learning_rates=learning_rates, iterations=iterations)
    # plot(learning_rates=learning_rates)


------ Iteration: 1/1




 Ep. 0  *  Avg Reward => nan
 Ep. 1  *  Avg Reward => 0.500
 Ep. 2  *  Avg Reward => 0.541
 Ep. 3  *  Avg Reward => 0.512
 Ep. 4  *  Avg Reward => 0.492
 Ep. 5  *  Avg Reward => 0.451
 Ep. 6  *  Avg Reward => 0.414
 Ep. 7  *  Avg Reward => 0.404
 Ep. 8  *  Avg Reward => 0.429
 Ep. 9  *  Avg Reward => 0.505
 Ep. 10  *  Avg Reward => 0.518
 Ep. 11  *  Avg Reward => 0.478
 Ep. 12  *  Avg Reward => 0.473
 Ep. 13  *  Avg Reward => 0.492
 Ep. 14  *  Avg Reward => 0.506
 Ep. 15  *  Avg Reward => 0.506
 Ep. 16  *  Avg Reward => 0.499
 Ep. 17  *  Avg Reward => 0.486
 Ep. 18  *  Avg Reward => 0.491
 Ep. 19  *  Avg Reward => 0.490
 Ep. 20  *  Avg Reward => 0.491
 Ep. 21  *  Avg Reward => 0.483
 Ep. 22  *  Avg Reward => 0.489
 Ep. 23  *  Avg Reward => 0.508
 Ep. 24  *  Avg Reward => 0.465
 Ep. 25  *  Avg Reward => 0.463
 Ep. 26  *  Avg Reward => 0.474
 Ep. 27  *  Avg Reward => 0.485
 Ep. 28  *  Avg Reward => 0.488
 Ep. 29  *  Avg Reward => 0.490
 Ep. 30  *  Avg Reward => 0.497
 Ep. 31  *  Avg Rewa