In [1]:
import pandas as pd
import numpy as np
import random
import sys
import os
import pdb
import tensorflow as tf
import matplotlib.pyplot as plt
from copy import deepcopy
import time

from keras.models import Sequential, Model
import keras.layers as layers
from keras.optimizers import Adam
from keras import backend as K
from keras.layers.merge import _Merge, Multiply
import ast
import gym

random_state=0
np.random.seed(random_state)
random.seed(random_state)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


# Agent

In [2]:
class QLayer(_Merge):
    """
    Q Layer that merges an advantage and value layer'''
    Needed for dueling dqn only
    """
    def _merge_function(self, inputs):
        '''Assume that the inputs come in as [value, advantage]'''
        output = inputs[0] + (inputs[1] - K.mean(inputs[1], axis=1, keepdims=True))
        return output

class DQNAgent:
    def __init__(self, df_batch, state_size, action_size, 
                 minibatch_size=64, gamma=.9, lr=0.0001, units=128, hidden_layers=1,
                 dueling=False, double_param=0, priority_alpha=0,
                 copy_online_to_target_ep=100, eval_after=100):
        
        #adding priority as noise in batch
        df_batch.at[:, 'weight'] = 0.0
        for i, row in df_batch.iterrows():
            df_batch.at[i, 'priority'] = (0 + np.random.uniform(0, 0.001))**priority_alpha

        
        # setting parameters
        self.state_size = state_size
        self.action_size = action_size
        self.batch = df_batch
        
        self.minibatch_size = minibatch_size
        self.gamma = gamma
        self.learning_rate = lr
        self.units = units
        self.hidden_layers = hidden_layers
        
        self.dueling = dueling
        self.double_param = double_param
        self.priority_alpha = priority_alpha
        
        self.copy_online_to_target_ep = copy_online_to_target_ep
        self.eval_after = eval_after
        
        
        # setting up the models
        if self.dueling:
            self.model_1 = self._build_model_dueling()
            self.model_2 = self._build_model_dueling()
        else:
            self.model_1 = self._build_model()
            self.model_2 = self._build_model()
        
        # evaluation variables
        self.R = []
        self.ecrs = []
    
    def _build_model_dueling(self):
        inputs = layers.Input(shape=(self.state_size,))
        z = layers.Dense(self.units, kernel_initializer='glorot_normal', activation='relu')(inputs)
        for layer in range(self.hidden_layers-1):
            z = layers.Dense(self.units, kernel_initializer='glorot_normal', activation='relu')(z)
            
        value = layers.Dense(1, kernel_initializer='glorot_normal', activation='linear')(z)
        
        adv = layers.Dense(self.action_size, kernel_initializer='glorot_normal', activation='linear')(z)

        q = QLayer()([value, adv])
        
        model = Model(inputs=inputs, outputs=q)
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model
        
    def _build_model(self):
        """
        Standard DQN model
        """
        model = Sequential()
        
        model.add(layers.Dense(self.units, input_dim=self.state_size, activation='relu', kernel_initializer='glorot_normal'))
        for layer in range(self.hidden_layers-1):
            model.add(layers.Dense(self.units, activation='relu', kernel_initializer='glorot_normal'))
        
        model.add(layers.Dense(self.action_size, activation='linear', kernel_initializer='glorot_normal'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate), metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])
        return model
    
    def act(self, state):
        state_array = np.array(state.reshape(1, self.state_size))
        act_values = self.model_2.predict(state_array)
        return np.argmax(act_values[0]), np.max(act_values[0])
    
    def learn(self, epoch, env=None):
        for i in range(epoch):
            self._learn_minibatch()
            
            if (i+1)%self.copy_online_to_target_ep==0:
                self.model_2.set_weights(self.model_1.get_weights())
            
            if (i+1)%self.eval_after==0:
                r = self.run_env(env)
                self.R.append(r)
                ecr = 0
                ecr = self.ecr_reward()
                print("--epoch: {}/{} | ECR: {:.5f} | R: {:.2f} --".format(i+1, epoch, ecr, r))
        
        print("--final run--")
        self.model_2 = self.model_1
        r = self.run_env(env)
        self.R.append(r)
        self.predict()
        ecr = self.ecr_reward()
        print("--epoch: {}/{} | ECR: {:.5f} | R: {:.2f} --".format(i+1, epoch, ecr, r))
    
    
    def _learn_row(self, row):
        i = row.name
        state, action, reward, next_state, done = row['state'], row['action'], row['reward'], row['next_state'], row['done']

        target_q = reward

        # For Double DQN
        rand = random.random()
        
        if rand >= self.double_param:
            if not done: 
                ns_act_values = self.model_1.predict(next_state.reshape(1,self.state_size))[0]
                a_prime = np.argmax(ns_act_values)

                target_ns_act_values = self.model_2.predict(next_state.reshape(1,self.state_size))[0]
                target_ns_q = target_ns_act_values[a_prime]

                target_q = reward + self.gamma*target_ns_q

                self.batch.at[i, 'pred_action'] = a_prime
                self.batch.at[i, 'pred_reward'] = target_q
                
            target_f = self.model_1.predict(state.reshape(1,self.state_size))
            # Prioritized Experience Reply with noise
            self.batch.loc[i, 'priority'] = (abs(target_q - target_f[0][action]) + np.random.uniform(0, 0.001))**self.priority_alpha

            target_f[0][action] = target_q
            self.model_1.fit(state.reshape(1,self.state_size), target_f, epochs=1, verbose=0)
        else:
            if not done: 
                ns_act_values = self.model_2.predict(next_state.reshape(1,self.state_size))[0]
                a_prime = np.argmax(ns_act_values)

                target_ns_act_values = self.model_1.predict(next_state.reshape(1,self.state_size))[0]
                target_ns_q = target_ns_act_values[a_prime]

                target_q = reward + self.gamma*target_ns_q

                self.batch.at[i, 'pred_action'] = a_prime
                self.batch.at[i, 'pred_reward'] = target_q

            target_f = self.model_2.predict(state.reshape(1,self.state_size))
            # Prioritized Experience Reply with noise
            self.batch.loc[i, 'priority'] = (abs(target_q - target_f[0][action]) + np.random.uniform(0, 0.001))**self.priority_alpha

            target_f[0][action] = target_q
            self.model_2.fit(state.reshape(1,self.state_size), target_f, epochs=1, verbose=0)
            
                
    
    def _learn_minibatch(self):
        # For PER
        priority_sum = self.batch['priority'].sum()
        self.batch['weight'] = self.batch['priority']/priority_sum
        
        
        minibatch = self.batch.sample(self.minibatch_size, weights=self.batch['weight'])
        minibatch.apply(self._learn_row, axis=1)
        
            
    def ecr_reward(self):
        reward = 0.0
        count = 0
        for i, row in self.batch.loc[self.batch['transition_id']==1].iterrows():
            state = row['state']
            next_state = row['next_state']
                
            reward += self.act(state)[1]
            count += 1
            
        ecr = reward/count
        self.ecrs.append(ecr)
        return ecr
    
    def _predict_row(self, row):
        i = row.name
        state = row['state']
        next_state = row['next_state']

        act, q = self.act(state)
        self.batch.loc[i, 'pred_action'] = act
        self.batch.loc[i, 'pred_reward'] = q
        
    def predict(self):
        self.batch.apply(self._predict_row, axis=1)
        
        return self.batch
    
    def run_env(self, env, np_cast=True):
        if env is None:
            return 0
        state = env.reset()
        total_reward = 0
        while True:
            if np_cast:
                action = self.act(np.array(state))[0]
            else:
                action = self.act(state)[0]
            next_state, reward, done, info = env.step(action)
            total_reward += reward
            state = next_state
            if done:
                state = env.reset()
                return total_reward
            
    def get_all_eval_df(self):
        eval_df = pd.DataFrame(columns=['ECR', 'R'])
        
        eval_df['ECR'] = self.ecrs
        eval_df['R'] = self.R
        
        return eval_df


# RUN

In [3]:
# SOME FORMATTING ISSUES WITH CSV
df = pd.read_csv('../data/CartPole-v1_10k.csv')
for i, row in df.iterrows():
    state = ast.literal_eval(row['state'])
    df.at[i, 'state'] = np.array(state)
    
    next_state = ast.literal_eval(row['next_state'])
    df.at[i, 'next_state'] = np.array(next_state)

dlt_lst = []
for i, row in df.iterrows():
    if row['done']==True and row['delayed_reward']==0:
        dlt_lst.append(i)
len(dlt_lst)    

df.drop(dlt_lst, inplace=True)
df = df.sort_values(by=['episode_id', 'transition_id'])
df.reset_index(inplace=True, drop=True)
df

print("Cartpole", "| Total transitions:", len(df), " | Total episodes:", len(df['episode_id'].unique()))
org_df = df.copy()



Cartpole | Total transitions: 222307  | Total episodes: 10000


In [3]:
import gym_gridworld
org_df = pd.read_pickle('../data/gridworld_ndm_10k_fixed.pkl')

result_dir = '../results/'
env = gym.make('gridworld-v0', deterministic=False)
epoch = 10000
action_size = 2
env_name = 'gridworld_ndm_fixed_'
prefix = ''
for ep_size in [1000, 5000, 10000]:
    df_run = org_df.copy()
    if ep_size < len(df_run['episode_id'].unique()):
        eps = np.random.choice(df_run['episode_id'].unique(), ep_size)
        df_run = df_run.loc[df_run['episode_id'].isin(eps)]
        df_run.reset_index(drop=True, inplace=True)
    
    for reward_type in ['immediate_reward', 'delayed_reward']:
        df_run['reward'] = df_run[reward_type]
        
        for dueling, double_param, priority_alpha in [(False, 0, 0), (False, 0, 0.05), (False, 0.5, 0.05), (True, 0.5, 0.05)]:
            random_state = 0
            df = df_run.copy()

            prefix = env_name + '_' + 'ep_size_' + str(ep_size) + '_' + reward_type + '_' + \
                'dueling_' + str(dueling) + '_double_' + str(double_param) + '_priority_' + \
                str(priority_alpha) + '_' + 'rs_' + str(random_state) + '_'


            np.random.seed(random_state)
            random.seed(random_state)

            print("==" + prefix + "==")
            agent = DQNAgent(df_batch=df, state_size=len(df.iloc[0]['state']), action_size=action_size, 
                             dueling=dueling, double_param=double_param, priority_alpha=priority_alpha,
                             copy_online_to_target_ep=100, eval_after=100)

            agent.learn(epoch, env)


            result = agent.batch
            eval_df = agent.get_all_eval_df()
            eval_df.to_pickle(result_dir + prefix +'eval.pkl')
            result.to_pickle(result_dir + prefix +'result.pkl')
            eval_df.to_csv(result_dir + prefix +'eval.csv')
            result.to_csv(result_dir + prefix +'result.csv')

            print('==run ends==')


==gridworld_ndm_fixed__ep_size_1000_immediate_reward_dueling_False_double_0_priority_0_rs_0_==

--epoch: 100/10000 | ECR: 0.00868 | R: -3.00 --
--epoch: 200/10000 | ECR: 0.12828 | R: 10.00 --
--epoch: 300/10000 | ECR: 0.24482 | R: 19.00 --
--epoch: 400/10000 | ECR: 0.36409 | R: -1.00 --
--epoch: 500/10000 | ECR: 0.46902 | R: 27.00 --
--epoch: 600/10000 | ECR: 0.58800 | R: 6.00 --
--epoch: 700/10000 | ECR: 0.71022 | R: 4.00 --
--epoch: 800/10000 | ECR: 0.81364 | R: 0.00 --
--epoch: 900/10000 | ECR: 0.93486 | R: 6.00 --
--epoch: 1000/10000 | ECR: 1.05822 | R: 4.00 --
--epoch: 1100/10000 | ECR: 1.22755 | R: -1.00 --
--epoch: 1200/10000 | ECR: 1.35088 | R: 10.00 --
--epoch: 1300/10000 | ECR: 1.40833 | R: 11.00 --
--epoch: 1400/10000 | ECR: 1.52441 | R: 14.00 --
--epoch: 1500/10000 | ECR: 1.60319 | R: 1.00 --
--epoch: 1600/10000 | ECR: 1.67723 | R: 22.00 --
--epoch: 1700/10000 | ECR: 1.76462 | R: 4.00 --
--epoch: 1800/10000 | ECR: 1.84892 | R: 27.00 --
--epoch: 1900/10000 | ECR: 1.94033 | R

--epoch: 6000/10000 | ECR: 4.55613 | R: 27.00 --
--epoch: 6100/10000 | ECR: 4.74649 | R: -1.00 --
--epoch: 6200/10000 | ECR: 4.82611 | R: -1.00 --
--epoch: 6300/10000 | ECR: 4.95974 | R: 26.00 --
--epoch: 6400/10000 | ECR: 5.06763 | R: -1.00 --
--epoch: 6500/10000 | ECR: 5.15870 | R: 31.00 --
--epoch: 6600/10000 | ECR: 5.18963 | R: 4.00 --
--epoch: 6700/10000 | ECR: 5.22274 | R: 4.00 --
--epoch: 6800/10000 | ECR: 5.15165 | R: 26.00 --
--epoch: 6900/10000 | ECR: 4.95308 | R: 0.00 --
--epoch: 7000/10000 | ECR: 5.01894 | R: 26.00 --
--epoch: 7100/10000 | ECR: 5.06093 | R: 26.00 --
--epoch: 7200/10000 | ECR: 4.99179 | R: 26.00 --
--epoch: 7300/10000 | ECR: 5.00369 | R: 8.00 --
--epoch: 7400/10000 | ECR: 5.03598 | R: 27.00 --
--epoch: 7500/10000 | ECR: 4.98209 | R: -1.00 --
--epoch: 7600/10000 | ECR: 4.92309 | R: -1.00 --
--epoch: 7700/10000 | ECR: 4.92705 | R: 27.00 --
--epoch: 7800/10000 | ECR: 4.95213 | R: 26.00 --
--epoch: 7900/10000 | ECR: 4.96786 | R: 22.00 --
--epoch: 8000/10000 | EC

--epoch: 2200/10000 | ECR: 2.59352 | R: 8.00 --
--epoch: 2300/10000 | ECR: 2.75692 | R: -1.00 --
--epoch: 2400/10000 | ECR: 2.77813 | R: 15.00 --
--epoch: 2500/10000 | ECR: 2.73800 | R: -4.00 --
--epoch: 2600/10000 | ECR: 2.88567 | R: 0.00 --
--epoch: 2700/10000 | ECR: 3.00775 | R: 0.00 --
--epoch: 2800/10000 | ECR: 2.98005 | R: 0.00 --
--epoch: 2900/10000 | ECR: 3.04497 | R: 5.00 --
--epoch: 3000/10000 | ECR: 3.10688 | R: 4.00 --
--epoch: 3100/10000 | ECR: 3.12643 | R: 27.00 --
--epoch: 3200/10000 | ECR: 3.22872 | R: -4.00 --
--epoch: 3300/10000 | ECR: 3.29675 | R: 14.00 --
--epoch: 3400/10000 | ECR: 3.24910 | R: 11.00 --
--epoch: 3500/10000 | ECR: 3.20742 | R: 4.00 --
--epoch: 3600/10000 | ECR: 3.14958 | R: 4.00 --
--epoch: 3700/10000 | ECR: 3.35723 | R: 4.00 --
--epoch: 3800/10000 | ECR: 3.30307 | R: 3.00 --
--epoch: 3900/10000 | ECR: 3.42581 | R: 4.00 --
--epoch: 4000/10000 | ECR: 3.35159 | R: 0.00 --
--epoch: 4100/10000 | ECR: 3.50239 | R: 4.00 --
--epoch: 4200/10000 | ECR: 3.3856

--epoch: 8700/10000 | ECR: 3.40878 | R: 22.00 --
--epoch: 8800/10000 | ECR: 3.40801 | R: 31.00 --
--epoch: 8900/10000 | ECR: 3.41512 | R: -1.00 --
--epoch: 9000/10000 | ECR: 3.47344 | R: -1.00 --
--epoch: 9100/10000 | ECR: 3.50989 | R: -1.00 --
--epoch: 9200/10000 | ECR: 3.48262 | R: -1.00 --
--epoch: 9300/10000 | ECR: 3.39473 | R: 26.00 --
--epoch: 9400/10000 | ECR: 3.26205 | R: -1.00 --
--epoch: 9500/10000 | ECR: 3.20775 | R: 26.00 --
--epoch: 9600/10000 | ECR: 3.15077 | R: -1.00 --
--epoch: 9700/10000 | ECR: 3.11486 | R: 26.00 --
--epoch: 9800/10000 | ECR: 3.08435 | R: 27.00 --
--epoch: 9900/10000 | ECR: 3.06666 | R: -1.00 --
--epoch: 10000/10000 | ECR: 3.15598 | R: 26.00 --
--final run--
--epoch: 10000/10000 | ECR: 3.15598 | R: -1.00 --
==run ends==
==gridworld_ndm_fixed__ep_size_1000_delayed_reward_dueling_False_double_0_priority_0.05_rs_0_==
--epoch: 100/10000 | ECR: -0.18092 | R: -4.00 --
--epoch: 200/10000 | ECR: -0.22841 | R: -4.00 --
--epoch: 300/10000 | ECR: -0.15241 | R: 0.

--epoch: 4800/10000 | ECR: 2.33407 | R: 4.00 --
--epoch: 4900/10000 | ECR: 2.44512 | R: -8.00 --
--epoch: 5000/10000 | ECR: 2.54057 | R: -4.00 --
--epoch: 5100/10000 | ECR: 2.34783 | R: -4.00 --
--epoch: 5200/10000 | ECR: 2.20318 | R: 26.00 --
--epoch: 5300/10000 | ECR: 2.22489 | R: 0.00 --
--epoch: 5400/10000 | ECR: 2.45581 | R: -1.00 --
--epoch: 5500/10000 | ECR: 2.41328 | R: -4.00 --
--epoch: 5600/10000 | ECR: 2.38372 | R: -1.00 --
--epoch: 5700/10000 | ECR: 2.58118 | R: -4.00 --
--epoch: 5800/10000 | ECR: 2.57475 | R: 0.00 --
--epoch: 5900/10000 | ECR: 2.72227 | R: -4.00 --
--epoch: 6000/10000 | ECR: 2.70995 | R: 0.00 --
--epoch: 6100/10000 | ECR: 2.79277 | R: -4.00 --
--epoch: 6200/10000 | ECR: 2.56429 | R: -1.00 --
--epoch: 6300/10000 | ECR: 2.76358 | R: 27.00 --
--epoch: 6400/10000 | ECR: 2.79483 | R: -4.00 --
--epoch: 6500/10000 | ECR: 2.71226 | R: -1.00 --
--epoch: 6600/10000 | ECR: 2.85032 | R: -1.00 --
--epoch: 6700/10000 | ECR: 2.78105 | R: -1.00 --
--epoch: 6800/10000 | EC

--epoch: 900/10000 | ECR: 1.14296 | R: 16.00 --
--epoch: 1000/10000 | ECR: 1.30378 | R: 4.00 --
--epoch: 1100/10000 | ECR: 1.45319 | R: 4.00 --
--epoch: 1200/10000 | ECR: 1.58584 | R: 4.00 --
--epoch: 1300/10000 | ECR: 1.74396 | R: 4.00 --
--epoch: 1400/10000 | ECR: 1.88270 | R: 13.00 --
--epoch: 1500/10000 | ECR: 2.01808 | R: 8.00 --
--epoch: 1600/10000 | ECR: 2.09907 | R: 13.00 --
--epoch: 1700/10000 | ECR: 2.20342 | R: 27.00 --
--epoch: 1800/10000 | ECR: 2.28916 | R: 4.00 --
--epoch: 1900/10000 | ECR: 2.39666 | R: 27.00 --
--epoch: 2000/10000 | ECR: 2.48151 | R: 8.00 --
--epoch: 2100/10000 | ECR: 2.55994 | R: 9.00 --
--epoch: 2200/10000 | ECR: 2.68116 | R: 14.00 --
--epoch: 2300/10000 | ECR: 2.78227 | R: 4.00 --
--epoch: 2400/10000 | ECR: 2.92297 | R: 19.00 --
--epoch: 2500/10000 | ECR: 3.00828 | R: 31.00 --
--epoch: 2600/10000 | ECR: 3.09073 | R: -1.00 --
--epoch: 2700/10000 | ECR: 3.12060 | R: 14.00 --
--epoch: 2800/10000 | ECR: 3.19206 | R: 9.00 --
--epoch: 2900/10000 | ECR: 3.20

--epoch: 7500/10000 | ECR: 4.92276 | R: 19.00 --
--epoch: 7600/10000 | ECR: 5.00950 | R: 0.00 --
--epoch: 7700/10000 | ECR: 5.04926 | R: 13.00 --
--epoch: 7800/10000 | ECR: 5.23350 | R: 13.00 --
--epoch: 7900/10000 | ECR: 5.32191 | R: 31.00 --
--epoch: 8000/10000 | ECR: 5.35211 | R: 16.00 --
--epoch: 8100/10000 | ECR: 5.46809 | R: 4.00 --
--epoch: 8200/10000 | ECR: 5.54229 | R: 20.00 --
--epoch: 8300/10000 | ECR: 5.63201 | R: 14.00 --
--epoch: 8400/10000 | ECR: 5.61391 | R: 13.00 --
--epoch: 8500/10000 | ECR: 5.56604 | R: 8.00 --
--epoch: 8600/10000 | ECR: 5.54294 | R: 8.00 --
--epoch: 8700/10000 | ECR: 5.48690 | R: 14.00 --
--epoch: 8800/10000 | ECR: 5.41591 | R: 4.00 --
--epoch: 8900/10000 | ECR: 5.33996 | R: 13.00 --
--epoch: 9000/10000 | ECR: 5.41972 | R: 4.00 --
--epoch: 9100/10000 | ECR: 5.54044 | R: 13.00 --
--epoch: 9200/10000 | ECR: 5.55515 | R: 13.00 --
--epoch: 9300/10000 | ECR: 5.53011 | R: 14.00 --
--epoch: 9400/10000 | ECR: 5.50891 | R: 8.00 --
--epoch: 9500/10000 | ECR: 

--epoch: 3700/10000 | ECR: 3.64765 | R: 8.00 --
--epoch: 3800/10000 | ECR: 3.55765 | R: 13.00 --
--epoch: 3900/10000 | ECR: 3.72079 | R: 4.00 --
--epoch: 4000/10000 | ECR: 3.81308 | R: 27.00 --
--epoch: 4100/10000 | ECR: 3.78754 | R: 11.00 --
--epoch: 4200/10000 | ECR: 3.91233 | R: 4.00 --
--epoch: 4300/10000 | ECR: 3.82902 | R: 19.00 --
--epoch: 4400/10000 | ECR: 3.88444 | R: 13.00 --
--epoch: 4500/10000 | ECR: 3.87801 | R: 4.00 --
--epoch: 4600/10000 | ECR: 3.96581 | R: 4.00 --
--epoch: 4700/10000 | ECR: 3.91533 | R: 9.00 --
--epoch: 4800/10000 | ECR: 3.88496 | R: -5.00 --
--epoch: 4900/10000 | ECR: 3.84704 | R: 11.00 --
--epoch: 5000/10000 | ECR: 3.85872 | R: 4.00 --
--epoch: 5100/10000 | ECR: 4.03932 | R: 27.00 --
--epoch: 5200/10000 | ECR: 4.08215 | R: 0.00 --
--epoch: 5300/10000 | ECR: 4.09305 | R: 14.00 --
--epoch: 5400/10000 | ECR: 4.12555 | R: 4.00 --
--epoch: 5500/10000 | ECR: 4.17222 | R: 13.00 --
--epoch: 5600/10000 | ECR: 4.02243 | R: 13.00 --
--epoch: 5700/10000 | ECR: 4.

==run ends==
==gridworld_ndm_fixed__ep_size_5000_delayed_reward_dueling_False_double_0_priority_0.05_rs_0_==
--epoch: 100/10000 | ECR: -0.07685 | R: -1.00 --
--epoch: 200/10000 | ECR: -0.03225 | R: 22.00 --
--epoch: 300/10000 | ECR: 0.05753 | R: 0.00 --
--epoch: 400/10000 | ECR: 0.11756 | R: -1.00 --
--epoch: 500/10000 | ECR: 0.20587 | R: -1.00 --
--epoch: 600/10000 | ECR: 0.35764 | R: -4.00 --
--epoch: 700/10000 | ECR: 0.51674 | R: -4.00 --
--epoch: 800/10000 | ECR: 0.60045 | R: -1.00 --
--epoch: 900/10000 | ECR: 0.67507 | R: 23.00 --
--epoch: 1000/10000 | ECR: 0.77127 | R: -1.00 --
--epoch: 1100/10000 | ECR: 0.92412 | R: -4.00 --
--epoch: 1200/10000 | ECR: 1.04963 | R: -4.00 --
--epoch: 1300/10000 | ECR: 1.12933 | R: 27.00 --
--epoch: 1400/10000 | ECR: 1.31848 | R: -1.00 --
--epoch: 1500/10000 | ECR: 1.45461 | R: -1.00 --
--epoch: 1600/10000 | ECR: 1.60899 | R: -4.00 --
--epoch: 1700/10000 | ECR: 1.71140 | R: -1.00 --
--epoch: 1800/10000 | ECR: 1.78741 | R: 0.00 --
--epoch: 1900/1000

--epoch: 6400/10000 | ECR: 2.55426 | R: -4.00 --
--epoch: 6500/10000 | ECR: 2.58007 | R: -1.00 --
--epoch: 6600/10000 | ECR: 2.55887 | R: -1.00 --
--epoch: 6700/10000 | ECR: 2.46444 | R: 26.00 --
--epoch: 6800/10000 | ECR: 2.47081 | R: 0.00 --
--epoch: 6900/10000 | ECR: 2.38216 | R: -1.00 --
--epoch: 7000/10000 | ECR: 2.44039 | R: 4.00 --
--epoch: 7100/10000 | ECR: 2.40774 | R: -4.00 --
--epoch: 7200/10000 | ECR: 2.48514 | R: -1.00 --
--epoch: 7300/10000 | ECR: 2.40974 | R: 31.00 --
--epoch: 7400/10000 | ECR: 2.40397 | R: -1.00 --
--epoch: 7500/10000 | ECR: 2.38843 | R: 22.00 --
--epoch: 7600/10000 | ECR: 2.39794 | R: -4.00 --
--epoch: 7700/10000 | ECR: 2.42306 | R: 23.00 --
--epoch: 7800/10000 | ECR: 2.37116 | R: 31.00 --
--epoch: 7900/10000 | ECR: 2.35593 | R: 0.00 --
--epoch: 8000/10000 | ECR: 2.45956 | R: 0.00 --
--epoch: 8100/10000 | ECR: 2.53719 | R: -4.00 --
--epoch: 8200/10000 | ECR: 2.52116 | R: 4.00 --
--epoch: 8300/10000 | ECR: 2.61404 | R: -4.00 --
--epoch: 8400/10000 | ECR

--epoch: 2600/10000 | ECR: 2.73888 | R: 19.00 --
--epoch: 2700/10000 | ECR: 2.78915 | R: -1.00 --
--epoch: 2800/10000 | ECR: 2.87582 | R: 27.00 --
--epoch: 2900/10000 | ECR: 2.87824 | R: 0.00 --
--epoch: 3000/10000 | ECR: 2.88417 | R: 4.00 --
--epoch: 3100/10000 | ECR: 2.89431 | R: 27.00 --
--epoch: 3200/10000 | ECR: 2.88957 | R: 26.00 --
--epoch: 3300/10000 | ECR: 2.93421 | R: -1.00 --
--epoch: 3400/10000 | ECR: 3.07862 | R: -1.00 --
--epoch: 3500/10000 | ECR: 3.19285 | R: 31.00 --
--epoch: 3600/10000 | ECR: 3.26893 | R: -1.00 --
--epoch: 3700/10000 | ECR: 3.41677 | R: 0.00 --
--epoch: 3800/10000 | ECR: 3.54301 | R: 4.00 --
--epoch: 3900/10000 | ECR: 3.70476 | R: 4.00 --
--epoch: 4000/10000 | ECR: 3.74458 | R: 27.00 --
--epoch: 4100/10000 | ECR: 3.80556 | R: 0.00 --
--epoch: 4200/10000 | ECR: 3.85763 | R: 31.00 --
--epoch: 4300/10000 | ECR: 3.89863 | R: 13.00 --
--epoch: 4400/10000 | ECR: 3.88573 | R: 13.00 --
--epoch: 4500/10000 | ECR: 3.87746 | R: 4.00 --
--epoch: 4600/10000 | ECR: 

--epoch: 9200/10000 | ECR: 4.39274 | R: 14.00 --
--epoch: 9300/10000 | ECR: 4.43836 | R: 31.00 --
--epoch: 9400/10000 | ECR: 4.42756 | R: 27.00 --
--epoch: 9500/10000 | ECR: 4.36701 | R: 13.00 --
--epoch: 9600/10000 | ECR: 4.39384 | R: 4.00 --
--epoch: 9700/10000 | ECR: 4.55263 | R: 27.00 --
--epoch: 9800/10000 | ECR: 4.68587 | R: 4.00 --
--epoch: 9900/10000 | ECR: 4.69046 | R: 4.00 --
--epoch: 10000/10000 | ECR: 4.71793 | R: 4.00 --
--final run--
--epoch: 10000/10000 | ECR: 4.71793 | R: -1.00 --
==run ends==
==gridworld_ndm_fixed__ep_size_10000_immediate_reward_dueling_False_double_0.5_priority_0.05_rs_0_==
--epoch: 100/10000 | ECR: 0.23842 | R: 2.00 --
--epoch: 200/10000 | ECR: 0.34174 | R: 0.00 --
--epoch: 300/10000 | ECR: 0.34042 | R: -3.00 --
--epoch: 400/10000 | ECR: 0.48936 | R: -1.00 --
--epoch: 500/10000 | ECR: 0.58222 | R: 4.00 --
--epoch: 600/10000 | ECR: 0.60966 | R: 10.00 --
--epoch: 700/10000 | ECR: 0.75526 | R: 4.00 --
--epoch: 800/10000 | ECR: 0.89445 | R: 4.00 --
--epo

--epoch: 5400/10000 | ECR: 3.76016 | R: -1.00 --
--epoch: 5500/10000 | ECR: 3.84094 | R: 0.00 --
--epoch: 5600/10000 | ECR: 3.77315 | R: 26.00 --
--epoch: 5700/10000 | ECR: 3.70984 | R: -1.00 --
--epoch: 5800/10000 | ECR: 3.86394 | R: -4.00 --
--epoch: 5900/10000 | ECR: 3.81857 | R: 13.00 --
--epoch: 6000/10000 | ECR: 3.69092 | R: 11.00 --
--epoch: 6100/10000 | ECR: 3.67887 | R: -1.00 --
--epoch: 6200/10000 | ECR: 3.86384 | R: -1.00 --
--epoch: 6300/10000 | ECR: 3.83291 | R: 31.00 --
--epoch: 6400/10000 | ECR: 3.86918 | R: 31.00 --
--epoch: 6500/10000 | ECR: 3.74955 | R: -1.00 --
--epoch: 6600/10000 | ECR: 3.73250 | R: 19.00 --
--epoch: 6700/10000 | ECR: 3.81268 | R: 4.00 --
--epoch: 6800/10000 | ECR: 3.86261 | R: 13.00 --
--epoch: 6900/10000 | ECR: 3.78252 | R: 16.00 --
--epoch: 7000/10000 | ECR: 3.84040 | R: 11.00 --
--epoch: 7100/10000 | ECR: 3.80584 | R: 5.00 --
--epoch: 7200/10000 | ECR: 3.70400 | R: 27.00 --
--epoch: 7300/10000 | ECR: 3.74590 | R: 13.00 --
--epoch: 7400/10000 | E

--epoch: 1600/10000 | ECR: 1.59916 | R: 0.00 --
--epoch: 1700/10000 | ECR: 1.76797 | R: -5.00 --
--epoch: 1800/10000 | ECR: 1.91027 | R: -1.00 --
--epoch: 1900/10000 | ECR: 2.01373 | R: 4.00 --
--epoch: 2000/10000 | ECR: 2.10656 | R: -4.00 --
--epoch: 2100/10000 | ECR: 2.19859 | R: 27.00 --
--epoch: 2200/10000 | ECR: 2.24496 | R: -4.00 --
--epoch: 2300/10000 | ECR: 2.27274 | R: -1.00 --
--epoch: 2400/10000 | ECR: 2.32953 | R: -5.00 --
--epoch: 2500/10000 | ECR: 2.35164 | R: 0.00 --
--epoch: 2600/10000 | ECR: 2.47408 | R: -1.00 --
--epoch: 2700/10000 | ECR: 2.56640 | R: -1.00 --
--epoch: 2800/10000 | ECR: 2.65481 | R: -1.00 --
--epoch: 2900/10000 | ECR: 2.71271 | R: 0.00 --
--epoch: 3000/10000 | ECR: 2.74441 | R: -1.00 --
--epoch: 3100/10000 | ECR: 2.75071 | R: -4.00 --
--epoch: 3200/10000 | ECR: 2.82027 | R: 0.00 --
--epoch: 3300/10000 | ECR: 2.84416 | R: -1.00 --
--epoch: 3400/10000 | ECR: 2.86384 | R: -4.00 --
--epoch: 3500/10000 | ECR: 2.79301 | R: -1.00 --
--epoch: 3600/10000 | ECR

--epoch: 8100/10000 | ECR: 2.45950 | R: -1.00 --
--epoch: 8200/10000 | ECR: 2.51388 | R: 0.00 --
--epoch: 8300/10000 | ECR: 2.53242 | R: 4.00 --
--epoch: 8400/10000 | ECR: 2.53077 | R: -4.00 --
--epoch: 8500/10000 | ECR: 2.53109 | R: 23.00 --
--epoch: 8600/10000 | ECR: 2.53871 | R: 26.00 --
--epoch: 8700/10000 | ECR: 2.48399 | R: 0.00 --
--epoch: 8800/10000 | ECR: 2.44109 | R: -4.00 --
--epoch: 8900/10000 | ECR: 2.56267 | R: 27.00 --
--epoch: 9000/10000 | ECR: 2.62362 | R: 0.00 --
--epoch: 9100/10000 | ECR: 2.57703 | R: 0.00 --
--epoch: 9200/10000 | ECR: 2.63890 | R: -1.00 --
--epoch: 9300/10000 | ECR: 2.40017 | R: -8.00 --
--epoch: 9400/10000 | ECR: 2.54400 | R: -1.00 --
--epoch: 9500/10000 | ECR: 2.51731 | R: 23.00 --
--epoch: 9600/10000 | ECR: 2.55800 | R: 0.00 --
--epoch: 9700/10000 | ECR: 2.59178 | R: 26.00 --
--epoch: 9800/10000 | ECR: 2.56242 | R: -1.00 --
--epoch: 9900/10000 | ECR: 2.52487 | R: -4.00 --
--epoch: 10000/10000 | ECR: 2.44729 | R: 23.00 --
--final run--
--epoch: 10