In [None]:
!pip install tf-agents

!pip install tensorflow



In [None]:
import abc
import copy
import math
import random
import numpy as np
import gym
from gym import spaces

import tensorflow as tf
from tf_agents.agents.dqn.dqn_agent import DqnAgent, DdqnAgent
from tf_agents.networks.q_network import QNetwork
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import utils
from tf_agents.trajectories import trajectory
from tf_agents.environments import wrappers
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.utils import common
from tf_agents.metrics import py_metrics
from tf_agents.metrics import tf_metrics
from tf_agents.drivers import py_driver
from tf_agents.drivers import dynamic_episode_driver
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts

import matplotlib
import matplotlib.pyplot as plt
from skimage.draw import line

from google.colab.patches import cv2_imshow

tf.compat.v1.enable_v2_behavior()

IMG_DIM = 512
NUMBER_ITERATION = 25000
COLLECTION_STEPS_PER_ITERATION = 1
REPLAY_BUFFER_MAX_LENGTH = 100000
BATCH_SIZE = 128
EVAL_EPISODES = 20
EVAL_INTERVAL = 1000
INITIAL_COLLECT_STEPS = 1000
LEARNING_RATE = 3e-4

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  pass
  #raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
#Given 3 points, check their orientation -> 0 = colinear, 1 = clockwise, -1 = counter-clockwise
def get_orientation(A, B, C):

  val = ((B[1] - A[1]) * (C[0] - B[0])) - ((B[0] - A[0]) * (C[1] - B[1]))
  if val == 0 : return 0
  return 1 if val > 0 else -1

#Check if segments intersect
def is_segm_intersection(S1, S2):

  (A1, B1) = S1
  (A2, B2) = S2

  if A1 == A2 or A1 == B2 or B1 == A2 or B1 == B2:
    return False

  #find all orientations
  o1 = get_orientation(A1, B1, A2)
  o2 = get_orientation(A1, B1, B2)
  o3 = get_orientation(A2, B2, A1)
  o4 = get_orientation(A2, B2, B1)

  #general case
  if o1 != o2 and o3 != o4:
    return True

  return False

def intersects_any(S, segments):

  for s in segments[:-1]:
    if is_segm_intersection(S, s) == True:
      return True

  return False

#check if a pixel is outside of image
def is_out_of_bounds(A, n):
  (x, y) = A
  return x < 0 or y < 0 or x >= n or y >= n 

#check if a pixel is on the edge of the image
def is_marginal_node(A, n):
  (x, y) = A
  return x == 0 or y == 0 or x == (n - 1) or y == (n - 1)

In [None]:
#given a point, get the equivalent point at a given distance and given angle
def apply_translation_to(position, angle, distance):
  x = distance * math.cos(math.radians(angle))
  y = distance * math.sin(math.radians(angle))
  return (position[0] + x, position[1] + y)

#rotate a position to left with a number of degrees
def rotate_left(position, degree_angle):
  (x, y) = position
  adjustedX = (x * math.cos(math.radians(degree_angle))) - (y * math.sin(math.radians(degree_angle)))
  adjustedY = (y * math.cos(math.radians(degree_angle))) + (x * math.sin(math.radians(degree_angle)))
  return (round(adjustedX), round(adjustedY))

#given a position, get the point at a certaing angle and distance
def get_position_of(last_position, angle, distance, angle_adjustment=0):
  rotated_left_position = rotate_left(last_position, 360 - angle_adjustment)
  translated_location = apply_translation_to(rotated_left_position, angle, distance)
  return rotate_left(translated_location, angle_adjustment)

In [None]:
position = (456, 255)
angle = -64
action_value = 155 
get_position_of(position, 0, action_value, angle)

(524, 116)

In [None]:
UPP_LIMIT = 2000
MAX_STEPS = 15

ACTION_DIST = 0
ACTION_ANGLE = 1
ACTION_INTER = 2
ACTION_END = 3

NR_INFO = 23

POS_NR_ENDS = 0
POS_NR_ANGLES = 1
POS_NR_INTER = 2
POS_NR_DIST = 3
POS_NR_SELFINTER = 4
POS_NR_OOB = 20
POS_NR_MARGINALS = 21
POS_MIN_ANGLE = 5
POS_MEAN_ANGLE = 6
POS_MAX_ANGLE = 7
POS_MIN_DIST = 8
POS_MEAN_DIST = 9
POS_MAX_DIST = 10
POS_SUM_DIST = 11
POS_INTER3 = 12
POS_INTER4 = 13
POS_INTER5 = 14
POS_INTER6 = 15
POS_INTER7 = 16
POS_LAST_ANGLE = 17
POS_LAST_DIST = 18
POS_LAST_INTER = 19
POS_LAST_ACTION = 22

INITIAL_STATE = np.array([0,0,0,0,0,UPP_LIMIT,0,0,UPP_LIMIT,0,0,0,0,0,0,0,0,0,0,0,0,0, ACTION_ANGLE], dtype=np.float64)
MAXIMUM_STATE = np.full((NR_INFO,), UPP_LIMIT, dtype=np.float64)
MAXIMUM_STATE[POS_SUM_DIST] = 200.0 * UPP_LIMIT
MAX_ACTIONS = 38

def convert_integer_to_action(code):

    '''
    0...20 -> bins of 10 values: 0 = [0..9], 1 = [10..19]..etc
    21...38 -> bins of 10 values: 21 = [0..9], 22 = [10..19]..etc
    39...43 -> 39 == INTER_3, 40 == INTER_4, 41 == INTER_5, 42 == INTER_6, 43 == INTER_7
    44 -> END
    '''

    action, value = (-1, -1)

    if code <= 20:
        action = ACTION_DIST
        value = (code - 0) * 10 + random.randint(0, 9)

    elif code <= 38:
        action = ACTION_ANGLE
        value = ((code - 21) * 10 + random.randint(0, 9)) - 90

    elif code <= 43:
        action = ACTION_INTER
        value = code - 39 + 3

    elif code == 44:
        action = ACTION_END
        value = code

    #print(code, ' -> ', (action, value))

    return (action, value)

class RoadNetworkEnv(py_environment.PyEnvironment):

    def __init__(self):

        self._action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=MAX_ACTIONS, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(shape=(NR_INFO,), dtype=np.float64, minimum=np.full((NR_INFO,), -1000.0, dtype=np.float64),maximum=MAXIMUM_STATE, name='observation')
        self._state=copy.deepcopy(INITIAL_STATE)
        self._action_timeline = np.zeros(4,)
        self._episode_ended = False
        self._reward = 0
        self._timestamp = 0
        self._maxstep = 0

        self._edges = []
        self._nodes = [(255, 255)]
        self._nodes_to_expand = [(ACTION_ANGLE, (255, 255), 0, 0)]

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec



    def give_ANGLE_reward(self, action_value, node):
      (last_action, position, angle, steps) = node
      time_diff = self._timestamp - self._action_timeline[ACTION_ANGLE]

      nr_angles = self._state[POS_NR_ANGLES]

      self._reward += 20 #(time_diff - 3) * 15 #(85 + math.log(abs(0.0001 + action_value - 180) / 30)) #we want to promote angles of 30 or higher

      if steps < MAX_STEPS:
        self._state[POS_NR_ANGLES] += 1
        self._state[POS_MIN_ANGLE] = min(action_value, self._state[POS_MIN_ANGLE])
        self._state[POS_MAX_ANGLE] = max(action_value, self._state[POS_MAX_ANGLE])
        self._state[POS_MEAN_ANGLE] = (self._state[POS_MEAN_ANGLE] * nr_angles + action_value) / (nr_angles + 1)
        self._state[POS_LAST_ANGLE] = action_value

        self._nodes_to_expand.append((ACTION_ANGLE, position, (angle + action_value), steps + 1))

    def give_DIST_reward(self, action_value, node):
        (last_action, position, angle, steps) = node   
        time_diff = self._timestamp - self._action_timeline[ACTION_DIST]

        nr_dists = self._state[POS_NR_DIST]

        self._reward += 25 #+(time_diff - 1) * 25 #(180 - (abs(120 - action_value) / 4))

        new_position = get_position_of(position, 0, action_value, angle)

        is_oob = is_out_of_bounds(new_position, IMG_DIM)
        is_marginal = is_marginal_node(new_position, IMG_DIM)
        is_self_inter = intersects_any((position, new_position), self._edges)

        if is_oob:
            self._state[POS_NR_OOB] += 1
            self._reward -= 15

        if is_self_inter:
            self._state[POS_NR_SELFINTER] += 1
            self._reward -= 15

        if is_marginal:
            self._state[POS_NR_MARGINALS] += 1
            self._reward += 1

        if steps < MAX_STEPS:
          if is_oob == False:
            self._state[POS_NR_DIST] += 1
            self._state[POS_SUM_DIST] += action_value
            self._state[POS_MIN_DIST] = min(action_value, self._state[POS_MIN_DIST])
            self._state[POS_MAX_DIST] = max(action_value, self._state[POS_MAX_DIST])
            self._state[POS_MEAN_DIST] = (self._state[POS_MEAN_DIST] * nr_dists + action_value) / (nr_dists + 1)
            self._state[POS_LAST_DIST] = action_value
            
            self._nodes_to_expand.append((ACTION_DIST, new_position, angle, steps + 1))
            if position != new_position and (len(self._edges) == 0 or (position, new_position) != self._edges[-1]):
              self._edges.append((position, new_position))
              self._nodes.append(new_position)
          
          else:
              self._nodes_to_expand.append((ACTION_DIST, position, angle + 5, steps + 1))


    def give_INTER_reward(self, action_value, node):
      (last_action, position, angle, steps) = node   
      time_diff = self._timestamp - self._action_timeline[ACTION_INTER]

      self._reward += (time_diff - 6) * 25 #(300 - 3 * action_value)

      if steps < MAX_STEPS:
        self._state[POS_NR_INTER] += 1
        self._state[POS_INTER3] += (1 if action_value == 3 else 0)
        self._state[POS_INTER4] += (1 if action_value == 4 else 0)
        self._state[POS_INTER5] += (1 if action_value == 5 else 0)
        self._state[POS_INTER6] += (1 if action_value == 6 else 0)
        self._state[POS_INTER7] += (1 if action_value == 7 else 0)
        self._state[POS_INTER7] += (1 if action_value == 7 else 0)
        self._state[POS_LAST_INTER] = action_value

        for i in range(action_value):
          #print("Inter adding node")
          self._nodes_to_expand.append((ACTION_INTER, position, (angle + i * (360 // action_value)), steps + 1))

    def give_END_reward(self):   
        time_diff = self._timestamp - self._action_timeline[ACTION_END]

        self._reward += (time_diff - 9) * 50 #we dont want to encourage it to finish the road too fast
        self._state[POS_NR_ENDS] += 1


    def apply_action(self, action):

        self._timestamp += 1

        (action_type, action_value) = convert_integer_to_action(action)
        print("Nodes to expand: ", self._nodes_to_expand)
        
        initial_len = len(self._nodes_to_expand)
        node_to_expand = self._nodes_to_expand.pop(-1)
        (last_action, position, angle, steps) = node_to_expand
        
        self._maxstep = max(self._maxstep, steps)

        print("Current action: ", action_type)
        print("Last action: ", last_action)
        

        if last_action == ACTION_DIST:
            if action_type == ACTION_ANGLE:
                self.give_ANGLE_reward(action_value, node_to_expand)
            elif action_type == ACTION_INTER:
                self.give_INTER_reward(action_value, node_to_expand)
            elif action_type == ACTION_END:
                self.give_END_reward()
            else:
                self._reward -= 100

        elif last_action == ACTION_ANGLE:
            if action_type == ACTION_DIST:
                self.give_DIST_reward(action_value, node_to_expand)
            else:
                self._reward -= 100

        elif last_action == ACTION_INTER:
            if action_type == ACTION_DIST:
                self.give_DIST_reward(action_value, node_to_expand)
            #elif action_type == ACTION_ANGLE:
                #self.give_ANGLE_reward(action_value, node_to_expand)
            else:
                self._reward -= 100

        if last_action == action_type:
          self._reward -= 1000
          if steps < MAX_STEPS:
            self._nodes_to_expand.append((last_action, position, angle + 5, steps + 1))
        else:
          self._state[POS_LAST_ACTION] = action_type
        print("Nodes to expand after: ", self._nodes_to_expand)
        print("==============================================================")

        #print(self._reward)      
        
        self._action_timeline[action_type] = self._timestamp
  
        #print(self._edges)
        #print("==============================================================")

    def _reset(self):

        #print("RESET WAS CALLED pre!!!! -> ", self._nodes_to_expand)

        self._state=copy.deepcopy(INITIAL_STATE)
        self._episode_ended = False
        self._reward = 0
        self._edges = []
        self._nodes = [(255, 255)]
        self._nodes_to_expand = [(ACTION_ANGLE, (255, 255), 0, 0)]
        self._timestamp = 0
        self._maxstep = 0
        self._action_timeline = np.zeros(4,)

        #print("RESET WAS CALLED after!!!! -> ", self._nodes_to_expand)

        #print(self._state)

        return ts.restart(np.array(self._state, dtype=np.float64))


    def _step(self, action):

        #print(self._nodes_to_expand)

        if self._episode_ended == True:
            return self.reset()

        self.apply_action(action)

        if len(self._nodes_to_expand) == 0:
          self._episode_ended = True
          self._reward += 100 * (len(self._edges) - 20)
            
        if self._episode_ended == True:
          #print("Max steps: ", self._maxstep)
          #print("Terminated after " + str(self._timestamp) + " with state:", self._state)
          return ts.termination(np.array(self._state, dtype=np.float64), self._reward)
        else:
          return ts.transition(np.array(self._state, dtype=np.float64), reward=self._reward, discount=0.9)

    def create_image_from_graph(self):
        img = np.zeros((IMG_DIM, IMG_DIM), dtype=np.uint8)
        for ((xA, yA), (xB, yB)) in self._edges:
            rr, cc = line(xA, yA, xB, yB)
            img[rr, cc] = 255
      
        return img

    def render(self, mode='human'):
        print(self._edges)
        print(self._state)
        return self.create_image_from_graph()

In [None]:
env = RoadNetworkEnv()
utils.validate_py_environment(env)

tl_env = wrappers.TimeLimit(env, duration=5000)

time_step = tl_env.reset()
rewards = time_step.reward

for i in range(100):
    action = np.random.choice(range(MAX_ACTIONS))
    time_step = tl_env.step(action)
    rewards += time_step.reward

Nodes to expand:  [(1, (255, 255), 0, 0)]
Current action:  1
Last action:  1
Nodes to expand after:  [(1, (255, 255), 5, 1)]
Nodes to expand:  [(1, (255, 255), 5, 1)]
Current action:  0
Last action:  1
Nodes to expand after:  [(0, (340, 263), 5, 2)]
Nodes to expand:  [(0, (340, 263), 5, 2)]
Current action:  0
Last action:  0
Nodes to expand after:  [(0, (340, 263), 10, 3)]
Nodes to expand:  [(0, (340, 263), 10, 3)]
Current action:  1
Last action:  0
Nodes to expand after:  [(1, (340, 263), -36, 4)]
Nodes to expand:  [(1, (340, 263), -36, 4)]
Current action:  0
Last action:  1
Nodes to expand after:  [(0, (371, 241), -36, 5)]
Nodes to expand:  [(0, (371, 241), -36, 5)]
Current action:  1
Last action:  0
Nodes to expand after:  [(1, (371, 241), -80, 6)]
Nodes to expand:  [(1, (371, 241), -80, 6)]
Current action:  1
Last action:  1
Nodes to expand after:  [(1, (371, 241), -75, 7)]
Nodes to expand:  [(1, (371, 241), -75, 7)]
Current action:  0
Last action:  1
Nodes to expand after:  [(0, (

In [None]:
train_py_env = wrappers.TimeLimit(RoadNetworkEnv(), duration=1000)
eval_py_env = wrappers.TimeLimit(RoadNetworkEnv(), duration=1000)

print('Observation Spec:')
print(train_py_env.time_step_spec().observation)

print('Reward Spec:')
print(train_py_env.time_step_spec().reward)

print('Action Spec:')
print(train_py_env.action_spec())

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
evaluation_env = tf_py_environment.TFPyEnvironment(eval_py_env)

Observation Spec:
BoundedArraySpec(shape=(23,), dtype=dtype('float64'), name='observation', minimum=[-1000. -1000. -1000. -1000. -1000. -1000. -1000. -1000. -1000. -1000.
 -1000. -1000. -1000. -1000. -1000. -1000. -1000. -1000. -1000. -1000.
 -1000. -1000. -1000.], maximum=[  2000.   2000.   2000.   2000.   2000.   2000.   2000.   2000.   2000.
   2000.   2000. 400000.   2000.   2000.   2000.   2000.   2000.   2000.
   2000.   2000.   2000.   2000.   2000.])
Reward Spec:
ArraySpec(shape=(), dtype=dtype('float32'), name='reward')
Action Spec:
BoundedArraySpec(shape=(), dtype=dtype('int32'), name='action', minimum=0, maximum=38)


In [None]:
hidden_layers = (264, 512, 1024, 512, 264,)

q_network = QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=hidden_layers)

counter = tf.Variable(0)

agent = DdqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network = q_network,
    epsilon_greedy = 0.99,
    debug_summaries = True,
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=LEARNING_RATE),
    td_errors_loss_fn = common.element_wise_squared_loss,
    train_step_counter = counter)

agent.initialize()
agent.action_spec

BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(38, dtype=int32))

In [None]:
#This method is used for calculations of how much reward has agent gained on average.
def get_average_return(environment, policy, episodes=20):

  print(policy)

  total_return = 0.0

  for _ in range(episodes):
    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward

    total_return += episode_return
  
  avg_return = total_return / episodes
  return avg_return.numpy()[0]

In [None]:
class ExperienceReplay(object):
    def __init__(self, agent, enviroment):

        self._replay_buffer = TFUniformReplayBuffer(
            data_spec=agent.collect_data_spec,
            batch_size=enviroment.batch_size,
            max_length=REPLAY_BUFFER_MAX_LENGTH)
        
        self._random_policy = RandomTFPolicy(train_env.time_step_spec(),
                                                enviroment.action_spec())
        
        self.collect_data(train_env, self._random_policy, steps=INITIAL_COLLECT_STEPS)
        
        self.dataset = self._replay_buffer.as_dataset(
            num_parallel_calls=3, 
            sample_batch_size=BATCH_SIZE, 
            num_steps=2).prefetch(3)

        self.iterator = iter(self.dataset)
    
    def collect_data(self, enviroment, policy, steps):
        for _ in range(steps):
            self.collect_step(enviroment, policy)
            
    def collect_step(self, environment, policy):
        time_step = environment.current_time_step()
        action_step = policy.action(time_step)
        next_time_step = environment.step(action_step.action)
        timestamp_trajectory = trajectory.from_transition(time_step, action_step, next_time_step)
        self._replay_buffer.add_batch(timestamp_trajectory)

experience_replay = ExperienceReplay(agent, train_env)

Nodes to expand:  [(1, (255, 255), 0, 0)]
Current action:  0
Last action:  1
Nodes to expand after:  [(0, (431, 255), 0, 1)]
Nodes to expand:  [(0, (431, 255), 0, 1)]
Current action:  1
Last action:  0
Nodes to expand after:  [(1, (431, 255), 79, 2)]
Nodes to expand:  [(1, (431, 255), 79, 2)]
Current action:  0
Last action:  1
Nodes to expand after:  [(0, (455, 383), 79, 3)]
Nodes to expand:  [(0, (455, 383), 79, 3)]
Current action:  0
Last action:  0
Nodes to expand after:  [(0, (455, 383), 84, 4)]
Nodes to expand:  [(0, (455, 383), 84, 4)]
Current action:  1
Last action:  0
Nodes to expand after:  [(1, (455, 383), 42, 5)]
Nodes to expand:  [(1, (455, 383), 42, 5)]
Current action:  1
Last action:  1
Nodes to expand after:  [(1, (455, 383), 47, 6)]
Nodes to expand:  [(1, (455, 383), 47, 6)]
Current action:  0
Last action:  1
Nodes to expand after:  [(0, (455, 383), 52, 7)]
Nodes to expand:  [(0, (455, 383), 52, 7)]
Current action:  1
Last action:  0
Nodes to expand after:  [(1, (455, 3

In [None]:
agent.train = common.function(agent.train)

#initialize counter on the agent to 0
agent.train_step_counter.assign(0)


with tf.device('/device:GPU:0'):
  #get initial average return of reward
  avg_return = get_average_return(evaluation_env, agent.policy, EVAL_EPISODES)
  returns = [avg_return]
  
  layers_values = agent._q_network.layers[0].get_weights()[0]
  for _ in range(NUMBER_ITERATION):
      
      experience_replay.collect_data(train_env, agent.collect_policy, COLLECTION_STEPS_PER_ITERATION)
      
      experience, unused_info  = next(experience_replay.iterator)
      train_loss = agent.train(experience).loss

      new_values = agent._q_network.layers[0].get_weights()[0]

      #print("Layers update were not changed: ", np.all(layers_values == new_values))
      layers_values = new_values

      if agent.train_step_counter.numpy() % EVAL_INTERVAL == 0:
           avg_return = get_average_return(evaluation_env, agent.policy, EVAL_EPISODES)
           print('Iteration {0} – Average Return = {1}, Loss = {2}.'.format(agent.train_step_counter.numpy(), avg_return, train_loss))
           returns.append(avg_return)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Last action:  0
Nodes to expand after:  [(1, (277, 425), -39, 13)]
Nodes to expand:  [(1, (277, 425), -39, 13)]
Current action:  0
Last action:  1
Nodes to expand after:  [(0, (438, 295), -39, 14)]
Nodes to expand:  [(0, (438, 295), -39, 14)]
Current action:  0
Last action:  0
Nodes to expand after:  [(0, (438, 295), -34, 15)]
Nodes to expand:  [(0, (438, 295), -34, 15)]
Current action:  0
Last action:  0
Nodes to expand after:  []
Nodes to expand:  [(1, (255, 255), 0, 0)]
Current action:  0
Last action:  1
Nodes to expand after:  [(0, (331, 255), 0, 1)]
Nodes to expand:  [(0, (331, 255), 0, 1)]
Current action:  1
Last action:  0
Nodes to expand after:  [(1, (331, 255), 53, 2)]
Nodes to expand:  [(1, (331, 255), 53, 2)]
Current action:  0
Last action:  1
Nodes to expand after:  [(0, (340, 267), 53, 3)]
Nodes to expand:  [(0, (340, 267), 53, 3)]
Current action:  1
Last action:  0
Nodes to expand after:  [(1, (340, 267), 13

In [None]:
iterations = range(0, NUMBER_ITERATION + 1, EVAL_INTERVAL)
plt.plot(iterations, returns)
plt.ylabel('Average Return')
plt.xlabel('Iterations')

In [None]:
def create_policy_eval(policy, num_episodes=5):
  for _ in range(num_episodes):
    time_step = evaluation_env.reset()
    #cv2_imshow(eval_py_env.render())
    while not time_step.is_last():
      action_step = policy.action(time_step)
      print(action_step)
      time_step = evaluation_env.step(action_step.action)
      #print(time_step)
    cv2_imshow(evaluation_env.render().numpy()[0])
    print(time_step)
    print("============================================")

In [None]:
create_policy_eval(agent.policy)