In [1]:
%%capture
!pip install ipython-autotime

%load_ext autotime

time: 2.7 ms (started: 2022-05-05 04:37:15 +00:00)


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive
time: 22.3 s (started: 2022-05-05 04:37:15 +00:00)


In [3]:
!cp -r '/content/gdrive/MyDrive/ESE_650_Project/data/' '/content/'

time: 4.96 s (started: 2022-05-05 04:37:37 +00:00)


In [4]:
!rm -r '/content/Data'

rm: cannot remove '/content/Data': No such file or directory
time: 121 ms (started: 2022-05-05 04:37:42 +00:00)


In [5]:
import torch
from torch import nn
import torch.nn.functional as F

from tqdm import tqdm

import matplotlib.pyplot as plt


## scene loader 
import sys
import h5py
import json
import numpy as np
import random
import skimage.io
from skimage.transform import resize

import os

time: 3.9 s (started: 2022-05-05 04:37:42 +00:00)


In [6]:
def seed_everything(seed=42):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)

  return

seed_everything()

time: 19.3 ms (started: 2022-05-05 04:37:46 +00:00)


In [7]:
ACTION_SIZE = 4 # action size

SCREEN_WIDTH = 84
SCREEN_HEIGHT = 84
HISTORY_LENGTH = 4

NUM_EVAL_EPISODES = 5

TASK_TYPE = 'navigation' # no need to change
# keys are scene names, and values are a list of location ids (navigation targets)
TASK_LIST = {
  'bathroom_02'    : ['26', '37', '43', '53', '69'],
  'bedroom_04'     : ['134', '264', '320', '384', '387'],
  'kitchen_02'     : ['90', '136', '157', '207', '329'],
  'living_room_08' : ['92', '135', '193', '228', '254']
}

time: 6.4 ms (started: 2022-05-05 04:37:46 +00:00)


In [8]:
## Scene Loader

# -*- coding: utf-8 -*-
class THORDiscreteEnvironment(object):

  def __init__(self, config=dict()):

    # configurations
    self.scene_name          = config.get('scene_name', 'bedroom_04')
    self.random_start        = config.get('random_start', True)
    self.n_feat_per_locaiton = config.get('n_feat_per_locaiton', 1) # 1 for no sampling
    self.terminal_state_id   = config.get('terminal_state_id', 0)

    self.h5_file_path = config.get('h5_file_path', 'data/%s.h5'%self.scene_name)
    self.h5_file      = h5py.File(self.h5_file_path, 'r')

    self.locations   = self.h5_file['location'][()]
    self.rotations   = self.h5_file['rotation'][()]
    self.n_locations = self.locations.shape[0]

    self.terminals = np.zeros(self.n_locations)
    self.terminals[self.terminal_state_id] = 1
    self.terminal_states, = np.where(self.terminals)

    self.transition_graph = self.h5_file['graph'][()]
    self.shortest_path_distances = self.h5_file['shortest_path_distance'][()]

    self.history_length = HISTORY_LENGTH
    self.screen_height  = SCREEN_HEIGHT
    self.screen_width   = SCREEN_WIDTH

    # we use pre-computed fc7 features from ResNet-50
    # self.s_t = np.zeros([self.screen_height, self.screen_width, self.history_length])
    self.s_t      = np.zeros([2048, self.history_length])
    self.s_t1     = np.zeros_like(self.s_t)
    self.s_target = self._tiled_state(self.terminal_state_id)

    self.reset()

  # public methods

  def reset(self):
    # randomize initial state
    while True:
      k = random.randrange(self.n_locations)
      min_d = np.inf
      # check if target is reachable
      for t_state in self.terminal_states:
        dist = self.shortest_path_distances[k][t_state]
        min_d = min(min_d, dist)
      # min_d = 0  if k is a terminal state
      # min_d = -1 if no terminal state is reachable from k
      if min_d > 0: break

    # reset parameters
    self.current_state_id = k
    self.s_t = self._tiled_state(self.current_state_id)

    self.reward   = 0
    self.collided = False
    self.terminal = False

  def step(self, action):
    assert not self.terminal, 'step() called in terminal state'
    k = self.current_state_id
    if self.transition_graph[k][action] != -1:
      self.current_state_id = self.transition_graph[k][action]
      if self.terminals[self.current_state_id]:
        self.terminal = True
        self.collided = False
      else:
        self.terminal = False
        self.collided = False
    else:
      self.terminal = False
      self.collided = True

    self.reward = self._reward(self.terminal, self.collided)
    self.s_t1 = np.append(self.s_t[:,1:], self.state, axis=1)

  def update(self):
    self.s_t = self.s_t1

  # private methods

  def _tiled_state(self, state_id):
    k = random.randrange(self.n_feat_per_locaiton)
    f = self.h5_file['resnet_feature'][state_id][k][:,np.newaxis]
    return np.tile(f, (1, self.history_length))

  def _reward(self, terminal, collided):
    # positive reward upon task completion
    if terminal: return 10.0
    # time penalty or collision penalty
    return -0.1 if collided else -0.01

  # properties

  @property
  def action_size(self):
    # move forward/backward, turn left/right for navigation
    return ACTION_SIZE 

  @property
  def action_definitions(self):
    action_vocab = ["MoveForward", "RotateRight", "RotateLeft", "MoveBackward"]
    return action_vocab[:ACTION_SIZE]

  @property
  def observation(self):
    return self.h5_file['observation'][self.current_state_id]

  @property
  def state(self):
    # read from hdf5 cache
    k = random.randrange(self.n_feat_per_locaiton)
    return self.h5_file['resnet_feature'][self.current_state_id][k][:,np.newaxis]

  @property
  def target(self):
    return self.s_target

  @property
  def x(self):
    return self.locations[self.current_state_id][0]

  @property
  def z(self):
    return self.locations[self.current_state_id][1]

  @property
  def r(self):
    return self.rotations[self.current_state_id]




time: 315 ms (started: 2022-05-05 04:37:46 +00:00)


In [9]:
class MyActorCritic(nn.Module):

    def __init__(self):
        super(MyActorCritic, self).__init__()

        # fully connected layer 1

        # generic siamese layer: <start>
        self.fc1 = nn.Linear(in_features=8192, out_features=512)
        self.fc2 = nn.Linear(in_features=1024, out_features=512)
        # generic siamese layer: <end>

        # Scene specific layer: <start> 
        # for now we have only one scene speicific layer and only one thread i.e only one target in one scene
        self.fc3 = nn.Linear(in_features=512, out_features=512)

        self.actions_fc = nn.Linear(in_features=512, out_features=4)
        self.value_fc  = nn.Linear(in_features=512, out_features=1)
        # Scene specific layer: <end>


    def forward(self, input_image_embedding, target_image_embedding):
        
        # generic siamese latyer
        x1 = F.relu(self.fc1(input_image_embedding))
        x2 = F.relu(self.fc1(target_image_embedding))

        global c_nan
        global t_img_embedding

        x_combined = torch.cat((x1, x2)) # combine input image emb and target image emb, may need to add axis for concat later

        x = F.relu(self.fc2(x_combined))

        

        # scene specific layer

        x = F.relu(self.fc3(x))
        x1 = self.actions_fc(x)
        # print(f' actions values before softmax:\n{x1}\n')
        actions_prob = F.softmax(x1, dim=0)
        value = self.value_fc(x)

        return actions_prob, value




time: 56.5 ms (started: 2022-05-05 04:37:47 +00:00)


In [10]:
zip_file_name_open = 'h_eval_working.zip'

time: 2.16 ms (started: 2022-05-05 04:37:47 +00:00)


In [11]:
scene_filename = 'bedroom_04.h5'

time: 881 µs (started: 2022-05-05 04:37:47 +00:00)


In [12]:
!cp -r '/content/gdrive/MyDrive/ESE_650_Project/{zip_file_name_open}' '/content/' 

time: 3.06 s (started: 2022-05-05 04:37:47 +00:00)


In [13]:
!cp -r '/content/gdrive/MyDrive/ESE_650_Project/{scene_filename}' '/content/' 

time: 3.17 s (started: 2022-05-05 04:37:50 +00:00)


In [14]:
%%capture
!unzip {zip_file_name_open}

time: 3.79 s (started: 2022-05-05 04:37:53 +00:00)


In [15]:
model_load = MyActorCritic()
model_load = torch.load('/content/gdrive/MyDrive/ESE_650_Project/models/bedroom_04_myA2C_model_wts_epoch_150000.pth')



time: 1.36 s (started: 2022-05-05 04:37:57 +00:00)


In [16]:
device_name = 'cuda:0'
# device_name = 'cpu'

time: 859 µs (started: 2022-05-05 04:37:58 +00:00)


# Evaluation of Generalization performance

In [17]:
!cp -r '/content/gdrive/MyDrive/ESE_650_Project/bathroom_02.h5' '/content/' 

time: 1.73 s (started: 2022-05-05 04:37:58 +00:00)


In [18]:
!cp -r '/content/gdrive/MyDrive/ESE_650_Project/living_room_08.h5' '/content/' 

time: 3 s (started: 2022-05-05 04:38:00 +00:00)


In [19]:
!cp -r '/content/gdrive/MyDrive/ESE_650_Project/kitchen_02.h5' '/content/' 


time: 4.34 s (started: 2022-05-05 04:38:03 +00:00)


In [20]:
trained_scene = 'bedroom_04.h5'

time: 1.01 ms (started: 2022-05-05 04:38:07 +00:00)


In [21]:
other_scenes = ['kitchen_02', 'bathroom_02', 'living_room_08']

time: 1.1 ms (started: 2022-05-05 04:38:08 +00:00)


## Evaluation on trained targets in the scene

In [22]:
# for trained scenes in task list

model_load.to(device_name)


target_list = TASK_LIST['bedroom_04']

total_episode_len_scene_tr = []
total_episode_reward_scene_tr = []


for target_s in target_list:

    target_s_int = int(target_s)

    

    eval_env = THORDiscreteEnvironment({
    'random_start': True,
    'scene_name': trained_scene,
    'h5_file_path': '%s'%trained_scene,
    'terminal_state_id' : target_s_int
    })

    print()
    print('%s'%trained_scene)
    print(f'terminal state id: {eval_env.terminal_state_id}')

    total_episode_len_eps_tr = np.zeros(NUM_EVAL_EPISODES)
    total_episode_reward_eps_tr = np.zeros(NUM_EVAL_EPISODES)

    for episode in range(NUM_EVAL_EPISODES):


        model_load.eval()
        eval_env.reset()

        actions = []

        final_state = None
        inital_state_id = eval_env.current_state_id

        terminal_reached = False

        episode_length = 0
        episode_reward = 0
        local_t = 0

        terminal = False
        inital_state_to_target_dist = eval_env.shortest_path_distances[inital_state_id][eval_env.terminal_state_id]
        

        while not terminal:

            torch_s_t = torch.from_numpy(eval_env.s_t).float().flatten()  # state embedding in torch convertinng from 2048 x 4, to 8192 x 1
            torch_target = torch.from_numpy(eval_env.target).float().flatten()  # target embedding in torch

            torch_s_t = torch_s_t.to(device_name)
            torch_target = torch_target.to(device_name)

            actions_prob, value = model_load(torch_s_t, torch_target)

            action = torch.argmax(actions_prob)  # choose best action

            # store required items
            actions.append(action)

            # process game
            eval_env.step(action)

            # receive game result
            reward = eval_env.reward
            terminal = eval_env.terminal

            # ad-hoc reward for navigation
            reward = 10.0 if terminal else -0.01
            if episode_length > 1e4: terminal = True

            episode_reward += reward
            episode_length += 1
            # episode_max_q = max(episode_max_q, np.max(value_)) !!!! DO Q VALUE CLIPPING LATER !!!!!!!

            local_t += 1

            # s_t1 -> s_t
            eval_env.update()

            if terminal:
                final_state = eval_env.s_t[0]
                terminal_end = True
                break

        
        final_state_to_target_dist = eval_env.shortest_path_distances[eval_env.current_state_id][eval_env.terminal_state_id]
        print(f'\n\t\tnum_iter : {episode}\n\t\tepisode_length : {episode_length}')
        print(f'\t\tepisode_reward : {episode_reward}')
        print(f'\t\tinital state to target state shortest dist: {inital_state_to_target_dist}')
        # print(f'shortest dist from episode termination state to goal state: {dist}')
        print(f'\t\tfinal state to target state shortest dist: {final_state_to_target_dist}')
        
        total_episode_len_eps_tr[episode] = episode_length
        total_episode_reward_eps_tr[episode] = episode_reward

        
    print()
    print(f'\taverage episode length = {total_episode_len_eps_tr.mean()}')
    print(f'\taverage episode reward = {total_episode_reward_eps_tr.mean()}')
    total_episode_len_scene_tr.append(total_episode_len_eps_tr.mean().item())
    total_episode_reward_scene_tr.append(total_episode_reward_eps_tr.mean().item())

print()
print(f'average scene length = {np.array(total_episode_len_scene_tr).mean()}')
print(f'average scene reward = {np.array(total_episode_reward_scene_tr).mean()}')


    


bedroom_04.h5
terminal state id: 134

		num_iter : 0
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 9
		final state to target state shortest dist: 9

		num_iter : 1
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 9
		final state to target state shortest dist: 9

		num_iter : 2
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 5
		final state to target state shortest dist: 5

		num_iter : 3
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 13
		final state to target state shortest dist: 13

		num_iter : 4
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 17
		final state to target state shortest dist: 17

	average episode length = 10002.0
	average episode reward = -100.02000000001426

b

####Observe that the agent keeps rotating about in the initialized position without any forward or backward movement

Action space

0 : Move forward

1 : Rotate right

2 : Rotate left

3 : Move backward

In [34]:
# for trained scene in task list observe that the agent rotates about a fixed position

model_load.to(device_name)


target_list = TASK_LIST['bedroom_04']

total_episode_len_scene_tr = []
total_episode_reward_scene_tr = []


for target_s in target_list:

    target_s_int = int(target_s)

    

    eval_env = THORDiscreteEnvironment({
    'random_start': True,
    'scene_name': trained_scene,
    'h5_file_path': '%s'%trained_scene,
    'terminal_state_id' : target_s_int
    })

    print()
    print('%s'%trained_scene)
    print(f'terminal state id: {eval_env.terminal_state_id}')

    total_episode_len_eps_tr = np.zeros(NUM_EVAL_EPISODES)
    total_episode_reward_eps_tr = np.zeros(NUM_EVAL_EPISODES)

    for episode in range(1):


        model_load.eval()
        eval_env.reset()

        actions = []

        final_state = None
        inital_state_id = eval_env.current_state_id

        terminal_reached = False

        episode_length = 0
        episode_reward = 0
        local_t = 0

        terminal = False
        inital_state_to_target_dist = eval_env.shortest_path_distances[inital_state_id][eval_env.terminal_state_id]
        

        while not terminal:

            print()
            print(f'current state before action: {eval_env.current_state_id}')

            torch_s_t = torch.from_numpy(eval_env.s_t).float().flatten()  # state embedding in torch convertinng from 2048 x 4, to 8192 x 1
            torch_target = torch.from_numpy(eval_env.target).float().flatten()  # target embedding in torch

            torch_s_t = torch_s_t.to(device_name)
            torch_target = torch_target.to(device_name)

            actions_prob, value = model_load(torch_s_t, torch_target)

            action = torch.argmax(actions_prob)  # choose best action

            # store required items
            actions.append(action.detach().cpu().item())

            # process game
            eval_env.step(action)

            # receive game result
            reward = eval_env.reward
            terminal = eval_env.terminal

            # ad-hoc reward for navigation
            reward = 10.0 if terminal else -0.01
            if episode_length > 1e2: terminal = True

            episode_reward += reward
            episode_length += 1
            # episode_max_q = max(episode_max_q, np.max(value_)) !!!! DO Q VALUE CLIPPING LATER !!!!!!!

            local_t += 1

            # s_t1 -> s_t
            eval_env.update()

            print(f'current action: {action}')
            print(f'agent current state id after action:{eval_env.current_state_id}')
            print(f'agent current position x {eval_env.x}')
            print(f'agent current position z {eval_env.z}')
            print(f'agent current orientation {eval_env.r}')

            if terminal:
                final_state = eval_env.s_t[0]
                terminal_end = True
                break

            


        
        final_state_to_target_dist = eval_env.shortest_path_distances[eval_env.current_state_id][eval_env.terminal_state_id]
        print(f'\nnum_iter : {episode}\nepisode_length : {episode_length}')
        print(f'episode_reward : {episode_reward}')
        print(f'inital state to target state shortest dist: {inital_state_to_target_dist}')
        # print(f'shortest dist from episode termination state to goal state: {dist}')
        print(f'final state to target state shortest dist: {final_state_to_target_dist}')
        
        total_episode_len_eps_tr[episode] = episode_length
        total_episode_reward_eps_tr[episode] = episode_reward

    from pprint import pprint
    print()
    print('first 10 actions:')
    pprint(actions[:10])
    print()
    print('last 10 actions:')

    pprint(actions[-10:-1])

    break
    


bedroom_04.h5
terminal state id: 134

current state before action: 348
current action: 1
agent current state id after action:349
agent current position x 8.0
agent current position z 6.0
agent current orientation 90

current state before action: 349
current action: 1
agent current state id after action:350
agent current position x 8.0
agent current position z 6.0
agent current orientation 180

current state before action: 350
current action: 1
agent current state id after action:351
agent current position x 8.0
agent current position z 6.0
agent current orientation 270

current state before action: 351
current action: 1
agent current state id after action:348
agent current position x 8.0
agent current position z 6.0
agent current orientation 0

current state before action: 348
current action: 1
agent current state id after action:349
agent current position x 8.0
agent current position z 6.0
agent current orientation 90

current state before action: 349
current action: 1
agent current 

## Evaluation on targets different from trained targets in trained scene

In [35]:
## load scene into environment
scene_name = 'bedroom_04'
print('%s.h5'%scene_name)

eval_env = THORDiscreteEnvironment({
'random_start': True,
'scene_name': scene_name,
'h5_file_path': '%s.h5'%scene_name,
'terminal_state_id' : 265
})

print(eval_env.scene_name)
print(eval_env.terminal_state_id)

bedroom_04.h5
bedroom_04
265
time: 12.1 ms (started: 2022-05-05 04:53:24 +00:00)


In [36]:
# on other targets in trained scene


print(eval_env.scene_name)
print(eval_env.terminal_state_id)


model_load.to(device_name)


target_list = TASK_LIST['bedroom_04']

nearest_targets_dict = {}

# dist 1 and 2

target_list = target_list[:3] # check for 3 targets
nearest_dist = [1, 2]


for target in target_list:

    nearest_targets_dict[target] = []

    target = int(target)

    for n_dist in nearest_dist:

        eval_env = THORDiscreteEnvironment({
        'random_start': True,
        'scene_name': trained_scene,
        'h5_file_path': '%s'%trained_scene,
        'terminal_state_id' : target
        })

        for i in range(eval_env.n_locations):

            if eval_env.shortest_path_distances[target][i] == n_dist:
                nearest_targets_dict[str(target)].append(i)
                break

# print(nearest_targets_dict)
# print(eval_env.shortest_path_distances[264][196])


avg_len_dict = {1:[], 2:[]}
avg_reward_dict = {1:[], 2:[]}

for t in nearest_targets_dict.keys():

    target_list =  nearest_targets_dict[t]
    i=1


    for target_s in target_list:

        target_s_int = int(target_s)

        

        eval_env = THORDiscreteEnvironment({
        'random_start': True,
        'scene_name': trained_scene,
        'h5_file_path': '%s'%trained_scene,
        'terminal_state_id' : target_s_int
        })

        print()
        print('%s'%trained_scene)
        print(f'terminal state id: {eval_env.terminal_state_id}')

        total_episode_len_eps_tr = np.zeros(NUM_EVAL_EPISODES)
        total_episode_reward_eps_tr = np.zeros(NUM_EVAL_EPISODES)

        for episode in range(NUM_EVAL_EPISODES):


            model_load.eval()
            eval_env.reset()

            actions = []

            final_state = None
            inital_state_id = eval_env.current_state_id

            terminal_reached = False

            episode_length = 0
            episode_reward = 0
            local_t = 0

            terminal = False
            inital_state_to_target_dist = eval_env.shortest_path_distances[inital_state_id][eval_env.terminal_state_id]
            

            while not terminal:

                torch_s_t = torch.from_numpy(eval_env.s_t).float().flatten()  # state embedding in torch convertinng from 2048 x 4, to 8192 x 1
                torch_target = torch.from_numpy(eval_env.target).float().flatten()  # target embedding in torch

                torch_s_t = torch_s_t.to(device_name)
                torch_target = torch_target.to(device_name)

                actions_prob, value = model_load(torch_s_t, torch_target)

                action = torch.argmax(actions_prob)  # choose best action

                # store required items
                actions.append(action)

                # process game
                eval_env.step(action)

                # receive game result
                reward = eval_env.reward
                terminal = eval_env.terminal

                # ad-hoc reward for navigation
                reward = 10.0 if terminal else -0.01
                if episode_length > 1e4: terminal = True

                episode_reward += reward
                episode_length += 1
                # episode_max_q = max(episode_max_q, np.max(value_)) !!!! DO Q VALUE CLIPPING LATER !!!!!!!

                local_t += 1

                # s_t1 -> s_t
                eval_env.update()

                if terminal:
                    final_state = eval_env.s_t[0]
                    terminal_end = True
                    break

            
            final_state_to_target_dist = eval_env.shortest_path_distances[eval_env.current_state_id][eval_env.terminal_state_id]
            print(f'\n\t\tnum_iter : {episode}\n\t\tepisode_length : {episode_length}')
            print(f'\t\tepisode_reward : {episode_reward}')
            print(f'\t\tinital state to target state shortest dist: {inital_state_to_target_dist}')
            # print(f'shortest dist from episode termination state to goal state: {dist}')
            print(f'\t\tfinal state to target state shortest dist: {final_state_to_target_dist}')
            
            total_episode_len_eps_tr[episode] = episode_length
            total_episode_reward_eps_tr[episode] = episode_reward

            
        print()
        print(f'\taverage episode length = {total_episode_len_eps_tr.mean()}')
        print(f'\taverage episode reward = {total_episode_reward_eps_tr.mean()}')
        total_episode_len_scene_tr.append(total_episode_len_eps_tr.mean().item())
        total_episode_reward_scene_tr.append(total_episode_reward_eps_tr.mean().item())

        avg_len_dict[i].append(total_episode_len_eps_tr.mean())
        avg_reward_dict[i].append(total_episode_reward_eps_tr.mean())
    
        i += 1


print()

for k in avg_len_dict.keys():
    print(f'dist from target : {k}')
    print(f'average episode length = {np.array(avg_len_dict[k]).mean()}')
    print(f'average episode reward = {np.array(avg_reward_dict[k]).mean()}')


bedroom_04
265

bedroom_04.h5
terminal state id: 133

		num_iter : 0
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 8
		final state to target state shortest dist: 8

		num_iter : 1
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 7
		final state to target state shortest dist: 7

		num_iter : 2
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 3
		final state to target state shortest dist: 3

		num_iter : 3
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 5
		final state to target state shortest dist: 5

		num_iter : 4
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 2
		final state to target state shortest dist: 4

	average episode length = 10002.0
	average episode reward = -100.020000

## Evaluation on targets in different scene

In [37]:
# for differenct scenes generalization

model_load.to(device_name)
model_load.eval()

avg_len_scene_dict = {}
avg_reward_scene_dict = {}

for s in other_scenes:
    target_list = TASK_LIST[s]
    print(s)

    avg_len_scene_dict[s] = []
    avg_reward_scene_dict[s] = []

    total_episode_len_scene_tr = []
    total_episode_reward_scene_tr = []


    for target_s in target_list:

        target_s_int = int(target_s)

        

        eval_env = THORDiscreteEnvironment({
        'random_start': True,
        'scene_name': s,
        'h5_file_path': '%s.h5'%s,
        'terminal_state_id' : target_s_int
        })

        print()
        print('%s'%s)
        print(f'terminal state id: {eval_env.terminal_state_id}')

        total_episode_len_eps_tr = np.zeros(NUM_EVAL_EPISODES)
        total_episode_reward_eps_tr = np.zeros(NUM_EVAL_EPISODES)

        for episode in range(NUM_EVAL_EPISODES):


            model_load.eval()
            eval_env.reset()

            actions = []

            final_state = None
            inital_state_id = eval_env.current_state_id

            terminal_reached = False

            episode_length = 0
            episode_reward = 0
            local_t = 0

            terminal = False
            inital_state_to_target_dist = eval_env.shortest_path_distances[inital_state_id][eval_env.terminal_state_id]
            

            while not terminal:

                torch_s_t = torch.from_numpy(eval_env.s_t).float().flatten()  # state embedding in torch convertinng from 2048 x 4, to 8192 x 1
                torch_target = torch.from_numpy(eval_env.target).float().flatten()  # target embedding in torch

                torch_s_t = torch_s_t.to(device_name)
                torch_target = torch_target.to(device_name)

                actions_prob, value = model_load(torch_s_t, torch_target)

                action = torch.argmax(actions_prob)  # choose best action

                # store required items
                actions.append(action)

                # process game
                eval_env.step(action)

                # receive game result
                reward = eval_env.reward
                terminal = eval_env.terminal

                # ad-hoc reward for navigation
                reward = 10.0 if terminal else -0.01
                if episode_length > 1e4: terminal = True

                episode_reward += reward
                episode_length += 1
                # episode_max_q = max(episode_max_q, np.max(value_)) !!!! DO Q VALUE CLIPPING LATER !!!!!!!

                local_t += 1

                # s_t1 -> s_t
                eval_env.update()

                if terminal:
                    final_state = eval_env.s_t[0]
                    terminal_end = True
                    break

            
            final_state_to_target_dist = eval_env.shortest_path_distances[eval_env.current_state_id][eval_env.terminal_state_id]
            print(f'\n\t\tnum_iter : {episode}\n\t\tepisode_length : {episode_length}')
            print(f'\t\tepisode_reward : {episode_reward}')
            print(f'\t\tinital state to target state shortest dist: {inital_state_to_target_dist}')
            # print(f'shortest dist from episode termination state to goal state: {dist}')
            print(f'\t\tfinal state to target state shortest dist: {final_state_to_target_dist}')
            
            total_episode_len_eps_tr[episode] = episode_length
            total_episode_reward_eps_tr[episode] = episode_reward

            
        print()
        print(f'\taverage episode length = {total_episode_len_eps_tr.mean()}')
        print(f'\taverage episode reward = {total_episode_reward_eps_tr.mean()}')
        total_episode_len_scene_tr.append(total_episode_len_eps_tr.mean().item())
        total_episode_reward_scene_tr.append(total_episode_reward_eps_tr.mean().item())

    avg_len_scene_dict[s].append(total_episode_len_eps_tr.mean())
    avg_reward_scene_dict[s].append(total_episode_reward_eps_tr.mean())


for s in avg_len_scene_dict.keys():
    print()
    print(f'scene name:{s}')
    print(f'average scene length = {np.array(avg_len_scene_dict[s]).mean()}')
    print(f'average scene reward = {np.array(avg_reward_scene_dict[s]).mean()}')
    print()

kitchen_02

kitchen_02
terminal state id: 90

		num_iter : 0
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 18
		final state to target state shortest dist: 18

		num_iter : 1
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 4
		final state to target state shortest dist: 4

		num_iter : 2
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 13
		final state to target state shortest dist: 13

		num_iter : 3
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 17
		final state to target state shortest dist: 17

		num_iter : 4
		episode_length : 10002
		episode_reward : -100.02000000001426
		inital state to target state shortest dist: 15
		final state to target state shortest dist: 15

	average episode length = 10002.0
	average episode reward = -100.020000