In [1]:
%%capture
!pip install ipython-autotime

%load_ext autotime

time: 2.7 ms (started: 2022-05-06 23:12:08 +00:00)


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive
time: 18.4 s (started: 2022-05-06 23:12:08 +00:00)


In [3]:
!cp -r '/content/gdrive/MyDrive/ESE_650_Project/data/' '/content/'

time: 1.74 s (started: 2022-05-06 23:12:27 +00:00)


In [4]:
!rm -r '/content/Data'

rm: cannot remove '/content/Data': No such file or directory
time: 111 ms (started: 2022-05-06 23:12:29 +00:00)


In [5]:
import torch
from torch import nn
import torch.nn.functional as F

from tqdm import tqdm

import matplotlib.pyplot as plt


## scene loader 
import sys
import h5py
import json
import numpy as np
import random
import skimage.io
from skimage.transform import resize

import os

time: 5.04 s (started: 2022-05-06 23:12:29 +00:00)


In [6]:
def seed_everything(seed=42):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  np.random.seed(seed)

  return

seed_everything()

time: 10.6 ms (started: 2022-05-06 23:12:34 +00:00)


In [7]:
## Constants.py

# -*- coding: utf-8 -*-

LOCAL_T_MAX = 5 # repeat step size
RMSP_ALPHA = 0.99 # decay parameter for RMSProp
RMSP_EPSILON = 0.1 # epsilon parameter for RMSProp
CHECKPOINT_DIR = 'checkpoints'
LOG_FILE = 'logs'
INITIAL_ALPHA_LOW = 1e-4    # log_uniform low limit for learning rate
INITIAL_ALPHA_HIGH = 1e-2   # log_uniform high limit for learning rate

PARALLEL_SIZE = 20 # parallel thread size
ACTION_SIZE = 4 # action size

INITIAL_ALPHA_LOG_RATE = 0.4226 # log_uniform interpolate rate for learning rate (around 7 * 10^-4)
GAMMA = 0.99 # discount factor for rewards
ENTROPY_BETA = 0.01 # entropy regurarlization constant
MAX_TIME_STEP = 10.0 * 10**6 # 10 million frames
GRAD_NORM_CLIP = 40.0 # gradient norm clipping
USE_GPU = True # To use GPU, set True
VERBOSE = True

SCREEN_WIDTH = 84
SCREEN_HEIGHT = 84
HISTORY_LENGTH = 4

NUM_EVAL_EPISODES = 5 # number of episodes for evaluation

TASK_TYPE = 'navigation' # no need to change
# keys are scene names, and values are a list of location ids (navigation targets)
TASK_LIST = {
  'bathroom_02'    : ['26', '37', '43', '53', '69'],
  'bedroom_04'     : ['134', '264', '320', '384', '387'],
  'kitchen_02'     : ['90', '136', '157', '207', '329'],
  'living_room_08' : ['92', '135', '193', '228', '254']
}


time: 22.1 ms (started: 2022-05-06 23:12:34 +00:00)


In [8]:
## Scene Loader

# -*- coding: utf-8 -*-
class THORDiscreteEnvironment(object):

  def __init__(self, config=dict()):

    # configurations
    self.scene_name          = config.get('scene_name', 'bedroom_04')
    self.random_start        = config.get('random_start', True)
    self.n_feat_per_locaiton = config.get('n_feat_per_locaiton', 1) # 1 for no sampling
    self.terminal_state_id   = config.get('terminal_state_id', 0)

    self.h5_file_path = config.get('h5_file_path', 'data/%s.h5'%self.scene_name)
    self.h5_file      = h5py.File(self.h5_file_path, 'r')

    self.locations   = self.h5_file['location'][()]
    self.rotations   = self.h5_file['rotation'][()]
    self.n_locations = self.locations.shape[0]

    self.terminals = np.zeros(self.n_locations)
    self.terminals[self.terminal_state_id] = 1
    self.terminal_states, = np.where(self.terminals)

    self.transition_graph = self.h5_file['graph'][()]
    self.shortest_path_distances = self.h5_file['shortest_path_distance'][()]

    self.history_length = HISTORY_LENGTH
    self.screen_height  = SCREEN_HEIGHT
    self.screen_width   = SCREEN_WIDTH

    # we use pre-computed fc7 features from ResNet-50
    # self.s_t = np.zeros([self.screen_height, self.screen_width, self.history_length])
    self.s_t      = np.zeros([2048, self.history_length])
    self.s_t1     = np.zeros_like(self.s_t)
    self.s_target = self._tiled_state(self.terminal_state_id)

    self.reset()

  # public methods

  def reset(self):
    # randomize initial state
    while True:
      k = random.randrange(self.n_locations)
      min_d = np.inf
      # check if target is reachable
      for t_state in self.terminal_states:
        dist = self.shortest_path_distances[k][t_state]
        min_d = min(min_d, dist)
      # min_d = 0  if k is a terminal state
      # min_d = -1 if no terminal state is reachable from k
      if min_d > 0: break

    # reset parameters
    self.current_state_id = k
    self.s_t = self._tiled_state(self.current_state_id)

    self.reward   = 0
    self.collided = False
    self.terminal = False

  def step(self, action):
    assert not self.terminal, 'step() called in terminal state'
    k = self.current_state_id
    if self.transition_graph[k][action] != -1:
      self.current_state_id = self.transition_graph[k][action]
      if self.terminals[self.current_state_id]:
        self.terminal = True
        self.collided = False
      else:
        self.terminal = False
        self.collided = False
    else:
      self.terminal = False
      self.collided = True

    self.reward = self._reward(self.terminal, self.collided)
    self.s_t1 = np.append(self.s_t[:,1:], self.state, axis=1)

  def update(self):
    self.s_t = self.s_t1

  # private methods

  def _tiled_state(self, state_id):
    k = random.randrange(self.n_feat_per_locaiton)
    f = self.h5_file['resnet_feature'][state_id][k][:,np.newaxis]
    return np.tile(f, (1, self.history_length))

  def _reward(self, terminal, collided):
    # positive reward upon task completion
    if terminal: return 10.0
    # time penalty or collision penalty
    return -0.1 if collided else -0.01

  # properties

  @property
  def action_size(self):
    # move forward/backward, turn left/right for navigation
    return ACTION_SIZE 

  @property
  def action_definitions(self):
    action_vocab = ["MoveForward", "RotateRight", "RotateLeft", "MoveBackward"]
    return action_vocab[:ACTION_SIZE]

  @property
  def observation(self):
    return self.h5_file['observation'][self.current_state_id]

  @property
  def state(self):
    # read from hdf5 cache
    k = random.randrange(self.n_feat_per_locaiton)
    return self.h5_file['resnet_feature'][self.current_state_id][k][:,np.newaxis]

  @property
  def target(self):
    return self.s_target

  @property
  def x(self):
    return self.locations[self.current_state_id][0]

  @property
  def z(self):
    return self.locations[self.current_state_id][1]

  @property
  def r(self):
    return self.rotations[self.current_state_id]




time: 236 ms (started: 2022-05-06 23:12:34 +00:00)


In [9]:
## load scene into environment
scene_name = 'bedroom_04'

env = THORDiscreteEnvironment({
'random_start': True,
'scene_name': scene_name,
'h5_file_path': 'data/%s.h5'%scene_name,
'terminal_state_id' : 134
})

print(env.scene_name)
print(env.terminal_state_id)

bedroom_04
134
time: 18.8 ms (started: 2022-05-06 23:12:34 +00:00)


In [10]:
## Constants.py

# -*- coding: utf-8 -*-

LOCAL_T_MAX = 5 # repeat step size
RMSP_ALPHA = 0.99 # decay parameter for RMSProp
RMSP_EPSILON = 0.1 # epsilon parameter for RMSProp
LEARNING_RATE = 1e-2
GAMMA = 0.99 # discount factor for rewards
ENTROPY_BETA = 0.01 # entropy regurarlization constant

SCREEN_WIDTH = 84
SCREEN_HEIGHT = 84
HISTORY_LENGTH = 4

ACTION_SIZE=4

# keys are scene names, and values are a list of location ids (navigation targets)
TASK_LIST = {
  'bathroom_02'    : ['26', '37', '43', '53', '69'],
  'bedroom_04'     : ['134', '264', '320', '384', '387'],
  'kitchen_02'     : ['90', '136', '157', '207', '329'],
  'living_room_08' : ['92', '135', '193', '228', '254']
}


time: 11.7 ms (started: 2022-05-06 23:12:34 +00:00)


In [12]:
class MyActorCritic(nn.Module):

    def __init__(self):
        super(MyActorCritic, self).__init__()

        # fully connected layer 1

        # generic siamese layer: <start>
        self.fc1 = nn.Linear(in_features=8192, out_features=512)
        self.fc2 = nn.Linear(in_features=1024, out_features=512)
        # generic siamese layer: <end>

        # Scene specific layer: <start> 
        # for now we have only one scene speicific layer and only one thread i.e only one target in one scene
        self.fc3 = nn.Linear(in_features=512, out_features=512)

        self.actions_fc = nn.Linear(in_features=512, out_features=4)
        self.value_fc  = nn.Linear(in_features=512, out_features=1)
        # Scene specific layer: <end>


    def forward(self, input_image_embedding, target_image_embedding):
        
        # generic siamese latyer
        x1 = F.relu(self.fc1(input_image_embedding))
        x2 = F.relu(self.fc1(target_image_embedding))

        global c_nan
        global t_img_embedding

        x_combined = torch.cat((x1, x2)) # combine input image emb and target image emb, may need to add axis for concat later

        x = F.relu(self.fc2(x_combined))

        

        # scene specific layer

        x = F.relu(self.fc3(x))
        x1 = self.actions_fc(x)
        # print(f' actions values before softmax:\n{x1}\n')
        actions_prob = F.softmax(x1, dim=0)
        value = self.value_fc(x)

        return actions_prob, value




time: 24.1 ms (started: 2022-05-06 23:12:39 +00:00)


In [13]:
zip_file_name_open = 'h_eval_working.zip'

time: 818 µs (started: 2022-05-06 23:12:39 +00:00)


In [14]:
scene_filename = 'bedroom_04.h5'

time: 810 µs (started: 2022-05-06 23:12:39 +00:00)


In [15]:
!cp -r '/content/gdrive/MyDrive/ESE_650_Project/{zip_file_name_open}' '/content/' 

time: 1.13 s (started: 2022-05-06 23:12:40 +00:00)


In [16]:
!cp -r '/content/gdrive/MyDrive/ESE_650_Project/{scene_filename}' '/content/' 

time: 1.43 s (started: 2022-05-06 23:12:41 +00:00)


In [17]:
%%capture
!unzip {zip_file_name_open}

time: 2.02 s (started: 2022-05-06 23:12:42 +00:00)


In [18]:
model_orig = torch.load('/content/visual-navigation-agent-pytorch-master/checkpoint-1600000.pth')

time: 38.4 ms (started: 2022-05-06 23:12:44 +00:00)


##Load checkpoint model

In [19]:
model_load = MyActorCritic()
with torch.no_grad():
    model_load.fc1.weight.copy_(model_orig['navigation']['fc_siemense.weight'])
    model_load.fc1.bias.copy_(model_orig['navigation']['fc_siemense.bias'])
    model_load.fc2.weight.copy_(model_orig['navigation']['fc_merge.weight'])
    model_load.fc2.bias.copy_(model_orig['navigation']['fc_merge.bias'])
    model_load.fc3.weight.copy_(model_orig['navigation/bedroom_04']['fc1.weight'])
    model_load.fc3.bias.copy_(model_orig['navigation/bedroom_04']['fc1.bias'])
    model_load.actions_fc.weight.copy_(model_orig['navigation/bedroom_04']['fc2_policy.weight'])
    model_load.actions_fc.bias.copy_(model_orig['navigation/bedroom_04']['fc2_policy.bias'])
    model_load.value_fc.weight.copy_(model_orig['navigation/bedroom_04']['fc2_value.weight'])
    model_load.value_fc.bias.copy_(model_orig['navigation/bedroom_04']['fc2_value.bias'])


time: 63.3 ms (started: 2022-05-06 23:12:44 +00:00)


In [20]:
num_epochs = 100000

model = MyActorCritic()
lr = LEARNING_RATE
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, alpha=RMSP_ALPHA, eps=RMSP_EPSILON)
device_name = 'cuda:0'
# device_name = 'cpu'

max_local_timestep = LOCAL_T_MAX
entropy_beta = ENTROPY_BETA

torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f97ef7c5f10>

time: 53.6 ms (started: 2022-05-06 23:12:45 +00:00)


## Train Model

In [21]:
model_load.to(device_name)

for epoch in tqdm(range(num_epochs)):

    epoch_loss = 0
    # model.double()
    model_load.train()

    env.reset()

    states = []
    actions = []
    policy = []
    rewards = []
    values = []
    targets = []

    terminal_reached = False

    episode_length = 0
    episode_reward = 0
    local_t = 0

    for t in range(max_local_timestep):

        torch_s_t = torch.from_numpy(env.s_t).float().flatten()  # state embedding in torch convertinng from 2048 x 4, to 8192 x 1
        torch_target = torch.from_numpy(env.target).float().flatten()  # target embedding in torch

        torch_s_t = torch_s_t.to(device_name)
        torch_target = torch_target.to(device_name)

        actions_prob, value = model_load(torch_s_t, torch_target)

        action = None
        with torch.no_grad():
            action = torch.multinomial(actions_prob, 1).item()  # sample action according to the probability returned by the network


        # store required items
        states.append(env.s_t)
        policy.append(actions_prob)
        actions.append(action)
        values.append(value)
        targets.append(env.target)

        # process game
        env.step(action)

        # receive game result
        reward = env.reward
        terminal = env.terminal

        # ad-hoc reward for navigation
        reward = 10.0 if terminal else -0.01
        if episode_length > 5e3: terminal = True

        episode_reward += reward
        episode_length += 1
        # episode_max_q = max(episode_max_q, np.max(value_)) !!!! DO Q VALUE CLIPPING LATER !!!!!!!

        # clip and append reward
        rewards.append(np.clip(reward, -1, 1))

        local_t += 1

        # s_t1 -> s_t
        env.update()

        if terminal:
            terminal_end = True
            # sys.stdout.write("time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q))

            # summary_values = {
            # "episode_reward_input": episode_reward,
            # "episode_length_input": float(episode_length),
            # # "episode_max_q_input": episode_max_q,
            # # "learning_rate_input": _anneal_learning_rate(global_t)
            # }

            # self._record_score(sess, summary_writer, summary_op, summary_placeholders,
            #                 summary_values, global_t)
            # self.episode_reward = 0
            # self.episode_length = 0
            # self.episode_max_q = -np.inf
            # self.env.reset()
            # TODO: add logging
            # print('playout finished')
            # print(f'episode length: {episode_length}')
            # print(f'episode reward: {episode_reward}')
            # print(f'episode max_q: {episode_max_q}')


            break

    # writer.add_scalar("train/episode_reward", episode_reward, epoch)
    # writer.add_scalar("train/episode_length", episode_length, epoch)

    torch_s_t = torch.from_numpy(env.s_t).float().flatten()  # state embedding in torch
    torch_target = torch.from_numpy(env.target).float().flatten()  # target embedding in torch

    torch_s_t = torch_s_t.to(device_name)
    torch_target = torch_target.to(device_name)

    next_state_value = 0.0
    if not terminal:
        _, next_state_value = model_load(torch_s_t, torch_target)
        next_state_value = next_state_value.item()

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()
    policy.reverse()

    batch_si = []
    batch_policy = []
    batch_a = []
    batch_td = []
    batch_R = []
    batch_t = []
    batch_values = []

    # compute and accmulate gradients
    for (pi, ai, ri, si, Vi, ti) in zip(policy, actions, rewards, states, values, targets):
        
        next_state_value = ri + GAMMA * next_state_value
        td = next_state_value - Vi
        a = np.zeros([ACTION_SIZE])
        a[ai] = 1

        batch_si.append(si)
        batch_policy.append(pi)
        batch_a.append(a)
        batch_td.append(td)
        batch_R.append(next_state_value)
        batch_t.append(ti)
        batch_values.append(Vi)

    # COMPUTE LOSS + BACKPROP
    # !!!!!!!! compute actor/policy loss !!!!!!!!!!!!
    batch_policy_tensor = torch.stack(batch_policy, 0)

    batch_policy_tensor = torch.clip(batch_policy_tensor, 1e-15, 1.0)
    policy_log_prob = torch.log(batch_policy_tensor)
    policy_entropy = - torch.sum(batch_policy_tensor * policy_log_prob, axis=1)


    batch_a_tensor = torch.from_numpy(np.array(batch_a, dtype=np.int64)).to(device_name)
    nllLoss = F.nll_loss(policy_log_prob, torch.argmax(batch_a_tensor, dim=1), reduce=False)

    batch_td_tensor = torch.stack(batch_td)
    policy_loss = nllLoss * batch_td_tensor - policy_entropy * entropy_beta
    policy_loss = policy_loss.sum()

    # !!!!!!!!!!!! Value loss !!!!!!!!!!!!!!
    # learning rate for critic is half of actor's
    # Equivalent to 0.5 * l2 loss
    r_tensor = torch.from_numpy(np.array(batch_R, dtype=np.float32)).to(device_name)
    value_loss = (0.5 * 0.5) * F.mse_loss(torch.stack(batch_values), r_tensor, size_average=False)
    # value_loss = (0.5 * 0.5) * F.mse_loss(torch.stack(batch_values), r_tensor)

    total_loss = value_loss + policy_loss

    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()


    # writer.add_scalar("train/value_loss", value_loss.cpu().detach().item(), epoch)
    # writer.add_scalar("train/policy_loss", policy_loss.cpu().detach().item(), epoch)
    # writer.add_scalar("train/total_loss", total_loss.cpu().detach().item(), epoch)


    # #                {"epoch","train_episode_reward":[], "train_episode_length":[], "train_value_loss":[], "train_policy_loss":[], "train_total_loss":[]}
    # log_df.loc[len(log_df.index)] = [epoch, episode_reward, episode_length, value_loss.cpu().detach().item(), policy_loss.cpu().detach().item(), total_loss.cpu().detach().item()] # add row to df

100%|██████████| 100000/100000 [2:40:34<00:00, 10.38it/s]

time: 2h 40min 46s (started: 2022-05-06 23:12:46 +00:00)





##Saving Model

In [22]:
# save the model parameters
import time

timestr = time.strftime("%d_%m-%H_%M_%S")
save_model_filename = env.scene_name +'tf_model_wts_epoch_' + str(num_epochs) + '_' + timestr  + '.pth'
torch.save(model_load, save_model_filename)

time: 68 ms (started: 2022-05-07 01:53:40 +00:00)


In [23]:
!cp {save_model_filename} '/content/gdrive/MyDrive/ESE_650_Project/models/'

time: 159 ms (started: 2022-05-07 01:53:42 +00:00)


##Evaluation

In [24]:

# for episode in tqdm(range(NUM_EVAL_EPISODES)):
for episode in tqdm(range(5)):


    model_load.eval()
    env.reset()

    actions = []

    final_state = None
    inital_state_id = env.current_state_id

    terminal_reached = False

    episode_length = 0
    episode_reward = 0
    local_t = 0

    terminal = False
    inital_state_to_target_dist = env.shortest_path_distances[inital_state_id][env.terminal_state_id]
    

    while not terminal:

        torch_s_t = torch.from_numpy(env.s_t).float().flatten()  # state embedding in torch convertinng from 2048 x 4, to 8192 x 1
        torch_target = torch.from_numpy(env.target).float().flatten()  # target embedding in torch

        torch_s_t = torch_s_t.to(device_name)
        torch_target = torch_target.to(device_name)

        actions_prob, value = model_load(torch_s_t, torch_target)

        action = torch.argmax(actions_prob)  # choose best action

        # store required items
        actions.append(action)

        # process game
        env.step(action)

        # receive game result
        reward = env.reward
        terminal = env.terminal

        # ad-hoc reward for navigation
        reward = 10.0 if terminal else -0.01
        if episode_length > 5e3: terminal = True

        episode_reward += reward
        episode_length += 1
        # episode_max_q = max(episode_max_q, np.max(value_)) !!!! DO Q VALUE CLIPPING LATER !!!!!!!

        local_t += 1

        # s_t1 -> s_t
        env.update()

        if terminal:
            final_state = env.s_t[0]
            terminal_end = True
            break

    
    final_state_to_target_dist = env.shortest_path_distances[env.current_state_id][env.terminal_state_id]
    print(f'\nnum_iter : {episode}\nepisode_length : {episode_length}')
    print(f'inital state to target state shortest dist: {inital_state_to_target_dist}')
    # print(f'shortest dist from episode termination state to goal state: {dist}')
    print(f'final state to target state shortest dist: {final_state_to_target_dist}')
    print()

    

 20%|██        | 1/5 [00:53<03:34, 53.67s/it]


num_iter : 0
episode_length : 5002
inital state to target state shortest dist: 4
final state to target state shortest dist: 6



 60%|██████    | 3/5 [01:47<00:58, 29.19s/it]


num_iter : 1
episode_length : 5002
inital state to target state shortest dist: 18
final state to target state shortest dist: 18


num_iter : 2
episode_length : 13
inital state to target state shortest dist: 11
final state to target state shortest dist: 0



 80%|████████  | 4/5 [02:42<00:39, 39.36s/it]


num_iter : 3
episode_length : 5002
inital state to target state shortest dist: 11
final state to target state shortest dist: 7



100%|██████████| 5/5 [03:36<00:00, 43.35s/it]


num_iter : 4
episode_length : 5002
inital state to target state shortest dist: 9
final state to target state shortest dist: 7

time: 3min 36s (started: 2022-05-07 01:53:44 +00:00)



