In [None]:
%%capture
!pip install ipython-autotime

%load_ext autotime

time: 286 µs (started: 2022-04-30 20:46:33 +00:00)


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive
time: 21.8 s (started: 2022-04-30 20:46:33 +00:00)


In [None]:
# !mkdir 'Data'

time: 1.32 ms (started: 2022-04-30 20:46:55 +00:00)


In [None]:
!cp -r '/content/gdrive/MyDrive/ESE_650_Project/data/' '/content/'

time: 3.66 s (started: 2022-04-30 20:46:55 +00:00)


In [None]:
import torch
from torch import nn
import torch.nn.functional as F

from tqdm import tqdm

import matplotlib.pyplot as plt


## scene loader 
import sys
import h5py
import json
import numpy as np
import random
import skimage.io
from skimage.transform import resize

import os

time: 5.75 s (started: 2022-04-30 20:46:58 +00:00)


In [None]:
def set_seed(seed=None, seed_torch=True):
  """
  Function that controls randomness. NumPy and random modules must be imported.

  Args:
    seed : Integer
      A non-negative integer that defines the random state. Default is `None`.
    seed_torch : Boolean
      If `True` sets the random seed for pytorch tensors, so pytorch module
      must be imported. Default is `True`.

  Returns:
    Nothing.
  """
  if seed is None:
    seed = np.random.choice(2 ** 32)
  random.seed(seed)
  np.random.seed(seed)
  if seed_torch:
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

  print(f'Random seed {seed} has been set.')

time: 24 ms (started: 2022-04-30 20:47:04 +00:00)


In [None]:
SEED = 2021
set_seed(seed=SEED)

Random seed 2021 has been set.
time: 8.01 ms (started: 2022-04-30 20:47:04 +00:00)


In [None]:
## Constants.py

# -*- coding: utf-8 -*-

LOCAL_T_MAX = 5 # repeat step size
RMSP_ALPHA = 0.99 # decay parameter for RMSProp
RMSP_EPSILON = 0.1 # epsilon parameter for RMSProp
CHECKPOINT_DIR = 'checkpoints'
LOG_FILE = 'logs'
INITIAL_ALPHA_LOW = 1e-4    # log_uniform low limit for learning rate
INITIAL_ALPHA_HIGH = 1e-2   # log_uniform high limit for learning rate

PARALLEL_SIZE = 20 # parallel thread size
ACTION_SIZE = 4 # action size

INITIAL_ALPHA_LOG_RATE = 0.4226 # log_uniform interpolate rate for learning rate (around 7 * 10^-4)
GAMMA = 0.99 # discount factor for rewards
ENTROPY_BETA = 0.01 # entropy regurarlization constant
MAX_TIME_STEP = 10.0 * 10**6 # 10 million frames
GRAD_NORM_CLIP = 40.0 # gradient norm clipping
USE_GPU = True # To use GPU, set True
VERBOSE = True

SCREEN_WIDTH = 84
SCREEN_HEIGHT = 84
HISTORY_LENGTH = 4

NUM_EVAL_EPISODES = 100 # number of episodes for evaluation

TASK_TYPE = 'navigation' # no need to change
# keys are scene names, and values are a list of location ids (navigation targets)
TASK_LIST = {
  'bathroom_02'    : ['26', '37', '43', '53', '69'],
  'bedroom_04'     : ['134', '264', '320', '384', '387'],
  'kitchen_02'     : ['90', '136', '157', '207', '329'],
  'living_room_08' : ['92', '135', '193', '228', '254']
}


time: 21.7 ms (started: 2022-04-30 20:47:04 +00:00)


In [None]:
## Scene Loader

# -*- coding: utf-8 -*-
class THORDiscreteEnvironment(object):

  def __init__(self, config=dict()):

    # configurations
    self.scene_name          = config.get('scene_name', 'bedroom_04')
    self.random_start        = config.get('random_start', True)
    self.n_feat_per_locaiton = config.get('n_feat_per_locaiton', 1) # 1 for no sampling
    self.terminal_state_id   = config.get('terminal_state_id', 0)

    self.h5_file_path = config.get('h5_file_path', 'data/%s.h5'%self.scene_name)
    self.h5_file      = h5py.File(self.h5_file_path, 'r')

    self.locations   = self.h5_file['location'][()]
    self.rotations   = self.h5_file['rotation'][()]
    self.n_locations = self.locations.shape[0]

    self.terminals = np.zeros(self.n_locations)
    self.terminals[self.terminal_state_id] = 1
    self.terminal_states, = np.where(self.terminals)

    self.transition_graph = self.h5_file['graph'][()]
    self.shortest_path_distances = self.h5_file['shortest_path_distance'][()]

    self.history_length = HISTORY_LENGTH
    self.screen_height  = SCREEN_HEIGHT
    self.screen_width   = SCREEN_WIDTH

    # we use pre-computed fc7 features from ResNet-50
    # self.s_t = np.zeros([self.screen_height, self.screen_width, self.history_length])
    self.s_t      = np.zeros([2048, self.history_length])
    self.s_t1     = np.zeros_like(self.s_t)
    self.s_target = self._tiled_state(self.terminal_state_id)

    self.reset()

  # public methods

#   def reset(self):
#     # randomize initial state
#     while True:
#       k = random.randrange(self.n_locations)
#       min_d = np.inf
#       # check if target is reachable
#       for t_state in self.terminal_states:
#         dist = self.shortest_path_distances[k][t_state]
#         min_d = min(min_d, dist)
#       # min_d = 0  if k is a terminal state
#       # min_d = -1 if no terminal state is reachable from k
#       if min_d > 0: break

#     # reset parameters
#     self.current_state_id = k
#     self.s_t = self._tiled_state(self.current_state_id)

#     self.reward   = 0
#     self.collided = False
#     self.terminal = False


# FOR DEBUGGING ONLY COMMENT LATER
  def reset(self):
    # randomize initial state
    k = 1
    print(self.terminal_states[0])
    dist = self.shortest_path_distances[k][self.terminal_states[0]]

    # reset parameters
    self.current_state_id = k
    self.s_t = self._tiled_state(self.current_state_id)

    self.reward   = 0
    self.collided = False
    self.terminal = False

  def step(self, action):
    assert not self.terminal, 'step() called in terminal state'
    k = self.current_state_id
    if self.transition_graph[k][action] != -1:
      self.current_state_id = self.transition_graph[k][action]
      if self.terminals[self.current_state_id]:
        self.terminal = True
        self.collided = False
      else:
        self.terminal = False
        self.collided = False
    else:
      self.terminal = False
      self.collided = True

    self.reward = self._reward(self.terminal, self.collided)
    self.s_t1 = np.append(self.s_t[:,1:], self.state, axis=1)

  def update(self):
    self.s_t = self.s_t1

  # private methods

  def _tiled_state(self, state_id):
    k = random.randrange(self.n_feat_per_locaiton)
    f = self.h5_file['resnet_feature'][state_id][k][:,np.newaxis]
    return np.tile(f, (1, self.history_length))

  def _reward(self, terminal, collided):
    # positive reward upon task completion
    if terminal: return 10.0
    # time penalty or collision penalty
    return -0.1 if collided else -0.01

  # properties

  @property
  def action_size(self):
    # move forward/backward, turn left/right for navigation
    return ACTION_SIZE 

  @property
  def action_definitions(self):
    action_vocab = ["MoveForward", "RotateRight", "RotateLeft", "MoveBackward"]
    return action_vocab[:ACTION_SIZE]

  @property
  def observation(self):
    return self.h5_file['observation'][self.current_state_id]

  @property
  def state(self):
    # read from hdf5 cache
    k = random.randrange(self.n_feat_per_locaiton)
    return self.h5_file['resnet_feature'][self.current_state_id][k][:,np.newaxis]

  @property
  def target(self):
    return self.s_target

  @property
  def x(self):
    return self.locations[self.current_state_id][0]

  @property
  def z(self):
    return self.locations[self.current_state_id][1]

  @property
  def r(self):
    return self.rotations[self.current_state_id]




time: 326 ms (started: 2022-04-30 20:47:04 +00:00)


In [None]:
## load scene into environment
scene_name = 'bedroom_04'

env = THORDiscreteEnvironment({
'random_start': True,
'scene_name': scene_name,
'h5_file_path': 'data/%s.h5'%scene_name,
'terminal_state_id' : 134
})

print(env.scene_name)
print(env.terminal_state_id)

134
bedroom_04
134
time: 12.4 ms (started: 2022-04-30 22:09:30 +00:00)


In [None]:
print(env.terminal_states[0])
print(env.current_state_id)

134
1
time: 1.72 ms (started: 2022-04-30 22:09:31 +00:00)


In [None]:
env.shortest_path_distances[1][134]

15

time: 5.31 ms (started: 2022-04-30 22:09:31 +00:00)


In [None]:
model_1_var = None
model_2_var = None

time: 1.42 ms (started: 2022-04-30 22:09:31 +00:00)


# Pytorch original

In [None]:
class A2COriginalNetwork(nn.Module):
    def __init__(self):
        super(A2COriginalNetwork, self).__init__()

        # Siemense layer
        self.fc_siemense= nn.Linear(8192, 512)

        # Merge layer
        self.fc_merge = nn.Linear(1024, 512)


        # scene specific network

        self.fc1 = nn.Linear(512, 512)

        # Policy layer
        self.fc2_policy = nn.Linear(512, 4)

        # Value layer
        self.fc2_value = nn.Linear(512, 1)
    
        return

    def forward(self, inp):
        (x, y,) = inp
        
        x = x.view(-1)
        x = self.fc_siemense(x)  
        x = F.relu(x, True)

        y = y.view(-1)
        y = self.fc_siemense(y)
        y = F.relu(y, True)

        xy = torch.stack([x,y], 0).view(-1)
        xy = self.fc_merge(xy)
        xy = F.relu(xy, True)

        # scene specific net
        x = self.fc1(xy)
        x = F.relu(x)
        x_policy = self.fc2_policy(x)
        #x_policy = F.softmax(x_policy)

        x_value = self.fc2_value(x)[0]
        return (x_policy, x_value, )
   

class ActorCriticLoss(nn.Module):
    def __init__(self, entropy_beta):
        self.entropy_beta = entropy_beta
        pass

    def forward(self, policy, value, action_taken, temporary_difference, r):
        # Calculate policy entropy
        log_softmax_policy = torch.nn.functional.log_softmax(policy, dim=1)
        softmax_policy = torch.nn.functional.softmax(policy, dim=1)
        print(softmax_policy)
        policy_entropy = softmax_policy * log_softmax_policy
        policy_entropy = -torch.sum(policy_entropy, 1)

        # Policy loss
        nllLoss = F.nll_loss(log_softmax_policy, action_taken, reduce=False)
        policy_loss = nllLoss * temporary_difference - policy_entropy * self.entropy_beta
        policy_loss = policy_loss.sum(0)

        # Value loss
        # learning rate for critic is half of actor's
        # Equivalent to 0.5 * l2 loss
        value_loss = (0.5 * 0.5) * F.mse_loss(value, r, size_average=False)
        print(r)

        print()
        print(f'value_loss: {value_loss}')
        print(f'policy_loss: {policy_loss}')

        return value_loss + policy_loss

time: 68.8 ms (started: 2022-04-30 22:09:31 +00:00)


In [None]:
num_epochs = 1
model_org = A2COriginalNetwork()

lr = 1e-2
optimizer = torch.optim.Adam(model_org.parameters(), lr=lr)
device_name = 'cuda:0'
# device_name = 'cpu'

max_local_timestep = 5
entropy_beta = 1e-3

torch.autograd.set_detect_anomaly(True)

loss_model = ActorCriticLoss(entropy_beta)

time: 50.6 ms (started: 2022-04-30 22:09:31 +00:00)


In [None]:
from collections import namedtuple

TrainingSample = namedtuple('TrainingSample', ('state', 'policy', 'value', 'action_taken', 'goal', 'R', 'temporary_difference'))

model_org.to(device_name)

for epoch in tqdm(range(num_epochs)):

    epoch_loss = 0
    # model.train()

    env.reset()

    terminal_end= False

    episode_length = 0
    episode_reward = 0
    local_t = 0

    results = { "policy":[], "value": []}
    rollout_path = {"state": [], "action": [], "rewards": [], "done": []}

    # Plays out one game to end or max_t
    for t in range(max_local_timestep):
    # for t in range(2):

        # print(f'env current state id : {env.current_state_id}')

        torch_s_t = env.s_t  # state embedding 2048 x 4
        torch_target = env.target  # target embedding 

        state = {"current": torch_s_t,
                  "goal": torch_target,
                }

        x_processed = torch.from_numpy(torch_s_t)
        goal_processed = torch.from_numpy(torch_target)

        policy, value = model_org((x_processed.to(device_name), goal_processed.to(device_name)))

        # Store raw network output to use in backprop
        results["policy"].append(policy)
        # print(results['policy'][-1])
        results["value"].append(value)

        with torch.no_grad():
            # (_, action,) = policy.max(0)
            # action = F.softmax(policy, dim=0).multinomial(1).item()
            action = 1
        
        policy = policy.data.cpu().numpy()
        value = value.data.cpu().numpy()
        
        
        # Makes the step in the environment
        env.step(action)
        # print(env.current_state_id)
        # print(action)

        # Receives the game reward
        is_terminal = env.terminal

        # ad-hoc reward for navigation
        reward = 10.0 if is_terminal else -0.01

        # Max episode length
        if episode_length > 5e3: is_terminal = True

        # Update episode stats
        episode_length += 1
        episode_reward += reward
        # self.episode_max_q = max(self.episode_max_q, np.max(value))

        # clip reward
        reward = np.clip(reward, -1, 1)

        # Increase local time
        local_t += 1

        rollout_path["state"].append(state)
        rollout_path["action"].append(action)
        rollout_path["rewards"].append(reward)
        rollout_path["done"].append(is_terminal)

        env.update()

        # print()
        # print(f'results len : {len(results["policy"])}')
        # print(f'rollout_path: {len(rollout_path["state"])}')
        if is_terminal:
            # TODO: add logging
            print('playout finished')
            print(f'episode length: {episode_length}')
            print(f'episode reward: {episode_reward}')
            # print(f'episode max_q: {episode_max_q}')

            terminal_end = True
            break


    if terminal_end:
        # playout_reward, results, rollout_path =  0.0, results, rollout_path
        playout_reward =  0.0

    else:

        torch_s_t = env.s_t  # state embedding 2048 x 4
        torch_target = env.target  # target embedding 

        state = {"current": torch_s_t,
                  "goal": torch_target,
                }

        x_processed = torch.from_numpy(state["current"])
        goal_processed = torch.from_numpy(state["goal"])

        (_, value) = model_org((x_processed.to(device_name), goal_processed.to(device_name)))

        # playout_reward, results, rollout_path = value.data.item(), results, rollout_path
        playout_reward = value.data.item()


    policy_batch = []
    value_batch = []
    action_batch = []
    temporary_difference_batch = []
    playout_reward_batch = []

    for i in reversed(range(len(results["value"]))):
        # print(f'i:{i}')
        reward = rollout_path["rewards"][i]
        value = results["value"][i]
        action = rollout_path["action"][i]

        print(f'playout reward begin : {playout_reward}')

        playout_reward = reward + GAMMA * playout_reward
        temporary_difference = playout_reward - value.data.item()

        print(f'playout reward end : {playout_reward}')

        policy_batch.append(results['policy'][i])
        value_batch.append(results['value'][i])
        action_batch.append(action)
        temporary_difference_batch.append(temporary_difference)
        playout_reward_batch.append(playout_reward)
    
    print([value_batch[i].detach().cpu().numpy() for i in range(len(value_batch))])
    
    policy_batch = torch.stack(policy_batch, 0)
    value_batch = torch.stack(value_batch, 0)
    action_batch = torch.from_numpy(np.array(action_batch, dtype=np.int64)).to(device_name)
    temporary_difference_batch = torch.from_numpy(np.array(temporary_difference_batch, dtype=np.float32)).to(device_name)
    playout_reward_batch = torch.from_numpy(np.array(playout_reward_batch, dtype=np.float32)).to(device_name)
    
    # Compute loss
    loss = loss_model.forward(policy_batch, value_batch, action_batch, temporary_difference_batch, playout_reward_batch)
    loss = loss.sum()

    print(loss)

    # loss_value = loss.detach().numpy()
    # optimizer.zero_grad()
    # loss.backward()
    # optimizer.step()

  0%|          | 0/1 [00:00<?, ?it/s]

134

100%|██████████| 1/1 [00:00<00:00,  9.12it/s]


playout reward begin : 0.06402208656072617
playout reward end : 0.0533818656951189
playout reward begin : 0.0533818656951189
playout reward end : 0.04284804703816771
playout reward begin : 0.04284804703816771
playout reward end : 0.03241956656778603
playout reward begin : 0.03241956656778603
playout reward end : 0.02209537090210817
playout reward begin : 0.02209537090210817
playout reward end : 0.011874417193087088
[array(0.05018353, dtype=float32), array(0.05900423, dtype=float32), array(0.06619567, dtype=float32), array(0.05242595, dtype=float32), array(0.04702532, dtype=float32)]
tensor([[0.2442, 0.2607, 0.2646, 0.2306],
        [0.2427, 0.2572, 0.2718, 0.2283],
        [0.2452, 0.2585, 0.2667, 0.2295],
        [0.2450, 0.2556, 0.2669, 0.2326],
        [0.2459, 0.2590, 0.2653, 0.2299]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([0.0534, 0.0428, 0.0324, 0.0221, 0.0119], device='cuda:0')

value_loss: 0.0008919016690924764
policy_loss: -0.15912076830863953
tensor(-0.1




# My Model

In [None]:
# class MyActorCritic(nn.Module):

#     def __init__(self):
#         super(MyActorCritic, self).__init__()

#         # fully connected layer 1

#         # generic siamese layer: <start>
#         self.fc1 = nn.Linear(in_features=8192, out_features=512)
#         self.fc2 = nn.Linear(in_features=1024, out_features=512)
#         # generic siamese layer: <end>

#         # Scene specific layer: <start> 
#         # for now we have only one scene speicific layer and only one thread i.e only one target in one scene
#         self.fc3 = nn.Linear(in_features=512, out_features=512)

#         self.actions_fc = nn.Linear(in_features=512, out_features=4)
#         self.value_fc  = nn.Linear(in_features=512, out_features=1)
#         # Scene specific layer: <end>


#     def forward(self, input_image_embedding, target_image_embedding):

#         # if torch.isnan(input_image_embedding).any():
#         #     print(torch.isnan(input_image_embedding))

#         # generic siamese latyer
#         x1 = F.relu(self.fc1(input_image_embedding))
#         x2 = F.relu(self.fc1(target_image_embedding))

#         global c_nan
#         global t_img_embedding

#         # if torch.isnan(x1).any():
#         #     print('x1 contains nan')
#         #     c_nan = x1

#         # if torch.isnan(x2).any():
#         #     print('x2 contains nan')
#         #     c_nan = x2
#         #     t_img_embedding = target_image_embedding

#         x_combined = torch.cat((x1, x2)) # combine input image emb and target image emb, may need to add axis for concat later

#         x = F.relu(self.fc2(x_combined))

        

#         # scene specific layer

#         x = F.relu(self.fc3(x))
#         x1 = self.actions_fc(x)
#         # print(f' actions values before softmax:\n{x1}\n')
#         actions_prob = F.softmax(x1, dim=0)
#         value = self.value_fc(x)

#         return actions_prob, value




time: 11.4 ms (started: 2022-04-30 22:09:33 +00:00)


In [None]:
from copy import deepcopy

time: 1.04 ms (started: 2022-04-30 22:09:33 +00:00)


In [None]:
num_epochs = 1
# model_my = MyActorCritic()
model_my = deepcopy(model_org)

lr = 1e-2
optimizer = torch.optim.Adam(model_my.parameters(), lr=lr)
device_name = 'cuda:0'
# device_name = 'cpu'

max_local_timestep = 5
entropy_beta = 1e-3

torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f61a530f6d0>

time: 13.6 ms (started: 2022-04-30 22:09:33 +00:00)


In [None]:
model_my.to(device_name)

for epoch in tqdm(range(num_epochs)):

    epoch_loss = 0
    # model.double()
    # model_my.train()
    # take 4 random actions intially for warm start

    env.reset()

    states = []
    actions = []
    policy = []
    rewards = []
    values = []
    targets = []

    terminal_reached = False

    episode_length = 0
    episode_reward = 0
    local_t = 0

    for t in range(max_local_timestep):

        torch_s_t = torch.from_numpy(env.s_t).float().flatten()  # state embedding in torch convertinng from 2048 x 4, to 8192 x 1
        torch_target = torch.from_numpy(env.target).float().flatten()  # target embedding in torch

        torch_s_t = torch_s_t.to(device_name)
        torch_target = torch_target.to(device_name)

        actions_prob, value = model_my((torch_s_t, torch_target))
        # print(actions_prob)

        # action = torch.multinomial(actions_prob,
        #                            1)  # sample action according to the probability returned by the network

        with torch.no_grad():
            # action = F.softmax(policy, dim=0).multinomial(1).item()   # !!!!!!!!!!!! UNCOMMENT LATER !!!!!!!!!!
            action = 1     # !!!!!!!!!!!! COMMENT LATER !!!!!!!!!!

        actions_prob = F.softmax(actions_prob, dim=0)
        # print(actions_prob)

        # store required items
        states.append(env.s_t)
        policy.append(actions_prob)
        actions.append(action)
        values.append(value)
        targets.append(env.target)

        # process game
        env.step(action)
        # print(env.current_state_id)

        # receive game result
        reward = env.reward
        terminal = env.terminal

        # ad-hoc reward for navigation
        reward = 10.0 if terminal else -0.01
        if episode_length > 5e3: terminal = True

        episode_reward += reward
        episode_length += 1
        # episode_max_q = max(episode_max_q, np.max(value_)) !!!! DO Q VALUE CLIPPING LATER !!!!!!!

        # clip and append reward
        rewards.append(np.clip(reward, -1, 1))

        local_t += 1

        # s_t1 -> s_t
        env.update()

        if terminal:
            terminal_end = True
            # sys.stdout.write("time %d | thread #%d | scene %s | target #%s\n%s %s episode reward = %.3f\n%s %s episode length = %d\n%s %s episode max Q  = %.3f\n" % (global_t, self.thread_index, self.scene_scope, self.task_scope, self.scene_scope, self.task_scope, self.episode_reward, self.scene_scope, self.task_scope, self.episode_length, self.scene_scope, self.task_scope, self.episode_max_q))

            # summary_values = {
            # "episode_reward_input": episode_reward,
            # "episode_length_input": float(episode_length),
            # # "episode_max_q_input": episode_max_q,
            # # "learning_rate_input": _anneal_learning_rate(global_t)
            # }

            # self._record_score(sess, summary_writer, summary_op, summary_placeholders,
            #                 summary_values, global_t)
            # self.episode_reward = 0
            # self.episode_length = 0
            # self.episode_max_q = -np.inf
            # self.env.reset()

            break

    torch_s_t = torch.from_numpy(env.s_t).float().flatten()  # state embedding in torch
    torch_target = torch.from_numpy(env.target).float().flatten()  # target embedding in torch

    torch_s_t = torch_s_t.to(device_name)
    torch_target = torch_target.to(device_name)

    next_state_value = 0.0
    if not terminal:
        _, next_state_value = model_my((torch_s_t, torch_target))
        next_state_value = next_state_value.item()

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()
    policy.reverse()

    batch_si = []
    batch_policy = []
    batch_a = []
    batch_td = []
    batch_R = []
    batch_t = []
    batch_values = []

    # compute and accmulate gradients
    for (pi, ai, ri, si, Vi, ti) in zip(policy, actions, rewards, states, values, targets):
        
        print(f'next state value begin:{next_state_value}')
        next_state_value = ri + GAMMA * next_state_value
        td = next_state_value - Vi
        a = np.zeros([ACTION_SIZE])
        a[ai] = 1

        print(f'next state value end:{next_state_value}')


        batch_si.append(si)
        batch_policy.append(pi)
        batch_a.append(a)
        batch_td.append(td)
        batch_R.append(next_state_value)
        batch_t.append(ti)
        batch_values.append(Vi)

    print([batch_values[i].detach().cpu().numpy() for i in range(len(batch_values))])


    # COMPUTE LOSS + BACKPROP
    # !!!!!!!! compute actor/policy loss !!!!!!!!!!!!
    batch_policy_tensor = torch.stack(batch_policy, 0)
    
    # batch_policy_tensor = torch.clip(batch_policy_tensor, 1e-15, 1.0)

    policy_log_prob = torch.log(batch_policy_tensor)

    # policy_log_prob = torch.nn.functional.log_softmax(batch_policy_tensor, dim=1)
    # softmax_policy = torch.nn.functional.softmax(batch_policy_tensor, dim=1)

    policy_entropy = - torch.sum(batch_policy_tensor * policy_log_prob, axis=1)
    # policy_entropy = - torch.sum(softmax_policy * policy_log_prob, axis=1)


    batch_a_tensor = torch.from_numpy(np.array(batch_a, dtype=np.int64)).to(device_name)

    nllLoss = F.nll_loss(policy_log_prob, torch.argmax(batch_a_tensor, dim=1), reduce=False)

    batch_td_tensor = torch.stack(batch_td)
    policy_loss = nllLoss * batch_td_tensor - policy_entropy * entropy_beta
    policy_loss = policy_loss.sum()

    r_tensor = torch.from_numpy(np.array(batch_R, dtype=np.float32)).to(device_name)
     
    print(r_tensor)
    value_loss = (0.5 * 0.5) * F.mse_loss(torch.stack(batch_values), r_tensor, size_average=False)
    total_loss = value_loss + policy_loss


    print()
    print(f'value_loss : {value_loss}')
    print(f'policy_loss : {policy_loss}')
    print(f' my model loss:{total_loss}')
    print()

    # if torch.isnan(total_loss).any():
    #     print('\n\n\ntotal loss contains nans\n\n\n')
    #     break

    # optimizer.zero_grad()
    # total_loss.backward()
    # optimizer.step()

100%|██████████| 1/1 [00:00<00:00,  9.07it/s]

134
next state value begin:0.06402208656072617
next state value end:0.0533818656951189
next state value begin:0.0533818656951189
next state value end:0.04284804703816771
next state value begin:0.04284804703816771
next state value end:0.03241956656778603
next state value begin:0.03241956656778603
next state value end:0.02209537090210817
next state value begin:0.02209537090210817
next state value end:0.011874417193087088
[array(0.05018353, dtype=float32), array(0.05900423, dtype=float32), array(0.06619567, dtype=float32), array(0.05242595, dtype=float32), array(0.04702532, dtype=float32)]
tensor([0.0534, 0.0428, 0.0324, 0.0221, 0.0119], device='cuda:0')

value_loss : 0.0008919016690924764
policy_loss : -0.15912076830863953
 my model loss:-0.15822885930538177

time: 341 ms (started: 2022-04-30 22:18:58 +00:00)





In [None]:
for p1, p2 in zip(model_org.parameters(), model_my.parameters()):
    if p1.data.ne(p2.data).sum() > 0:
        print('not equal')
        break

time: 9.05 ms (started: 2022-04-30 22:09:33 +00:00)


In [None]:
for p1, p2 in zip(model_org.parameters(), model_my.parameters()):
    # if p1.data.ne(p2.data).sum() > 0:
    if torch.equal(p1.data, p2.data):
        print(True)
        # print(f'layer : {p1}')

True
True
True
True
True
True
True
True
True
True
time: 7.45 ms (started: 2022-04-30 22:09:33 +00:00)


#Cells not used

In [None]:
#@title
# model = torch.load('model.pth')

time: 918 µs (started: 2022-04-30 22:09:33 +00:00)


In [None]:
#@title
# policy_log_prob

time: 792 µs (started: 2022-04-30 22:09:33 +00:00)


In [None]:
# print(c_nan)
# print(torch.isnan(t_img_embedding).any())

time: 1 ms (started: 2022-04-30 22:09:33 +00:00)


In [None]:
# for i, k in enumerate(model.parameters()):
#     if i == 2:
#         print(k[(torch.logical_not(torch.isnan(k)))].min())
#         print()

In [None]:
# # save the model parameters
# import time

# timestr = time.strftime("%d_%m-%H_%M_%S")
# save_model_filename = env.scene_name +'ActorCritic_model_weights_epoch_' + str(num_epochs) + '_' + timestr  + '.pth'
# torch.save(model, save_model_filename)

time: 46.7 ms (started: 2022-04-30 02:38:06 +00:00)


In [None]:
# !cp {save_model_filename} '/content/gdrive/MyDrive//ESE_650_Project/models/'

time: 159 ms (started: 2022-04-30 02:38:09 +00:00)


In [None]:


# # from https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f

# for episode in range(max_episodes):
#         log_probs = []
#         values = []
#         rewards = []

#         state = env.reset()
#         for steps in range(num_steps):
#             value, policy_dist = actor_critic.forward(state)
#             value = value.detach().numpy()[0,0]
#             dist = policy_dist.detach().numpy() 

#             action = np.random.choice(num_outputs, p=np.squeeze(dist))
#             log_prob = torch.log(policy_dist.squeeze(0)[action])
#             entropy = -np.sum(np.mean(dist) * np.log(dist))
#             new_state, reward, done, _ = env.step(action)

#             rewards.append(reward)
#             values.append(value)
#             log_probs.append(log_prob)
#             entropy_term += entropy
#             state = new_state
            
#             if done or steps == num_steps-1:
#                 Qval, _ = actor_critic.forward(new_state)
#                 Qval = Qval.detach().numpy()[0,0]
#                 all_rewards.append(np.sum(rewards))
#                 all_lengths.append(steps)
#                 average_lengths.append(np.mean(all_lengths[-10:]))
#                 if episode % 10 == 0:                    
#                     sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum(rewards), steps, average_lengths[-1]))
#                 break
        
#         # compute Q values
#         Qvals = np.zeros_like(values)
#         for t in reversed(range(len(rewards))):
#             Qval = rewards[t] + GAMMA * Qval
#             Qvals[t] = Qval
  
#         #update actor critic
#         values = torch.FloatTensor(values)
#         Qvals = torch.FloatTensor(Qvals)
#         log_probs = torch.stack(log_probs)
        
#         advantage = Qvals - values
#         actor_loss = (-log_probs * advantage).mean()
#         critic_loss = 0.5 * advantage.pow(2).mean()
#         ac_loss = actor_loss + critic_loss + 0.001 * entropy_term

#         ac_optimizer.zero_grad()
#         ac_loss.backward()
#         ac_optimizer.step()

time: 12.4 ms (started: 2022-04-18 19:50:09 +00:00)


# Evaluation

In [None]:

# # for episode in tqdm(range(NUM_EVAL_EPISODES)):
# for episode in tqdm(range(5)):


#     model.eval()
#     env.reset()

#     actions = []

#     final_state = None
#     inital_state_id = env.current_state_id

#     terminal_reached = False

#     episode_length = 0
#     episode_reward = 0
#     local_t = 0

#     terminal = False
#     inital_state_to_target_dist = env.shortest_path_distances[inital_state_id][env.terminal_state_id]
    

#     while not terminal:

#         torch_s_t = torch.from_numpy(env.s_t).float().flatten()  # state embedding in torch convertinng from 2048 x 4, to 8192 x 1
#         torch_target = torch.from_numpy(env.target).float().flatten()  # target embedding in torch

#         torch_s_t = torch_s_t.to(device_name)
#         torch_target = torch_target.to(device_name)

#         actions_prob, value = model((torch_s_t, torch_target))

#         action = torch.argmax(actions_prob)  # choose best action

#         # store required items
#         actions.append(action)

#         # process game
#         env.step(action)

#         # receive game result
#         reward = env.reward
#         terminal = env.terminal

#         # ad-hoc reward for navigation
#         reward = 10.0 if terminal else -0.01
#         if episode_length > 5e3: terminal = True

#         episode_reward += reward
#         episode_length += 1
#         # episode_max_q = max(episode_max_q, np.max(value_)) !!!! DO Q VALUE CLIPPING LATER !!!!!!!

#         local_t += 1

#         # s_t1 -> s_t
#         env.update()

#         if terminal:
#             final_state = env.s_t[0]
#             terminal_end = True
#             break

    
#     final_state_to_target_dist = env.shortest_path_distances[env.current_state_id][env.terminal_state_id]
#     print(f'\nnum_iter : {episode}\nepisode_length : {episode_length}')
#     print(f'inital state to target state shortest dist: {inital_state_to_target_dist}')
#     # print(f'shortest dist from episode termination state to goal state: {dist}')
#     print(f'final state to target state shortest dist: {final_state_to_target_dist}')
#     print()

    

 20%|██        | 1/5 [01:00<04:03, 60.90s/it]


num_iter : 0
episode_length : 5002
inital state to target state shortest dist: 15
final state to target state shortest dist: 15



 40%|████      | 2/5 [01:59<02:58, 59.52s/it]


num_iter : 1
episode_length : 5002
inital state to target state shortest dist: 14
final state to target state shortest dist: 14



 60%|██████    | 3/5 [02:57<01:57, 58.90s/it]


num_iter : 2
episode_length : 5002
inital state to target state shortest dist: 21
final state to target state shortest dist: 21



 80%|████████  | 4/5 [03:55<00:58, 58.51s/it]


num_iter : 3
episode_length : 5002
inital state to target state shortest dist: 13
final state to target state shortest dist: 13



100%|██████████| 5/5 [04:53<00:00, 58.76s/it]


num_iter : 4
episode_length : 5002
inital state to target state shortest dist: 3
final state to target state shortest dist: 3

time: 4min 53s (started: 2022-04-30 07:40:08 +00:00)



