In [59]:
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as opt

from torch.utils.tensorboard import SummaryWriter
from torch.distributions.multivariate_normal import MultivariateNormal

In [60]:
# define the policy model
class MLP_policy(nn.Module):
  def __init__(self, n_input, n_hiddens, n_actions, action_type="discrete"):
    super().__init__()
    self.action_type = action_type
    layers = []
    layers.append(nn.Linear(n_input, n_hiddens[0]))
    layers.append(nn.ReLU())
    for i in range(len(n_hiddens)-1):
      layers.append( nn.Linear(n_hiddens[i], n_hiddens[i+1]) )
      layers.append( nn.ReLU() )
    
    self.model = nn.Sequential(*layers)
    if action_type == "discrete":
      self.dis_actions = nn.Linear(n_hiddens[-1], n_actions)
    else:
      self.mean = nn.Linear(n_hiddens[-1], n_actions)
      self.log_var = nn.Linear(n_hiddens[-1], n_actions)


  def forward(self, x):
    if self.action_type == "discrete":
      return self.dis_actions(self.model(x))
    else:
      x = self.model(x)
      mean = self.mean(x)
      log_var = self.log_var(x)
      return mean, log_var
  
  def sample_action(self, x):
    if self.action_type == "discrete":
      x = self.forward(x)
      self.actions_probs = F.softmax(x, dim=-1)
      action = torch.multinomial(self.actions_probs, 1)
      log_prob = F.log_softmax(x)[action] 
      return action, log_prob
    else:
      mean, log_var = self.forward(x)
      covar = torch.exp(log_var)
      self.actions_dist = MultivariateNormal(mean, torch.diag(covar))
      action = self.actions_dist.sample()
      log = self.actions_dist.log_prob(action)
      return action, log

  def entropy(self):
    if self.action_type == "discrete":
      return Categorical(self.actions_probs).entropy()
    else:
      return self.actions_dist.entropy()

In [61]:
source_env_name = "CartPole-v0"
source_env = gym.make(source_env_name)

target_env_name = "InvertedPendulum-v2"
target_env = gym.make(target_env_name)

In [62]:
# load the model
action_type = "discrete"
if action_type == "discrete":
        model = MLP_policy(source_env.observation_space.shape[0], [64, 32], source_env.action_space.n, "discrete")
else:
    model = MLP_policy(source_env.observation_space.shape[0], [64, 32], source_env.action_space.shape[0], "continuous")

In [63]:
# load the trainied model on the source env
model.load_state_dict(torch.load("best_model.ckpt"))

<All keys matched successfully>

In [64]:
model

MLP_policy(
  (model): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
  )
  (dis_actions): Linear(in_features=32, out_features=2, bias=True)
)

In [65]:
# change the last layer of the model
model.action_type = "Continuous"
model.dis_actions = None
model.mean = nn.Linear(32, target_env.action_space.shape[0])
model.log_var = nn.Linear(32, target_env.action_space.shape[0])
model

MLP_policy(
  (model): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
  )
  (dis_actions): None
  (mean): Linear(in_features=32, out_features=1, bias=True)
  (log_var): Linear(in_features=32, out_features=1, bias=True)
)

In [66]:
# load tensorboard
%load_ext tensorboard
%tensorboard --logdir ./

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 51202), started 0:06:23 ago. (Use '!kill 51202' to kill it.)

In [67]:
# helper method for the reward to go
def reward_to_go(rewards, gamma=0.99):
  cum_rewards = []
  sum_rewards = 0
  for r in reversed(rewards):
      sum_rewards = gamma * sum_rewards + r
      cum_rewards.append( sum_rewards )
  cum_rewards.reverse()
  return torch.tensor(cum_rewards)

In [68]:
def gradient_norm(net):
  '''
  gradient_norm(net)
  This function calulate the gradient norm of a neural network model.
  net: network model
  '''
  total_norm = 0
  for param in net.parameters():
    param_norm = param.grad.detach().data.norm(2)
    total_norm += param_norm.item() ** 2
  return total_norm**0.5

In [69]:
def log_values(sw, iter, **kwargs):
  '''
  Helper function to add logs to tensorboard
  '''
  # print("Here")
  for k in kwargs:
    sw.add_scalar(k, kwargs[k], iter+1)

In [70]:
# configirations
# policy learning rate
lr = 1e-3

# batch size
batch_size = 64

# iterations
iterations = 87

# gamma
gamma = 0.99

# baseline type
# baseline type 1: avergae reward, 2: value function
baseline = None

# use entropy regulrization
entropy = False

seed = 42

In [71]:
import random
import numpy as np

random.seed(seed)
np.random.seed(seed)
torch.random.manual_seed(seed)

<torch._C.Generator at 0x7f999a67d110>

In [76]:
# run without transfer
# training loop
# create the environment
target_env = gym.make(target_env_name)

# create the policy model
action_type = "discrete" if type(gym.make(target_env_name).action_space) == gym.spaces.Discrete else "continuous"
if action_type == "discrete":
  model = MLP_policy(target_env.observation_space.shape[0], [64, 32], target_env.action_space.n, action_type)
else:
  model = MLP_policy(target_env.observation_space.shape[0], [64, 32], target_env.action_space.shape[0], action_type)
# model
optimizer = opt.Adam(model.parameters(), lr=lr)

# if baseline == 2:
#   # create the value model
#   model_v = MLP_value(env.observation_space.shape[0], [64])
#   # create the optimizer
#   value_optimizer = opt.Adam(model_v.parameters(), lr=lr)

# create a summaryWriter
sw = SummaryWriter(comment=f"env_name:{target_env_name}, lr_={lr}, batch_size={batch_size}, baseline: {baseline}, entropy: {entropy}")
temp = 0.1

avg_reward_baseline = None

for iter in range(iterations):
  rewards_avg = 0
  entropy_avg = 0

  batch_loss = torch.zeros(1)

  if baseline == 2:
    value_batch_loss = torch.zeros(1)


  max_reward = -1000000
  min_reward = 1000000
  
  for traj in range(batch_size):
    rewards = []
    log_probs = []
    state = target_env.reset()
    entropies = []
    value_baseline = []
    done = False
    while not done:
      state_t = torch.FloatTensor(state)
      action, log_prob = model.sample_action(state_t)

      if action.dim() >= 1 and len(action) >=2 :
          next_state, reward, done, info = target_env.step(action.cpu().numpy())
      else:
          next_state, reward, done, info = target_env.step(action.cpu().item())

      rewards.append(reward)
      entropies.append(model.entropy())
      if baseline == 2:
        value_baseline.append(model_v(state_t))

      log_probs.append(log_prob)
      state = next_state

    cum_reward = reward_to_go(rewards, gamma=gamma)

    loss = torch.zeros(1)
    value_loss = torch.zeros(1)
      
    # fix this for all baselines
    for i in range(len(log_probs)):
      log_prob = log_probs[i]
      r = cum_reward[i]
      # weight = torch.zeros(1)
      # weight += r
      if baseline:
        if baseline == 1:
          b = compute_baseline(cum_reward, btype=baseline)
          r = r -  b
        else:
          b = value_baseline[i]
          r = r -  b
      if entropy:
        r = r + (temp * entropies[i])
      loss -= (log_prob * r.detach())
      # print(f"r={r}")
      # print(f"log_prob={log_prob}")
      # print(f"loss={loss}")
      if baseline == 2:
        value_loss += (r - b)**2
    batch_loss += loss
    if baseline == 2:
      value_batch_loss += value_loss

    rewards_avg += sum(rewards)
    entropy_avg += sum(entropies)
    max_reward = sum(rewards) if sum(rewards) > max_reward else max_reward
    min_reward = sum(rewards) if sum(rewards) < min_reward else min_reward

  batch_loss /= batch_size
  rewards_avg /= batch_size
  entropy_avg /= batch_size

  
  # optimize the value
  if baseline == 2:
    value_batch_loss /= batch_size
    value_optimizer.zero_grad()
    value_batch_loss.backward()
    value_optimizer.step()


  # optimize the policy
  optimizer.zero_grad()
  batch_loss.backward()
  optimizer.step()


  grad_norm = gradient_norm(model)
  if baseline == 2:
    log_values(sw, iter, reward=rewards_avg, loss=batch_loss, gradient_norm=grad_norm, entropy=entropy_avg, maximum_reward=max_reward, minimum_reward=min_reward, value_loss=value_batch_loss)
  elif baseline == 1:
    log_values(sw, iter, reward=rewards_avg, loss=batch_loss, gradient_norm=grad_norm, entropy=entropy_avg, maximum_reward=max_reward, minimum_reward=min_reward, avg_reward_baseline=compute_baseline(cum_reward, btype=baseline))
  else:
    log_values(sw, iter, reward=rewards_avg, loss=batch_loss, gradient_norm=grad_norm, entropy=entropy_avg, maximum_reward=max_reward, minimum_reward=min_reward)
  print(f"Iteration:{iter+1}, reward: {rewards_avg}, batch loss: {batch_loss.cpu().detach().item()}")

Iteration:1, reward: 6.890625, batch loss: 44.40943908691406
Iteration:2, reward: 7.75, batch loss: 57.23524475097656
Iteration:3, reward: 8.703125, batch loss: 76.3541030883789
Iteration:4, reward: 7.59375, batch loss: 56.46784210205078
Iteration:5, reward: 8.015625, batch loss: 58.748802185058594
Iteration:6, reward: 8.65625, batch loss: 60.22014617919922
Iteration:7, reward: 7.671875, batch loss: 53.97249984741211
Iteration:8, reward: 9.21875, batch loss: 80.33733367919922
Iteration:9, reward: 9.3125, batch loss: 82.07076263427734
Iteration:10, reward: 9.46875, batch loss: 85.37443542480469
Iteration:11, reward: 9.359375, batch loss: 83.05612182617188
Iteration:12, reward: 9.421875, batch loss: 81.64006042480469
Iteration:13, reward: 10.78125, batch loss: 99.58782958984375
Iteration:14, reward: 10.328125, batch loss: 95.90492248535156
Iteration:15, reward: 10.109375, batch loss: 88.2119369506836
Iteration:16, reward: 9.59375, batch loss: 82.52635955810547
Iteration:17, reward: 10.37

KeyboardInterrupt: 

In [None]:
# run without transfer
# training loop
# create the environment
target_env = gym.make(target_env_name)

# create the policy model
action_type = "discrete" if type(gym.make(target_env_name).action_space) == gym.spaces.Discrete else "continuous"
if action_type == "discrete":
  model = MLP_policy(target_env.observation_space.shape[0], [64, 32], target_env.action_space.n, action_type)
else:
  model = MLP_policy(target_env.observation_space.shape[0], [64, 32], target_env.action_space.shape[0], action_type)
# model
optimizer = opt.Adam(model.parameters(), lr=lr)

# if baseline == 2:
#   # create the value model
#   model_v = MLP_value(env.observation_space.shape[0], [64])
#   # create the optimizer
#   value_optimizer = opt.Adam(model_v.parameters(), lr=lr)

# create a summaryWriter
sw = SummaryWriter(comment=f"env_name:{target_env_name}, lr_={lr}, batch_size={batch_size}, baseline: {baseline}, entropy: {entropy}")
temp = 0.1

avg_reward_baseline = None

for iter in range(iterations):
  rewards_avg = 0
  entropy_avg = 0

  batch_loss = torch.zeros(1)

  if baseline == 2:
    value_batch_loss = torch.zeros(1)


  max_reward = -1000000
  min_reward = 1000000
  
  for traj in range(batch_size):
    rewards = []
    log_probs = []
    state = target_env.reset()
    entropies = []
    value_baseline = []
    done = False
    while not done:
      state_t = torch.FloatTensor(state)
      action, log_prob = model.sample_action(state_t)

      if action.dim() >= 1 and len(action) >=2 :
          next_state, reward, done, info = target_env.step(action.cpu().numpy())
      else:
          next_state, reward, done, info = target_env.step(action.cpu().item())

      rewards.append(reward)
      entropies.append(model.entropy())
      if baseline == 2:
        value_baseline.append(model_v(state_t))

      log_probs.append(log_prob)
      state = next_state

    cum_reward = reward_to_go(rewards, gamma=gamma)

    loss = torch.zeros(1)
    value_loss = torch.zeros(1)
      
    # fix this for all baselines
    for i in range(len(log_probs)):
      log_prob = log_probs[i]
      r = cum_reward[i]
      # weight = torch.zeros(1)
      # weight += r
      if baseline:
        if baseline == 1:
          b = compute_baseline(cum_reward, btype=baseline)
          r = r -  b
        else:
          b = value_baseline[i]
          r = r -  b
      if entropy:
        r = r + (temp * entropies[i])
      loss -= (log_prob * r.detach())
      # print(f"r={r}")
      # print(f"log_prob={log_prob}")
      # print(f"loss={loss}")
      if baseline == 2:
        value_loss += (r - b)**2
    batch_loss += loss
    if baseline == 2:
      value_batch_loss += value_loss

    rewards_avg += sum(rewards)
    entropy_avg += sum(entropies)
    max_reward = sum(rewards) if sum(rewards) > max_reward else max_reward
    min_reward = sum(rewards) if sum(rewards) < min_reward else min_reward

  batch_loss /= batch_size
  rewards_avg /= batch_size
  entropy_avg /= batch_size

  
  # optimize the value
  if baseline == 2:
    value_batch_loss /= batch_size
    value_optimizer.zero_grad()
    value_batch_loss.backward()
    value_optimizer.step()


  # optimize the policy
  optimizer.zero_grad()
  batch_loss.backward()
  optimizer.step()


  grad_norm = gradient_norm(model)
  if baseline == 2:
    log_values(sw, iter, reward=rewards_avg, loss=batch_loss, gradient_norm=grad_norm, entropy=entropy_avg, maximum_reward=max_reward, minimum_reward=min_reward, value_loss=value_batch_loss)
  elif baseline == 1:
    log_values(sw, iter, reward=rewards_avg, loss=batch_loss, gradient_norm=grad_norm, entropy=entropy_avg, maximum_reward=max_reward, minimum_reward=min_reward, avg_reward_baseline=compute_baseline(cum_reward, btype=baseline))
  else:
    log_values(sw, iter, reward=rewards_avg, loss=batch_loss, gradient_norm=grad_norm, entropy=entropy_avg, maximum_reward=max_reward, minimum_reward=min_reward)
  print(f"Iteration:{iter+1}, reward: {rewards_avg}, batch loss: {batch_loss.cpu().detach().item()}")