In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
from ActorCritic import ActorCritic
from Memory import Memory
import torch
import torch.optim as optim
import torch.nn.functional as F
import copy
import pdb
from torch.distributions import Categorical

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class PPO:
  def __init__(self, state_space, action_space, hidden_size=64, epsilon=0.2, entropy_beta=0.01, gamma=0.99):
    self.mem = Memory()

    self.gamma = gamma
    self.epsilon = epsilon
    self.entropy_beta = entropy_beta

    self.model = ActorCritic(state_space, action_space, hidden_size)
    self.model_old = ActorCritic(state_space, action_space, hidden_size) 

    self.model_old.load_state_dict(self.model.state_dict())

    self.optimiser = optim.Adam(self.model.parameters(), lr=1e-3)
  
  def act(self, x):
    return self.model_old.act(x)
  
  def test(self):
    # prev_states = torch.stack(self.mem.states).to(device).detach()
    # prev_actions = torch.stack(self.mem.actions).to(device).detach()
    # prev_log_probs = torch.stack(self.mem.log_probs).to(device).detach()

    

    for s, a in zip(self.mem.states, self.mem.actions):
      _, lp, _, _ = self.model.evaluate(s, a)
      _, lp2, _, _ = self.model_old.evaluate(s, a)
      print("eval second time: {}, old: {}".format(lp, lp2))

    prev_actions = torch.stack(self.mem.actions).unsqueeze(dim=1).to(device).detach()
    prev_states = torch.stack(self.mem.states).to(device).detach()
    prev_log_probs = torch.stack(self.mem.log_probs).to(device).detach()
    
    pdb.set_trace()

  def learn(self, num_learn):
    # Calculate discounted rewards
    discounted_returns = []
    running_reward = 0

    for reward, done in zip(reversed(self.mem.rewards), reversed(self.mem.dones)):
      if done:
        running_reward = 0
      running_reward = reward + self.gamma * running_reward

      discounted_returns.insert(0,running_reward)

    # normalise rewards
    discounted_returns = torch.FloatTensor(discounted_returns)
    discounted_returns = (discounted_returns - discounted_returns.mean()) / (discounted_returns.std() + 1e-5)

    prev_states = torch.stack(self.mem.states).to(device).detach()
    prev_actions = torch.stack(self.mem.actions).to(device).detach()
    prev_log_probs = torch.stack(self.mem.log_probs).to(device).detach()

    for i in range(num_learn):
      pdb.set_trace()

      # find ratios
      actions, log_probs, values, entropy = self.model.evaluate(prev_states, prev_actions)
      ratio = torch.exp(log_probs - prev_log_probs.detach())

      print("")
      print(prev_log_probs)
      actions, log_probs, values, entropy = self.model.evaluate(prev_states, prev_actions)
      print(log_probs)
      actions, log_probs, values, entropy = self.model_old.evaluate(prev_states, prev_actions)
      print(log_probs)

      pdb.set_trace()

      # calculate advantage
      advantage = discounted_returns - values.detach()

      # calculate surrogates
      surrogate_1 = ratio * advantage
      surrogate_2 = torch.clamp(advantage, 1-self.epsilon, 1+self.epsilon)
      loss = -torch.min(surrogate_1, surrogate_2) + F.mse_loss(values, discounted_returns) - self.entropy_beta*entropy

      loss = loss.mean()

      # calculate gradient
      self.optimiser.zero_grad()
      loss.backward()
      self.optimiser.step()
    
    self.model_old.load_state_dict(self.model.state_dict())

In [10]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from PPO import PPO

env = gym.make('CartPole-v1')
env.seed(10)

state_size = env.observation_space.shape[0]
action_size =env.action_space.n

# PPO Settings
update_every = 10
num_learn = 4

steps = 0
scores_deque = deque(maxlen=100)
scores = []
max_score = -np.Inf

agent = PPO(state_size, action_size)

state = env.reset()
score = 0

sl = []
al = []


for t in range(update_every):
    steps += 1

    state = torch.from_numpy(state).float()
    action, log_prob = agent.model_old.act(state)
    next_state, reward, done, _ = env.step(action.item())

    sl.append(state)
    al.append(action)

    agent.mem.add(state, action, reward, log_prob, done)

    # Update 
    state = next_state
    score += reward

    # Book Keeping
    scores_deque.append(score)
    scores.append(score)


    if done:
        state = env.reset() 

In [14]:
alt = torch.stack(al).to(device)
slt = torch.stack(sl).to(device)

# print(alt)
# print(slt)
# print(llt)

for s, a in zip(sl, al):
    logits, value = agent.model(s)
    print("Logits: {}, values: {}".format(logits, value))

Logits: tensor([ 0.0090, -0.0493], grad_fn=<AddBackward0>), values: tensor([0.0524], grad_fn=<AddBackward0>)
Logits: tensor([ 0.0138, -0.0107], grad_fn=<AddBackward0>), values: tensor([0.0280], grad_fn=<AddBackward0>)
Logits: tensor([0.0092, 0.0239], grad_fn=<AddBackward0>), values: tensor([0.0130], grad_fn=<AddBackward0>)
Logits: tensor([ 0.0138, -0.0071], grad_fn=<AddBackward0>), values: tensor([0.0271], grad_fn=<AddBackward0>)
Logits: tensor([ 0.0102, -0.0454], grad_fn=<AddBackward0>), values: tensor([0.0491], grad_fn=<AddBackward0>)
Logits: tensor([ 0.0127, -0.0034], grad_fn=<AddBackward0>), values: tensor([0.0260], grad_fn=<AddBackward0>)
Logits: tensor([ 0.0108, -0.0428], grad_fn=<AddBackward0>), values: tensor([0.0468], grad_fn=<AddBackward0>)
Logits: tensor([-0.0001, -0.0598], grad_fn=<AddBackward0>), values: tensor([0.0703], grad_fn=<AddBackward0>)
Logits: tensor([ 0.0111, -0.0398], grad_fn=<AddBackward0>), values: tensor([0.0436], grad_fn=<AddBackward0>)
Logits: tensor([0.009

In [15]:
action, probs, value, entropy = agent.model.evaluate(slt, alt)
# logits, value = agent.model(slt)
# print(logits)

# logits = F.softmax(logits, dim=0) 
# print(logits)
# probs = Categorical(logits)
# print(probs.log_prob(alt))

logits: tensor([[ 0.0090, -0.0493],
        [ 0.0138, -0.0107],
        [ 0.0092,  0.0239],
        [ 0.0138, -0.0071],
        [ 0.0102, -0.0454],
        [ 0.0127, -0.0034],
        [ 0.0108, -0.0428],
        [-0.0001, -0.0598],
        [ 0.0111, -0.0398],
        [ 0.0099,  0.0045]], grad_fn=<AddmmBackward>)
softmax: tensor([[0.5146, 0.4854],
        [0.5061, 0.4939],
        [0.4963, 0.5037],
        [0.5052, 0.4948],
        [0.5139, 0.4861],
        [0.5040, 0.4960],
        [0.5134, 0.4866],
        [0.5149, 0.4851],
        [0.5127, 0.4873],
        [0.5014, 0.4986]], grad_fn=<SoftmaxBackward>)


In [None]:
agent.mem.actions

In [None]:
for s, a in zip(agent.mem.states, agent.mem.actions):
    _, lp, _, _ = agent.model.evaluate(s, a)
    _, lp2, _, _ = agent.model_old.evaluate(s, a)
    print(s)
    print("eval second time: {}, old: {}".format(lp, lp2))

prev_actions = torch.stack(agent.mem.actions).to(device).detach()
prev_states = torch.stack(agent.mem.states).to(device).detach()
prev_log_probs = torch.stack(agent.mem.log_probs).to(device).detach()

In [None]:
prev_actions

In [None]:
prev_states

In [None]:
_, lg, _, _ = agent.model.evaluate(prev_states[0], prev_actions[0])
lg

In [None]:
_, lg, _, _ = agent.model.evaluate(prev_states, prev_actions)
lg

In [None]:
_, lg, _, _ = agent.model_old.evaluate(prev_states, prev_actions)
lg