In [52]:
from replay_buffer2 import  ReplayBuffer
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.distributions import Categorical
import numpy as np

In [21]:
import gym
from models import QNetwork, RNetwork, PolicyNetwork

In [40]:
class QNetwork(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size + action_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state, action):
        """Build a network that maps state -> action values."""
        sa = torch.cat((state, action), dim=1)
        x = F.relu(self.fc1(sa))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [11]:
env = gym.make("LunarLander-v2")

In [12]:
env.action_space.sample()

3

In [13]:
env.reset()
env.step(1)

(array([ 0.01038485,  1.4179726 ,  0.5190571 ,  0.14405067, -0.00992668,
        -0.07739462,  0.        ,  0.        ], dtype=float32),
 1.0344570947313525,
 False,
 {})

In [14]:
memory = ReplayBuffer((8,), (1,), 20000, 'cpu')
memory.load_memory("expert_policy/")
memory.idx = 20000
batch_size = 64

In [33]:
states, actions, rewards, next_states, not_done = memory.sample(batch_size)

In [34]:
states.shape

torch.Size([64, 8])

In [41]:
seed = 0
q_shift_local = QNetwork(8, 1, seed).to('cpu')
q_shift_target = QNetwork(8, 1, seed).to('cuda')
Q_local = QNetwork(8, 1, seed).to('cuda')
Q_target = QNetwork(8, 1, seed).to('cuda')
R_local = RNetwork(8,1,seed).to('cuda')
R_target = RNetwork(8,1, seed).to('cuda')


In [8]:
policy = PolicyNetwork(8,4,seed).to("cpu")

In [27]:
action_0 = torch.Tensor(1) * 0 +  0
action_1 = torch.Tensor(1) * 0 +  1
action_2 = torch.Tensor(1) * 0 +  2
action_3 = torch.Tensor(1) * 0 +  3
all_actions = [action_0, action_1, action_2, action_3]

In [39]:
states.shape, actions.shape

(torch.Size([64, 8]), torch.Size([64, 1]))

In [42]:
q_shift_values = q_shift_local(states, actions)

In [44]:
q_shift_values.shape

torch.Size([64, 1])

In [46]:
s = states[0]
s

tensor([-0.1234,  1.3636, -0.4401, -0.4256, -0.0976, -0.1259,  0.0000,  0.0000])

In [53]:
y_sh = np.empty((batch_size,1), dtype=np.float32)

In [61]:
for idx, s in enumerate(states):
    q = []
    for action in all_actions:
        q.append(q_shift_local(s.unsqueeze(0), action.unsqueeze(0)))
    q_max = max(q)
    np.copyto(y_sh[idx], q_max.detach().numpy())
    
    

In [63]:
y_sh = torch.Tensor(y_sh)

In [64]:
q_shift_loss = F.mse_loss(y_sh, q_shift_values)
q_shift_loss

tensor(0.0004, grad_fn=<MeanBackward0>)

In [50]:
q = []
for action in all_actions:
    q.append(q_shift_local(s.unsqueeze(0), action.unsqueeze(0)))
q_max = max(q)
q_max, q

(tensor([[-0.1007]], grad_fn=<AddmmBackward>),
 [tensor([[-0.1007]], grad_fn=<AddmmBackward>),
  tensor([[-0.1240]], grad_fn=<AddmmBackward>),
  tensor([[-0.1161]], grad_fn=<AddmmBackward>),
  tensor([[-0.1020]], grad_fn=<AddmmBackward>)])

In [19]:
s = obses[0]
a = actions[0]

In [9]:
policy(torch.Tensor(s).unsqueeze(0))

NameError: name 's' is not defined

In [12]:
s.shape, a.shape

(torch.Size([8]), torch.Size([4]))

In [58]:
actions[0]

tensor([0., 0., 0., 0.])

In [17]:
s = obses[0]
a = actions[0]
q_shift_local(s,a)

tensor([-1.7882e+36,  8.7725e+34,  3.2616e+36,  2.7237e+36],
       grad_fn=<AddBackward0>)

In [27]:
s = s.to("cpu")
s

tensor([ 0.1434,  0.3018,  0.0592, -0.0687,  0.0365,  0.0072,  0.0000,  0.0000])

In [23]:
next(policy.parameters()).is_cuda

True

In [28]:
policy(torch.Tensor(s).unsqueeze(0))

RuntimeError: copy_if failed to synchronize: cudaErrorIllegalAddress: an illegal memory access was encountered

In [44]:
Q_local = QNetwork(8, 1, 0).to('cpu')

In [7]:
reward_est = RNetwork(8,4,0).to('cuda')

In [10]:
class Agent():
    def __init__(self, state_size, action_size, config):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = 0
        self.device = 'cuda'
        self.q_shift_local = QNetwork(state_size, action_size, self.seed).to(self.device)
        self.q_shift_target = QNetwork(state_size, action_size, self.seed).to(self.device)
        self.Q_local = QNetwork(state_size, action_size, self.seed).to(self.device)
        self.Q_target = QNetwork(state_size, action_size, self.seed).to(self.device)
        self.R_local = RNetwork(state_size,action_size, self.seed).to(self.device)
        self.R_target = RNetwork(state_size, action_size, self.seed).to(self.device)
        self.policy = PolicyNetwork(state_size, action_size,self.seed).to(self.device)
    
    def act(self, state):
        dis, action, log_probs, ent = self.policy.sample_action(torch.Tensor(state).unsqueeze(0))
        return dis, action, log_probs, ent
    
    def learn(self, memory, batch_size):
        obses, actions, rewards, next_obses, not_dones_no_max = memory.sample(batch_size)
        

In [13]:
action_0 = torch.Tensor(1) * 0 +  0
action_1 = torch.Tensor(1) * 0 +  1
action_2 = torch.Tensor(1) * 0 +  2
action_3 = torch.Tensor(1) * 0 +  3
all_actions = [action_0, action_1, action_2, action_3]

In [16]:
a

tensor([0., 0., 0., 0.])

In [15]:
action_0 = action_0.

tensor([0.])

In [14]:
q_shift_local(s, action_0)

RuntimeError: size mismatch, m1: [1 x 9], m2: [12 x 64] at /pytorch/aten/src/TH/generic/THTensorMath.cpp:41

In [59]:
s = obses[0]
s = s.unsqueeze(0)
action_0 = action_0.unsqueeze(0)
s = s.to("cpu")
action_0 = action_0.to("cpu")
s.shape, action_0.shape

RuntimeError: CUDA error: an illegal memory access was encountered

In [54]:
Q_target(s,action_0)

RuntimeError: All input tensors must be on the same device. Received cuda:0 and cpu

In [None]:
def learn(batch_size):
    

In [11]:
agent = Agent(8,4,None)

In [8]:
policy= PolicyNetwork(8,4,0).to("cuda")

In [11]:
policy.sample_action(s)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [10]:
s = obses[0]
a = actions[0]

In [20]:
agent.act(s.to("cpu"))

NameError: name 'x' is not defined

In [14]:
s = obses[0]
a = actions[0]
q_shift_local(s,a)

NameError: name 'q_shift_local' is not defined

In [14]:
reward_est(s,a)

tensor([0.0133], device='cuda:0', grad_fn=<AddBackward0>)

In [30]:
s

RuntimeError: copy_if failed to synchronize: cudaErrorIllegalAddress: an illegal memory access was encountered

In [15]:
policy = PolicyNetwork(8,4,seed=1).cuda()
policy.sample_action(s)

RuntimeError: CUDA error: an illegal memory access was encountered

In [15]:
reward_est(s,a)

tensor([0.0133], device='cuda:0', grad_fn=<AddBackward0>)

In [12]:

class PolicyNetwork(nn.Module):
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed, fc1_units=64, fc2_units=64):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(PolicyNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        logits = torch.softmax(x, dim=1)
        return logits



    def sample_action(self, state):
        output = self.forward(state)
        dis = Categorical(output)
        action = dis.sample()
        log_probs  = dis.log_prob(action)
        ent = dis.entropy()
        return output, action, log_probs, ent


In [37]:
action_0 = torch.Tensor(1)
action_1 = torch.Tensor(1)
action_2 = torch.Tensor(1)
action_3 = torch.Tensor(1)
all_actions = [action_0, action_1, action_2, action_3]

In [40]:
action_0  = action_0 * 0
action_1 = action_1 * 0 + 1
action_2 = action_2 * 0 + 2
action_3 = action_3 * 0 + 3

action_0, action_1, action_2, action_3

(tensor([-0.]), tensor([1.]), tensor([2.]), tensor([3.]))

In [23]:
for idx, action in enumerate(all_actions):
    print(idx, action)

0 tensor([])
1 tensor([-1.2245e+33])
2 tensor([-1.3333e-06,  3.0716e-41])
3 tensor([-1.2245e+33,  4.5879e-41, -1.2245e+33])
