In [0]:
#%cd drive/My\ Drive/Colab\ Notebooks/drlp

In [0]:
#%pip install -r requirements.txt 

In [0]:
import numpy as np
import ptan

In [0]:
q_vals = np.array([[1, 2, 3], [1, -1, 0]])

In [58]:
q_vals

array([[ 1,  2,  3],
       [ 1, -1,  0]])

In [0]:
selector = ptan.actions.ArgmaxActionSelector()

In [60]:
selector(q_vals)

array([2, 0])

In [0]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.0)

In [62]:
selector(q_vals)

array([2, 0])

In [0]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)

In [64]:
selector(q_vals)

array([0, 1])

In [0]:
selector = ptan.actions.ProbabilityActionSelector()

In [66]:
for _ in range(10):
  acts = selector(np.array(
      [
       [0.1, 0.8, 0.1],
       [0.0, 0.0, 1.0],
       [0.5, 0.5, 0.0] 
      ]
  ))
  print(acts)

[1 2 1]
[1 2 0]
[1 2 1]
[1 2 1]
[0 2 1]
[1 2 1]
[1 2 0]
[1 2 0]
[1 2 0]
[1 2 0]


In [0]:
import torch
import torch.nn as nn

In [0]:
class DQNNet(nn.Module):
  def __init__(self, actions: int):
    super(DQNNet, self).__init__()
    self.actions = actions
  
  def forward(self, x):
    return torch.eye(x.size()[0], self.actions)

In [0]:
net = DQNNet(actions=3)

In [70]:
net(torch.zeros(2, 10))

tensor([[1., 0., 0.],
        [0., 1., 0.]])

In [0]:
selector = ptan.actions.ArgmaxActionSelector()

In [0]:
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector, device='cpu')

In [73]:
agent(torch.zeros(2, 5))

(array([0, 1]), [None, None])

In [0]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)

In [0]:
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector)

In [76]:
agent(torch.zeros(10, 5))[0]

array([1, 2, 2, 2, 0, 1, 2, 1, 1, 2])

In [0]:
selector.epsilon = 0.5

In [78]:
agent(torch.zeros(10, 5))[0]

array([0, 0, 2, 0, 2, 2, 0, 0, 0, 2])

In [0]:
selector.epsilon = 0.1

In [80]:
agent(torch.zeros(10, 5))[0]

array([0, 1, 2, 0, 0, 0, 0, 0, 0, 0])

In [0]:
class PolicyNet(nn.Module):
  def __init__(self, actions: int):
    super(PolicyNet, self).__init__()
    self.actions = actions
  
  def forward(self, x):
    shape = (x.size()[0], self.actions)
    res = torch.zeros(shape, dtype=torch.float32)
    res[:, 0] = 1
    res[:, 1] = 1
    return res

In [0]:
net = PolicyNet(actions=5)

In [83]:
net(torch.zeros(6, 10))

tensor([[1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.]])

In [0]:
selector = ptan.actions.ProbabilityActionSelector()

In [0]:
agent = ptan.agent.PolicyAgent(model=net, action_selector=selector, apply_softmax=True)

In [86]:
agent(torch.zeros(6, 5))[0]

array([0, 1, 0, 2, 1, 0])

In [87]:
  torch.nn.functional.softmax(net(torch.zeros(1, 10)), dim=1)

tensor([[0.3222, 0.3222, 0.1185, 0.1185, 0.1185]])

In [88]:
!mv '/content/drive/My Drive/requirements.txt' .

mv: cannot stat '/content/drive/My Drive/requirements.txt': No such file or directory


In [0]:
import gym

In [0]:
class ToyEnv(gym.Env):
  def __init__(self):
    super(ToyEnv, self).__init__()
    self.observation_space = gym.spaces.Discrete(n=5)
    self.action_space = gym.spaces.Discrete(n=3)

    self.step_index = 0

  def reset(self):
    self.step_index = 0
    return self.step_index
  
  def step(self, action):
    is_done = self.step_index == 10
    if is_done:
      return self.step_index % self.observation_space.n, 0.0, is_done, {}
    
    self.step_index += 1
    return self.step_index % self.observation_space.n, float(action), self.step_index == 10, {}
    

In [0]:
from typing import *
from typing import Tuple

In [0]:
class DullAgent(ptan.agent.BaseAgent):
  def __init__(self, action : int):
    self.action = action
  
  def __call__(self, observations: List[Any], state: Optional[List] = None):
    return [self.action for _ in observations], state

In [0]:
env = ToyEnv()
agent = DullAgent(action=1)


In [0]:
exp_source = ptan.experience.ExperienceSource(env=env, agent=agent, steps_count=2)

In [95]:
for idx, exp in enumerate(exp_source):
  if idx > 2:
    break
  
  print(exp)

(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))


In [96]:
for idx, exp in enumerate(exp_source):
  if idx > 15:
    break
  print(exp)

(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
(Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False))
(Experience(state=4, action=1, reward=1.0, done=False), Experience(state=0, action=1, reward=1.0, done=False))
(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
(Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=True))
(E

In [0]:
exp_source = ptan.experience.ExperienceSource(env=env, agent=agent, steps_count=4)

In [98]:
next(iter(exp_source))

(Experience(state=0, action=1, reward=1.0, done=False),
 Experience(state=1, action=1, reward=1.0, done=False),
 Experience(state=2, action=1, reward=1.0, done=False),
 Experience(state=3, action=1, reward=1.0, done=False))

In [0]:
exp_source = ptan.experience.ExperienceSource(env=[env, env], agent=agent, steps_count=4)

In [100]:
for idx, exp in enumerate(exp_source):
  if idx > 4:
    break
  print(exp)

(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False), Experience(state=0, action=1, reward=1.0, done=False))
(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False), Experience(state=0, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=True))
(Experience(state=4, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False), Experi

In [0]:
import ptan

In [0]:
exp_source = ptan.experience.ExperienceSourceFirstLast(env=env, agent=agent, gamma=1.0, steps_count=1)

In [107]:
for idx, exp in enumerate(exp_source):
  if idx > 10:
    break
  print(exp)

ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=2, action=1, reward=1.0, last_state=3)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=2, action=1, reward=1.0, last_state=3)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=None)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)


In [0]:
%cat javascript.js

In [0]:
exp_source = ptan.experience.ExperienceSourceFirstLast(env=env, agent=agent, gamma=1.0, steps_count=2)

In [110]:
for idx, exp in enumerate(exp_source):
  if idx > 10:
    break
  print(exp)

ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)
ExperienceFirstLast(state=1, action=1, reward=2.0, last_state=3)
ExperienceFirstLast(state=2, action=1, reward=2.0, last_state=4)
ExperienceFirstLast(state=3, action=1, reward=2.0, last_state=0)
ExperienceFirstLast(state=4, action=1, reward=2.0, last_state=1)
ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)
ExperienceFirstLast(state=1, action=1, reward=2.0, last_state=3)
ExperienceFirstLast(state=2, action=1, reward=2.0, last_state=4)
ExperienceFirstLast(state=3, action=1, reward=2.0, last_state=None)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=None)
ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)


In [0]:
env = ToyEnv()

In [0]:
agent = DullAgent(action=1)

In [0]:
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=1.0, steps_count=1)

In [0]:
buffer = ptan.experience.ExperienceReplayBuffer(experience_source=exp_source, buffer_size=100)

In [122]:
len(buffer)

0

In [123]:
for step in range(6):
  buffer.populate(1)

  if len(buffer) < 5:
    batch  = buffer.sample(1)
    print( ' This is experiement->', batch)
    continue
  batch  = buffer.sample(4)
  print('Train time, %d batch samples:' %len(batch))
  
  for s in batch:
    print(s)

 This is experiement-> [ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)]
 This is experiement-> [ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)]
 This is experiement-> [ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)]
 This is experiement-> [ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)]
Train time, 4 batch samples:
ExperienceFirstLast(state=2, action=1, reward=1.0, last_state=3)
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
Train time, 4 batch samples:
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)


In [0]:
 # The TargetNet Class

In [0]:
class DQNNet(nn.Module):
  def __init__(self):
    super(DQNNet, self).__init__()
    self.ff = nn.Linear(5, 3)
  
  def forward(self, x):
    return self.ff(x)

In [0]:
net = DQNNet()

In [127]:
net

DQNNet(
  (ff): Linear(in_features=5, out_features=3, bias=True)
)

In [0]:
tgt_net = ptan.agent.TargetNet(net)

In [130]:
net.ff.weight

Parameter containing:
tensor([[ 0.0539,  0.4224,  0.1722,  0.2179, -0.3346],
        [-0.4162,  0.0744, -0.1779,  0.0076, -0.2441],
        [-0.3310,  0.1990, -0.2176, -0.1843, -0.0221]], requires_grad=True)

In [131]:
tgt_net.target_model.ff.weight

Parameter containing:
tensor([[ 0.0539,  0.4224,  0.1722,  0.2179, -0.3346],
        [-0.4162,  0.0744, -0.1779,  0.0076, -0.2441],
        [-0.3310,  0.1990, -0.2176, -0.1843, -0.0221]], requires_grad=True)

In [0]:
net.ff.weight.data += 1.0

In [134]:
net.ff.weight

Parameter containing:
tensor([[1.0539, 1.4224, 1.1722, 1.2179, 0.6654],
        [0.5838, 1.0744, 0.8221, 1.0076, 0.7559],
        [0.6690, 1.1990, 0.7824, 0.8157, 0.9779]], requires_grad=True)

In [135]:
tgt_net.target_model.ff.weight

Parameter containing:
tensor([[ 0.0539,  0.4224,  0.1722,  0.2179, -0.3346],
        [-0.4162,  0.0744, -0.1779,  0.0076, -0.2441],
        [-0.3310,  0.1990, -0.2176, -0.1843, -0.0221]], requires_grad=True)

In [0]:
tgt_net.sync()

In [137]:
net.ff.weight

Parameter containing:
tensor([[1.0539, 1.4224, 1.1722, 1.2179, 0.6654],
        [0.5838, 1.0744, 0.8221, 1.0076, 0.7559],
        [0.6690, 1.1990, 0.7824, 0.8157, 0.9779]], requires_grad=True)

In [139]:
tgt_net.target_model.ff.weight

Parameter containing:
tensor([[1.0539, 1.4224, 1.1722, 1.2179, 0.6654],
        [0.5838, 1.0744, 0.8221, 1.0076, 0.7559],
        [0.6690, 1.1990, 0.7824, 0.8157, 0.9779]], requires_grad=True)

In [0]:
tgt_net.alpha_sync(alpha=0.1)

In [142]:
tgt_net.target_model.ff.weight

Parameter containing:
tensor([[1.0539, 1.4224, 1.1722, 1.2179, 0.6654],
        [0.5838, 1.0744, 0.8221, 1.0076, 0.7559],
        [0.6690, 1.1990, 0.7824, 0.8157, 0.9779]], requires_grad=True)