In [1]:
import numpy as np

In [6]:
a = [1,2]

In [9]:
b = np.array(a)
b.shape

(2,)

In [10]:
c = np.ndarray(a)
c.shape

(1, 2)

In [5]:
a

[1, 2, 3]

In [14]:
d = np.mean(a[-10:])
d

1.5

In [1]:
from IMPORTS import *

In [14]:
env_id = 'CartPole-v1'
env = gym.make(env_id)
obs = env.reset()
obs

array([ 1.08632864e-02, -3.04061511e-02, -3.11447187e-02, -4.99648067e-06])

In [15]:
obs.astype('double')

array([ 1.08632864e-02, -3.04061511e-02, -3.11447187e-02, -4.99648067e-06])

In [16]:
torch.tensor(obs, dtype=torch.double)

tensor([ 1.0863e-02, -3.0406e-02, -3.1145e-02, -4.9965e-06],
       dtype=torch.float64)

In [19]:
class Model(nn.Module):
    def __init__(self, dim_observation, n_actions):
        super(Model, self).__init__()

        self.n_actions = n_actions
        self.dim_observation = dim_observation

        self.net = nn.Sequential(
            nn.Linear(in_features=self.dim_observation, out_features=16),
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=8),
            nn.ReLU(),
            nn.Linear(in_features=8, out_features=self.n_actions),
            nn.Softmax(dim=0)
        )

    def forward(self, state):
        return self.net(state)

    def select_action(self, state):
        # action = torch.multinomial(self.forward(state), 1)
        # return action

        # state = torch.from_numpy(state).float().unsqueeze(0)
        # probs = self.forward(Variable(state))

        tnsr = torch.tensor(state, dtype=torch.double)
        self.net.double()
        curr_out = self(tnsr)
        highest_prob_action = np.random.choice(self.n_actions, p=np.squeeze(curr_out.detach().numpy()))
        log_prob = torch.log(tnsr.squeeze(0)[highest_prob_action])
        return highest_prob_action, log_prob
    


In [20]:
class RLDataset(torch.utils.data.IterableDataset):
    """
    Iterable Dataset containing the ReplayBuffer
    which will be updated with new experiences during training
    Args:
        model: model of the agent
        sample_size: number of experiences to sample at a time
    """

    def __init__(self, model: Model, env, sample_size: int = 1) -> None:
        self.model = model
        self.env = env
        self.sample_size = sample_size

    def __iter__(self):  # -> Tuple:
        max_steps = 10000
        state = self.env.reset()
        log_probs = []
        rewards = []

        for steps in range(max_steps):
            action, log_prob = self.model.select_action(state)
            new_state, reward, done, _ = self.env.step(action)
            log_probs.append(log_prob)
            rewards.append(reward)
            if done:
                break
        yield rewards, log_probs

In [22]:
env_id = 'CartPole-v1'
env = gym.make(env_id)
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n
net = Model(obs_size, n_actions)
dataset = RLDataset(net, env)
dataloader = DataLoader(dataset=dataset)

In [35]:
for i_batch, sample_batched in enumerate(dataloader):
    # print(i_batch, sample_batched['image'].size(),
    #       sample_batched['landmarks'].size())
    print(f'num of batch = {i_batch}')
    print(f'len of sample in batch = {len(sample_batched[0])}')
    print('-------------------')
    print(i_batch, sample_batched[0])
    print('-------------------')
    print(i_batch, sample_batched[1])
    print('-------------------')

num of batch = 0
len of sample in batch = 25
-------------------
0 [tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64), tensor([1.], dtype=torch.float64)]
-------------------
0 [tensor([-3.0295], dtype=torch.floa