In [2]:
import gym
import numpy as np
import torch
from cartpole import RandomPolicy

from gym.envs.classic_control.cartpole import CartPoleEnv

In [11]:
class InfiniteCartPole(CartPoleEnv):
    def step(self, action):
        obs, reward, done, _, info = super(InfiniteCartPole, self).step(action)
        reward = -1.0 if done else 1.0
        return obs, reward, False, info, 1


# first_states, states, actions, next_states, rewards, step_num
def evaluate_policy_on_cartpole(policy: RandomPolicy, episodes=200):
    env = InfiniteCartPole()
    total_reward = 0
    total_disc_reward = 0
    ds = []

    for episode in range(episodes):
        state = env.reset()[0]
        first_state = state
        done = False
        episode_reward = 0
        episode_disc_reward = 0
        episode_len = 0

        for _ in range(250):
            old_state = state
            state_tensor = (
                torch.tensor(state, dtype=torch.float32).to(policy._device).unsqueeze(0)
            )
            action = policy.select_action(state_tensor).item()

            state, reward, done, _, _ = env.step(action)
            episode_disc_reward += reward * (0.99**episode_len)
            episode_reward += reward
            ds.append([first_state, old_state, action, state, reward, episode_len])
            episode_len += 1

        total_reward += episode_reward
        total_disc_reward += episode_disc_reward
        # print(f"Episode {episode + 1}, reward: {episode_reward}, len: {episode_len}")
    env.close()
    average_reward = total_reward / episodes
    average_disc_reward = total_disc_reward / episodes
    print(f"Average reward over {episodes} episodes: {average_reward}")
    print(f"Average discounted reward over {episodes} episodes: {average_disc_reward}")

    return ds

In [12]:
ds = evaluate_policy_on_cartpole(RandomPolicy())

  logger.warn(


Average reward over 200 episodes: -207.36
Average discounted reward over 200 episodes: -55.32634929046574


In [13]:
np.save("cartpole_ds.npy", ds, allow_pickle=True)
new_ds = np.load("cartpole_ds.npy", allow_pickle=True)

  arr = np.asanyarray(arr)


In [65]:
import numpy as np

first_states = torch.tensor(np.stack(ds[:, 0]))
states = torch.tensor(np.stack(ds[:, 1]))
actions = torch.LongTensor(ds[:, 2].astype(int)).unsqueeze(1)
next_states = np.stack(ds[:, 3])
rewards = torch.tensor(ds[:, 4].astype(np.float32))
step_num = torch.LongTensor(ds[:, 5].astype(int))

array([1.0, 1.0, 1.0, ..., 1.0, 1.0, 1.0], dtype=object)

In [59]:
torch.LongTensor(actions.astype(int))

tensor([1, 0, 0,  ..., 1, 0, 1])

In [36]:
import numpy as np

ds = np.array(ds, dtype=object)

In [48]:
np.stack(ds[:, 0]).shape

(22563, 4)

In [50]:
ds[:, 0].shape

(22563,)

In [33]:
ds

[[tensor([-0.0051, -0.0212,  0.0258,  0.0236]),
  tensor([-0.0051, -0.0212,  0.0258,  0.0236]),
  tensor([1]),
  tensor([-0.0055,  0.1736,  0.0263, -0.2608]),
  tensor(1.),
  tensor(0)],
 [tensor([-0.0051, -0.0212,  0.0258,  0.0236]),
  tensor([-0.0055,  0.1736,  0.0263, -0.2608]),
  tensor([0]),
  tensor([-0.0020, -0.0219,  0.0211,  0.0401]),
  tensor(1.),
  tensor(1)],
 [tensor([-0.0051, -0.0212,  0.0258,  0.0236]),
  tensor([-0.0020, -0.0219,  0.0211,  0.0401]),
  tensor([1]),
  tensor([-0.0025,  0.1729,  0.0219, -0.2459]),
  tensor(1.),
  tensor(2)],
 [tensor([-0.0051, -0.0212,  0.0258,  0.0236]),
  tensor([-0.0025,  0.1729,  0.0219, -0.2459]),
  tensor([0]),
  tensor([ 0.0010, -0.0225,  0.0169,  0.0536]),
  tensor(1.),
  tensor(3)],
 [tensor([-0.0051, -0.0212,  0.0258,  0.0236]),
  tensor([ 0.0010, -0.0225,  0.0169,  0.0536]),
  tensor([1]),
  tensor([ 0.0005,  0.1723,  0.0180, -0.2337]),
  tensor(1.),
  tensor(4)],
 [tensor([-0.0051, -0.0212,  0.0258,  0.0236]),
  tensor([ 0.0005

In [17]:
import numpy as np

ds = np.array(ds, dtype=object)

In [23]:
torch.tensor(ds[0][0])

tensor([-0.0171, -0.0190, -0.0340,  0.0469])

In [None]:
np.save("cartpole.npy")

22878