<a href="https://colab.research.google.com/github/AiProcess/CartPole_RL/blob/main/cart_pole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torchrl
!pip install tensordict

In [2]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Default device is {device}")

Default device is cpu


# Environment

In [3]:
from torchrl.envs import GymEnv
from torchrl.envs.transforms import StepCounter, TransformedEnv
from torchrl.record import VideoRecorder, CSVLogger

## Main Env

In [None]:
env = TransformedEnv(
    env=GymEnv(env_name="CartPole-v1"),
    transform=StepCounter(),
).to(device)

## Test Env

In [None]:
csv_logger = CSVLogger(exp_name="test", log_dir="cart_pole", video_format="mp4")
video_recorder = VideoRecorder(logger=csv_logger, tag="video")

test_env = TransformedEnv(
    env=GymEnv(
        env_name="CartPole-v1",
        from_pixels=True,
        pixels_only=False
    ),
    transform=video_recorder
).to(device)

In [None]:
# env.rollout(max_steps=5)

# Agent

In [7]:
from torch import nn
import torch.nn.functional as F
from tensordict.nn import TensorDictModule, TensorDictSequential
from torchrl.modules import EGreedyModule, QValueModule

## Policy

In [8]:
class MLP_Model(nn.Module):
    def __init__(self, in_features, out_features) -> None:
        super().__init__()
        self.layer1 = nn.Linear(in_features=in_features, out_features=64)
        self.layer2 = nn.Linear(in_features=64, out_features=64)
        self.layer3 = nn.Linear(in_features=64, out_features=out_features)

    def forward(self, x):
        y = F.relu(self.layer1(x))
        y = F.relu(self.layer2(y))
        y = F.relu(self.layer3(y))

        return y

In [9]:
value_mlp = MLP_Model(
    in_features=env.observation_spec['observation'].shape[0],
    out_features=env.action_spec.shape[0]
)

value_net = TensorDictModule(
    module=value_mlp,
    in_keys=["observation"],
    out_keys=["action_value"]
)

exploration_module = EGreedyModule(
    env.action_spec, annealing_num_steps=100_000, eps_init=0.5
)

policy_explore = TensorDictSequential(
    value_net,
    QValueModule(spec=env.action_spec),
    exploration_module
).to(device)

## Data Collection and Replay Buffer

In [None]:
from torchrl.collectors import SyncDataCollector
from torchrl.data import LazyTensorStorage, ReplayBuffer

In [None]:
collector = SyncDataCollector(
    create_env_fn=env,
    policy=policy_explore,
    frames_per_batch=128,
    total_frames=-1,
    init_random_frames=5000
)

In [15]:
collector

SyncDataCollector(
    env=TransformedEnv(
        env=GymEnv(env=CartPole-v1, batch_size=torch.Size([]), device=cpu),
        transform=StepCounter(keys=[])),
    policy=TensorDictSequential(
        module=ModuleList(
          (0): TensorDictModule(
              module=MLP_Model(
                (layer1): Linear(in_features=4, out_features=64, bias=True)
                (layer2): Linear(in_features=64, out_features=64, bias=True)
                (layer3): Linear(in_features=64, out_features=2, bias=True)
              ),
              device=cpu,
              in_keys=['observation'],
              out_keys=['action_value'])
          (1): QValueModule()
          (2): EGreedyModule()
        ),
        device=cpu,
        in_keys=['observation'],
        out_keys=['action_value', 'chosen_action_value', 'action']),
    td_out=TensorDict(
        fields={
            action: Tensor(shape=torch.Size([128, 2]), device=cpu, dtype=torch.int64, is_shared=False),
            action_value:

In [None]:
rb = ReplayBuffer(storage=LazyTensorStorage(max_size=100_000))

In [18]:
rb

ReplayBuffer(
    storage=LazyTensorStorage(
        data=<empty>, 
        shape=None, 
        len=0, 
        max_size=100000), 
    sampler=RandomSampler(), 
    writer=RoundRobinWriter(cursor=0, full_storage=False), 
    batch_size=None, 
    collate_fn=<function _collate_id at 0x7a0413fdec20>)

## Loss module and Optimizer

# Training Loop

# Results